In [67]:
import json
import random
import sklearn
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
import pandas as pd
import sklearn.svm as svm
import sklearn.metrics

In [2]:
with open("imdb_train.json") as f:
    data = json.load(f)
random.shuffle(data)
print(data[0])

{'class': 'pos', 'text': "almost every review of this movie I'd seen was pretty bad. It's not pretty bad, it's actually pretty good, though not great. The Judy Garland character could have gotten annoying, but she didn't allow it to. Somewhere along the line, i've become a fan of brooding, overbearing, overacting Van Heflin, at least in the early 40's. Judy's singing is great, but the film missed a great chance by not showing more of their relationship. I gave it a 7."}


In [3]:
texts =[text["text"] for text in data]
labels =[label["class"] for label in data]

# 1. countVectorizer/tfidfVectorizer

In [4]:
countVectorizer = CountVectorizer(max_features=100000, binary=True, ngram_range=(1,1))
tfidfVectorizer = TfidfVectorizer(max_features=100000, binary=True, ngram_range=(1,1))

count_matrix = countVectorizer.fit_transform(texts)
tfidf_matrix = tfidfVectorizer.fit_transform(texts)

In [5]:
features = {"CountVectorizer": countVectorizer.get_feature_names()[:15], "TfidfVectorizer": tfidfVectorizer.get_feature_names()[:15]}
features_df = pd.DataFrame.from_dict(features)

In [6]:
features_df

Unnamed: 0,CountVectorizer,TfidfVectorizer
0,00,00
1,000,000
2,0000000000001,0000000000001
3,00001,00001
4,00015,00015
5,000s,000s
6,001,001
7,003830,003830
8,006,006
9,007,007


In [21]:
train_texts, dev_texts, train_labels, dev_labels = train_test_split(texts, labels, test_size=0.2)

In [22]:
count_matrix_train = countVectorizer.fit_transform(train_texts)
count_matrix_dev = countVectorizer.transform(dev_texts)
tfidf_matrix_train = tfidfVectorizer.fit_transform(train_texts)
tfidf_matrix_dev = tfidfVectorizer.transform(dev_texts)

In [23]:
print(count_matrix_train.shape)
print(count_matrix_dev.shape)
print("--")
print(tfidf_matrix_train.shape)
print(tfidf_matrix_dev.shape)

(20000, 68377)
(5000, 68377)
--
(20000, 68377)
(5000, 68377)


In [90]:
count_classifier = svm.LinearSVC(C=0.005, verbose=1)
count_classifier.fit(count_matrix_train, train_labels)

tfidf_classifier = svm.LinearSVC(C=0.05, verbose=1)
tfidf_classifier.fit(tfidf_matrix_train, train_labels)

[LibLinear][LibLinear]

LinearSVC(C=0.05, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
          verbose=1)

In [91]:
print("Count dev:", count_classifier.score(count_matrix_dev, dev_labels))
print("Count train:", count_classifier.score(count_matrix_train, train_labels))
print("---------")
print("Tfidf dev:", tfidf_classifier.score(tfidf_matrix_dev, dev_labels))
print("Tfidf train:", tfidf_classifier.score(tfidf_matrix_train, train_labels))

Count dev: 0.8834
Count train: 0.95565
---------
Tfidf dev: 0.8834
Tfidf train: 0.92275


In [92]:
predictions_count_dev = count_classifier.predict(count_matrix_dev)
print(predictions_count_dev)
print(sklearn.metrics.confusion_matrix(dev_labels, predictions_count_dev))
print(sklearn.metrics.accuracy_score(dev_labels, predictions_count_dev))

['neg' 'pos' 'neg' ... 'neg' 'pos' 'pos']
[[2175  320]
 [ 263 2242]]
0.8834


In [93]:
predictions_tfidf_dev = tfidf_classifier.predict(tfidf_matrix_dev)
print(predictions_tfidf_dev)
print(sklearn.metrics.confusion_matrix(dev_labels, predictions_tfidf_dev))
print(sklearn.metrics.accuracy_score(dev_labels, predictions_tfidf_dev))

['neg' 'pos' 'neg' ... 'pos' 'pos' 'pos']
[[2141  354]
 [ 229 2276]]
0.8834


# 2. CountVectorizer Ngram

In [100]:
def count_vectorizer(ngram=(1,1), c=0.005):
    countVectorizer = CountVectorizer(max_features=100000, binary=True, ngram_range=ngram)
    count_matrix = countVectorizer.fit_transform(texts)
    train_texts, dev_texts, train_labels, dev_labels = train_test_split(texts, labels, test_size=0.2)
    count_matrix_train = countVectorizer.fit_transform(train_texts)
    count_matrix_dev = countVectorizer.transform(dev_texts)
    count_classifier = svm.LinearSVC(C=c, verbose=1)
    count_classifier.fit(count_matrix_train, train_labels)
    print("Count dev:", count_classifier.score(count_matrix_dev, dev_labels))
    print("Count train:", count_classifier.score(count_matrix_train, train_labels))
    print("---------")
    predictions_count_dev = count_classifier.predict(count_matrix_dev)
    print(predictions_count_dev)
    print(sklearn.metrics.confusion_matrix(dev_labels, predictions_count_dev))
    print(sklearn.metrics.accuracy_score(dev_labels, predictions_count_dev))

In [101]:
count_vectorizer(ngram=(1,2), c=0.005)

[LibLinear]Count dev: 0.8982
Count train: 0.9948
---------
['neg' 'pos' 'neg' ... 'neg' 'neg' 'neg']
[[2227  269]
 [ 240 2264]]
0.8982


In [102]:
count_vectorizer(ngram=(2,2), c=0.005)

[LibLinear]Count dev: 0.8696
Count train: 0.99155
---------
['neg' 'neg' 'pos' ... 'neg' 'pos' 'pos']
[[2118  360]
 [ 292 2230]]
0.8696


In [103]:
count_vectorizer(ngram=(2,3), c=0.005)

[LibLinear]Count dev: 0.8746
Count train: 0.9943
---------
['neg' 'neg' 'neg' ... 'pos' 'neg' 'neg']
[[2119  345]
 [ 282 2254]]
0.8746


In [104]:
count_vectorizer(ngram=(3,3), c=0.005)

[LibLinear]Count dev: 0.8266
Count train: 0.9756
---------
['pos' 'pos' 'neg' ... 'pos' 'pos' 'pos']
[[1980  476]
 [ 391 2153]]
0.8266


# 3. Language recognition SVM