In [41]:
#Load the data

import json

X_text = []
y_text = []

with open("reviews.jl") as f:
    lines = f.readlines()

for line in lines:

    parsed_json = json.loads(line)

    X_text.append(parsed_json["text"])
    y_text.append(parsed_json["rating"])

print("Size of the dataset: {}".format(len(X_text)))

Size of the dataset: 11147


In [42]:
#Reduce the classes from 5 to 2

y_text_old = y_text.copy()

y_text = []

for label in y_text_old:
    
    if (label == "1" or label == "2" or label == "3"):
        y_text.append("1-3")
    else:
        y_text.append("4-5")

In [43]:
#Encode the labels

from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

le.fit(y_text)
    
print("Classes: " + ", ".join(le.classes_))

Classes: 1-3, 4-5


In [44]:
#Create training and testing set

from sklearn.model_selection import train_test_split

test_size = 0.1

X_train_text, X_test_text, y_train_text, y_test_text = train_test_split(X_text,
                                                                        y_text,
                                                                        test_size=test_size,
                                                                        random_state=0)

print("X train size: \t{}".format(len(X_train_text)))
print("y train size: \t{}".format(len(y_train_text)))
print("X test size: \t{}".format(len(X_test_text)))
print("y test size: \t{}".format(len(y_test_text)))

X train size: 	10032
y train size: 	10032
X test size: 	1115
y test size: 	1115


In [45]:
#Create the pipeline

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline

text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', SGDClassifier()),])

In [46]:
#Fit the classifier

text_clf = text_clf.fit(X_train_text, le.transform(y_train_text))

In [47]:
#Make prediction on the test set
y_pred = text_clf.predict(X_test_text)

y_pred_text = le.inverse_transform(y_pred)

#Print comparison of first 15 elements predicted and true
print("Prediction:\t" + ", ".join(y_pred_text[0:15]))
print("True values:\t" + ", ".join(y_test_text[0:15]))

Prediction:	1-3, 4-5, 4-5, 4-5, 4-5, 1-3, 1-3, 4-5, 4-5, 4-5, 4-5, 4-5, 4-5, 1-3, 1-3
True values:	1-3, 1-3, 4-5, 4-5, 4-5, 1-3, 1-3, 4-5, 4-5, 1-3, 4-5, 4-5, 4-5, 1-3, 1-3


In [48]:
#Compute F1-score, Precision ans Recall

from sklearn.metrics import f1_score
from sklearn.metrics import precision_score, recall_score


y_true = le.transform(y_test_text)

precision = precision_score(y_true, y_pred, average=None)

recall = recall_score(y_true, y_pred, average=None)

f1_s = f1_score(y_true, y_pred, average=None)

In [49]:
#Analyze results

print("Label\tF1-score\tPrecision\tRecall")
for i in range(len(le.classes_)):
    print("{}-Star\t{:.4f}\t{:.4f}\t{:.4f}".format(le.classes_[i],
                                                   f1_s[i],
                                                   precision[i,],
                                                   recall[i]))

Label	F1-score	Precision	Recall
1-3-Star	0.7650	0.8342	0.7064
4-5-Star	0.8502	0.8075	0.8977


In [53]:
#Tune the classifier

from sklearn.model_selection import GridSearchCV

param = {'vect__ngram_range': [(1, 2), (1, 3), (1, 4)],
         'vect__strip_accents': ('ascii', 'unicode', None),
         'tfidf__norm': ('l1', 'l2', None),
         'clf__alpha': (0.0001, 0.0003, 0.001),
         'clf__loss': ('hinge', 'squared_hinge', 'perceptron')}

gs_clf = GridSearchCV(text_clf, param, n_jobs=-1)

text_clf_gs = gs_clf.fit(X_train_text, le.transform(y_train_text))

y_pred_gs = text_clf_gs.predict(X_test_text)

#Compute F1-score, Precision ans Recall

from sklearn.metrics import f1_score
from sklearn.metrics import precision_score, recall_score

y_true = le.transform(y_test_text)

precision_gs = precision_score(y_true, y_pred_gs, average=None)

recall_gs = recall_score(y_true, y_pred_gs, average=None)

f1_s_gs = f1_score(y_true, y_pred_gs, average=None)

#Analyze results

print("Label\tF1-score\tPrecision\tRecall")
for i in range(len(le.classes_)):
    print("{}-Star\t{:.4f}\t{:.4f}\t{:.4f}".format(le.classes_[i],
                                                   f1_s_gs[i],
                                                   precision_gs[i,],
                                                   recall_gs[i]))
    
# Best Parameters

print("\nBest parameters:")
print(text_clf_gs.best_params_)

Label	F1-score	Precision	Recall
1-3-Star	0.7973	0.8635	0.7404
4-5-Star	0.8696	0.8287	0.9147

Best parameters:
{'clf__alpha': 0.0001, 'clf__loss': 'hinge', 'tfidf__norm': 'l2', 'vect__ngram_range': (1, 3), 'vect__strip_accents': None}
