In [1]:
#Load the data

import json

X_text = []
y_text = []

with open("reviews.jl") as f:
    lines = f.readlines()

for line in lines:

    parsed_json = json.loads(line)

    X_text.append(parsed_json["text"])
    y_text.append(parsed_json["rating"])

print("Size of the dataset: {}".format(len(X_text)))

Size of the dataset: 11147


In [2]:
#View random records

print("RATING: {}\nTEXT: {}\n".format(y_text[3],X_text[3]))
print("RATING: {}\nTEXT: {}\n".format(y_text[20],X_text[20]))
print("RATING: {}\nTEXT: {}\n".format(y_text[450],X_text[450]))

RATING: 5
TEXT: e' un ottimo traduttore. Eccezzionale il fatto di poterci scrivere in cinese. se si e' spesso in cina e' indispensabile.

RATING: 4
TEXT: Non ho dato il massimo di valutazione perchè non ho ancora fatto il backup del vecchio hard disk e quindi non l'ho messo alla prova. Devo soltanto dire che la consegna è stata perfetta, come sempre, rispetta al massimo la descrizione del prodotto e poi devo aggiungere che difficilmente potrò dare un giudizio negativo,perchè il samsung SSD 850 EVO è uno dei migliori. A lavoro finito ,nel caso dovessi trovare dei difetti, aggiungerò immediatamente le note negative.

RATING: 3
TEXT: Non ha la stessa resa che ha il "leggi tutto" che tutti conosciamo....VLC. Spero che ci siano aggiornamenti che lo rendano piu fruibile. Per ora poco utilizzo su Fire 2015. Confido le cose cambino presto!!



In [3]:
#Encode the labels

from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

le.fit(y_text)
    
print("Classes: " + ", ".join(le.classes_) + "\n")

print("Label\tType Label\t\tTransformed\tType Transformed")

for c in le.classes_:
    print("{}\t{}\t\t{}\t\t{}".format(c,
                                  type(str(c)),
                                  le.transform([str(c)])[0],
                                  type(le.transform([str(c)])[0])))

Classes: 1, 2, 3, 4, 5

Label	Type Label		Transformed	Type Transformed
1	<class 'str'>		0		<class 'numpy.int64'>
2	<class 'str'>		1		<class 'numpy.int64'>
3	<class 'str'>		2		<class 'numpy.int64'>
4	<class 'str'>		3		<class 'numpy.int64'>
5	<class 'str'>		4		<class 'numpy.int64'>


In [4]:
#Create training and testing set

from sklearn.model_selection import train_test_split

test_size = 0.1

X_train_text, X_test_text, y_train_text, y_test_text = train_test_split(X_text,
                                                                        y_text,
                                                                        test_size=test_size,
                                                                        random_state=0)

print("X train size: \t{}".format(len(X_train_text)))
print("y train size: \t{}".format(len(y_train_text)))
print("X test size: \t{}".format(len(X_test_text)))
print("y test size: \t{}".format(len(y_test_text)))

X train size: 	10032
y train size: 	10032
X test size: 	1115
y test size: 	1115


In [5]:
#Create the pipeline

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline

text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', SGDClassifier()),])

In [6]:
#Fit the classifier

text_clf = text_clf.fit(X_train_text, le.transform(y_train_text))

In [7]:
#Make prediction on the test set
y_pred = text_clf.predict(X_test_text)

y_pred_text = le.inverse_transform(y_pred)

#Print comparison of first 15 elements predicted and true
print("Prediction:\t" + ", ".join(y_pred_text[0:15]))
print("True values:\t" + ", ".join(y_test_text[0:15]))

Prediction:	1, 4, 5, 5, 4, 1, 3, 4, 5, 5, 5, 5, 5, 1, 1
True values:	1, 3, 4, 5, 4, 3, 3, 5, 5, 3, 4, 5, 5, 1, 1


In [8]:
#Compute F1-score for each class

from sklearn.metrics import f1_score

y_true = le.transform(y_test_text)

f1_s = f1_score(y_true, y_pred, average=None)


#Create Random Agent Classifier prediction and F1-score

import random

y_pred_random_agent = []
for _ in range(len(y_pred)):
    y_pred_random_agent += random.choice(le.classes_) 

f1_s_random_agent = f1_score(y_true, le.transform(y_pred_random_agent), average=None)

In [9]:
#Create a similar classifier using an equivalent english dataset

import json

X_text_eng = []
y_text_eng = []

with open("reviews_eng.jl") as f:
    lines = f.readlines()

#Same length of the Italian dataset
for line in lines:

    parsed_json = json.loads(line)

    X_text_eng.append(parsed_json["reviewText"])
    y_text_eng.append(str(parsed_json["overall"])[0])

from sklearn.model_selection import train_test_split

X_train_text_eng, X_test_text_eng, y_train_text_eng, y_test_text_eng =\
    train_test_split(X_text_eng, y_text_eng, test_size=test_size, random_state=0)

from sklearn.preprocessing import LabelEncoder

le_eng = LabelEncoder()

le_eng.fit(y_train_text_eng)

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline

text_clf_eng = Pipeline([('vect', CountVectorizer()),
                         ('tfidf', TfidfTransformer()),
                         ('clf', SGDClassifier()), ])

text_clf_eng = text_clf_eng.fit(X_train_text_eng, le_eng.transform(y_train_text_eng))

#Compiute F1-score of the English classifier
y_pred_eng = text_clf_eng.predict(X_test_text_eng)

y_true_eng = le_eng.transform(y_test_text_eng)

from sklearn.metrics import f1_score

f1_s_eng = f1_score(y_true_eng, y_pred_eng, average=None)

In [10]:
#Create another classifier using an english dataset 10 times bigger than the previous

import json

X_text_eng_big = []
y_text_eng_big = []

with open("reviews_eng_big.jl") as f:
    lines = f.readlines()


for line in lines:

    parsed_json = json.loads(line)

    X_text_eng_big.append(parsed_json["reviewText"])
    y_text_eng_big.append(str(parsed_json["overall"])[0])

from sklearn.model_selection import train_test_split

X_train_text_eng_big, X_test_text_eng_big, y_train_text_eng_big, y_test_text_eng_big =\
    train_test_split(X_text_eng_big, y_text_eng_big, test_size=0.1, random_state=0)

from sklearn.preprocessing import LabelEncoder

le_eng_big = LabelEncoder()

le_eng_big.fit(y_train_text_eng_big)

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline

text_clf_eng_big = Pipeline([('vect', CountVectorizer()),
                         ('tfidf', TfidfTransformer()),
                         ('clf', SGDClassifier()), ])

text_clf_eng_big = text_clf_eng_big.fit(X_train_text_eng_big, le_eng_big.transform(y_train_text_eng_big))

#Compiute F1-score of the Big English classifier
y_pred_eng_big = text_clf_eng_big.predict(X_test_text_eng_big)

y_true_eng_big = le_eng_big.transform(y_test_text_eng_big)

from sklearn.metrics import f1_score

f1_s_eng_big = f1_score(y_true_eng_big, y_pred_eng_big, average=None)

In [11]:
#Tune the classifier

from sklearn.model_selection import GridSearchCV

param = {'vect__ngram_range': [(1, 2), (1, 3), (1, 4)],
         'vect__strip_accents': ('ascii', 'unicode', None),
         'tfidf__norm': ('l1', 'l2', None),
         'clf__alpha': (0.0001, 0.0003, 0.001),
         'clf__loss': ('hinge', 'squared_hinge', 'perceptron')}

gs_clf = GridSearchCV(text_clf, param, n_jobs=-1)

text_clf_gs = gs_clf.fit(X_train_text, le.transform(y_train_text))

y_pred_gs = text_clf_gs.predict(X_test_text)

#Compute F1-score, Precision ans Recall

from sklearn.metrics import f1_score
from sklearn.metrics import precision_score, recall_score

y_true = le.transform(y_test_text)

precision_gs = precision_score(y_true, y_pred_gs, average=None)

recall_gs = recall_score(y_true, y_pred_gs, average=None)

f1_s_gs = f1_score(y_true, y_pred_gs, average=None)

In [12]:
print("\nBest parameters:")
print(text_clf_gs.best_params_)


Best parameters:
{'clf__alpha': 0.0003, 'clf__loss': 'hinge', 'tfidf__norm': 'l2', 'vect__ngram_range': (1, 3), 'vect__strip_accents': None}


In [14]:
#Compare results

print("Label\tF1 Italian\tF1 Tuned\tF1 Random\tF1 English\tF1 English Big")
for i in range(len(le.classes_)):
    print("{}-Star\t{:.4f}\t{:.4f}\t{:.4f}\t{:.4f}\t{:.4f}".format(le.classes_[i],
                                                                   f1_s[i],
                                                                   f1_s_gs[i],
                                                                   f1_s_random_agent[i],
                                                                   f1_s_eng[i],
                                                                   f1_s_eng_big[i]))

Label	F1 Italian	F1 Tuned	F1 Random	F1 English	F1 English Big
1-Star	0.5269	0.5706	0.1463	0.6096	0.6058
2-Star	0.3226	0.2766	0.1689	0.1395	0.1163
3-Star	0.3497	0.4387	0.2060	0.2819	0.2641
4-Star	0.3609	0.3687	0.2222	0.4603	0.3038
5-Star	0.6443	0.6564	0.2820	0.7227	0.6922


In [15]:
#create confusion matrix

from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_true, y_pred)

print("\t" + "-Star\t".join(le.classes_) + "-Star")
for i in range(len(le.classes_)):
    print(le.classes_[i] + "-Star\t" + "\t".join(str(x) for x in cm[i]))

	1-Star	2-Star	3-Star	4-Star	5-Star
1-Star	88	19	6	12	27
2-Star	46	35	17	15	22
3-Star	29	13	50	39	52
4-Star	9	5	22	85	151
5-Star	10	10	8	48	297


In [16]:
#Compute precision and recall for the classes

from sklearn.metrics import precision_score, recall_score

precision = precision_score(y_true, y_pred, average=None)

recall = recall_score(y_true, y_pred, average=None)

print("Label\tPrecision\tRecall")
for i in range(len(le.classes_)):
    print("{}-Star\t{:.2f}\t{:.2f}".format(le.classes_[i],
                                              precision[i],
                                              recall[i]))

Label	Precision	Recall
1-Star	0.48	0.58
2-Star	0.43	0.26
3-Star	0.49	0.27
4-Star	0.43	0.31
5-Star	0.54	0.80
