# Scikit Learn

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## <span style='background :yellow'>Import dei dati

In [2]:
df = pd.read_json('Dataset/Books_small_10000.json', lines=True)
df

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime
0,A1F2H80A1ZNN1N,B00GDM3NQC,Connie Correll,"[0, 0]","I bought both boxed sets, books 1-5. Really a...",5,Can't stop reading!,1390435200,"01 23, 2014"
1,AI3DRTKCSK4KX,B00A5MREAM,Grandma,"[0, 0]",I enjoyed this short book. But it was way way ...,3,A leaf on the wind of all hallows,1399593600,"05 9, 2014"
2,A3KAKFHY9DAC8A,0446547573,"toobusyreading ""Inspired Kathy""","[1, 1]",I love Nicholas Sparks. I&#8217;ve read everyt...,4,Great writing from Nicholas Sparks.,1404518400,"07 5, 2014"
3,ATYBCYD6BIXVL,0955809215,Chrissie,"[0, 0]",I really enjoyed this adventure and look forwa...,4,great,1389225600,"01 9, 2014"
4,A17K95SEU3J68U,0991500776,"Sirde ""artist761""","[0, 0]",It was a decent read.. typical story line. Not...,3,It was a decent read.. typical story line ...,1404864000,"07 9, 2014"
...,...,...,...,...,...,...,...,...,...
9995,A32FV8MF6KTRYE,0615676863,S.Wilson,"[0, 0]",The whole series was great! Melody is a fanta...,5,Great,1389830400,"01 16, 2014"
9996,A2IYZ41783TX4R,B00GG2DWO2,"Lynn Demsky ""Lynn/MI""","[0, 0]",I didn't thing that much of this book. I am a...,3,an anchor might have helped....,1391731200,"02 7, 2014"
9997,A1GHQOXZME7VLL,1482012294,CABorgford,"[0, 0]",It is an emotional TRIP to the past with Trip ...,5,Must Read Series - Timing is Everything,1388880000,"01 5, 2014"
9998,A37ZXVSFNM1UAZ,1481978063,"V. Jones ""Phoenix Rising""","[0, 0]",This definitely got under my veins whereby I h...,5,This story grew on me!,1392076800,"02 11, 2014"


## <span style='background :yellow'>Creazione di una nuova colonna
Sulla base del numero di stelle (1 a 5), viene creata la colonna **sentiment** che indica se la recensione è positiva, negativa o neutrale

In [3]:
# creo funzione da applicare
def set_sentiment(x):
    if x <= 2:
        return 'NEGATIVE'
    elif x == 3:
        return 'NEUTRAL'
    else:
        return 'POSITIVE'

df['sentiment'] = df['overall'].apply(lambda x: set_sentiment(x))
df

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime,sentiment
0,A1F2H80A1ZNN1N,B00GDM3NQC,Connie Correll,"[0, 0]","I bought both boxed sets, books 1-5. Really a...",5,Can't stop reading!,1390435200,"01 23, 2014",POSITIVE
1,AI3DRTKCSK4KX,B00A5MREAM,Grandma,"[0, 0]",I enjoyed this short book. But it was way way ...,3,A leaf on the wind of all hallows,1399593600,"05 9, 2014",NEUTRAL
2,A3KAKFHY9DAC8A,0446547573,"toobusyreading ""Inspired Kathy""","[1, 1]",I love Nicholas Sparks. I&#8217;ve read everyt...,4,Great writing from Nicholas Sparks.,1404518400,"07 5, 2014",POSITIVE
3,ATYBCYD6BIXVL,0955809215,Chrissie,"[0, 0]",I really enjoyed this adventure and look forwa...,4,great,1389225600,"01 9, 2014",POSITIVE
4,A17K95SEU3J68U,0991500776,"Sirde ""artist761""","[0, 0]",It was a decent read.. typical story line. Not...,3,It was a decent read.. typical story line ...,1404864000,"07 9, 2014",NEUTRAL
...,...,...,...,...,...,...,...,...,...,...
9995,A32FV8MF6KTRYE,0615676863,S.Wilson,"[0, 0]",The whole series was great! Melody is a fanta...,5,Great,1389830400,"01 16, 2014",POSITIVE
9996,A2IYZ41783TX4R,B00GG2DWO2,"Lynn Demsky ""Lynn/MI""","[0, 0]",I didn't thing that much of this book. I am a...,3,an anchor might have helped....,1391731200,"02 7, 2014",NEUTRAL
9997,A1GHQOXZME7VLL,1482012294,CABorgford,"[0, 0]",It is an emotional TRIP to the past with Trip ...,5,Must Read Series - Timing is Everything,1388880000,"01 5, 2014",POSITIVE
9998,A37ZXVSFNM1UAZ,1481978063,"V. Jones ""Phoenix Rising""","[0, 0]",This definitely got under my veins whereby I h...,5,This story grew on me!,1392076800,"02 11, 2014",POSITIVE


## <span style='background :yellow'>Eliminazione delle colonne superflue

In [4]:
df = df[['reviewText','sentiment']]
df

Unnamed: 0,reviewText,sentiment
0,"I bought both boxed sets, books 1-5. Really a...",POSITIVE
1,I enjoyed this short book. But it was way way ...,NEUTRAL
2,I love Nicholas Sparks. I&#8217;ve read everyt...,POSITIVE
3,I really enjoyed this adventure and look forwa...,POSITIVE
4,It was a decent read.. typical story line. Not...,NEUTRAL
...,...,...
9995,The whole series was great! Melody is a fanta...,POSITIVE
9996,I didn't thing that much of this book. I am a...,NEUTRAL
9997,It is an emotional TRIP to the past with Trip ...,POSITIVE
9998,This definitely got under my veins whereby I h...,POSITIVE


## <span style='background :yellow'>Split del dataframe in train e test

In [5]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(df,test_size=0.33, random_state=42)

In [6]:
train # (67% del dataframe viene utilizzato per allenare il modello)

Unnamed: 0,reviewText,sentiment
8371,Olivia Hampton arrives at the Dunraven family ...,POSITIVE
5027,"Perhaps one of the funniest, yet saddest stori...",POSITIVE
9234,One of Francine Rivers best series books!,POSITIVE
3944,It was just one of those books that never went...,NEGATIVE
6862,This was a real pleasure. I really like that ...,POSITIVE
...,...,...
5734,These kids had to grow up fast due to the moth...,POSITIVE
5191,This book described the development of the tec...,POSITIVE
5390,"I've hardly ever given five stars, as I had be...",POSITIVE
860,When I was reading the first series of &#34;Th...,POSITIVE


In [7]:
test  # (33% del dataframe viene utilizzato per allenare il modello)

Unnamed: 0,reviewText,sentiment
6252,was sent an Arc of this book for an honest rev...,POSITIVE
4684,Yet another wonderful book by Elle. I really h...,POSITIVE
1731,Well written and easy to listen to while drivi...,POSITIVE
4742,Unknown Seas tells the story of exploration fr...,POSITIVE
4521,"Westlake written, you will be surprised at who...",POSITIVE
...,...,...
1744,Loved it!!! too bad it was short. can't wait f...,POSITIVE
9754,I cannot say enough good about this book. I wa...,POSITIVE
6094,I enjoyed the story line and writing style of ...,POSITIVE
8781,The plot was believable and fast moving. The w...,POSITIVE


## <span style='background :yellow'>Scelta di un numero uguale di commenti positivi e negativi
Dato che il numero di commenti positivi era molto maggiore rispetto a quelli negativi, si è deciso di prendere solo un numero di commenti positivi pari a quello dei negativi, in modo da allenare il modello in maniera migliore

### <span style='background :yellow'>Train


In [8]:
# numero di commenti positivi
number_of_positive_reviews = train[train['sentiment'] == 'POSITIVE'].shape[0]
number_of_positive_reviews

5611

In [9]:
# numero di commenti negativi
number_of_negative_reviews = train[train['sentiment'] == 'NEGATIVE'].shape[0]
number_of_negative_reviews

436

In [10]:
train_negative = train[train['sentiment'] == 'NEGATIVE'] # prendo tutte le righe dei commenti negativi
train_positive = train[train['sentiment'] == 'POSITIVE'].iloc[:number_of_negative_reviews] # prendo solo le prime n righe dei commenti positivi (n = numero di commenti negativi)

In [11]:
# unione e shuffle delle recensioni positive e negative
train = pd.concat([train_positive, train_negative])
train = train.sample(n=train.shape[0])
train

Unnamed: 0,reviewText,sentiment
9387,I read the first two books and then bought thi...,NEGATIVE
2797,I was confused while reading this book. I kep...,NEGATIVE
3661,WHAT I READI loved it as with any of arials b...,POSITIVE
7124,Moon Wreck - Secrets of Ceres follows the main...,NEGATIVE
8498,I just finished reading Chained by S.K. Fordha...,POSITIVE
...,...,...
3977,I bought this and it was OK but I lost patienc...,NEGATIVE
796,The only thing that kept me reading this book ...,NEGATIVE
2992,"I found the first book, The Fallen Star, to be...",NEGATIVE
3247,I did not like this book. It started out good...,NEGATIVE


### <span style='background :yellow'>Test
Eseguo la stessa procedura per la parte destinata al test

In [12]:
# numero di commenti positivi
number_of_positive_reviews = test[test['sentiment'] == 'POSITIVE'].shape[0]
number_of_positive_reviews

2767

In [13]:
# numero di commenti negativi
number_of_negative_reviews = test[test['sentiment'] == 'NEGATIVE'].shape[0]
number_of_negative_reviews

208

In [14]:
test_negative = test[test['sentiment'] == 'NEGATIVE'] # prendo tutte le righe dei commenti negativi
test_positive = test[test['sentiment'] == 'POSITIVE'].iloc[:number_of_negative_reviews] # prendo solo le prime n righe dei commenti positivi (n = numero di commenti negativi)

In [15]:
# unione e shuffle delle recensioni positive e negative
test = pd.concat([test_positive, test_negative])
test = test.sample(n=test.shape[0])
test

Unnamed: 0,reviewText,sentiment
5170,You can find all this info out in one well wri...,NEGATIVE
4819,Recommended by a friend as &#34;one of the bes...,POSITIVE
9644,"This book was really helpful, there is nothing...",POSITIVE
2545,My 13 yr old daughter wrote the author this em...,POSITIVE
871,While the attempt to write something new and w...,NEGATIVE
...,...,...
487,This was only my second book by R.L. Mathewson...,POSITIVE
9001,The title initially refers to the ranking syst...,POSITIVE
9189,I love HM Ward and I love the Fero boys. But b...,POSITIVE
5331,I could not really get into this book... his w...,NEGATIVE


## <span style='background :yellow'>Split in features e label

In [16]:
X_train = train['reviewText'] # feature
y_train = train['sentiment'] # label

X_test = test['reviewText'] # feature
y_test = test['sentiment'] # label

## <span style='background :yellow'>Creazione delle bag of words

In [17]:
# Da ricerca su internet TF-IDF risulta molto migliore in quanto non si focalizza soltanto sulla frequenza delle parole
# presenti ma attribuisce anche una certa importanza ad esse

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

#vectorizer = CountVectorizer()
vectorizer = TfidfVectorizer()

X_train_counts = vectorizer.fit_transform(X_train) # fit e trasformazione
X_test_counts = vectorizer.transform(X_test) # fit è già stato eseguito, si procede soltanto alla trasformazione

# uguale a sopra ma con due step 
# vectorizer.fit(X)
# X_train_counts = vectorizer.transform(X_train)

In [18]:
vectorizer.get_feature_names_out() # tutte le parole che vengono considerate

array(['00', '000', '01', ..., 'zombies', 'zone', 'zora'], dtype=object)

In [19]:
X_train_counts.shape # come ci si aspetta la variabile è una matrice di 872 righe e 8906 colonne (che sono le parole considerate)

(872, 8906)

In [20]:
print(X_train.iloc[0]) # prima riga della variabile X_train_counts
X_train_counts[0].toarray() # conversione della riga sopra in parole con relativo valore assegnato

I read the first two books and then bought this one. Really wish I hadn't. It has so many mistakes and something will be mentioned and then it's like it wasn't mentioned. It's like the author forgot she wrote something and then talks about it and it's totally different from what she already wrote!  I thought the first two read slowly. Meaning you don't really get anywhere and discovered that in this book. I will not buy another one!! This could be a great story if it was told and not dragged out with all the dramatics!!


array([[0., 0., 0., ..., 0., 0., 0.]])

## <span style='background :yellow'>Modelli

### <span style='background :yellow'>Linear SVM

In [21]:
from sklearn.svm import SVC

In [22]:
model_SVM = SVC(kernel='linear')
model_SVM.fit(X_train_counts,y_train)

predictions_SVM = model_SVM.predict(X_test_counts)

### <span style='background :yellow'>Decision tree

In [23]:
from sklearn.tree import DecisionTreeClassifier

In [24]:
model_DecisionTreeClassifier = DecisionTreeClassifier()
model_DecisionTreeClassifier.fit(X_train_counts,y_train)

predictions_DecisionTreeClassifier = model_DecisionTreeClassifier.predict(X_test_counts)

### <span style='background :yellow'>Naive Bayes

In [25]:
from sklearn.naive_bayes import GaussianNB

In [26]:
model_GaussianNB = GaussianNB()
model_GaussianNB.fit(X_train_counts.toarray(),y_train)

predictions_GaussianNB = model_GaussianNB.predict(X_test_counts.toarray())

### <span style='background :yellow'>Logistic regression

In [27]:
from sklearn.linear_model import LogisticRegression

In [28]:
model_LogisticRegression = LogisticRegression()
model_LogisticRegression.fit(X_train_counts,y_train)

predictions_LogisticRegression = model_LogisticRegression.predict(X_test_counts)

## <span style='background :yellow'>Valutazione dei modelli

In [29]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

### <span style='background :yellow'>Score

In [30]:
score_SVM = accuracy_score(y_test, predictions_SVM) # valutazione accuratezza
score_DecisionTreeClassifier = accuracy_score(y_test, predictions_DecisionTreeClassifier) # valutazione accuratezza
score_GaussianNB = accuracy_score(y_test, predictions_GaussianNB) # valutazione accuratezza
score_LogisticRegression = accuracy_score(y_test, predictions_LogisticRegression) # valutazione accuratezza

print(f'score_SVM: {score_SVM}')
print(f'score_DecisionTreeClassifier: {score_DecisionTreeClassifier}')
print(f'score_GaussianNB: {score_GaussianNB}')
print(f'score_LogisticRegression: {score_LogisticRegression}')

score_SVM: 0.8076923076923077
score_DecisionTreeClassifier: 0.6538461538461539
score_GaussianNB: 0.6610576923076923
score_LogisticRegression: 0.8052884615384616


### <span style='background :yellow'>F1 score

In [31]:
f1_score_SVM = f1_score(y_test, predictions_SVM, average=None, labels=['POSITIVE','NEGATIVE']) # valutazione accuratezza
f1_score_DecisionTreeClassifier = f1_score(y_test, predictions_DecisionTreeClassifier, average=None, labels=['POSITIVE','NEGATIVE']) # valutazione accuratezza
f1_score_GaussianNB = f1_score(y_test, predictions_GaussianNB, average=None, labels=['POSITIVE','NEGATIVE']) # valutazione accuratezza
f1_score_LogisticRegression = f1_score(y_test, predictions_LogisticRegression, average=None, labels=['POSITIVE','NEGATIVE']) # valutazione accuratezza

print(f'f1 score_SVM: {f1_score_SVM}')
print(f'f1 score_DecisionTreeClassifier: {f1_score_DecisionTreeClassifier}')
print(f'f1 score_GaussianNB: {f1_score_GaussianNB}')
print(f'f1 score_LogisticRegression: {f1_score_LogisticRegression}')

f1 score_SVM: [0.80582524 0.80952381]
f1 score_DecisionTreeClassifier: [0.65048544 0.65714286]
f1 score_GaussianNB: [0.65693431 0.66508314]
f1 score_LogisticRegression: [0.80291971 0.80760095]


## <span style='background :yellow'>Test

In [32]:
X_test_set = ['Fantastic bike', 'bad book do not buy', 'horrible waste of time']
X_test_set_count = vectorizer.transform(X_test_set)

model_SVM.predict(X_test_set_count)

array(['POSITIVE', 'NEGATIVE', 'NEGATIVE'], dtype=object)

## <span style='background :yellow'>Tuning del modello tramite GridSearchCV

In [33]:
from sklearn.model_selection import GridSearchCV

### <span style='background :yellow'>Linear SVM

In [34]:
model_SVM_parameters = {'C': (1,2,4,8,16,32), 'kernel': ('linear','rbf','sigmoid')}

model_SVM = SVC()
gridSearch = GridSearchCV(model_SVM, model_SVM_parameters, cv = 5)
gridSearch.fit(X_train_counts, y_train)

In [35]:
pd.DataFrame(gridSearch.cv_results_).sort_values('rank_test_score').head() # ordino da migliore a peggiore e visualizzo i primi 5

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_kernel,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
1,0.253238,0.023857,0.055471,0.011172,1,rbf,"{'C': 1, 'kernel': 'rbf'}",0.88,0.834286,0.781609,0.821839,0.821839,0.827915,0.031533,1
4,0.333962,0.136597,0.079852,0.022355,2,rbf,"{'C': 2, 'kernel': 'rbf'}",0.862857,0.822857,0.787356,0.833333,0.827586,0.826798,0.024144,2
7,0.243249,0.003073,0.059174,0.002326,4,rbf,"{'C': 4, 'kernel': 'rbf'}",0.862857,0.822857,0.787356,0.833333,0.827586,0.826798,0.024144,2
16,0.251683,0.032937,0.063128,0.001266,32,rbf,"{'C': 32, 'kernel': 'rbf'}",0.862857,0.822857,0.787356,0.833333,0.827586,0.826798,0.024144,2
10,0.244465,0.002336,0.058762,0.001714,8,rbf,"{'C': 8, 'kernel': 'rbf'}",0.862857,0.822857,0.787356,0.833333,0.827586,0.826798,0.024144,2


Per quanto riguarda il modello SVM, performa al meglio con **C = 4** e **kernel = rbf**

### <span style='background :yellow'>Decision tree

In [36]:
model_DecisionTreeClassifier_parameters = {'criterion': ('gini','entropy','log_loss'), 'splitter': ('best','random')}

model_DecisionTreeClassifier = DecisionTreeClassifier()
gridSearch = GridSearchCV(model_DecisionTreeClassifier, model_DecisionTreeClassifier_parameters, cv = 5)
gridSearch.fit(X_train_counts, y_train)

In [37]:
pd.DataFrame(gridSearch.cv_results_).sort_values('rank_test_score') # ordino da migliore a peggiore

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_criterion,param_splitter,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
3,0.078893,0.003384,0.0,0.0,entropy,random,"{'criterion': 'entropy', 'splitter': 'random'}",0.657143,0.68,0.695402,0.712644,0.649425,0.678923,0.02347,1
4,0.109363,0.009881,0.0,0.0,log_loss,best,"{'criterion': 'log_loss', 'splitter': 'best'}",0.662857,0.662857,0.666667,0.689655,0.66092,0.668591,0.010696,2
2,0.108468,0.008537,0.003325,0.006162,entropy,best,"{'criterion': 'entropy', 'splitter': 'best'}",0.685714,0.651429,0.678161,0.683908,0.643678,0.668578,0.017519,3
5,0.09333,0.016295,0.0,0.0,log_loss,random,"{'criterion': 'log_loss', 'splitter': 'random'}",0.714286,0.634286,0.655172,0.66092,0.678161,0.668565,0.026815,4
0,0.095973,0.007568,0.0002,0.0004,gini,best,"{'criterion': 'gini', 'splitter': 'best'}",0.634286,0.657143,0.62069,0.689655,0.632184,0.646791,0.024477,5
1,0.074998,0.011693,0.003125,0.00625,gini,random,"{'criterion': 'gini', 'splitter': 'random'}",0.708571,0.622857,0.637931,0.614943,0.649425,0.646745,0.033134,6


Per quanto riguarda il modello DecisionTree, performa al meglio con **criterion = gini** e **splitter = random**

### <span style='background :yellow'>Logistic regression

In [38]:
model_LogisticRegression_parameters = {'C': (1,2,4,8,16), 'solver': ('newton-cg','lbfgs','liblinear')}

model_LogisticRegression = LogisticRegression()
gridSearch = GridSearchCV(model_LogisticRegression, model_LogisticRegression_parameters, cv = 5)
gridSearch.fit(X_train_counts, y_train)

In [39]:
pd.DataFrame(gridSearch.cv_results_).sort_values('rank_test_score') # ordino da migliore a peggiore

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_solver,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
7,0.057966,0.013808,0.000804,0.000756,4,lbfgs,"{'C': 4, 'solver': 'lbfgs'}",0.868571,0.811429,0.793103,0.833333,0.821839,0.825655,0.025208,1
8,0.01019,0.001471,0.0006,0.00049,4,liblinear,"{'C': 4, 'solver': 'liblinear'}",0.874286,0.811429,0.793103,0.827586,0.821839,0.825649,0.027001,2
9,0.040575,0.005206,0.000998,0.000629,8,newton-cg,"{'C': 8, 'solver': 'newton-cg'}",0.862857,0.805714,0.804598,0.833333,0.816092,0.824519,0.021767,3
10,0.068163,0.01988,0.000798,0.000399,8,lbfgs,"{'C': 8, 'solver': 'lbfgs'}",0.862857,0.805714,0.804598,0.833333,0.816092,0.824519,0.021767,3
11,0.007399,0.000495,0.0006,0.00049,8,liblinear,"{'C': 8, 'solver': 'liblinear'}",0.862857,0.805714,0.804598,0.833333,0.816092,0.824519,0.021767,3
6,0.033985,0.004047,0.0008,0.0004,4,newton-cg,"{'C': 4, 'solver': 'newton-cg'}",0.868571,0.811429,0.793103,0.827586,0.821839,0.824506,0.024962,6
14,0.008198,0.000747,0.0006,0.00049,16,liblinear,"{'C': 16, 'solver': 'liblinear'}",0.862857,0.805714,0.798851,0.833333,0.816092,0.823369,0.02291,7
12,0.036579,0.00186,0.000997,7e-06,16,newton-cg,"{'C': 16, 'solver': 'newton-cg'}",0.862857,0.805714,0.798851,0.827586,0.816092,0.82222,0.022522,8
13,0.122352,0.052358,0.000799,0.000747,16,lbfgs,"{'C': 16, 'solver': 'lbfgs'}",0.862857,0.805714,0.798851,0.827586,0.816092,0.82222,0.022522,8
3,0.042176,0.005152,0.001,0.000633,2,newton-cg,"{'C': 2, 'solver': 'newton-cg'}",0.874286,0.817143,0.781609,0.816092,0.816092,0.821044,0.029846,10


Per quanto riguarda il modello LogisticRegression, performa al meglio con **C = 4** e **solver = newton-cg**