# Scikit Learn

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os

## <span style='background :yellow'>Import dei dati

In [2]:
complete_dataframe = pd.DataFrame() # create an empty dataframe

In [3]:
# iterate over files in that directory
for filename in os.listdir('Dataset'):
    single_dataframe = pd.read_json('Dataset/' + filename, lines=True)
    single_dataframe['field'] = filename.split('_')[0].upper() #create a new column
    complete_dataframe = pd.concat([complete_dataframe, single_dataframe])

In [4]:
# shuffle the dataframe and reset the index
complete_dataframe = complete_dataframe.sample(frac=1).reset_index(drop=True)

In [5]:
complete_dataframe

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime,field
0,A22KGO6Q2YWS0C,B00H727J6M,Dean!,"[0, 0]",Ironically this is made by the company that se...,5,"Great tasting, naturally sweet!",1400630400,"05 21, 2014",GROCERY
1,AW40UPUC4UFDU,1469241943,Mama C,"[0, 0]","This book started rather slowly, but became a ...",5,Historical fiction,1398556800,"04 27, 2014",BOOKS
2,A3NXO2KE2X6ZJE,B00GTGETFG,Jeff,"[1, 1]",I have been looking for a charger to charge my...,5,Charges a Dell Venue Pro 8,1401235200,"05 28, 2014",ELECTRONICS
3,AUV3OR951650C,B000FCGS6I,N. Porter,"[1, 1]","Living on a farm, I've tried a lot of differen...",5,The best mouse traps are the classic ones like...,1403308800,"06 21, 2014",PATIO
4,A3Q8EG8F181J2P,B004VLV922,Bluebirdie,"[0, 0]",I've been having BRM Flaxseed Meal every morni...,5,Got to have this every day!,1393459200,"02 27, 2014",GROCERY
...,...,...,...,...,...,...,...,...,...,...
4995,A2LYK1AGZ4U9L8,B00D68UXQE,tbyg,"[0, 0]","It's a little pricey, but it seems to be the o...",4,Works,1389139200,"01 8, 2014",ELECTRONICS
4996,A1E2HNBZU9SD2A,B00G9XM36G,Unity Bekoe,"[0, 1]",Received the dress. Very nice but it's just to...,5,Too Long,1391472000,"02 4, 2014",CLOTHING
4997,A641JBBT6V2PT,B000MOIWWM,Happy Daddy,"[1, 1]",I've used this a few times now in an 18' above...,2,Not Professional Heavy Duty Quality,1396828800,"04 7, 2014",PATIO
4998,A2VV4JDBMOHNF8,B000LKVDLO,T Post,"[0, 0]","Having recently gone gluten free, I missed Ore...",5,Good gluten free option,1390176000,"01 20, 2014",GROCERY


## <span style='background :yellow'>Eliminazione delle colonne superflue

In [6]:
df = complete_dataframe[['reviewText','field']]
df

Unnamed: 0,reviewText,field
0,Ironically this is made by the company that se...,GROCERY
1,"This book started rather slowly, but became a ...",BOOKS
2,I have been looking for a charger to charge my...,ELECTRONICS
3,"Living on a farm, I've tried a lot of differen...",PATIO
4,I've been having BRM Flaxseed Meal every morni...,GROCERY
...,...,...
4995,"It's a little pricey, but it seems to be the o...",ELECTRONICS
4996,Received the dress. Very nice but it's just to...,CLOTHING
4997,I've used this a few times now in an 18' above...,PATIO
4998,"Having recently gone gluten free, I missed Ore...",GROCERY


## <span style='background :yellow'>Split del dataframe in train e test

In [7]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(df,test_size=0.33, random_state=42)

In [8]:
train # (67% del dataframe viene utilizzato per allenare il modello)

Unnamed: 0,reviewText,field
1522,First the pros - it's very a compact hose with...,PATIO
835,I have a well established bird feeding hobby a...,PATIO
358,I had not previously read this author but hear...,BOOKS
138,"Amazon has a good price for the variety pack, ...",GROCERY
299,Man I hit the Jack pot I have a lot of peanut...,GROCERY
...,...,...
4426,I get these for my class store. The students l...,GROCERY
466,"I got this coffee for free, because who doesn'...",GROCERY
3092,The third story in the Lux Series follows suit...,BOOKS
3772,"For commercial spaces, like warehouses, these ...",PATIO


In [9]:
test  # (33% del dataframe viene utilizzato per allenare il modello)

Unnamed: 0,reviewText,field
1501,I bought this to help keep me cool in summer. ...,CLOTHING
2586,Perhaps these are included to convince us that...,GROCERY
2653,Have looked all over for this product. Is har...,GROCERY
1055,Garden safe take root rooting hormone is an ex...,PATIO
705,Just when it was getting excited when a book h...,BOOKS
...,...,...
908,"I don't game very much, but coming from a regu...",ELECTRONICS
2114,I got these for my husband that wears big and ...,CLOTHING
3896,This was another very good book in the series....,BOOKS
1627,I am quite pleased with this little backpack. ...,CLOTHING


## <span style='background :yellow'>Split in features e label

In [10]:
X_train = train['reviewText'] # feature
y_train = train['field'] # label

X_test = test['reviewText'] # feature
y_test = test['field'] # label

## <span style='background :yellow'>Creazione delle bag of words

In [11]:
# Da ricerca su internet TF-IDF risulta molto migliore in quanto non si focalizza soltanto sulla frequenza delle parole
# presenti ma attribuisce anche una certa importanza ad esse

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

#vectorizer = CountVectorizer()
vectorizer = TfidfVectorizer()

X_train_counts = vectorizer.fit_transform(X_train) # fit e trasformazione
X_test_counts = vectorizer.transform(X_test) # fit è già stato eseguito, si procede soltanto alla trasformazione

# uguale a sopra ma con due step 
# vectorizer.fit(X)
# X_train_counts = vectorizer.transform(X_train)

In [12]:
vectorizer.get_feature_names_out() # tutte le parole che vengono considerate

array(['00', '000', '00am', ..., 'zoysia', 'zuiko', 'zurich'],
      dtype=object)

In [13]:
X_train_counts.shape # come ci si aspetta la variabile è una matrice di 872 righe e 8906 colonne (che sono le parole considerate)

(3350, 16591)

In [14]:
print(X_train.iloc[0]) # prima riga della variabile X_train_counts
X_train_counts[0].toarray() # conversione della riga sopra in parole con relativo valore assegnato

First the pros - it's very a compact hose with a small footprint when dry/collapsed, the brass fittings seem to be sturdy and connected easily to my house and it's lightweight/easy to move around. For the cons - you need to follow the directions for it to work properly, the water pressure is disappointing and it is rather fragile when compared to a 'normal' hose. I've found it's better for watering plants and washing the car rather than for running a sprinkler or connecting to the pressure washer. Overall, I can see the promise in the idea but most of our water usage outside requires a bit more ooomph, so I see us sticking with a more traditional hose for now.


array([[0., 0., 0., ..., 0., 0., 0.]])

## <span style='background :yellow'>Modelli

### <span style='background :yellow'>Linear SVM

In [15]:
from sklearn.svm import SVC

In [16]:
model_SVM = SVC(kernel='linear')
model_SVM.fit(X_train_counts,y_train)

predictions_SVM = model_SVM.predict(X_test_counts)

### <span style='background :yellow'>Decision tree

In [17]:
from sklearn.tree import DecisionTreeClassifier

In [18]:
model_DecisionTreeClassifier = DecisionTreeClassifier()
model_DecisionTreeClassifier.fit(X_train_counts,y_train)

predictions_DecisionTreeClassifier = model_DecisionTreeClassifier.predict(X_test_counts)

### <span style='background :yellow'>Naive Bayes

In [19]:
from sklearn.naive_bayes import GaussianNB

In [20]:
model_GaussianNB = GaussianNB()
model_GaussianNB.fit(X_train_counts.toarray(),y_train)

predictions_GaussianNB = model_GaussianNB.predict(X_test_counts.toarray())

### <span style='background :yellow'>Logistic regression

In [21]:
from sklearn.linear_model import LogisticRegression

In [22]:
model_LogisticRegression = LogisticRegression()
model_LogisticRegression.fit(X_train_counts,y_train)

predictions_LogisticRegression = model_LogisticRegression.predict(X_test_counts)

## <span style='background :yellow'>Valutazione dei modelli

In [23]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

### <span style='background :yellow'>Score

In [24]:
score_SVM = accuracy_score(y_test, predictions_SVM) # valutazione accuratezza
score_DecisionTreeClassifier = accuracy_score(y_test, predictions_DecisionTreeClassifier) # valutazione accuratezza
score_GaussianNB = accuracy_score(y_test, predictions_GaussianNB) # valutazione accuratezza
score_LogisticRegression = accuracy_score(y_test, predictions_LogisticRegression) # valutazione accuratezza

print(f'score_SVM: {score_SVM}')
print(f'score_DecisionTreeClassifier: {score_DecisionTreeClassifier}')
print(f'score_GaussianNB: {score_GaussianNB}')
print(f'score_LogisticRegression: {score_LogisticRegression}')

score_SVM: 0.9315151515151515
score_DecisionTreeClassifier: 0.686060606060606
score_GaussianNB: 0.8103030303030303
score_LogisticRegression: 0.926060606060606


### <span style='background :yellow'>F1 score

In [25]:
f1_score_SVM = f1_score(y_test, predictions_SVM, average=None, labels=['BOOKS','CLOTHING','ELECTRONICS','GROCERY','PATIO']) # valutazione accuratezza
f1_score_DecisionTreeClassifier = f1_score(y_test, predictions_DecisionTreeClassifier, average=None, labels=['BOOKS','CLOTHING','ELECTRONICS','GROCERY','PATIO']) # valutazione accuratezza
f1_score_GaussianNB = f1_score(y_test, predictions_GaussianNB, average=None, labels=['BOOKS','CLOTHING','ELECTRONICS','GROCERY','PATIO']) # valutazione accuratezza
f1_score_LogisticRegression = f1_score(y_test, predictions_LogisticRegression, average=None, labels=['BOOKS','CLOTHING','ELECTRONICS','GROCERY','PATIO']) # valutazione accuratezza

print(f'f1 score_SVM: {f1_score_SVM}')
print(f'f1 score_DecisionTreeClassifier: {f1_score_DecisionTreeClassifier}')
print(f'f1 score_GaussianNB: {f1_score_GaussianNB}')
print(f'f1 score_LogisticRegression: {f1_score_LogisticRegression}')

f1 score_SVM: [0.96130952 0.92571429 0.89985052 0.94339623 0.92776886]
f1 score_DecisionTreeClassifier: [0.87240356 0.6975398  0.5356623  0.7        0.62654321]
f1 score_GaussianNB: [0.84992571 0.74433657 0.80601504 0.85950413 0.79296346]
f1 score_LogisticRegression: [0.96263079 0.92285714 0.8962406  0.94285714 0.90566038]


## <span style='background :yellow'>Test

In [26]:
X_test_set = ['Fantastic iPhone', 'bad book do not buy', 'This bread was very delicious']
X_test_set_count = vectorizer.transform(X_test_set)

model_SVM.predict(X_test_set_count)

array(['ELECTRONICS', 'BOOKS', 'GROCERY'], dtype=object)

## <span style='background :yellow'>Tuning del modello tramite GridSearchCV

In [27]:
from sklearn.model_selection import GridSearchCV

### <span style='background :yellow'>Linear SVM

In [28]:
model_SVM_parameters = {'C': (1,2,4,8,16,32), 'kernel': ('linear','rbf','sigmoid')}

model_SVM = SVC()
gridSearch = GridSearchCV(model_SVM, model_SVM_parameters, cv = 5)
gridSearch.fit(X_train_counts, y_train)

In [29]:
pd.DataFrame(gridSearch.cv_results_).sort_values('rank_test_score').head() # ordino da migliore a peggiore e visualizzo i primi 5

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_kernel,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,3.170687,0.088211,0.607606,0.020882,1,linear,"{'C': 1, 'kernel': 'linear'}",0.916418,0.90597,0.922388,0.91194,0.907463,0.912836,0.006015,1
2,2.706063,0.011691,0.562455,1.1e-05,1,sigmoid,"{'C': 1, 'kernel': 'sigmoid'}",0.913433,0.90597,0.922388,0.91194,0.90597,0.91194,0.006044,2
6,3.89347,0.020727,0.596841,0.011687,4,linear,"{'C': 4, 'kernel': 'linear'}",0.91791,0.902985,0.920896,0.901493,0.910448,0.910746,0.00775,3
4,5.279498,0.013168,0.742704,0.010788,2,rbf,"{'C': 2, 'kernel': 'rbf'}",0.908955,0.901493,0.923881,0.91194,0.901493,0.909552,0.008262,4
5,2.493113,0.027425,0.53434,0.006258,2,sigmoid,"{'C': 2, 'kernel': 'sigmoid'}",0.916418,0.897015,0.923881,0.901493,0.908955,0.909552,0.009746,4


Per quanto riguarda il modello SVM, performa al meglio con **C = 1** e **kernel = linear**

### <span style='background :yellow'>Decision tree

In [30]:
model_DecisionTreeClassifier_parameters = {'criterion': ('gini','entropy','log_loss'), 'splitter': ('best','random')}

model_DecisionTreeClassifier = DecisionTreeClassifier()
gridSearch = GridSearchCV(model_DecisionTreeClassifier, model_DecisionTreeClassifier_parameters, cv = 5)
gridSearch.fit(X_train_counts, y_train)

In [31]:
pd.DataFrame(gridSearch.cv_results_).sort_values('rank_test_score') # ordino da migliore a peggiore

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_criterion,param_splitter,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
1,0.473679,0.019787,0.0,0.0,gini,random,"{'criterion': 'gini', 'splitter': 'random'}",0.7,0.680597,0.710448,0.679104,0.685075,0.691045,0.012199,1
0,0.600551,0.025209,0.0,0.0,gini,best,"{'criterion': 'gini', 'splitter': 'best'}",0.704478,0.679104,0.656716,0.662687,0.628358,0.666269,0.025163,2
5,0.564346,0.01058,0.003125,0.00625,log_loss,random,"{'criterion': 'log_loss', 'splitter': 'random'}",0.652239,0.637313,0.649254,0.649254,0.679104,0.653433,0.013822,3
3,0.565585,0.011691,0.003126,0.006251,entropy,random,"{'criterion': 'entropy', 'splitter': 'random'}",0.632836,0.650746,0.610448,0.644776,0.629851,0.633731,0.013925,4
4,0.918689,0.020728,0.003121,0.006242,log_loss,best,"{'criterion': 'log_loss', 'splitter': 'best'}",0.677612,0.58806,0.613433,0.60597,0.628358,0.622687,0.030375,5
2,0.918685,0.020727,0.003125,0.006251,entropy,best,"{'criterion': 'entropy', 'splitter': 'best'}",0.677612,0.579104,0.608955,0.607463,0.628358,0.620299,0.032681,6


Per quanto riguarda il modello DecisionTree, performa al meglio con **criterion = gini** e **splitter = random**

### <span style='background :yellow'>Logistic regression

In [34]:
model_LogisticRegression_parameters = {'C': (1,2,4,8,16), 'solver': ('newton-cg','liblinear')}

model_LogisticRegression = LogisticRegression()
gridSearch = GridSearchCV(model_LogisticRegression, model_LogisticRegression_parameters, cv = 5)
gridSearch.fit(X_train_counts, y_train)

In [35]:
pd.DataFrame(gridSearch.cv_results_).sort_values('rank_test_score') # ordino da migliore a peggiore

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_solver,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
9,0.20226,0.008608,0.000404,0.000809,16,liblinear,"{'C': 16, 'solver': 'liblinear'}",0.929851,0.916418,0.946269,0.916418,0.925373,0.926866,0.011008,1
7,0.174991,0.006246,0.0,0.0,8,liblinear,"{'C': 8, 'solver': 'liblinear'}",0.926866,0.914925,0.943284,0.920896,0.925373,0.926269,0.009468,2
5,0.201896,0.038718,0.0008,0.001165,4,liblinear,"{'C': 4, 'solver': 'liblinear'}",0.925373,0.907463,0.941791,0.919403,0.922388,0.923284,0.011073,3
8,0.550195,0.035482,0.0006,0.001199,16,newton-cg,"{'C': 16, 'solver': 'newton-cg'}",0.923881,0.907463,0.941791,0.919403,0.920896,0.922687,0.011065,4
6,0.513634,0.071287,0.003529,0.006099,8,newton-cg,"{'C': 8, 'solver': 'newton-cg'}",0.925373,0.901493,0.943284,0.920896,0.919403,0.92209,0.013363,5
4,0.529153,0.064926,0.0004,0.0008,4,newton-cg,"{'C': 4, 'solver': 'newton-cg'}",0.922388,0.898507,0.938806,0.919403,0.919403,0.919701,0.012818,6
3,0.147683,0.007145,0.006255,0.00766,2,liblinear,"{'C': 2, 'solver': 'liblinear'}",0.91791,0.90597,0.931343,0.91791,0.916418,0.91791,0.008065,7
2,0.411459,0.028484,0.000599,0.000799,2,newton-cg,"{'C': 2, 'solver': 'newton-cg'}",0.916418,0.9,0.932836,0.920896,0.913433,0.916716,0.010655,8
1,0.172588,0.024593,0.000803,0.000983,1,liblinear,"{'C': 1, 'solver': 'liblinear'}",0.910448,0.9,0.925373,0.916418,0.904478,0.911343,0.008935,9
0,0.386877,0.034597,0.000404,0.000495,1,newton-cg,"{'C': 1, 'solver': 'newton-cg'}",0.910448,0.901493,0.922388,0.916418,0.904478,0.911045,0.007646,10


Per quanto riguarda il modello LogisticRegression, performa al meglio con **C = 16** e **solver = liblinear**