#**Working with the Data**

### Adding the imports

In [0]:
# Jupyter-specific.
%matplotlib inline
import time
program_start = time.time()

# Colab specific.
from google.colab import files

# General
from sklearn.feature_extraction.text import TfidfVectorizer, HashingVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import average_precision_score
from sklearn.metrics import recall_score
from sklearn.model_selection import KFold
from sklearn import neighbors
from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm, ensemble
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics
from sklearn import preprocessing
from sklearn.model_selection import GridSearchCV
from collections import Counter
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns; sns.set() # Set the default styling. 
from io import BytesIO
import csv
import pickle
from scipy.sparse import coo_matrix, vstack, csr_matrix

# Plot style settings.
plt.style.use('fivethirtyeight') # I'm a fan of this one.

### Reading the reduced file, taking the review and rating column

In [0]:
y = []
X = []
# open reduced CSV
with open('reduced_amazon_ff_reviews.csv', 'r') as csvFile:
    reader = csv.reader(csvFile)
    for row in reader:
        X.append(row[5])
        y.append(row[8])

FileNotFoundError: ignored

### Making a data frame called 'df' out of column 5 and 8

In [0]:
# columns 'Rating' and 'Text' are loaded into the df
df = pd.read_csv('reduced_amazon_ff_reviews.csv', usecols=[5, 8])

In [0]:
# top 5 records
df.head()

Unnamed: 0,Rating,Text
0,neutral,This seems a little more wholesome than some o...
1,positive,I bought these at Grocery Outlet here in the S...
2,positive,I bought this coffee because its much cheaper ...
3,negative,Herbal additives in this blend destroy real te...
4,positive,These Nature Valley Nut Lovers Variety Pack wa...


In [0]:
# basic info. 14906 records for the reduced
df.describe()

Unnamed: 0,Rating,Text
count,14906,14906
unique,3,14287
top,positive,"This review will make me sound really stupid, ..."
freq,6699,16


### Getting the count of each type of rating

In [0]:
# counts the total number of different labels
rating_counts = df.groupby('Rating')['Rating'].count()
rating_counts.head()

Rating
negative    5417
neutral     2790
positive    6699
Name: Rating, dtype: int64

In [0]:
# getting the ratios of the ratings
rating_counts / len(df)

Rating
negative    0.363411
neutral     0.187173
positive    0.449416
Name: Rating, dtype: float64

### Preparing the data for classification

In [0]:
# using a tfidf vecotrizer to convert review text into vectors also filters stopwords
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(df['Text'])

print(X.shape)
print(vectorizer.get_feature_names())

(14906, 24625)


### Splitting the labels from the review text

In [0]:
# putting the ratings into the variable y
y = df['Rating']
y.head()

0     neutral
1    positive
2    positive
3    negative
4    positive
Name: Rating, dtype: object

### Splitting up the training/testing with an 80/20 split

In [0]:
# split the data 80/20 and add random state 55
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=55)

In [0]:
# X_train containts text as a vector, and y_train contains the label
print(X_train)
print(y_train)

  (0, 2800)	0.20249946955992829
  (0, 9220)	0.20610249335546876
  (0, 22804)	0.2223036920019355
  (0, 11583)	0.20610249335546876
  (0, 6730)	0.20249946955992829
  (0, 12330)	0.17370009606253528
  (0, 16660)	0.4122049867109375
  (0, 22086)	0.15953264817297533
  (0, 12350)	0.20249946955992829
  (0, 23790)	0.1433314495265086
  (0, 3735)	0.13264660585163504
  (0, 6000)	0.16927686920049767
  (0, 4235)	0.10957462876027817
  (0, 14406)	0.33696895009477307
  (0, 13111)	0.09754733846318929
  (0, 12753)	0.1507747924445192
  (0, 7625)	0.12892377969773905
  (0, 3915)	0.12118251756451574
  (0, 20231)	0.1239097788146052
  (0, 24070)	0.1236794972578539
  (0, 5170)	0.10230855276100193
  (0, 23876)	0.11293866939108049
  (0, 7232)	0.09913586835037033
  (0, 14661)	0.12067327326109742
  (0, 13670)	0.11602875822361754
  :	:
  (11923, 19319)	0.07747314212909547
  (11923, 19066)	0.13375508847990547
  (11923, 7463)	0.05945104781646569
  (11923, 9570)	0.08404349135240155
  (11923, 1402)	0.08424614324312861
  (

In [0]:
# X_test containts text as a vector, and y_test contains the label
print(X_test)
print(y_test)

  (0, 21131)	0.4439561842194609
  (0, 10593)	0.2976862255027319
  (0, 19952)	0.3090762545105446
  (0, 10977)	0.2353844403303505
  (0, 19041)	0.22904996613364018
  (0, 5709)	0.23204834210414302
  (0, 2062)	0.2536194086442817
  (0, 18455)	0.2149542370959537
  (0, 12994)	0.20523740430619172
  (0, 14248)	0.20938776574994486
  (0, 19348)	0.27269223032208856
  (0, 3419)	0.22362655408202992
  (0, 21816)	0.16958498390821947
  (0, 17125)	0.12100447768342433
  (0, 9901)	0.11160232453584615
  (0, 10072)	0.13346190950501766
  (0, 21768)	0.11530469724999953
  (0, 3363)	0.16593482421815078
  (0, 3818)	0.1472414238167197
  (1, 1869)	0.1067694945977203
  (1, 18923)	0.11132121613151008
  (1, 18924)	0.11132121613151008
  (1, 19897)	0.10103500365087663
  (1, 14132)	0.09725779102684333
  (1, 17948)	0.09218383921310282
  :	:
  (2980, 10854)	0.1595681164335756
  (2980, 11535)	0.1352129779634383
  (2980, 8587)	0.15839071876826963
  (2980, 5985)	0.1709019091131299
  (2980, 2946)	0.10806220791231941
  (2980, 4

#**Naive Bayes Classifier**

In [0]:
# Create instance of the NB classifier
modelNB = MultinomialNB()

# Fit to data (also called training the model
modelNB.fit(X_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [0]:
# create NB prediction variable for the labels 
y_predNB = modelNB.predict(X_test)

# compute accuracy
accuracy_score(y_test, y_predNB)

0.6492287055667337

This model correctly predicted the rating of a review about 65% of the time

### Precision Score
The precision is the ability of the classifier not to label as postiive a sample that is negative

In [0]:
# best value is 1 and worst value is 0
print("Macro Precision:",precision_score(y_test, y_predNB, average='macro'))
print("Micro Precision:",precision_score(y_test, y_predNB, average='micro'))
print("Weighted Precision:", precision_score(y_test, y_predNB, average='weighted'))


Macro Precision: 0.4529694927656925
Micro Precision: 0.6492287055667337
Weighted Precision: 0.5435689763648446


  'precision', 'predicted', average, warn_for)


**Macro Precision:** Calculates metrics for each label, and find the unweighted mean. Does not take label imbalance into account. <br>
**Micro Precision:** Calculates Metrics globally by counting the total true positives, false negatives, and false positives. <br>
**Weighted Precision:** Calculate metrics for each label, and find their average weighted by support (the number of true instances for each label). This alters ‘macro’ to account for label imbalance.<br><br>
Definitions taken from Scikit Learn documentation found [here](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.precision_score.html)

### Recall Score
The revall is intuitively the ability of the classifier to find all the positive samples

In [0]:
# best value is 1 and worst value is 0
print("\nMacro Recall:", recall_score(y_test, y_predNB, average='macro'))
print("Micro Recall:", recall_score(y_test, y_predNB, average='micro'))
print("Weighted Recall:", recall_score(y_test, y_predNB, average='weighted'))


Macro Recall: 0.5195389606355865
Micro Recall: 0.6492287055667337
Weighted Recall: 0.6492287055667337


Defintitions for the different types of recall are the same as precision. Documentation used can be found from SciKit [here](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.recall_score.html)

### 10 Fold Cross Validation

Evalulates a score by cross-validation. Takes the model, test attributes, and number of splits as parameters. SciKit documentation can be found [here](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.cross_val_score.html)

In [0]:
scoresNB = cross_val_score(modelNB, X_test, y_test, cv=10)

In [0]:
print("Accuracy: %0.2f(+/-%0.2f)" %(scoresNB.mean(), scoresNB.std()*2))
print(scoresNB)

Accuracy: 0.56(+/-0.04)
[0.55       0.54       0.58862876 0.53020134 0.55704698 0.56711409
 0.57718121 0.55218855 0.55555556 0.58249158]


#**kNN Classifier**

In [0]:
# create the kNN classifier 
modelK = neighbors.KNeighborsClassifier(n_neighbors=3)

In [0]:
# fit the model
modelK.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=3, p=2,
                     weights='uniform')

In [0]:
# create prediction variable 
y_predK = modelK.predict(X_test)

In [0]:
print(y_predK)

['negative' 'negative' 'negative' ... 'positive' 'negative' 'positive']


### Accuracy score with 3NN

In [0]:
# accuracy score of 3NN
accuracy_score(y_test, y_predK) * 100

52.8504359490275

Predicted correctly about 53% of the time

### Trying different K values for kNN

Code chunk below checks the accuracy for different values of K from 1 to 10. <br> Code provided by Jason from the Collab Notebook

In [0]:
# Code chunk provided by Jason in the Collab Notebook

# Create list to store results in outer scope.
acc_vs_k = []

# Let's try from k = 1 to k = 10.
for k in range(1, 11): # range(from inclusive beginning, to exclusive end)
  # Build model with k value.
  modelKnn = neighbors.KNeighborsClassifier(n_neighbors=k)
  
  # Fit the model with the training data.
  modelKnn.fit(X_train, y_train)

  # Use the fitted model to predict the classes for the test data.
  y_predKnn = modelKnn.predict(X_test)
  
  # Add the accuracy to the list for plotting.
  acc_vs_k.append(accuracy_score(y_test, y_predKnn))
  
print(acc_vs_k)

[0.5013413816230717, 0.5040241448692153, 0.528504359490275, 0.528504359490275, 0.539570757880617, 0.5449362843729041, 0.5516431924882629, 0.54560697518444, 0.5576794097920859, 0.5516431924882629]


Returns the K with the highest accuracy from above. Code chunk provided by Jason 

In [0]:
# Code chunk prvided by Jason in the Collab Notebook

idx_of_best_k = np.argmax(acc_vs_k)

# To avoid super-long lines, use a string with placeholders, pass it into
# print() calling format() on it to replace the placeholders.
# This is the so-called Pythonic way of doing things, which helps keep code
# clear and readable.
format_string = 'Best K value is {k_value} at index {idx} with acc of {acc}%.'
print(format_string.format(k_value=idx_of_best_k+1,
      idx=idx_of_best_k,
      acc=acc_vs_k[idx_of_best_k]*100))

Best K value is 9 at index 8 with acc of 55.76794097920859%.


### Precision and Recall for 3NN

In [0]:
# best value is 1 and worst value is 0
print("Macro Precision:",precision_score(y_test, y_predK, average='macro'))
print("Micro Precision:",precision_score(y_test, y_predK, average='micro'))
print("Weighted Precision:", precision_score(y_test, y_predK, average='weighted'))

Macro Precision: 0.4777554213123231
Micro Precision: 0.528504359490275
Weighted Precision: 0.5044440625645044


In [0]:
print("Macro Recall:", recall_score(y_test, y_predK, average='macro'))
print("Micro Recall:", recall_score(y_test, y_predK, average='micro'))
print("Weighted Recall:", recall_score(y_test, y_predK, average='weighted'))

Macro Recall: 0.43658022364468696
Micro Recall: 0.528504359490275
Weighted Recall: 0.528504359490275


### 10 Fold Cross Validation witk 3NN

In [0]:
scoresK = cross_val_score(modelK, X_test, y_test, cv=10)

In [0]:
print("Accuracy: %0.2f(+/-%0.2f)" %(scoresK.mean(), scoresK.std()*2))
print(scoresK)

Accuracy: 0.49(+/-0.08)
[0.56       0.47333333 0.47157191 0.43288591 0.50671141 0.48322148
 0.5033557  0.46464646 0.48484848 0.56565657]


### 10 Fold Cross Validation with 1-10 NN from above

In [0]:
scoresKNN = cross_val_score(modelKnn, X_test, y_test, cv = 10)

print("Accuracy: %0.2f(+/-%0.2f)" %(scoresKNN.mean(), scoresKNN.std()*2))
print(scoresKNN)

Accuracy: 0.52(+/-0.07)
[0.54       0.54333333 0.52173913 0.4966443  0.58724832 0.52013423
 0.52348993 0.45117845 0.50505051 0.54208754]


#**Decision Tree Classifier**

Decision Tree Classifiers use example code from the SciKit documentation found [here.](https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html)

In [0]:
# this classifier is using the gini index ("default") and no max depth
# it is also using the best split

# create decision tree classifier
dTreeClf = DecisionTreeClassifier() 

# train decision tree classifier
dTreeClf = dTreeClf.fit(X_train, y_train)

# predict response for test data set
y_predTree = dTreeClf.predict(X_test)

print("Accuracy:", metrics.accuracy_score(y_test, y_predTree))

print("\nMacro Precision:", precision_score(y_test, y_predTree, average='macro'))
print("Micro Precision:", precision_score(y_test, y_predTree, average='micro'))
print("Weighted Precision:", precision_score(y_test, y_predTree, average='weighted' ))

print("\nMacro Recall:", recall_score(y_test, y_predTree, average='macro'))
print("Micro Recall:", recall_score(y_test, y_predTree, average='micro'))
print("Weighted Recall:", recall_score(y_test, y_predTree, average='weighted'))

scoresT1 = cross_val_score(dTreeClf, X_test, y_test, cv=10)
print("\n10-Fold Accuracy: %0.2f(+/-%0.2f)" %(scoresT1.mean(), scoresT1.std()*2))
print(scoresT1)


Accuracy: 0.56841046277666

Macro Precision: 0.5107480205592941
Micro Precision: 0.56841046277666
Weighted Precision: 0.5617607215527445

Macro Recall: 0.510046433007273
Micro Recall: 0.56841046277666
Weighted Recall: 0.56841046277666

10-Fold Accuracy: 0.52(+/-0.04)
[0.53       0.50666667 0.52842809 0.5033557  0.52013423 0.55033557
 0.52684564 0.49494949 0.47474747 0.53198653]


We got a low accuracy so let's expeirment with dTree parameters

In [0]:
# this classifier is using entropy for the information gain and a max depth of 3
# it is also using the best split

# create decision tree classifier
dTreeClf = DecisionTreeClassifier(criterion="entropy", max_depth=3)

# train decision tree classifier
dTreeClf = dTreeClf.fit(X_train, y_train)

# predict response for test data set
y_predTree = dTreeClf.predict(X_test)

print("Accuracy:", metrics.accuracy_score(y_test, y_predTree))

print("\nMacro Precision:", precision_score(y_test, y_predTree, average='macro'))
print("Micro Precision:", precision_score(y_test, y_predTree, average='micro'))
print("Weighted Precision:", precision_score(y_test, y_predTree, average='weighted' ))

print("\nMacro Recall:", recall_score(y_test, y_predTree, average='macro'))
print("Micro Recall:", recall_score(y_test, y_predTree, average='micro'))
print("Weighted Recall:", recall_score(y_test, y_predTree, average='weighted'))

scoresT2 = cross_val_score(dTreeClf, X_test, y_test, cv=10)
print("\n10-Fold Accuracy: %0.2f(+/-%0.2f)" %(scoresT2.mean(), scoresT2.std()*2))
print(scoresT2)



Accuracy: 0.4597585513078471

Macro Precision: 0.36303550193457834
Micro Precision: 0.4597585513078471
Weighted Precision: 0.4571280263053682

Macro Recall: 0.40301470567783243
Micro Recall: 0.4597585513078471
Weighted Recall: 0.4597585513078471


  'precision', 'predicted', average, warn_for)



10-Fold Accuracy: 0.48(+/-0.02)
[0.47333333 0.48       0.48494983 0.47651007 0.48322148 0.48322148
 0.48993289 0.48484848 0.46801347 0.46127946]


In [0]:
# this classifier is using entropy for the gini and a max depth of 10
# it is also using the best split

# create decision tree classifier
dTreeClf = DecisionTreeClassifier(criterion="gini", max_depth=10)

# train decision tree classifier
dTreeClf = dTreeClf.fit(X_train, y_train)

# predict response for test data set
y_predTree = dTreeClf.predict(X_test)

print("Accuracy:", metrics.accuracy_score(y_test, y_predTree))

print("\nMacro Precision:", precision_score(y_test, y_predTree, average='macro'))
print("Micro Precision:", precision_score(y_test, y_predTree, average='micro'))
print("Weighted Precision:", precision_score(y_test, y_predTree, average='weighted' ))

print("\nMacro Recall:", recall_score(y_test, y_predTree, average='macro'))
print("Micro Recall:", recall_score(y_test, y_predTree, average='micro'))
print("Weighted Recall:", recall_score(y_test, y_predTree, average='weighted'))

scoresT3 = cross_val_score(dTreeClf, X_test, y_test, cv=10)
print("\n10-Fold Accuracy: %0.2f(+/-%0.2f)" %(scoresT3.mean(), scoresT3.std()*2))
print(scoresT3)

Accuracy: 0.562374245472837

Macro Precision: 0.46247753530166885
Micro Precision: 0.562374245472837
Weighted Precision: 0.5160728039611223

Macro Recall: 0.46759969969515836
Micro Recall: 0.562374245472837
Weighted Recall: 0.562374245472837

10-Fold Accuracy: 0.53(+/-0.04)
[0.52       0.50666667 0.5451505  0.53020134 0.56040268 0.53691275
 0.58389262 0.52861953 0.52188552 0.51178451]


Gini is the best criteria. And accuracy goes up slightly with a higher mex_depth but it isn't too noticable. Also as the Max Depth goes higher we run the risk of over classifying the tree

#**Random Forest Classifier**

### Making an instance of the classifier

Both Random Forest and SVM Classifiers are similar to Nathalie's Research code

In [0]:
# creating an instance of the classification model
modelRF = ensemble.RandomForestClassifier(n_estimators=100)

# training the model
modelRF.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

###Finding the accuracy score of the random forest model

In [0]:
# making a RF prediction variable and calulcating the accuracy
y_predRF = modelRF.predict(X_test)

accuracy_score(y_test, y_predRF)

0.6961770623742455

### Different precision and recall scores

In [0]:
print("\nMacro Precision:", precision_score(y_test, y_predRF, average='macro'))
print("Micro Precision:", precision_score(y_test, y_predRF, average='micro'))
print("Weighted Precision:", precision_score(y_test, y_predRF, average='weighted' ))

print("\nMacro Recall:", recall_score(y_test, y_predRF, average='macro'))
print("Micro Recall:", recall_score(y_test, y_predRF, average='micro'))
print("Weighted Recall:", recall_score(y_test, y_predRF, average='weighted'))


Macro Precision: 0.7535834993148581
Micro Precision: 0.6961770623742455
Weighted Precision: 0.7274275454743321

Macro Recall: 0.5789062052927014
Micro Recall: 0.6961770623742455
Weighted Recall: 0.6961770623742455


In [0]:
scoresRF = cross_val_score(modelRF, X_test, y_test, cv=10)
print("\n10-Fold Accuracy: %0.2f(+/-%0.2f)" %(scoresRF.mean(), scoresRF.std()*2))
print(scoresRF)


10-Fold Accuracy: 0.64(+/-0.05)
[0.62666667 0.62333333 0.65551839 0.66778523 0.67114094 0.6442953
 0.66442953 0.60606061 0.62962963 0.60606061]


#**SVM Classifiers**

Function below trains the model so multiple SVM methods can be tested

In [0]:
#function for accuracy and precision, see svm for parameters
def train_model(classifier, feature_vector_train, label, feature_vector_valid, is_neural_net=False):
    # Fit the training dataset on the classifier
    classifier.fit(feature_vector_train, label)

    # Predict the labels on validation dataset
    predictions = classifier.predict(feature_vector_valid)

    if is_neural_net:
        predictions = predictions.argmax(axis=-1)

    return accuracy_score(y_test, predictions), precision_score(y_test, predictions, average=None).mean(), recall_score(y_test, predictions, average=None).mean()

SVM Methods used are:


*   SVM Linear
*   SVM RBF
*   SVM Polynomial
*   SVM Sigmoid 
*   RF<br>

Each method has it's accuracy, precision score, and recall score calculated. In addition to it's 10-Fold Cross Validation accuracy





In [0]:
# SVM Linear
metrics = train_model(svm.SVC(kernel='linear', gamma='scale', random_state=42), X_train, y_train, X_test)
print("SVM, Kernel = 'linear',  ", metrics)


SVM, Kernel = 'linear',   (0.7025486250838363, 0.6434568541100872, 0.6113409212500162)


In [0]:
# SVM RBF
metrics = train_model(svm.SVC(kernel='rbf', gamma='scale'), X_train, y_train, X_test, )
print("SVM, Kernel = 'RBF' ,  ", metrics)

SVM, Kernel = 'RBF' ,   (0.710261569416499, 0.6852531635585281, 0.6009422503614107)


In [0]:
# SVM Polynomial
metrics = train_model(svm.SVC(kernel='poly', gamma='scale'), X_train, y_train, X_test)
print("SVM, Kernel = 'polynomial' ,  ", metrics)

SVM, Kernel = 'polynomial' ,   (0.5710932260228034, 0.7710210651765962, 0.45160311397095426)


In [0]:
# SVM Sigmoid
metrics = train_model(svm.SVC(kernel='sigmoid', gamma='scale'), X_train, y_train, X_test)
print("SVM, Kernel = 'sigmoid' ,  ", metrics)

SVM, Kernel = 'sigmoid' ,   (0.6981891348088531, 0.6361996291285522, 0.6014817053080835)


In [0]:
# RF
metrics = train_model(ensemble.RandomForestClassifier(n_estimators=100), X_train, y_train, X_test)
print("RF, n_estimators = 100,  ", metrics)

RF, n_estimators = 100,   (0.6854460093896714, 0.7516697911795323, 0.5698788412749746)


In [0]:
# 10-Fold Cross validation

print("10-Fold Cross Validation Scores on TFIDF Vectors using SVM (Kernel = Linear)")
# 10-Fold Cross validation
clf_svm = svm.SVC(kernel='linear', gamma='scale', C=1, random_state=42)
scores_svm = cross_val_score(clf_svm, X_train, y_train, cv=10)
print(scores_svm)
print("Accuracy: %0.2f (+/- %0.2f)\n" % (scores_svm.mean(), scores_svm.std() * 2))


10-Fold Cross Validation Scores on TFIDF Vectors using SVM (Kernel = Linear)
[0.69932998 0.69011725 0.6789606  0.68063705 0.69237217 0.68540268
 0.6795302  0.67422334 0.70277078 0.69437448]
Accuracy: 0.69 (+/- 0.02)



In [0]:
print("10-Fold Cross Validation Scores on TFIDF Vectors using SVM (Kernel = Polynomial)")
# 10-Fold Cross validation
clf_svm = svm.SVC(kernel='poly', gamma='scale', C=1, random_state=42)
scores_svm = cross_val_score(clf_svm, X_train, y_train, cv=10)
print(scores_svm)
print("Accuracy: %0.2f (+/- %0.2f)\n" % (scores_svm.mean(), scores_svm.std() * 2))


10-Fold Cross Validation Scores on TFIDF Vectors using SVM (Kernel = Polynomial)
[0.54690117 0.5561139  0.54484493 0.54400671 0.55155071 0.54781879
 0.54278523 0.55667506 0.56171285 0.54575987]
Accuracy: 0.55 (+/- 0.01)



In [0]:
print("10-Fold Cross Validation Scores on TFIDF Vectors using SVM (Kernel = Sigmoid)")
# 10-Fold Cross validation
clf_svm = svm.SVC(kernel='sigmoid', gamma='scale', C=1, random_state=42)
scores_svm = cross_val_score(clf_svm, X_train, y_train, cv=10)
print(scores_svm)
print("Accuracy: %0.2f (+/- %0.2f)\n" % (scores_svm.mean(), scores_svm.std() * 2))


10-Fold Cross Validation Scores on TFIDF Vectors using SVM (Kernel = Sigmoid)
[0.69346734 0.67755444 0.67225482 0.67309304 0.67812238 0.6795302
 0.6761745  0.67086482 0.6868178  0.68849706]
Accuracy: 0.68 (+/- 0.01)



In [0]:
print("10-Fold Cross Validation Scores on TFIDF Vectors using SVM (Kernel = RBF)")
# 10-Fold Cross validation
clf_svm = svm.SVC(kernel='rbf', gamma='scale', C=1, random_state=42)
scores_svm = cross_val_score(clf_svm, X_train, y_train, cv=10)
print(scores_svm)
print("Accuracy: %0.2f (+/- %0.2f)\n" % (scores_svm.mean(), scores_svm.std() * 2))


10-Fold Cross Validation Scores on TFIDF Vectors using SVM (Kernel = RBF)
[0.7160804  0.71105528 0.69069573 0.69572506 0.6898575  0.69966443
 0.68624161 0.70361041 0.71284635 0.70696893]
Accuracy: 0.70 (+/- 0.02)





---


#**Classification Methods using Full Data Set**

---



To classify the full data set, two methods are used. **Naive Bayes using a TFIDF vectorizer and partial fit** and using a **HashingVectorizer** on all classifiers

#**Naive Bayes on Full Data Set Using TfidfVectorizer**

In [0]:
# set the full data set
full_data_set = "full_amazon_ff_reviews.csv"

# create the model
model = MultinomialNB()

# make lists for each chunk
X_test_chunk_list = []
y_test_chunk_list = []

# creating the vectorizer
vectorizer = TfidfVectorizer(stop_words='english')
do_once = 1

# read the data into the frame within a loop
for chunk in pd.read_csv(full_data_set, usecols=[5, 8], chunksize=100000):

    if(do_once == 1):
        X = vectorizer.fit_transform(chunk['Text'])
        do_once = 0
    else:
        X = vectorizer.transform(chunk['Text'])

    y = chunk['Rating']

    # split the data 80/20 and add random state 55
    X_train, X_test_chunk, y_train, y_test_chunk = train_test_split(X, y, test_size=0.2, random_state=55)

    # appened the chunks onto their respective lists
    X_test_chunk_list.append(X_test_chunk)
    y_test_chunk_list.append(y_test_chunk)

    model.partial_fit(X_train, y_train, classes=np.unique(y_train))

y_pred_list = [model.predict(test_chunk) for test_chunk in X_test_chunk_list]

# acccuracy score list
accuracy_score_list = [accuracy_score(y_test_chunk_list[index], y_pred_list[index]) for index in range(len(y_pred_list))] 

# precision score list
precision_score_list = [precision_score(y_test_chunk_list[index], y_pred_list[index], average=None).mean() for index in range(len(y_pred_list))]

# recall score list 
recall_score_list = [recall_score(y_test_chunk_list[index], y_pred_list[index], average=None).mean() for index in range(len(y_pred_list))]

print("---------------" + "AVG ACCURACY" + "---------------\n")
print(str(sum(accuracy_score_list)/len(accuracy_score_list))+'\n')

print("---------------" + "AVG PRECISION" + "---------------\n")
print(str(sum(precision_score_list)/len(precision_score_list))+ '\n')

print("---------------" + "AVG RECALL" + "---------------\n")
print(str(sum(recall_score_list)/len(recall_score_list))+ '\n')


---------------AVG ACCURACY---------------

0.79558

---------------AVG PRECISION---------------

0.7695693669527404

---------------AVG RECALL---------------

0.4311439260374394



In [0]:
# function that prints restils for the HashingVectorizer
def results_function(model, X_test, y_test, y_pred):
  print("Accuracy:", accuracy_score(y_test, y_pred))

  print("\nMacro Precision:", precision_score(y_test, y_pred, average='macro'))
  print("Micro Precision:", precision_score(y_test, y_pred, average='micro'))
  print("Weighted Precision:", precision_score(y_test, y_pred, average='weighted' ))

  print("\nMacro Recall:", recall_score(y_test, y_pred, average='macro'))
  print("Micro Recall:", recall_score(y_test, y_pred, average='micro'))
  print("Weighted Recall:", recall_score(y_test, y_pred, average='weighted'))

  scores = cross_val_score(model, X_test, y_test, cv=10)
  print("\n10-Fold Accuracy: %0.2f(+/-%0.2f)" %(scores.mean(), scores.std()*2))
  print(scores)

##Dataframe for full text

In [0]:
# create a dataframe that is using the full dataset 
H_df = pd.read_csv(full_data_set, usecols=[5, 8])

### Counting the Full Data

In [0]:
# counting the number of ratings in the full set
rating_counts_full = H_df.groupby('Rating')['Rating'].count()
rating_counts_full.head()

Rating
negative     82037
neutral      42640
positive    375323
Name: Rating, dtype: int64

In [0]:
# getting the ratios of the ratings
rating_counts_full / len(H_df)

Rating
negative    0.164074
neutral     0.085280
positive    0.750646
Name: Rating, dtype: float64

In [0]:
H_df.describe()

Unnamed: 0,Rating,Text
count,500000,500000
unique,3,352758
top,positive,"This review will make me sound really stupid, ..."
freq,375323,199


##Creating the HashingVectorizer

In [0]:
# creates the HashingVectorizer that will be used with the full data
H_vectorizer = HashingVectorizer(stop_words='english', alternate_sign=False)
H_X = H_vectorizer.transform(H_df['Text'])
H_y = H_df['Rating']

# splits the data 80/20 with random state 55
H_X_train, H_X_test, H_y_train, H_y_test = train_test_split(H_X, H_y, test_size=0.2, random_state=55)

##NB Classifier with HashingVectorizer

In [0]:
H_modelNB = MultinomialNB()
H_modelNB.fit(H_X_train, H_y_train)
H_y_predNB = H_modelNB.predict(H_X_test)

In [0]:
results_function(H_modelNB, H_X_test, H_y_test, H_y_predNB)

Accuracy: 0.75185


  'precision', 'predicted', average, warn_for)



Macro Precision: 0.5506117278394506
Micro Precision: 0.75185
Weighted Precision: 0.7115921359135915

Macro Recall: 0.33351334448065684
Micro Recall: 0.75185
Weighted Recall: 0.75185

10-Fold Accuracy: 0.75(+/-0.00)
[0.75172483 0.75172483 0.75172483 0.75172483 0.75172483 0.7518
 0.7518     0.75177518 0.75185037 0.75185037]


##kNN Classifier with HashingVectorizer and K = 3

In [0]:
H_modelK = neighbors.KNeighborsClassifier(n_neighbors=3)
H_modelK.fit(H_X_train, H_y_train)
H_y_predK = H_modelK.predict(H_X_test)

In [0]:
results_function(H_modelK, H_X_test, H_y_test, H_y_predK)

Accuracy: 0.81724

Macro Precision: 0.7735438074714115
Micro Precision: 0.81724
Weighted Precision: 0.8072352400481443

Macro Recall: 0.5555860774546658
Micro Recall: 0.81724
Weighted Recall: 0.81724

10-Fold Accuracy: 0.76(+/-0.00)
[0.75482452 0.75492451 0.76142386 0.75912409 0.75642436 0.7574
 0.7569     0.75537554 0.75645129 0.76205241]


##Decision Tree Classifier with HashingVectorizer

###No params

In [0]:
H_dTreeClf = DecisionTreeClassifier()
H_dTreeClf = H_dTreeClf.fit(H_X_train, H_y_train)
H_y_predTREE = H_dTreeClf.predict(H_X_test)

In [0]:
results_function(H_dTreeClf, H_X_test, H_y_test, H_y_predTREE)

###Params entropy & max_depth 3

In [0]:
H_dTreeClf = DecisionTreeClassifier(criterion="entropy", max_depth=3)
H_dTreeClf = H_dTreeClf.fit(H_X_train, H_y_train)
H_y_predTREE3 = H_dTreeClf.predict(H_X_test)

In [0]:
results_function(H_dTreeClf, H_X_test, H_y_test, H_y_predTREE3)

###Params gini & max_depth 10

In [0]:
H_dTreeClf = DecisionTreeClassifier(criterion="gini", max_depth=10)
H_dTreeClf = H_dTreeClf.fit(H_X_train, H_y_train)
H_y_predTREE10 = H_dTreeClf.predict(H_X_test)

In [0]:
results_function(H_dTreeClf, H_X_test, H_y_test, H_y_predTREE10)

##Random Forest Classifier with HashingVectorizer

In [0]:
H_modelRF = ensemble.RandomForestClassifier(n_estimators=100)
H_modelRF = H_modelRF.fit(H_X_train, H_y_train)
H_y_predRF = H_modelRF.predict(H_X_test)

In [0]:
results_function(H_dTreeClf, H_X_test, H_y_test, H_y_predRF)

##SVM Classifiers with HashingVectorizer

###SVM LINEAR

In [0]:
H_SVML = svm.SVC(kernel='linear', gamma='scale', random_state=42)
H_SVML = H_SVML.fit(H_X_train, H_y_train)
H_y_predL = H_SVML.fit.predict(H_X_test)

In [0]:
results_function(H_SVML, H_X_test, H_y_test, H_y_predL)

###SVM RBF

In [0]:
H_SVMRBF = svm.SVC(kernel='rbf', gamma='scale', random_state=42)
H_SVMRBF = H_SVMRBF.fit(H_X_train, H_y_train)
H_y_predRBF = H_SVMRBF.fit.predict(H_X_test)

In [0]:
results_function(H_SVMRBF, H_X_test, H_y_test, H_y_predRBF)

###SVM POLY

In [0]:
H_SVMP = svm.SVC(kernel='poly', gamma='scale', random_state=42)
H_SVMP = H_SVMP.fit(H_X_train, H_y_train)
H_y_predP = H_SVMP.fit.predict(H_X_test)

In [0]:
results_function(H_SVMP, H_X_test, H_y_test, H_y_predP)

###SVM SIGMOID

In [0]:
H_SVMSIG = svm.SVC(kernel='sigmoid', gamma='scale', random_state=42)
H_SVMSIG = H_SVMSIG.fit(H_X_train, H_y_train)
H_y_predSIG = H_SVMSIG.fit.predict(H_X_test)

In [0]:
results_function(H_SVMSIG, H_X_test, H_y_test, H_y_predSIG)

In [0]:
program_end = time.time()
print(program_end - program_start)