In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import random
import json
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

#### Loading in the data 

In [2]:
class Sentiment:
    NEUTRAL = 'Neutral'
    NEGATIVE = 'Negative'
    POSITIVE = 'Positive'

class Review:
    def __init__(self,text,rating):
        self.text = text
        self.rating = rating
        self.sentiment = self.get_sentiment()
    def get_sentiment(self):
        if self.rating <= 2:
            return Sentiment.NEGATIVE
        elif self.rating == 3:
            return Sentiment.NEUTRAL
        else:
            return Sentiment.POSITIVE
        
class ReviewContainer():
    def __init__(self, reviews):
        self.reviews = reviews
    def get_text(self):
        return [i.text for i in self.reviews]
    def get_sentiment(self):
        return [i.sentiment for i in self.reviews]
    def evenly_distribute(self):
        negative = list(filter(lambda i: i.sentiment == Sentiment.NEGATIVE, self.reviews))
        positive = list(filter(lambda i: i.sentiment == Sentiment.POSITIVE, self.reviews))
        equal_pos = positive[:len(negative)]
        self.reviews = negative + equal_pos
        random.shuffle(self.reviews)
    

In [3]:
file_name = './Books_reviews.json'

reviews = []
with open(file_name) as f:
    for line in f:
        review = json.loads(line)
        reviews.append(Review(review['reviewText'],review['overall']))

In [4]:
reviews[5].rating

5.0

In [5]:
train,test = train_test_split(reviews, test_size=0.2,random_state=27)

In [6]:
train[0].text

'it was a good book overall. I am 10 years old. this book is meant for 8 and 9 year-olds. it was trying to make it appealing to ages 8-12, but it just isn\'t working out. I didn\'t understand the "romance" between Cassandra and Apllo.'

In [7]:
train_x, train_y = [i.text for i in train], [i.sentiment for i in train]
test_x, test_y = [i.text for i in test], [i.sentiment for i in test]

### Bag of words vectorization

In [8]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer

In [9]:
vectorizer = TfidfVectorizer()
vector_x = vectorizer.fit_transform(train_x)
vector_test_x = vectorizer.transform(test_x)

### Building models

In [10]:
from sklearn import svm

clf_svc = svm.SVC(kernel='linear')
clf_svc.fit(vector_x,train_y)

SVC(kernel='linear')

In [11]:
clf_svc.predict(vector_test_x[0])

array(['Positive'], dtype='<U8')

In [12]:
from sklearn.tree import DecisionTreeClassifier

clf_tree = DecisionTreeClassifier(random_state=27)
clf_tree.fit(vector_x,train_y)

DecisionTreeClassifier(random_state=27)

In [13]:
clf_tree.predict(vector_test_x[0])

array(['Positive'], dtype='<U8')

In [14]:
from sklearn.naive_bayes import GaussianNB

clf_g = GaussianNB()
clf_g.fit(vector_x.toarray(),train_y)

GaussianNB()

In [15]:
clf_g.predict(vector_test_x[0].toarray())

array(['Positive'], dtype='<U8')

In [16]:
from sklearn.linear_model import LogisticRegression

clf_log = LogisticRegression(max_iter=5000)
clf_log.fit(vector_x,train_y)

LogisticRegression(max_iter=5000)

In [17]:
clf_log.predict(vector_test_x[0])

array(['Positive'], dtype='<U8')

#### Scoring the models

Mean Accuracy

In [18]:
print('SVC Score: ', clf_svc.score(vector_test_x,test_y))
print('Tree Score: ', clf_tree.score(vector_test_x,test_y))
print('GNB Score: ', clf_g.score(vector_test_x.toarray(),test_y))
print('Log Score: ', clf_log.score(vector_test_x,test_y))

SVC Score:  0.8505
Tree Score:  0.764
GNB Score:  0.6335
Log Score:  0.85


F1 Score

In [19]:
from sklearn.metrics import f1_score

In [20]:
print('SVC f1 Score: ',f1_score(test_y, clf_svc.predict(vector_test_x),average=None,labels=[Sentiment.POSITIVE,Sentiment.NEUTRAL,Sentiment.NEGATIVE]))
print('Tree f1 Score: ',f1_score(test_y, clf_tree.predict(vector_test_x),average=None,labels=[Sentiment.POSITIVE,Sentiment.NEUTRAL,Sentiment.NEGATIVE]))
print('GNB f1 Score: ',f1_score(test_y, clf_g.predict(vector_test_x.toarray()),average=None,labels=[Sentiment.POSITIVE,Sentiment.NEUTRAL,Sentiment.NEGATIVE]))
print('LOG f1 Score: ',f1_score(test_y, clf_log.predict(vector_test_x),average=None,labels=[Sentiment.POSITIVE,Sentiment.NEUTRAL,Sentiment.NEGATIVE]))

SVC f1 Score:  [0.92200557 0.12448133 0.36686391]
Tree f1 Score:  [0.87301587 0.1160221  0.18644068]
GNB f1 Score:  [0.77510427 0.12747253 0.14018692]
LOG f1 Score:  [0.92166667 0.16064257 0.2781457 ]


- We can see all models are performing inadequately on negative and neutral predictions, which leads me to believe we have uneven data 

In [21]:
print('POSITIVE:', train_y.count(Sentiment.POSITIVE))
print('NEGATIVE:', train_y.count(Sentiment.NEGATIVE))
print('NEUTRAL:', train_y.count(Sentiment.NEUTRAL))

POSITIVE: 6708
NEGATIVE: 523
NEUTRAL: 769


- We can see above that a vast majority of the data is positive so the models cannot train properly on negative and neutral

- We created a container class to allow us to shrink the number of positive reviews in the training data and shuffling the data

In [22]:
train_container = ReviewContainer(train)
train_container.evenly_distribute()
test_container = ReviewContainer(test)
test_container.evenly_distribute()

In [23]:
c_train_x = train_container.get_text()
c_train_y = train_container.get_sentiment()

c_test_x = test_container.get_text()
c_test_y = test_container.get_sentiment()

In [24]:
print(c_train_y.count(Sentiment.POSITIVE))
print(c_train_y.count(Sentiment.NEGATIVE))

523
523


- We can see we have the same number of positive and negative observations now

In [25]:
vector_cont_x = vectorizer.fit_transform(c_train_x)
vector_cont_test_x = vectorizer.transform(c_test_x)

In [26]:
clf_svc = svm.SVC(kernel='linear')
clf_svc.fit(vector_cont_x,c_train_y)

SVC(kernel='linear')

In [27]:
clf_tree = DecisionTreeClassifier(random_state=27)
clf_tree.fit(vector_cont_x,c_train_y)

DecisionTreeClassifier(random_state=27)

In [28]:
clf_g = GaussianNB()
clf_g.fit(vector_cont_x.toarray(),c_train_y)

GaussianNB()

In [29]:
clf_log = LogisticRegression(max_iter=5000)
clf_log.fit(vector_cont_x,c_train_y)

LogisticRegression(max_iter=5000)

In [30]:
print('SVC Score: ', clf_svc.score(vector_cont_test_x,c_test_y))
print('Tree Score: ', clf_tree.score(vector_cont_test_x,c_test_y))
print('GNB Score: ', clf_g.score(vector_cont_test_x.toarray(),c_test_y))
print('Log Score: ', clf_log.score(vector_cont_test_x,c_test_y))

SVC Score:  0.859504132231405
Tree Score:  0.7148760330578512
GNB Score:  0.6074380165289256
Log Score:  0.8677685950413223


In [31]:
print('SVC f1 Score: ',f1_score(c_test_y, clf_svc.predict(vector_cont_test_x),average=None,labels=[Sentiment.POSITIVE,Sentiment.NEGATIVE]))
print('Tree f1 Score: ',f1_score(c_test_y, clf_tree.predict(vector_cont_test_x),average=None,labels=[Sentiment.POSITIVE,Sentiment.NEGATIVE]))
print('GNB f1 Score: ',f1_score(c_test_y, clf_g.predict(vector_cont_test_x.toarray()),average=None,labels=[Sentiment.POSITIVE,Sentiment.NEGATIVE]))
print('LOG f1 Score: ',f1_score(c_test_y, clf_log.predict(vector_cont_test_x),average=None,labels=[Sentiment.POSITIVE,Sentiment.NEGATIVE]))

SVC f1 Score:  [0.86178862 0.85714286]
Tree f1 Score:  [0.71369295 0.71604938]
GNB f1 Score:  [0.6090535  0.60580913]
LOG f1 Score:  [0.86885246 0.86666667]


#### Model tuning using gridsearch

- Going to be using the SVC model for tuning as it has the best scores as a base model 

In [32]:
from sklearn.model_selection import GridSearchCV

In [33]:
param_grid = {'kernel':['linear','rbf','poly','sigmoid'],
              'C':[1,2,4,8,16,32],
              'gamma':['scale','auto'],
              'degree':[1,3,5]}
tuner = GridSearchCV(clf_svc, param_grid,verbose=1, cv=5)
tuner.fit(vector_cont_x,c_train_y)

Fitting 5 folds for each of 144 candidates, totalling 720 fits


GridSearchCV(cv=5, estimator=SVC(kernel='linear'),
             param_grid={'C': [1, 2, 4, 8, 16, 32], 'degree': [1, 3, 5],
                         'gamma': ['scale', 'auto'],
                         'kernel': ['linear', 'rbf', 'poly', 'sigmoid']},
             verbose=1)

In [34]:
tuner.best_params_

{'C': 2, 'degree': 1, 'gamma': 'scale', 'kernel': 'rbf'}

In [35]:
print(tuner.score(vector_cont_test_x,c_test_y))
print('Tuned Score: ',f1_score(c_test_y, tuner.predict(vector_cont_test_x),average=None,labels=[Sentiment.POSITIVE,Sentiment.NEGATIVE]))

0.8636363636363636
Tuned Score:  [0.86639676 0.86075949]


### Saving the model

In [38]:
import pickle

with open('./sentiment_classifier.pkl','wb') as f:
    pickle.dump(tuner,f)