In [52]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer # Term frequency inverse document frequency

In [24]:
import random

class Sentiment:
    NEGATIVE = "NEGATIVE"
    NEUTRAL = "NEUTRAL"
    POSITIVE = "POSITIVE"

class Review:
    def __init__(self, text, score):
        self.text = text
        self.score = score
        self.sentiment = self.get_sentiment()
    
    def get_sentiment(self):
        if self.score <= 2:
            return Sentiment.NEGATIVE
        elif self.score == 3:
            return Sentiment.NEUTRAL
        else:
            return Sentiment.POSITIVE

class ReviewContainer: 
    def __init__(self, reviews):
        self.reviews = reviews
    
    def get_text(self):
        return [x.text for x in self.reviews]

    def get_sentiment(self):
        return [x.sentiment for x in self.reviews]
    
    def evenly_distribute(self): # Whole point here is to have our score evaluate all sentiments evenly
        negative = list(filter(lambda x: x.sentiment == Sentiment.NEGATIVE, self.reviews))
        positive = list(filter(lambda x: x.sentiment == Sentiment.POSITIVE, self.reviews))
        
        positive_shrunk = positive[:len(negative)]
        self.reviews = negative + positive_shrunk
        random.shuffle(self.reviews)

In [3]:
import json

reviews = []
with open('Books_small_10000.json') as f:
    for line in f:
        review = json.loads(line)
        reviews.append(Review(review['reviewText'], review['overall']))


In [25]:

training, testing = train_test_split(reviews, test_size = 0.2, random_state = 10)

train_container = ReviewContainer(training)
test_container = ReviewContainer(testing)

**Purpose CountVectorizer is to return a numerical matrix corresponding to the text we pass**

In [38]:
train_container.evenly_distribute()
train_x = train_container.get_text()
train_y = train_container.get_sentiment()

test_container.evenly_distribute()
test_x = test_container.get_text()
test_y = test_container.get_sentiment()

#train_y.count(Sentiment.POSITIVE)

In [53]:
count_vect = TfidfVectorizer()

train_x_vectors = count_vect.fit_transform(train_x)    # Each row is a vector of 0's or n's if the word in that index exists 
test_x_vectors = count_vect.transform(test_x) # note we dont want to fit again


## **Classification**

#### Linear SVM

In [7]:
from sklearn import svm

In [54]:
clf_svm = svm.SVC(kernel='linear')
clf_svm.fit(train_x_vectors, train_y)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='linear', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [9]:
clf_svm.predict(test_x_vectors[2])

array(['NEUTRAL'], dtype='<U8')

#### Decision Tree

In [10]:
from sklearn.tree import DecisionTreeClassifier

In [55]:
clf_dec = DecisionTreeClassifier()
clf_dec.fit(train_x_vectors, train_y)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [89]:
clf_dec.predict(train_x_vectors[10])

array(['POSITIVE'], dtype='<U8')

#### Logistic Regression

In [56]:
from sklearn.linear_model import LogisticRegression

clf_log = LogisticRegression()
clf_log.fit(train_x_vectors, train_y)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

#### **Evaluation**

In [57]:
# Mean Accuracy

print('Score SVM: ' + str(clf_svm.score(test_x_vectors, test_y)))
print('Score Decision Tree: ' + str(clf_dec.score(test_x_vectors, test_y)))
print('Score Logistic: ' + str(clf_log.score(test_x_vectors, test_y)))

Score SVM: 0.8457943925233645
Score Decision Tree: 0.6448598130841121
Score Logistic: 0.8551401869158879


In [58]:
# F1 Score
from sklearn.metrics import f1_score

f1_score(test_y, clf_svm.predict(test_x_vectors), average=None, labels=[Sentiment.POSITIVE, Sentiment.NEUTRAL, Sentiment.NEGATIVE]) # SVM

  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


array([0.85067873, 0.        , 0.84057971])

In [59]:
f1_score(test_y, clf_dec.predict(test_x_vectors), average=None, labels=[Sentiment.POSITIVE, Sentiment.NEUTRAL, Sentiment.NEGATIVE]) # Dec

array([0.65765766, 0.        , 0.63106796])

In [60]:
f1_score(test_y, clf_log.predict(test_x_vectors), average=None, labels=[Sentiment.POSITIVE, Sentiment.NEUTRAL, Sentiment.NEGATIVE]) # Log

array([0.85581395, 0.        , 0.85446009])

**Usage**

In [74]:
test_set = ['This was an unbelievable book. Every day I read it and wonder why I even bought it. I am really considering just selling it','What were they thinking?','Amazing! Thoroughly enjoyed']
new_test = count_vect.transform(test_set)

clf_log.predict(new_test)

array(['NEGATIVE', 'NEGATIVE', 'POSITIVE'], dtype='<U8')

**Tuning Model**

In [70]:
from sklearn.model_selection import GridSearchCV

parameters = {'kernel':('linear','rbf'), 'C':(1,4,8,16,32)}

svc = svm.SVC()
clf = GridSearchCV(svc, parameters, cv=5)
clf.fit(train_x_vectors, train_y)



GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'kernel': ('linear', 'rbf'), 'C': (1, 4, 8, 16, 32)},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [72]:
clf_log.score(test_x_vectors, test_y)

0.8551401869158879

**Saving Model**

In [75]:
import pickle

with open('./models/sentiment_classifier.pkl','wb') as f:
    pickle.dump(clf_log, f)

In [77]:
with open('./models/sentiment_classifier.pkl','rb') as f:
    loaded_clf = pickle.load(f)

In [80]:
loaded_clf.predict(test_x_vectors[0:5])

array(['NEGATIVE', 'POSITIVE', 'NEGATIVE', 'POSITIVE', 'NEGATIVE'],
      dtype='<U8')

In [81]:
test_x[0:5]

["As a huge fan of Carly Phillips I can't believe she wrote this.  I can't believe I actually finished it - and only did it out of respect for her other books.  The main heroine is boring and flip floppy in every other paragraph.  It read like this.... blah blah blah..... no I changed I mind.... blah blah blah... no I changed my mind again.  Same crap in EVERY chapter.  Don't waste your time.  Try any other of her books than this one.",
 'I love description and appreciate philosophical musings....but...this was  like reading a speech by someone mildly intoxicated and very boring.you know those people who.keep going on even after you have given them every social cue to stop.',
 "This was OK. I had problems finishing it and actually read the last chapter at about 65% and decided that it wasn't worth it to finish. That's very unusual for me, but it just wasn't my cup of tea. It started out pretty good and the concept was good, but I became uncomfortable reading it at about 1/4 of the way 