In [2]:
import random

class Sentiment: 
    NEGATIVE = "NEGATIVE"
    NEUTRAL = "NEUTRAL"
    POSITIVE = "POSITIVE"

# using classes makes it easier for others to interpret    
class Review:
    def __init__(self,text,score):
        self.text = text
        self.score = score
        self.sentiment = self.get_sentiment()
        
    def get_sentiment(self):
        if self.score <= 2:
            return Sentiment.NEGATIVE
        elif self.score == 3:
            return Sentiment.NEUTRAL
        else:
            return Sentiment.POSITIVE
        
class ReviewContainer:
    def __init__(self,reviews):
        self.reviews = reviews
        
        
    def get_text(self): # returning text list 
        return [x.text for x in self.reviews]
    
    def get_sentiment(self): # returning sentiment list
        return [x.sentiment for x in self.reviews]
        
    def evenly_distribute(self):  # we see that the number of positive reviews outweights
        # our number of negative reviews which affects our final result and accuracy,
        # To balance the data to get better output use this function
        negative = list(filter(lambda x : x.sentiment == Sentiment.NEGATIVE, self.reviews))
        positive = list(filter(lambda x : x.sentiment == Sentiment.POSITIVE, self.reviews))
        positive_shrunk = positive[:len(negative)]
        self.reviews = negative + positive_shrunk
        random.shuffle(self.reviews)


In [3]:
import json

filename = 'Books_small_10000.json'


reviews = []

with open(filename) as f:
    for line in f:
        review = json.loads(line)
        #reading json file line by line, one line contains review and others stuff, we are reading only the data we need
        
        #print(review['reviewText'])
        #print(review['overall'])
        reviews.append(Review(review['reviewText'], review['overall']))
        
        
        
#print(reviews[5])
#print(reviews[5][0])

#pint(reviews[5].text)
#print(reviews[5].score)
#print(reviews[5].sentiment)




#press shift+TAB to see documentation details


In [15]:
from sklearn.model_selection import train_test_split

training, test = train_test_split(reviews, test_size=0.33, random_state=42)


train_container = ReviewContainer(training)
test_container = ReviewContainer(test)


#cont.evenly_distribute()


#len(cont.reviews)

In [5]:
train_container.evenly_distribute()
train_x = train_container.get_text()
train_y = train_container.get_sentiment()


test_container.evenly_distribute() # evenly distributing the data
test_x = test_container.get_text()
test_y = test_container.get_sentiment()

print(train_y.count(Sentiment.POSITIVE))
print(train_y.count(Sentiment.NEGATIVE))


436
436


In [6]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# The advantage of using Tfidfvectorizer OVER CountVectorizer is that, consider 2 sentence
# This book is so great!
# This was very bad
# Tfdidf is smart vectorizer, as 'This' occurs very frequently, its weight gets lower and lower, 
# whereas for the word 'great' creates a heavier impact of its less frequency and hence result higher weight
#vectorizer = CountVectorizer()


vectorizer = TfidfVectorizer()
train_x_vectors = vectorizer.fit_transform(train_x) #train_x_vectors contains the results which is 0 and 1


test_x_vectors = vectorizer.transform(test_x)


print(train_x[0])
#print(train_x_vectors[0].toarray())




I typically do not read suspense, thrillers or books of such genres, but I thought I would try this book as the synopsis was very intriguing, reminding me of a Julia Roberts movie about an abused woman escaping her husband. The book indeed was suspenseful, and I will leave out details so as to not give away anything but I was suspended in what will happen. The writing is pretty good, which is why I usually avoid this genre. To my surprise it was well done and very entertaining to read.


In [7]:
# these are classifiers, do further studies to understand classifiers better
from sklearn import svm
clf_svm = svm.SVC(kernel='linear')

clf_svm.fit(train_x_vectors,train_y)

test_x[0]

clf_svm.predict(test_x_vectors[0]) # predicting our value
# O/P = array(['POSITIVE'], dtype='<U8')




array(['NEGATIVE'], dtype='<U8')

In [8]:
from sklearn.tree import DecisionTreeClassifier

clf_dec =  DecisionTreeClassifier()
clf_dec.fit(train_x_vectors, train_y) 

clf_dec.predict(test_x_vectors[0])
# O/P = array(['POSITIVE'], dtype='<U8')

array(['POSITIVE'], dtype='<U8')

In [9]:
from sklearn.naive_bayes import GaussianNB

# this is showing error, GAlli didn't go through this

clf_gnb = DecisionTreeClassifier()
clf_gnb.fit(train_x_vectors, train_y)

clf_gnb.predict(test_x_vectors[0])
# O/P = array(['POSITIVE'], dtype='<U8')


array(['POSITIVE'], dtype='<U8')

In [10]:
from sklearn.linear_model import LogisticRegression

clf_log = LogisticRegression()
clf_log.fit(train_x_vectors, train_y)

clf_log.predict(test_x_vectors[0])

array(['NEGATIVE'], dtype='<U8')

In [11]:
#Mean accuracy of our data
print(clf_svm.score(test_x_vectors,test_y))
print(clf_dec.score(test_x_vectors,test_y))
print(clf_gnb.score(test_x_vectors,test_y))
print(clf_log.score(test_x_vectors,test_y))
# this is mean accuracy, but f1 accuracy is more accurate

0.8076923076923077
0.6298076923076923
0.6538461538461539
0.8052884615384616


In [12]:
from sklearn.metrics import f1_score

print(f1_score(test_y, clf_svm.predict(test_x_vectors), average=None, labels=[Sentiment.POSITIVE,  Sentiment.NEGATIVE]))
print(f1_score(test_y, clf_dec.predict(test_x_vectors), average=None, labels=[Sentiment.POSITIVE,  Sentiment.NEGATIVE]))
print(f1_score(test_y, clf_gnb.predict(test_x_vectors), average=None, labels=[Sentiment.POSITIVE,  Sentiment.NEGATIVE]))
print(f1_score(test_y, clf_log.predict(test_x_vectors), average=None, labels=[Sentiment.POSITIVE,  Sentiment.NEGATIVE]))
# O/P BEFORE CHANGING TEST DATA (using evenly distribute)
# [0.85363477 0.28146853] We can see our previously calculated data increased signifcantly
# [0.77823074 0.18553689] This was run before changing our test data. Since most of the test data were positive
# [0.7590636  0.18646409] so, the answer was still biased towards positive
# [0.8783008  0.31077216]
# print(train_y.count(Sentiment.NEGATIVE))# 436
# print(train_y.count(Sentiment.NEGATIVE))# 436


# O/P AFTER CHANGING TEST DATA (using evenly distribute)
# [0.8028169  0.79310345]  # we can now see how high the accuracy got after balancing the data
# [0.62926829 0.63981043]
# [0.63356974 0.62102689]
# [0.82051282 0.808933  ]
 

[0.80582524 0.80952381]
[0.62254902 0.63679245]
[0.65714286 0.65048544]
[0.80291971 0.80760095]


In [13]:
test_set = ["great product", "bad book do not buy", "horrible waste of time"]
new_test = vectorizer.transform(test_set)

clf_svm.predict(new_test)

array(['POSITIVE', 'NEGATIVE', 'NEGATIVE'], dtype='<U8')

In [14]:
# didnt understand this much, learn later, this improves accuracy slightly

from sklearn.model_selection import GridSearchCV

parameters = {'kernel': ('linear', 'rbf'), 'C': (1,4,8,16,32)}

svc = svm.SVC()
clf = GridSearchCV(svc, parameters, cv=5)
clf.fit(train_x_vectors, train_y)

GridSearchCV(cv=5, estimator=SVC(),
             param_grid={'C': (1, 4, 8, 16, 32), 'kernel': ('linear', 'rbf')})

In [16]:
print(clf.score(test_x_vectors, test_y))

0.8197115384615384


In [18]:
#pickle helps us to produce output of our model which we can later load and test data like we
# did in the cell below where we loaded and tested our own input

import pickle

with open('sentiment_classifier.pkl', 'wb') as f:
    pickle.dump(clf, f)
    


In [19]:
#loading data to test
with open('sentiment_classifier.pkl', 'rb') as f:
    loaded_clf = pickle.load(f)

In [50]:


test3 = ["I loved it! this is good", "horrible waste of time"]
test2 = vectorizer.transform(test3)
loaded_clf.predict(test2)

array(['POSITIVE', 'NEGATIVE'], dtype='<U8')