In [87]:
import random
class Sentiment:
    NEGATIVE = 'NEGATIVE'
    NEUTRAL = 'NEUTRAL'
    POSITIVE = 'POSITIVE'
class Review:
    def __init__(self,text,score):
        self.text = text
        self.score = score
        self.sentiment = self.get_sentiment()
        
    def get_sentiment(self):
        if self.score <=2 :
            return Sentiment.NEGATIVE
        elif self.score == 3:
            return Sentiment.NEUTRAL
        else : #score of 4 or 5
            return Sentiment.POSITIVE
        
class ReviewsContainer:
    def __init__(self,reviews):
        self.reviews = reviews
        
    def get_text(self) :
        return [x.text for x in self.reviews]
    
    def get_sentiment(self):
        return [x.sentiment for x in self.reviews]
        
    def evenly_distribute(self):
        negative = list(filter(lambda x: x.sentiment == Sentiment.NEGATIVE, self.reviews))
        positive = list(filter(lambda x: x.sentiment == Sentiment.POSITIVE, self.reviews))
        positive_shrunk = positive[:len(negative)]
        self.reviews = negative + positive_shrunk
        random.shuffle(self.reviews)
        print(negative[0].text)
        print(len(negative))
        print(len(positive))
        

    

In [38]:
ls

Books_small.json        Books_small_10000.json


## Loading the data from json file

In [66]:
import json

file_name = './books_small_10000.json'

reviews = []

with open(file_name) as f :
    for line in f:
        review = json.loads(line)
        #print(review['reviewText'])
        #print(review['overall'])
        reviews.append(Review(review['reviewText'], review['overall']))
    
reviews[5].text


'I hoped for Mia to have some peace in this book, but her story is so real and raw.  Broken World was so touching and emotional because you go from Mia\'s trauma to her trying to cope.  I love the way the story displays how there is no "just bouncing back" from being sexually assaulted.  Mia showed us how those demons come for you every day and how sometimes they best you. I was so in the moment with Broken World and hurt with Mia because she was surrounded by people but so alone and I understood her feelings.  I found myself wishing I could give her some of my courage and strength or even just to be there for her.  Thank you Lizzy for putting a great character\'s voice on a strong subject and making it so that other peoples story may be heard through Mia\'s.'

In [67]:
from sklearn.model_selection import train_test_split

In [120]:
train, test = train_test_split(reviews, test_size = 0.33, random_state = 42)

train_container = ReviewsContainer(train)
test_container = ReviewsContainer(test)

#cont.evenly_distribute()

#len(cont.reviews)

In [121]:
train_container.evenly_distribute()

train_x = train_container.get_text()
train_y = train_container.get_sentiment()

test_container.evenly_distribute()

test_x = test_container.get_text()
test_y = test_container.get_sentiment()

#print(train_y.count(Sentiment.POSITIVE))
#print(train_y.count(Sentiment.NEGATIVE))

It was just one of  those books that never went anywhere. I like books that get your attention in the beginning and not drag out until a quarter way through. I decided to give it an early death - delete!
436
5611
Story is very inaccurate with modern words, phrases and actions.  In the second chapter the author has the bagpipes playing "Amazing Grace" and according to her it is a song as old as time.  As someone who learned to play Amazing Grace on the piano I can state for a fact the song is not old as time. It was not even published until 1779; author has the book set in 1714. 65 years before John Newton wrote and published the songFiona and Juliet speak like they are in the 21 century. Not a young miss in the early 18th century.I have no problem reading about God in books. My problem is when authors take too much leeway and write using modern phrases in historical books.Really, wondering if this author did any 'real' research or just used what she remembered from high school world hi

In [122]:
len(train_x)

872

In [123]:
print(train[0].text)

Olivia Hampton arrives at the Dunraven family home as cataloger of their extensive library. What she doesn't expect is a broken carriage wheel on the way. Nor a young girl whose mind is clearly gone, an old man in need of care himself (and doesn&#8217;t quite seem all there in Olivia&#8217;s opinion). Furthermore, Marion Dunraven, the only sane one of the bunch and the one Olivia is inexplicable drawn to, seems captive to everyone in the dusty old house. More importantly, she doesn't expect to fall in love with Dunraven's daughter Marion.Can Olivia truly believe the stories of sadness and death that surround the house, or are they all just local neighborhood rumor?Was that carriage trouble just a coincidence or a supernatural sign to stay away? If she remains, will the Castle&#8217;s dark shadows take Olivia down with them or will she and Marion long enough to declare their love?Patty G. Henderson has created an atmospheric and intriguing story in her Gothic tale. I found this to be an

### Bag of words vectorization 

In [139]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
vectorizer = TfidfVectorizer()
train_x_vectors = vectorizer.fit_transform(train_x)

test_x_vectors = vectorizer.transform(test_x)

print(train_x[0])
print(train_x_vectors[0].toarray())

Yet another beautifully written and beautifully illustrated Jan Brett book. I think this is one of the prettiest ones yet.
[[0. 0. 0. ... 0. 0. 0.]]


# Classification 

### Linear SVM 

In [140]:
from sklearn import svm 
clf_svm = svm.SVC(kernel = 'linear')
clf_svm.fit(train_x_vectors, train_y)
test_x[0]
clf_svm.predict(test_x_vectors[0])

array(['POSITIVE'], dtype='<U8')

### Decision Trees

In [141]:
from sklearn.tree import DecisionTreeClassifier

clf_dec = DecisionTreeClassifier()
clf_dec.fit(train_x_vectors,train_y)
clf_dec.predict(test_x_vectors[0])

array(['POSITIVE'], dtype='<U8')

### Naive Bayes

In [142]:
from sklearn.naive_bayes import MultinomialNB
clf_mnb = MultinomialNB()

clf_mnb.fit(train_x_vectors,train_y) 
clf_mnb.predict(test_x_vectors[0])

array(['POSITIVE'], dtype='<U8')

# Let's evaluate our models

In [143]:
# Mean accuracy
print(clf_svm.score(test_x_vectors,test_y))
print(clf_dec.score(test_x_vectors,test_y))
print(clf_mnb.score(test_x_vectors,test_y))

0.8076923076923077
0.6490384615384616
0.8125


In [74]:
# The three classifiers did appearently a good job, but still sceptical about the metrics, let's check the F1 score

In [144]:
from sklearn.metrics import f1_score 

f1_score(test_y, clf_svm.predict(test_x_vectors),average=None, labels = [#Sentiment.NEUTRAL, 
                                                                        Sentiment.NEGATIVE, Sentiment.POSITIVE])

array([0.80952381, 0.80582524])

In [145]:
f1_score(test_y, clf_dec.predict(test_x_vectors),average=None, labels = [#Sentiment.NEUTRAL, 
                                                                        Sentiment.NEGATIVE, Sentiment.POSITIVE])

array([0.66046512, 0.63681592])

In [146]:
f1_score(test_y, clf_mnb.predict(test_x_vectors),average=None, labels = [#Sentiment.NEUTRAL, 
                                                                        Sentiment.NEGATIVE, Sentiment.POSITIVE])

array([0.82969432, 0.79144385])

In [147]:
train_y.count(Sentiment.NEGATIVE)
test_y.count(Sentiment.NEGATIVE)

208

In [149]:
train_y[0:5]

['POSITIVE', 'NEGATIVE', 'POSITIVE', 'POSITIVE', 'NEGATIVE']

In [150]:
## Qualitative check..
test_set = ['very fun', 'bad book,do not buy', 'horrible, just a waste of time']
new_test = vectorizer.transform(test_set)
clf_svm.predict(new_test)

array(['POSITIVE', 'NEGATIVE', 'NEGATIVE'], dtype='<U8')

# Fine-Tuning the model (with Grid Search)

In [152]:
from sklearn.model_selection import GridSearchCV

In [154]:
parameters = {'kernel' : ('linear','rbf'), 'C' : (1,4,8,16,32)}

svc = svm.SVC()
clf = GridSearchCV(svc, parameters, cv = 5)
clf.fit(train_x_vectors,train_y)

GridSearchCV(cv=5, estimator=SVC(),
             param_grid={'C': (1, 4, 8, 16, 32), 'kernel': ('linear', 'rbf')})

In [156]:
print(clf.score(test_x_vectors,test_y))

0.8076923076923077


In [None]:
# we can use the rbf kernel, but i'm not sure it would improve much.. I would recommend rather improving the dataset
# quality as following, like stripping the punctuation marks

## SAVING the model 

### we want to save the classify so that we don't have to train it for further times..

In [169]:
import pickle

with open('./sentiment_classifier.pkl','wb') as f :
    pickle.dump(clf,f)
with open('./category_vectorizer.pkl','wb') as f :
    pickle.dump(vectorizer,f)

In [167]:
with open('./sentiment_classifier.pkl','rb') as f :
    loaded_clf = pickle.load(f)
with open('./sentiment_classifier.pkl','rb') as f :
    loaded_clf = pickle.load(f)



In [168]:
loaded_clf.predict(test_x_vectors[0])

array(['POSITIVE'], dtype='<U8')

# This section is for metrics verification and heatmap plotting for features handling

In [181]:
#from sklearn.metrics import confusion_matrix
#import pandas as pd
#import seaborn as sns
#import matplotlib as plt

In [None]:
#y_pred = clf.predict(test_x)

#labels = [Category.ELECTRONICS, Category.BOOKS, Category.CLOTHING, Category.GROCERY, Category.PATIO]

#cm = confusion_matrix(test_y, y_pred, labels = labels)
#df_cm = pd.DataFrame(cm,index = reverse(labels), columns = labels)

#sns.heatmap(df_cm, annot = True, fmt = 'd')