In [1]:
###DATA CLASS
import random



class Sentiment:
    NEGATIVE = "NEGATIVE"
    NEUTRAL = "NEUTRAL"
    POSITIVE = "POSITIVE"

class Review:
    def __init__(self, text, score):
        self.text = text;
        self.score = score;
        self.sentiment = self.get_sentiment()
        
    def get_sentiment(self):
        if self.score <=2:
            return Sentiment.NEGATIVE
        elif self.score == 3:
            return Sentiment.NEUTRAL
        else: #score of 4 or 5
            return Sentiment.POSITIVE

        
class ReviewContainer:
    def __init__(self, reviews):
        self.reviews = reviews
        
    def get_text(self):
        return[x.text for x in self.reviews]
    
    def get_sentiment(self):
        return[x.sentiment for x in self.reviews]
    
    def evenly_distribute(self):
        negative = list(filter(lambda x: x.sentiment == Sentiment.NEGATIVE, self.reviews))
        positive = list(filter(lambda x: x.sentiment == Sentiment.POSITIVE, self.reviews))
        positive_shrunk = positive[:len(negative)]
        self.reviews = negative + positive_shrunk
        random.shuffle(self.reviews)
        
        
        #print(negative[0].text)
        #print(len(negative))
        #print(len(positive))

In [46]:
import json

file_name = './data/sentiment/books_small_10000.json'

reviews=[]
with open(file_name) as f:
    for line in f:
        review = json.loads(line)
        reviews.append(Review(review['reviewText'], review['overall']))
        
reviews[5].text
        

'I hoped for Mia to have some peace in this book, but her story is so real and raw.  Broken World was so touching and emotional because you go from Mia\'s trauma to her trying to cope.  I love the way the story displays how there is no "just bouncing back" from being sexually assaulted.  Mia showed us how those demons come for you every day and how sometimes they best you. I was so in the moment with Broken World and hurt with Mia because she was surrounded by people but so alone and I understood her feelings.  I found myself wishing I could give her some of my courage and strength or even just to be there for her.  Thank you Lizzy for putting a great character\'s voice on a strong subject and making it so that other peoples story may be heard through Mia\'s.'

In [47]:
from sklearn.model_selection import train_test_split

training, test = train_test_split(reviews,test_size=0.45, random_state = 43)

train_container = ReviewContainer(training)
test_container = ReviewContainer(test)
#cont.evenly_distribute()



In [48]:
train_container.evenly_distribute()
train_x = train_container.get_text()
train_y = train = train_container.get_sentiment()


test_container.evenly_distribute()
test_x = test_container.get_text()
test_y = test_container.get_sentiment()

print(train_y.count(Sentiment.POSITIVE))
print(train_y.count(Sentiment.NEGATIVE))

362
362


In [49]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

vectorizer = TfidfVectorizer()
vectorizer.fit_transform(train_x)

<724x8202 sparse matrix of type '<class 'numpy.float64'>'
	with 46300 stored elements in Compressed Sparse Row format>

In [50]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

vectorizer = TfidfVectorizer()
train_x_vectors = vectorizer.fit_transform(train_x)

test_x_vectors = vectorizer.transform(test_x)

#vectorizer.fit(train_x)

train_x_vectors = vectorizer.transform(train_x)

print(train_x[0])
print(train_x_vectors[0].toarray())


I enjoyed meeting a new pack.  The opening chapters grabbed me and I was hooked. The story was complete but the villain is still out there, so I'll be looking for the next installment.
[[0. 0. 0. ... 0. 0. 0.]]


In [51]:
###LINEAR SVM


from sklearn import svm

clf_svm = svm.SVC(kernel='linear')
clf_svm.fit(train_x_vectors, train_y)

test_x[0]


clf_svm.predict(test_x_vectors[0])

array(['POSITIVE'], dtype='<U8')

In [52]:
from sklearn.tree import DecisionTreeClassifier

clf_dec = DecisionTreeClassifier()
clf_dec.fit(train_x_vectors, train_y)


clf_dec.predict(test_x_vectors[0])


array(['POSITIVE'], dtype='<U8')

In [53]:
from sklearn.metrics import f1_score

f1_score(test_y, clf_svm.predict(test_x_vectors), average=None, labels =[Sentiment.POSITIVE, Sentiment.NEUTRAL, Sentiment.NEGATIVE])



  _warn_prf(


array([0.82181818, 0.        , 0.83044983])

In [54]:
test_set = ['very good', "bad book do not buy", 'horrible waste of time']
new_test = vectorizer.transform(test_set)

clf_svm.predict(new_test)

array(['POSITIVE', 'NEGATIVE', 'NEGATIVE'], dtype='<U8')

AttributeError: 'list' object has no attribute 'todense'

Tuning our model with Grid Search

In [32]:
from sklearn.model_selection import GridSearchCV

parameters = {'kernel': ('linear', 'rbf'), 'C':(1,4,8,16,32)}


svc = svm.SVC()
clf = GridSearchCV (svc, parameters, cv=5)
clf.fit(train_x_vectors, train_y)

GridSearchCV(cv=5, estimator=SVC(),
             param_grid={'C': (1, 4, 8, 16, 32), 'kernel': ('linear', 'rbf')})

In [33]:
print(clf.score(test_x_vectors, test_y))

0.8333333333333334


### SAVING MODEL

In [34]:
import pickle

with open('./models/sentiment_classifier.pkl', 'wb') as f:
    pickle.dump(clf, f)

In [35]:
with open('./models/sentiment_classifier.pkl', 'rb') as f:
    loaded_clf = pickle.load(f)

In [36]:
test_set = ['very quick speeds', "loved the necklace", 'bad']
new_test = vectorizer.transform(test_set)


clf.predict(new_test)

array(['POSITIVE', 'POSITIVE', 'NEGATIVE'], dtype='<U8')

In [37]:
### MATRIX

ValueError: could not convert string to float: "I'm not sure what I'd expected from a book as broadly titled as &#34;Cooking with Beans,&#34; but it was something other than this. When I think about cooking with beans, I'm surely not thinking about cooking with meat cooking with beans, but evidently Ms. Jones is. A vegetarian recipe book this is NOT. When the first ingredient in a 4-bean casserole is a half pound of bacon, there's trouble brewing in this kitchen.I was interested in the garbanzo bean-based chocolate cake, but even here, the first ingredient isn't cacao or even cacao, but 2 cups (yet) of chocolate chips. I'm looking for more natural, healthier, wiser recipes than these. They don't seem to be in there, and I've deleted this book from my electronic library already."