In [2]:
import random
import pickle
import json

### Classes

In [3]:
class Sentiment:
    positive = "Positive"
    neutral = "Neutral"
    negative = "Negative"

class Review:
    def __init__(self, text, score):
        self.text = text
        self.score = score
        self.sentiment = self.get_sentiment()

    def get_sentiment(self):
        if self.score >= 3:
            return Sentiment.positive
        else:
            return Sentiment.negative
        
class ReviewContainer:
    def __init__(self, reviews):
        self.reviews = reviews
        
    def get_text(self):
        return [x.text for x in self.reviews]
    
    def get_sentiment(self):
        return [x.sentiment for x in self.reviews]
        
    def evenly_distribute(self):
        negative = list(filter(lambda x: x.sentiment == Sentiment.negative, self.reviews))
        positive = list(filter(lambda x: x.sentiment == Sentiment.positive, self.reviews))
        positive_shrunk = positive[:len(negative)]
        self.reviews = positive_shrunk + negative
        random.shuffle(self.reviews)

### Load Data

In [4]:
file_name = "./datas/books_small.json"

reviews = []
with open(file_name) as file:
    for line in file:
        review = json.loads(line)
        reviews.append(Review(review["reviewText"], review["overall"]))
        
print(f"{reviews[0].text} \n{reviews[0].score} \n{reviews[0].sentiment}")

I bought both boxed sets, books 1-5.  Really a great series!  Start book 1 three weeks ago and just finished book 5.  Sloane Monroe is a great character and being able to follow her through both private life and her PI life gets a reader very involved!  Although clues may be right in front of the reader, there are twists and turns that keep one guessing until the last page!  These are books you won't be disappointed with. 
5.0 
Positive


### Data Preparation

In [5]:
from sklearn.model_selection import train_test_split

training, test = train_test_split(reviews, test_size=1/5, random_state=50)

train_container = ReviewContainer(training)

test_container = ReviewContainer(test)

train_container.evenly_distribute()
train_x = train_container.get_text()
train_y = train_container.get_sentiment()

test_container.evenly_distribute()
test_x = test_container.get_text()
test_y = test_container.get_sentiment()

# print(train_x[1])
# print(train_y[1])

print(train_y.count(Sentiment.positive))
print(train_y.count(Sentiment.negative))

500
500


### Word Vectorizer

In [6]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

vectorizer = TfidfVectorizer()
train_x_vectors = vectorizer.fit_transform(train_x)

test_x_vectors = vectorizer.transform(test_x)

print(train_x[0])
# print(train_x_vectors[0].toarray())

Don is that lovable kind of person you know who drives you insane at the same time.  It was a good read with the right amount of humor.  Easy to get an image of each character.  I enjoyed it and I will look for the sequel.  I hope for some more wit!


### Classification (Linear SVM)

In [7]:
from sklearn import svm

class_svm = svm.SVC(kernel='linear')
class_svm.fit(train_x_vectors, train_y).predict(test_x_vectors[0])

array(['Negative'], dtype='<U8')

### Classification (Decision Tree)

In [8]:
from sklearn.tree import DecisionTreeClassifier

class_dec = DecisionTreeClassifier()
class_dec.fit(train_x_vectors, train_y).predict(test_x_vectors[0])

array(['Positive'], dtype='<U8')

### Classfication (Naive Bayes)

In [9]:
from sklearn.naive_bayes import GaussianNB

class_gnb = GaussianNB()
class_gnb.fit(train_x_vectors.toarray(), train_y).predict(test_x_vectors[0].toarray())

array(['Negative'], dtype='<U8')

### Classification (Logistic Regression)

In [10]:
from sklearn.linear_model import LogisticRegression

class_log = LogisticRegression()
class_log.fit(train_x_vectors, train_y).predict(test_x_vectors[0])

array(['Negative'], dtype='<U8')

### Classification (Perceptron)

In [11]:
from sklearn.linear_model import Perceptron

class_per = Perceptron(tol=1e-3, random_state=0)
class_per.fit(train_x_vectors, train_y).predict(test_x_vectors[0]) 

array(['Negative'], dtype='<U8')

### Evaluation

In [12]:
# Mean Accuracy
print(class_svm.score(test_x_vectors, test_y))
print(class_dec.score(test_x_vectors, test_y))
print(class_gnb.score(test_x_vectors.toarray(), test_y))
print(class_log.score(test_x_vectors, test_y))
print(class_per.score(test_x_vectors, test_y))

0.8263888888888888
0.6701388888888888
0.5833333333333334
0.8159722222222222
0.7430555555555556


In [13]:
# F1 Scores
from sklearn.metrics import f1_score

print(f1_score(test_y, class_svm.predict(test_x_vectors), average=None, labels=[Sentiment.positive, Sentiment.negative]))
print(f1_score(test_y, class_dec.predict(test_x_vectors), average=None, labels=[Sentiment.positive, Sentiment.negative]))
print(f1_score(test_y, class_gnb.predict(test_x_vectors.toarray()), average=None, labels=[Sentiment.positive, Sentiment.negative]))
print(f1_score(test_y, class_log.predict(test_x_vectors), average=None, labels=[Sentiment.positive, Sentiment.negative]))
print(f1_score(test_y, class_per.predict(test_x_vectors), average=None, labels=[Sentiment.positive, Sentiment.negative]))

[0.82638889 0.82638889]
[0.66898955 0.67128028]
[0.58333333 0.58333333]
[0.81911263 0.81272085]
[0.76282051 0.71969697]


In [14]:
# Test Data
test_set = ["very fun", "bad book do not buy", "horrible waste of time"]
new_test = vectorizer.transform(test_set)

class_svm.predict(new_test)

array(['Positive', 'Negative', 'Negative'], dtype='<U8')

### Tuning Model

In [15]:
from sklearn.model_selection import GridSearchCV

parameters = {"kernel": ["linear", "rbf"], "C": [1,4,8,16,32]}

tuned_svc = svm.SVC()
tuned_class = GridSearchCV(tuned_svc, parameters)

print(tuned_class.fit(train_x_vectors, train_y).best_params_)
print(tuned_class.score(test_x_vectors, test_y))
print(f1_score(test_y, tuned_class.predict(test_x_vectors), average=None, labels=[Sentiment.positive, Sentiment.negative]))

{'C': 1, 'kernel': 'rbf'}
0.8229166666666666
[0.82474227 0.82105263]


### Saving Model

In [16]:
with open("./models/sentiment_classifier.pkl", "wb") as file:
    pickle.dump(tuned_class, file)

### Load Model

In [17]:
with open("./models/sentiment_classifier.pkl", "rb") as file:
    tuned_model = pickle.load(file)

tuned_model.predict(test_x_vectors[0])

array(['Negative'], dtype='<U8')