In [2]:
import json
import random
import pickle

from sklearn.model_selection import train_test_split as tts
from sklearn.feature_extraction.text import CountVectorizer as cv
from sklearn.feature_extraction.text import TfidfVectorizer as tf
from sklearn import svm
from sklearn.tree import DecisionTreeClassifier as dtc
from sklearn.neighbors import KNeighborsClassifier as knn
from sklearn.metrics import f1_score 
from sklearn.model_selection import GridSearchCV as gs

In [3]:
class Sentiment:
    NEGATIVE = 'NEGATIVE'
    POSITIVE = 'POSITIVE'

class Review:
    def __init__(self, text, score):
        self.text = text
        self.score = score
        self.sentiment = self.get_sentiment()
        
    def get_sentiment(self):
        if (self.score >= 3):
            return Sentiment.POSITIVE
        else:
            return Sentiment.NEGATIVE

class ReviewContainer:
    def __init__(self, reviews):
        self.reviews = reviews 
    
    def evenly_dist(self):
        neg = list(filter(lambda x: x.sentiment == Sentiment.NEGATIVE, self.reviews))
        pos = list(filter(lambda x: x.sentiment == Sentiment.POSITIVE, self.reviews))
        
        pos_shrunk = pos[:len(neg)]
        self.reviews = (neg + pos_shrunk)
        random.shuffle(self.reviews)
        
    def get_feature(self):
        return [x.text for x in self.reviews]
    
    def get_label(self):
        return [x.sentiment for x in self.reviews]
        

In [4]:
# getting the data in and loading it into a list 
file_name = './books_small.json'

reviews = []
with open(file_name) as f:
    for line in f:
        review = json.loads(line)
        reviews.append(Review(review['reviewText'], review['overall']))
        
    

In [5]:
# splitting the data into test and training sets 
train, test = tts(reviews, test_size=round((1/3)*len(reviews)), random_state=42)

train_cont = ReviewContainer(train)
train_cont.evenly_dist()

test_cont = ReviewContainer(test)
test_cont.evenly_dist()

# getting the features and the labels
train_x = train_cont.get_feature()
train_y = train_cont.get_label()

test_x = test_cont.get_feature()
test_y = test_cont.get_label()


In [6]:
# word embedding time
vectorizer = tf()
train_x_vectors = vectorizer.fit_transform(train_x)

test_x_vectors = vectorizer.transform(test_x)

In [7]:
# fitting svc model
err = 0
clf_svm = svm.SVC(kernel='linear')
clf_svm.fit(train_x_vectors, train_y)

SVC(kernel='linear')

In [8]:
# decision tree classifier 
err = 0
clf_dtree = dtc()
clf_dtree.fit(train_x_vectors, train_y)

DecisionTreeClassifier()

In [9]:
# KNN time!
err = 0
clf_knn = knn(n_neighbors=10, weights='distance')
clf_knn.fit(train_x_vectors, train_y)


KNeighborsClassifier(n_neighbors=10, weights='distance')

In [10]:
# more formal eval- accuracy score
print("SVM accuracy is " + str(clf_svm.score(test_x_vectors, test_y)))
print("Decision tree accuracy is " +  str(clf_dtree.score(test_x_vectors, test_y)))
print("KNN accuracy is " + str(clf_knn.score(test_x_vectors, test_y)))

# f1 scores
f1_score(test_y, clf_svm.predict(test_x_vectors), average=None, labels=[Sentiment.POSITIVE, Sentiment.NEGATIVE])
# f1_score(test_y, clf_dtree.predict(test_x_vectors), average=None, labels=[Sentiment.POSITIVE, Sentiment.NEGATIVE])
# f1_score(test_y, clf_knn.predict(test_x_vectors), average=None, labels=[Sentiment.POSITIVE, Sentiment.NEGATIVE])

SVM accuracy is 0.7757009345794392
Decision tree accuracy is 0.6448598130841121
KNN accuracy is 0.6845794392523364


array([0.77464789, 0.77674419])

In [11]:
# tuning params automatically with grid search
params = {'kernel': ('linear','rbf'), 'C': (1,4,8,16,32)}
svc = svm.SVC()
clf = gs(svc, params, cv=5)
clf.fit(train_x_vectors, train_y)

GridSearchCV(cv=5, estimator=SVC(),
             param_grid={'C': (1, 4, 8, 16, 32), 'kernel': ('linear', 'rbf')})

In [12]:
clf.score(test_x_vectors, test_y)

0.7757009345794392

In [14]:
# saving model 
with open('./models/sentiment_classifier.pkl', 'wb') as f:
    pickle.dump(clf, f)

In [15]:
with open('./models/sentiment_classifier.pkl', 'rb') as f:
    loaded_clf = pickle.load(f)

In [16]:
loaded_clf.predict(test_x_vectors[0])

array(['POSITIVE'], dtype='<U8')