# Importing modules

In [1]:
import numpy as np
import random
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import f1_score
import json

# Load Data

In [2]:
class Sentiment:
    POSITIVE = 'POSITIVE'
    NEGATIVE = 'NEGATIVE'
    NEUTRAL = 'NEUTRAL'

In [3]:
class Review:
    def __init__(self,t,s):
        self.text = t
        self.score = s
        self.sentiment = self.getSentiment()
    def getSentiment(self):
        if self.score <=2:
            return Sentiment.NEGATIVE
        elif self.score == 3:
            return Sentiment.NEUTRAL
        else: # for score 4 and 5
            return Sentiment.POSITIVE

In [4]:
reviews = []
i = 0
filePath = 'Data\Sentiment\Books_small_10000.json'
with open(filePath) as f:
    for line in f:
        # print(line)
        # line = line.replace("\'", "\"")
        # print(i)
        # i+=1
        review = json.loads(line)
        # print(review['overall'])
        reviews.append(Review(review['reviewText'],review['overall']))
        # break
reviews[0].text[:89]

'I bought both boxed sets, books 1-5.  Really a great series!  Start book 1 three weeks ag'

In [5]:
reviews[4].sentiment

'NEUTRAL'

# Prepared Data

In [6]:
training,testing = train_test_split(reviews,test_size=0.33,random_state=42)

In [7]:
trainX = [x.text for x in training]
trainY = [y.sentiment for y in training]
testX = [x.text for x in testing]
testY = [y.sentiment for y in testing]

# Bag of word Vectorization

In [8]:
vector = CountVectorizer()
# vector.fit(trainX)
# trainVectorX = vector.transform(trainX)
# vector.fit(testX)
trainVectorX = vector.fit_transform(trainX)
testVectorX = vector.transform(testX)
print(trainVectorX.shape)
print(testVectorX.shape)


(6700, 26615)
(3300, 26615)


In [9]:
testY[0]

'POSITIVE'

# Classification

## linear SVM

In [10]:
from sklearn.svm import SVC
svm = SVC(kernel='linear')
svm.fit(trainVectorX,trainY)
svm.predict(testVectorX[0])

array(['POSITIVE'], dtype='<U8')

## Decision Tree

In [11]:
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier(random_state=0)
dtc.fit(trainVectorX,trainY)
dtc.predict(testVectorX[0])

array(['POSITIVE'], dtype='<U8')

## Naive Bayes

In [12]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb.fit(trainVectorX.todense(),trainY)
gnb.predict(testVectorX.todense()[0])

array(['POSITIVE'], dtype='<U8')

## Logistic Regression

In [13]:
from sklearn.linear_model import LogisticRegression
lor = LogisticRegression(random_state=0)#.fit(X, y)
lor.fit(trainVectorX,trainY)
lor.predict(testVectorX[0])

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


array(['POSITIVE'], dtype='<U8')

## Random Forest

In [14]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(max_depth=2, random_state=0)
rfc.fit(trainVectorX,trainY)
rfc.predict(testVectorX[0])

array(['POSITIVE'], dtype='<U8')

# Evalution (check accuracy)

## Mean Accuracy

In [15]:
print('SVM:\t\t\t',svm.score(testVectorX,testY)*100)
print('Decision Tree:\t\t',dtc.score(testVectorX,testY)*100)
print('Naive Bayes:\t\t',gnb.score(testVectorX.todense(),testY)*100)
print('Logistic Regression:\t',lor.score(testVectorX,testY)*100)
print('Random Forest:\t\t',rfc.score(testVectorX,testY)*100)

SVM:			 81.24242424242424
Decision Tree:		 76.87878787878788
Naive Bayes:		 65.87878787878788
Logistic Regression:	 84.0909090909091
Random Forest:		 83.84848484848484


## F1 Score

In [16]:
from sklearn.metrics import f1_score
print('SVM:\t\t\t',f1_score(testY,(svm.predict(testVectorX)),average=None, labels=[Sentiment.POSITIVE,Sentiment.NEGATIVE,Sentiment.NEUTRAL]))
print('Decision Tree:\t\t',f1_score(testY,(dtc.predict(testVectorX)),average=None, labels=[Sentiment.POSITIVE,Sentiment.NEGATIVE,Sentiment.NEUTRAL]))
print('Naive Bayes:\t\t',f1_score(testY,(gnb.predict(testVectorX.todense())),average=None, labels=[Sentiment.POSITIVE,Sentiment.NEGATIVE,Sentiment.NEUTRAL]))
print('Logistic Regression\t',f1_score(testY,(lor.predict(testVectorX)),average=None, labels=[Sentiment.POSITIVE,Sentiment.NEGATIVE,Sentiment.NEUTRAL]))
print('Random Forest\t\t',f1_score(testY,(rfc.predict(testVectorX)),average=None, labels=[Sentiment.POSITIVE,Sentiment.NEGATIVE,Sentiment.NEUTRAL]))

SVM:			 [0.90738061 0.40268456 0.2656    ]
Decision Tree:		 [0.87384177 0.19259259 0.15780446]
Naive Bayes:		 [0.7996939  0.11851852 0.1260745 ]
Logistic Regression	 [0.92139968 0.40983607 0.29250457]
Random Forest		 [0.91214768 0.         0.        ]


In [17]:
trainY.count(Sentiment.NEUTRAL)

653

In [18]:
funTest = ['Equal']
FT = vector.transform(funTest)
print('SVM:',svm.predict(FT))
print('DTC:',dtc.predict(FT))
print('GNB:',gnb.predict(FT.todense()))
print('LOR:',lor.predict(FT))
print('RFC:',rfc.predict(FT))

SVM: ['POSITIVE']
DTC: ['POSITIVE']
GNB: ['POSITIVE']
LOR: ['POSITIVE']
RFC: ['POSITIVE']


# Tuning Our model with Grid Search

from sklearn.model_selection import GridSearchCV

paramiters = {'kernel':('linear','rbf'),'C':(1,4,8,16,32)}
svm = SVC()
clf = GridSearchCV(svm,paramiters,cv=5)
clf.fit(trainVectorX,trainY)

GridSearchCV(cv=5, error_score=nan,
             estimator=SVC(C=1.0, break_ties=False, cache_size=200,
                           class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='scale', kernel='rbf', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='deprecated', n_jobs=None,
             param_grid={'C': (1, 4, 8, 16, 32), 'kernel': ('linear', 'rbf')},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [19]:
svm1 = SVC(kernel='rbf')
svm1.fit(trainVectorX,trainY)

SVC()

In [20]:
svm1.score(testVectorX,testY)

0.8384848484848485

# Pick a model

## save model

In [29]:
import pickle
with open('model\sentimentClassifierSvm.pkl', 'wb') as f:
    pickle.dump(svm1,f)

## Load Model

In [30]:
with open('model\sentimentClassifierSvm.pkl', 'rb') as f:
    loadedSvm = pickle.load(f)

In [31]:
testY[0]

'POSITIVE'

In [32]:
loadedSvm.predict(testVectorX[0])

array(['POSITIVE'], dtype='<U8')

In [33]:
# Logistic Regression

lor1 = LogisticRegression(random_state=None)#.fit(X, y)
lor1.fit(trainVectorX,trainY)
lor1.score(testVectorX,testY)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.8409090909090909

In [34]:
with open('model\sentimentClassifierLor.pkl', 'wb') as f:
    pickle.dump(lor1,f)

In [35]:
with open('model\sentimentClassifierLor.pkl', 'rb') as f:
    loadedLor = pickle.load(f)