## Using Class Functions to Improve Readability

In [1]:
class Sentiment:
    NEGATIVE = "NEGATIVE"
    NEUTRAL = "NEUTRAL"
    POSITIVE = "POSITIVE"


class Review:
    def __init__(self, text, score):
        self.text = text
        self.score = score
        self.sentiment = self.get_sentiment()

    def get_sentiment(self):
        if self.score <= 2:
            return Sentiment.NEGATIVE
        elif self.score == 3:
            return Sentiment.NEUTRAL
        else: 
            return Sentiment.POSITIVE

## Loading The Data

In [2]:
import json

file_name = '/Users/jackie/Downloads/Books small 10000.json'

reviews = []

with open(file_name) as f:
    for line in f:
        review = json.loads(line)
        reviews.append((Review(review['reviewText'],review['overall'])))




## Preparing The Data

In [3]:
from sklearn.model_selection import train_test_split

training, test = train_test_split(reviews, test_size=0.33, random_state=42)

In [4]:
train_x = [x.text for x in training]
train_y = [x.sentiment for x in training]

test_x = [x.text for x in test]
test_y = [x.sentiment for x in test]




#### Bag of Words Vectorization

In [5]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()

train_x_vector = vectorizer.fit_transform(train_x)
test_x_vector = vectorizer.transform(test_x)





## Classification

#### Linear SVM

In [6]:
from sklearn import svm

clf_svm = svm.SVC(kernel='linear')

clf_svm.fit(train_x_vector, train_y)

print(test_x[186])
clf_svm.predict(test_x_vector[186])

the book was meant for young adults  and I as an older adult still enjoyed the story and I have always enjoyed Sandra Dallas books and her style of writing whether for the young or old


array(['POSITIVE'], dtype='<U8')

#### Decision Tree

In [7]:
from sklearn.tree import DecisionTreeClassifier

clf_dec = DecisionTreeClassifier()

clf_dec.fit(train_x_vector, train_y)

clf_dec.predict(test_x_vector[0])

array(['POSITIVE'], dtype='<U8')

#### Naive Bayes

In [8]:
from sklearn.naive_bayes import GaussianNB

train_x_vector_dense = train_x_vector.toarray()
test_x_vector_dense = test_x_vector.toarray()

clf_gnb = GaussianNB()
clf_gnb.fit(train_x_vector_dense, train_y)

single_test_vector = test_x_vector_dense[193].reshape(1, -1)

clf_gnb.predict(single_test_vector)



array(['NEGATIVE'], dtype='<U8')

#### Logistic Regression

In [9]:
from sklearn.linear_model import LogisticRegression

clf_lr = LogisticRegression()

clf_lr.fit(train_x_vector,train_y)
clf_lr.predict(test_x_vector[0])

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


array(['POSITIVE'], dtype='<U8')

## Evaluation

In [10]:
print(clf_svm.score(test_x_vector, test_y))
print(clf_dec.score(test_x_vector,test_y))
print(clf_lr.score(test_x_vector,test_y))


0.8124242424242424
0.7666666666666667
0.8409090909090909


#### F1 Scores

In [11]:
from sklearn.metrics import f1_score

f1_score(test_y, clf_svm.predict(test_x_vector), average=None, labels=[Sentiment.POSITIVE,Sentiment.NEUTRAL,Sentiment.NEGATIVE])
f1_score(test_y, clf_lr.predict(test_x_vector), average=None, labels=[Sentiment.POSITIVE,Sentiment.NEUTRAL,Sentiment.NEGATIVE])

array([0.92139968, 0.29250457, 0.40983607])

In [12]:
train_y.count(Sentiment.NEUTRAL)

653