### Data Class

In [12]:
#enum class	
class Sentiment:
    NEGATIVE = "NEGATIVE"
    NEUTRAL = "NEUTRAL"
    POSITIVE = "POSITIVE"

class Review:
    def __init__(self, text, score):
        self.text = text
        self.score = score
        self.sentiment = self.get_sentiment()

    def get_sentiment(self):
        if self.score <= 2:
            return Sentiment.NEGATIVE
        elif self.score == 3:
            return Sentiment.NEUTRAL
        else:
            return Sentiment.POSITIVE

### Loading Data

In [13]:
import json

file_name = "./Books_small.json"
reviews = []
with open(file_name) as f:
    for line in f:
        review = json.loads(line)
        reviews.append(Review(review['reviewText'],review['overall']))

print(reviews[5].sentiment)

POSITIVE


### Prepare data

In [14]:
from sklearn.model_selection import train_test_split

training, test = train_test_split(reviews, test_size=0.33, random_state=69)

In [15]:
train_x = [x.text for x in training]
train_y = [x.sentiment for x in training]
test_x = [x.text for x in test]
test_y = [x.sentiment for x in test]

#### Bag of words

In [31]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
train_x_vectors = vectorizer.fit_transform(train_x)

test_x_vectors = vectorizer.transform(test_x)


### Classification 

#### Linear SVM

In [33]:
from sklearn import svm
clf_svm = svm.SVC(kernel = 'linear')

clf_svm.fit(train_x_vectors, train_y)
clf_svm.predict(test_x_vectors[0])


array(['POSITIVE'], dtype='<U8')

#### Decision Tree

In [35]:
from sklearn.tree import DecisionTreeClassifier

clf_dec = DecisionTreeClassifier()
clf_dec.fit(train_x_vectors, train_y)

clf_dec.predict(test_x_vectors[0])


array(['POSITIVE'], dtype='<U8')

#### Naive Bayes

In [37]:
from sklearn.naive_bayes import GaussianNB

clf_gnb = DecisionTreeClassifier()
clf_gnb.fit(train_x_vectors, train_y)

clf_gnb.predict(test_x_vectors[0])

array(['POSITIVE'], dtype='<U8')

#### Logistic regression

In [38]:
from sklearn.linear_model import LogisticRegression

clf_log = LogisticRegression()
clf_log.fit(train_x_vectors, train_y)

clf_log.predict(train_x_vectors[0])

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


array(['POSITIVE'], dtype='<U8')

### Evaluation

In [41]:
#Mean Accuracy
print(clf_svm.score(test_x_vectors, test_y))
print(clf_dec.score(test_x_vectors, test_y))
print(clf_gnb.score(test_x_vectors, test_y))
print(clf_log.score(test_x_vectors, test_y))

0.8151515151515152
0.793939393939394
0.7818181818181819
0.8515151515151516


In [44]:
#F1 score
from sklearn.metrics import f1_score

f1_score(test_y, clf_svm.predict(test_x_vectors), average = None, labels=[Sentiment.POSITIVE,Sentiment.NEUTRAL ,Sentiment.NEGATIVE])
f1_score(test_y, clf_gnb.predict(test_x_vectors), average = None, labels=[Sentiment.POSITIVE,Sentiment.NEUTRAL ,Sentiment.NEGATIVE])

#The model predicts POSITIVE properly, but NEUTRAL and NEGATIVE are trash.

array([0.87985866, 0.26666667, 0.05882353])

In [46]:
train_y.count(Sentiment.NEUTRAL)

# So its data bad.

75