# Review Sentiment

In [69]:
class Sentiment:
    NEGATIVE = "NEGATIVE"
    NEUTRAL = "NEUTRAL"
    POSITIVE = "POSITIVE"

class Review:
    def __init__(self, text, score):
        self.text = text
        self.score = score
        self.sentiment = self.get_sentiment()

    def get_sentiment(self):
        if self.score <= 2:
            return Sentiment.NEGATIVE
        elif self.score == 3:
            return Sentiment.NEUTRAL
        else: #Score of 4 or 5
            return Sentiment.POSITIVE

# Load Data

In [70]:
import json

file_name = './books_small.json'

reviews = []
with open(file_name) as f:
    for line in f:
        review = json.loads(line)
        reviews.append(Review(review['reviewText'], review['overall']))

reviews[5].text



'Love the book, great story line, keeps you entertained.for a first novel from this author she did a great job,  Would definitely recommend!'

# Prep Data

In [71]:
from sklearn.model_selection import train_test_split

training, test = train_test_split(reviews, test_size=0.33, random_state=42)

In [72]:
X_train = [x.text for x in training]
y_train = [x.sentiment for x in training]

X_test = [x.text for x in test]
y_test = [x.sentiment for x in test]

# Bags of Words Vectorization

In [73]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()

# fit and transform training data
X_train_vectors = vectorizer.fit_transform(X_train)

# transform test data
X_test_vectors = vectorizer.transform(X_test)



# Classification

### Linear SVM (Support Vector Machine)

In [74]:
from sklearn import svm

clf_svm = svm.SVC(kernel='linear')

clf_svm.fit(X_train_vectors, y_train)

print(X_test[0])
clf_svm.predict(X_test_vectors[0])

Every new Myke Cole book is better than the last, and this is no exception. If you haven't read the Shadow Ops series before start with Control Point, but go ahead and order Fortress Frontier and Breach Zone as well - you're going to want them.


array(['POSITIVE'], dtype='<U8')

### Decision Tree

In [75]:
from sklearn.tree import DecisionTreeClassifier

clf_dec = DecisionTreeClassifier()

clf_dec.fit(X_train_vectors, y_train)

clf_dec.predict(X_test_vectors[0])

array(['POSITIVE'], dtype='<U8')

### Naive Bayes

In [76]:
from sklearn.naive_bayes import GaussianNB

clf_gnb = GaussianNB()

clf_gnb.fit(X_train_vectors.toarray(), y_train)

clf_gnb.predict(X_test_vectors[0].toarray())

array(['POSITIVE'], dtype='<U8')

### Logistic Regression

In [77]:
from sklearn.linear_model import LogisticRegression

clf_log = LogisticRegression()

clf_log.fit(X_train_vectors, y_train)

clf_log.predict(X_test_vectors[0])

array(['POSITIVE'], dtype='<U8')

## Evaluation

In [83]:
# MEAN ACCURACY
# Comparison of test vectors and test data
# .score() method returns the mean accuracy on the given test data and labels.
print(clf_svm.score(X_test_vectors, y_test))
print(clf_dec.score(X_test_vectors, y_test))
print(clf_gnb.score(X_test_vectors.toarray(), y_test))
print(clf_log.score(X_test_vectors, y_test))

0.8242424242424242
0.7818181818181819
0.8121212121212121
0.8303030303030303


In [84]:
# F1 SCORE
# The F1 score can be interpreted as a weighted average of the precision and recall,
# where an F1 score reaches its best value at 1 and worst score at 0.
from sklearn.metrics import f1_score

f1_score(y_test, clf_svm.predict(X_test_vectors), average=None, labels=[Sentiment.POSITIVE, Sentiment.NEUTRAL, Sentiment.NEGATIVE])
# output: array([0.91319444, 0.21052632, 0.22222222])
# this result shows that the model is good at predicting positive reviews, but not good at predicting neutral and negative reviews


array([0.91319444, 0.21052632, 0.22222222])