#### Methods

In [106]:
import random
class Sentiment:
    POSITIVE = "POSITIVE"
    NEGATIVE = "NEGATIVE"
    NEUTRAL = "NEUTRAL"

class Review:
    def __init__(self, text, score):
        self.text = text
        self.score = score
        self.sentiment = self.get_sentiment()
        
    def get_sentiment(self):
        if self.score <= 2:
            return Sentiment.NEGATIVE
        elif self.score == 3:
            return Sentiment.NEUTRAL
        else: # Amazon review is a 4 or 5
            return Sentiment.POSITIVE
            
            
class ReviewContainer:
    def __init__(self, reviews):
        self.reviews = reviews
    def get_text(self):
        return [x.text for x in self.reviews] 
    def get_sentiment(self):
        return [x.sentiment for x in self.reviews]
    def evenly_distribute(self):
        negative = list(filter(lambda x: x.sentiment == Sentiment.NEGATIVE, self.reviews))
        positive = list(filter(lambda x: x.sentiment == Sentiment.POSITIVE, self.reviews))
        positive_shrunk = positive[:len(negative)]
        self.reviews = negative + positive_shrunk
        random.shuffle(self.reviews)

In [107]:
import json
file_name ="./Books_small_10000.json"
reviews=[]
with open(file_name) as f:
    for line in f:
        review = json.loads(line)
        reviews.append(Review(review["reviewText"], review["overall"]))

reviews[5]

<__main__.Review at 0x1736b7052e0>

#### Train/Test

In [108]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(reviews, test_size = 0.33, random_state=42)

train_container= ReviewContainer(train)
test_container= ReviewContainer(test)


train_container.evenly_distribute()
train_x = train_container.get_text()
train_y =train_container.get_sentiment()

test_x = test_container.get_text()
test_y = test_container.get_sentiment ()
       

#### Feature Extraction


In [148]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
vectorizer = TfidfVectorizer()
train_xv = vectorizer.fit_transform(train_x)
test_xv = vectorizer.transform(test_x)

#### Classifier

#### Support Vector MAchine

In [149]:
from sklearn import svm
clf_svm = svm.SVC(kernel='linear')
clf_svm.fit(train_xv, train_y)

clf_svm.predict(test_xv[0])



array(['POSITIVE'], dtype='<U8')

#### Decision Tree

In [150]:
from sklearn.tree import DecisionTreeClassifier
clf_dtree = DecisionTreeClassifier()
clf_dtree = clf_dtree.fit(train_xv, train_y)

clf_dtree.predict(test_xv[0])

array(['NEGATIVE'], dtype='<U8')

#### KNNN


In [151]:
from sklearn.neighbors import KNeighborsClassifier
clf_knn = KNeighborsClassifier(n_neighbors=10)
clf_knn = clf_knn.fit(train_xv, train_y)

clf_knn.predict(test_xv[0])

array(['NEGATIVE'], dtype='<U8')

#### Nearest Centruoid


In [152]:
from sklearn.neighbors import NearestCentroid
clf_nc = NearestCentroid()
clf_nc = clf_nc.fit(train_xv, train_y)

#### Evaluation

#### Mean Acuracy

In [153]:
clf_svmscore = clf_svm.score(test_xv, test_y)
clf_dtreescore = clf_dtree.score(test_xv, test_y)
clf_knnscore = clf_knn.score(test_xv, test_y)
clf_ncscore = clf_nc.score(test_xv, test_y)
print(clf_svmscore, clf_dtreescore, clf_knnscore, clf_ncscore)

0.7412121212121212 0.57 0.45606060606060606 0.7018181818181818


#### F1 Score

In [155]:
from sklearn.metrics import f1_score
f1_score(test_y, clf_svm.predict(test_xv), average=None, labels=[Sentiment.POSITIVE, Sentiment.NEUTRAL, Sentiment.NEGATIVE])

array([0.87656461, 0.        , 0.3142329 ])