In [81]:
import random

class Sentiment:
    NEGATIVE='NEGATIVE'
    NEUTRAL='NEUTRAL'
    POSITIVE='POSITIVE'


class Review:
    def __init__(self,text,score):
        self.text=text
        self.score=score
        self.sentiment=self.get_sentiment()
        
    def get_sentiment(self):
        if self.score <= 2:
            return Sentiment.NEGATIVE
        elif self.score==3:
            return Sentiment.NEUTRAL
        else:
            return Sentiment.POSITIVE


class ReviewContainer:
    def __init__(self, reviews):
        self.reviews= reviews

    def get_text(self):
        return [x.text for x in self.reviews]

    def get_sentiment(self):
        return [x.sentiment for x in self.reviews]
        
    def evenly_distribute(self):
        negative = list(filter(lambda x: x.sentiment == Sentiment.NEGATIVE, self.reviews))
        positive = list(filter(lambda x: x.sentiment == Sentiment.POSITIVE, self.reviews))
        positive_shrunk= positive[:len(negative)]
        self.reviews=negative + positive_shrunk
        random.shuffle(self.reviews)

### Load data

In [63]:
import json

file_name='Books_small_10000.json'

reviews=[]

with open(file_name) as f:
    for line in f:
        review= json.loads(line)     
        reviews.append(Review(review['reviewText'], review['overall']))

reviews[12].sentiment


'POSITIVE'

### Prep data

shift+tab = info


In [103]:
from sklearn.model_selection import train_test_split

training, test = train_test_split(reviews,test_size=0.33, random_state=42)

train_container = ReviewContainer(training)
test_container = ReviewContainer(test)

len(cont.reviews)

872

In [107]:
train_container.evenly_distribute()
train_x=train_container.get_text()
train_y=train_container.get_sentiment()


test_container.evenly_distribute()
test_x=test_container.get_text()
test_y=test_container.get_sentiment()


In [108]:
from sklearn.feature_extraction.text import CountVectorizer 

vectorizer = CountVectorizer()
train_x_vectors=vectorizer.fit_transform(train_x)

test_x_vectors=vectorizer.transform(test_x)



### Classifaction

#### Linear SVM

In [109]:
from sklearn import svm

clf_svm = svm.SVC(kernel='linear')

clf_svm.fit(train_x_vectors,train_y)

test_x[0]

clf_svm.predict(test_x_vectors[0])

array(['NEGATIVE'], dtype='<U8')

#### Decision Tree

In [110]:
from sklearn.tree import DecisionTreeClassifier

clf_dec=DecisionTreeClassifier()
clf_dec.fit(train_x_vectors, train_y)

clf_dec.predict(test_x_vectors[:5])

array(['NEGATIVE', 'POSITIVE', 'NEGATIVE', 'NEGATIVE', 'POSITIVE'],
      dtype='<U8')

#### Logistic Regression

In [111]:
from sklearn.linear_model import LogisticRegression

clf_log=LogisticRegression()
clf_log.fit(train_x_vectors, train_y)


clf_log.predict(test_x_vectors[:5])

array(['NEGATIVE', 'POSITIVE', 'NEGATIVE', 'NEGATIVE', 'POSITIVE'],
      dtype='<U8')

In [None]:
## Evaluation

In [112]:
print(clf_svm.score(test_x_vectors,test_y))
print(clf_dec.score(test_x_vectors,test_y))
print(clf_log.score(test_x_vectors,test_y))


0.7980769230769231
0.6418269230769231
0.8173076923076923


In [114]:
#f1 scores

from sklearn.metrics import f1_score

f1_score(test_y, clf_svm.predict(test_x_vectors), average=None, labels=[Sentiment.POSITIVE, Sentiment.NEGATIVE])
#f1_score(test_y, clf_log.predict(test_x_vectors), average=None, labels=[Sentiment.POSITIVE,Sentiment.NEUTRAL, Sentiment.NEGATIVE])

array([0.8028169 , 0.79310345])

In [122]:
test_set = ['very good','do not buy','not worth of time','it was great']
new_test = vectorizer.transform(test_set)

clf_svm.predict(new_test)

array(['POSITIVE', 'NEGATIVE', 'NEGATIVE', 'POSITIVE'], dtype='<U8')