## Imports

In [1]:
import json
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn import svm
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from googletrans import Translator
translator = Translator()

## Data Classes

In [2]:
import random

class Sentiment:
    NEGATIVE = "NEGATIVE"
    NEUTRAL = "NEUTRAL"
    POSITIVE = "POSITIVE"

class Review:
    def __init__(self,text,score):
        self.text = text
        self.score = score
        self.sentiment = self.get_sentiment()
        
    def get_sentiment(self):
        if self.score <= 2:
            return Sentiment.NEGATIVE
        elif self.score ==3:
            return Sentiment.NEUTRAL
        else:
            return Sentiment.POSITIVE
        
        
class review_container:
    def __init__(self,reviews):
        self.reviews = reviews
    
    def get_text(self):
        return [x.text for x in self.reviews]
        
    def get_sentiment(self):
        return [x.sentiment for x in self.reviews]
        
    def evenly_distribute(self):
        negative = list(filter(lambda x: x.sentiment == Sentiment.NEGATIVE, self.reviews))
        positive = list(filter(lambda x: x.sentiment == Sentiment.POSITIVE, self.reviews))
        positive_shrunk = positive[:len(negative)]
        self.reviews = negative +positive_shrunk
        random.shuffle(self.reviews)
        

## Read json file

In [3]:
file_name = './datasets/sentiment/Books_small_10000.json'

reviews = []

with open(file_name) as f:
    for line in f:
        review = json.loads(line)
        reviews.append(Review(review['reviewText'],review['overall']))

## Seeing the data

In [4]:
print(reviews[25].sentiment)
print(reviews[1].sentiment)
print(reviews[2].sentiment)

POSITIVE
NEUTRAL
POSITIVE


In [5]:
len(reviews)

10000

## Prepare data

In [6]:
train,test = train_test_split(reviews,test_size=0.33,random_state=42)

train_container = review_container(train)
test_container = review_container(test)

In [7]:
train_container.evenly_distribute()

train_x = train_container.get_text()
train_y = train_container.get_sentiment()

test_container.evenly_distribute()
test_x = test_container.get_text()
test_y = test_container.get_sentiment()

## Bag of words

In [8]:
vectorizer = TfidfVectorizer()

train_x_vectors = vectorizer.fit_transform(train_x)

test_x_vectors = vectorizer.transform(test_x)

test_x_vectors = vectorizer.transform(test_x)

## Classifications

#### Linear SVM

In [9]:
clf_svm = svm.SVC(kernel = 'linear')

clf_svm.fit(train_x_vectors,train_y)

test_x[0]

clf_svm.predict(test_x_vectors[0])

array(['NEGATIVE'], dtype='<U8')

#### Decision Tree

In [10]:


clf_dec = DecisionTreeClassifier()

clf_dec.fit(train_x_vectors, train_y)

clf_dec.predict(test_x_vectors[0])



array(['POSITIVE'], dtype='<U8')

### Logistic Regression

In [11]:
clf_log = LogisticRegression()

clf_log.fit(train_x_vectors,train_y)

clf_log.predict(test_x_vectors[0])



array(['NEGATIVE'], dtype='<U8')

## Testing wich is the best

In [12]:
print(clf_svm.score(test_x_vectors,test_y))

print(clf_dec.score(test_x_vectors,test_y))

print(clf_log.score(test_x_vectors,test_y))

0.8076923076923077
0.6610576923076923
0.8028846153846154


In [13]:
print(f1_score(test_y, clf_svm.predict(test_x_vectors),average=None,labels=[Sentiment.POSITIVE, Sentiment.NEGATIVE]))

print(f1_score(test_y, clf_dec.predict(test_x_vectors),average=None,labels=[Sentiment.POSITIVE, Sentiment.NEGATIVE]))

print(f1_score(test_y, clf_log.predict(test_x_vectors),average=None,labels=[Sentiment.POSITIVE, Sentiment.NEGATIVE]))

[0.80582524 0.80952381]
[0.65356265 0.66823529]
[0.80097087 0.8047619 ]


## And now, use the project

In [14]:
def sentiment(list):
    translates = []
    
    for x in list:
        translate = translator.translate(x).text
        translates.append(translate)
        
    new_test = vectorizer.transform(translates)
    return clf_svm.predict(new_test) 
        

Using the translator library to write reviews in portuguese

In [15]:
words = ['este livro é ruim','Perda de tempo', 'este livro é muito confuso','gostei muito deste livro',]
sentiment(words)

array(['NEGATIVE', 'NEGATIVE', 'NEGATIVE', 'POSITIVE'], dtype='<U8')