In [962]:
import numpy as np
import pandas as pd
import json
import random
import string

In [963]:
class Sentiment:
    POSITIVE = 'Positive'
    NEUTRAL = 'Neutral'
    NEGATIVE = 'Negative'

class Review:
    def __init__(self, text, score):
        self.text = text
        self.score = score
        self.sentiment = self.score_checker()
        
        
    def score_checker(self):
        if self.score >= 4:
            return Sentiment.POSITIVE
        elif self.score == 3:
            return Sentiment.NEUTRAL
        else:
            return Sentiment.NEGATIVE
        
class ReviewContainer:
    def __init__(self, reviews):
        self.reviews = reviews       
     
    def get_text(self):
        return [x.text for x in self.reviews]
    
    def get_sentiment(self):
        return [x.sentiment for x in self.reviews]
        
    # Bem nosso modelo so esta performando bem em prever Positivos...bora arrumar isso    
    def distribuição_uniforme(self):
        negative = list(filter(lambda x: x.sentiment == Sentiment.NEGATIVE, self.reviews))
        positive = list(filter(lambda x: x.sentiment == Sentiment.POSITIVE, self.reviews))
        neutral = list(filter(lambda x: x.sentiment == Sentiment.NEUTRAL, self.reviews))
        positive_diminui = positive[: len(negative)]
        neutral_diminui = neutral[: len(negative)]
        self.reviews = negative + positive_diminui + neutral_diminui
        random.shuffle(self.reviews)
        
        
    # funcao para remover pontuacoes digitos das reviews
    def cleaner(x):
        x = x.lower()
        x = [c for c in x if c not in string.punctuation]
        x = ''.join([str(elemento) for elemento in x if not elemento.isdigit()])
        return x

In [964]:
file = 'data/Books_small_10000.json'

In [965]:
reviews = []
with open(file) as f:
    for lines in f:
        review = json.loads(lines)
        reviews.append(Review(cleaner(review['reviewText']), review['overall']))

In [966]:
type(reviews)

list

In [967]:
reviews[5].score

5.0

In [968]:
reviews[5].text

'i hoped for mia to have some peace in this book but her story is so real and raw  broken world was so touching and emotional because you go from mias trauma to her trying to cope  i love the way the story displays how there is no just bouncing back from being sexually assaulted  mia showed us how those demons come for you every day and how sometimes they best you i was so in the moment with broken world and hurt with mia because she was surrounded by people but so alone and i understood her feelings  i found myself wishing i could give her some of my courage and strength or even just to be there for her  thank you lizzy for putting a great characters voice on a strong subject and making it so that other peoples story may be heard through mias'

In [969]:
reviews[5].sentiment

'Positive'

#### Preparando os dados

In [970]:
from sklearn.model_selection import train_test_split
training, test = train_test_split(reviews, test_size=0.33, random_state=42)

In [971]:
train_container = ReviewContainer(training)
test_container = ReviewContainer(test)

In [972]:
# Para treinarmos
len(training)

6700

In [973]:
# Para testarmos
len(test)

3300

In [974]:
train_container.distribuição_uniforme()
train_x = train_container.get_text()
train_y = train_container.get_sentiment()

test_container.distribuição_uniforme()
test_x = test_container.get_text()
test_y = test_container.get_sentiment()

print(train_y.count(Sentiment.POSITIVE))
print(train_y.count(Sentiment.NEGATIVE))
print(train_y.count(Sentiment.NEUTRAL))

436
436
436


In [975]:
train_x[0], train_y[0]

('i dont like to give bad reviews  and usually dont  but the music in this thing is  not bluegrass and  not carter style and  not classics of bluegrass guitarthere is no cd although another reviewer mentions how wonderful the examples are  and there is no author  the arrangements are so bad imho that im willing to bet they were entered into notationtab software by a guitar player whose ear is trained enough takes very little eartraining for this that heshe knows where the melody notes are on guitar and he keyed that quickly in put chord strums on the beats where no melody note lies and thats the arrangement  in other words this stuff im betting was never performed  any firstyear music student could so easily write this with no guitar anywhere aroundthis stuff is not what a bluegrass picker would ever stand up and play  in fact theyre so unlike real guitar music and so uninteresting i cant imagine even a rank beginner having real fun with them theres so much other stuff thats good and h

In [976]:
test_x[0], test_y[0]

('a very good beginning that never really went anywhere  it seemed as if she got tired of writing this but its a stephanie plum book so it was worth it',
 'Neutral')

#### Bag of Words e Vetorizacao

In [977]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

vectorizer = TfidfVectorizer()

train_x_vectors = vectorizer.fit_transform(train_x)

test_x_vectors = vectorizer.transform(test_x)

In [978]:
print(train_x[0])
print(train_x_vectors[0].toarray())

i dont like to give bad reviews  and usually dont  but the music in this thing is  not bluegrass and  not carter style and  not classics of bluegrass guitarthere is no cd although another reviewer mentions how wonderful the examples are  and there is no author  the arrangements are so bad imho that im willing to bet they were entered into notationtab software by a guitar player whose ear is trained enough takes very little eartraining for this that heshe knows where the melody notes are on guitar and he keyed that quickly in put chord strums on the beats where no melody note lies and thats the arrangement  in other words this stuff im betting was never performed  any firstyear music student could so easily write this with no guitar anywhere aroundthis stuff is not what a bluegrass picker would ever stand up and play  in fact theyre so unlike real guitar music and so uninteresting i cant imagine even a rank beginner having real fun with them theres so much other stuff thats good and has

#### Classification

#### Linear SVM

In [979]:
from sklearn.svm import SVC
svm = SVC(kernel='linear')

svm_clf = svm.fit(train_x_vectors, train_y)

predicao_svm = svm_clf.predict(test_x_vectors)

In [980]:
# Apenas observando os resultados obtidos
print(test_x[0])
print(train_x_vectors[0].toarray())

a very good beginning that never really went anywhere  it seemed as if she got tired of writing this but its a stephanie plum book so it was worth it
[[0. 0. 0. ... 0. 0. 0.]]


In [981]:
train_y[5], predicao_svm[5]

('Negative', 'Positive')

#### Decision Tree

In [982]:
from sklearn.tree import DecisionTreeClassifier
tree_clf = DecisionTreeClassifier(criterion='entropy')

tree_clf.fit(train_x_vectors, train_y)

predicao_tree = tree_clf.predict(test_x_vectors)

#### Naive Bayes

In [983]:
from sklearn.naive_bayes import GaussianNB
bay_gnb = GaussianNB()

bay_gnb.fit(train_x_vectors.toarray(), train_y)

predicao_gnb = bay_gnb.predict(test_x_vectors.toarray())

#### Logistic Regression

In [984]:
from sklearn.linear_model import LogisticRegression
log_clf = LogisticRegression(max_iter=500)

log_clf.fit(train_x_vectors, train_y)

predicao_log = log_clf.predict(test_x_vectors)

#### Evaluation

In [985]:
from sklearn.metrics import classification_report

In [986]:
# MEAN SCORE/Media de Acertos
print(svm_clf.score(test_x_vectors, test_y).round(2) *100)
print(tree_clf.score(test_x_vectors, test_y).round(2)*100)
print(bay_gnb.score(test_x_vectors.toarray(), test_y).round(2)*100)
print(log_clf.score(test_x_vectors, test_y).round(2)*100)

61.0
41.0
44.0
62.0


In [987]:
# f1-score
print(f'Linear Support Vector Machine: {svm_clf.score(test_x_vectors, test_y).round(2)}')
print('*'*55)
print(classification_report(predicao_svm, test_y))

Linear Support Vector Machine: 0.61
*******************************************************
              precision    recall  f1-score   support

    Negative       0.59      0.60      0.59       205
     Neutral       0.54      0.53      0.53       215
    Positive       0.69      0.71      0.70       204

    accuracy                           0.61       624
   macro avg       0.61      0.61      0.61       624
weighted avg       0.61      0.61      0.61       624



In [988]:
print(f'Decision Tree: {tree_clf.score(test_x_vectors, test_y).round(2)}')
print('*'*55)
print(classification_report(predicao_tree, test_y))

Decision Tree: 0.41
*******************************************************
              precision    recall  f1-score   support

    Negative       0.38      0.39      0.39       204
     Neutral       0.38      0.37      0.38       214
    Positive       0.46      0.46      0.46       206

    accuracy                           0.41       624
   macro avg       0.41      0.41      0.41       624
weighted avg       0.41      0.41      0.41       624



In [989]:
print(f'Naive Bayes: {bay_gnb.score(test_x_vectors.toarray(), test_y).round(2)}')
print('*'*55)
print(classification_report(predicao_gnb, test_y))

Naive Bayes: 0.44
*******************************************************
              precision    recall  f1-score   support

    Negative       0.46      0.46      0.46       207
     Neutral       0.41      0.37      0.39       231
    Positive       0.44      0.49      0.47       186

    accuracy                           0.44       624
   macro avg       0.44      0.44      0.44       624
weighted avg       0.44      0.44      0.44       624



In [990]:
print(f'Logistic Regression: {log_clf.score(test_x_vectors, test_y).round(2)}')
print('*'*55)
print(classification_report(predicao_log, test_y))

Logistic Regression: 0.62
*******************************************************
              precision    recall  f1-score   support

    Negative       0.60      0.62      0.61       200
     Neutral       0.53      0.53      0.53       208
    Positive       0.73      0.70      0.71       216

    accuracy                           0.62       624
   macro avg       0.62      0.62      0.62       624
weighted avg       0.62      0.62      0.62       624



In [991]:
# Regressao logistica foi o que melhor performou...

In [992]:
# testando
new_text = ['Wow amazing story I enjoyed this', 'Waste of time horrible', 'Too bad I would never recomend']
new_test = vectorizer.transform(new_text)

log_clf.predict(new_test)

array(['Positive', 'Negative', 'Negative'], dtype='<U8')

#### Tunning our model (Com Grid Search)

In [993]:
# Precisamos melhorar nosso modelo, e pra isso utilizaremos Grid Search
# Grid Search e praticamente um afinador de parametros, que ira encontrar
# o melhor parametro para nossos algoritmos de Machine Learning

In [994]:
# Aplicaremos o Tunning apenas nos algoritmos que melhor performaram porque levara um tempo para isso

In [995]:
from sklearn.model_selection import GridSearchCV

In [996]:
# SVM
parameters = {'tol':[0.001, 0.0001, 0.00001],'C': [1.0, 1.5, 2.0],'kernel': ['rbf', 'linear', 'poly', 'sigmoid']}

grid_search_svm = GridSearchCV(estimator=SVC(), param_grid=parameters)
grid_search_svm.fit(train_x_vectors, train_y)
best_parameters = grid_search.best_params_
best_result = grid_search.best_score_

print(best_parameters)
print(best_result)

{'C': 1.5, 'solver': 'saga', 'tol': 1e-05}
0.6146793015705887


In [997]:
print(grid_search_svm.score(test_x_vectors, test_y).round(2) *100)

61.0


In [998]:
# Logistic Regression
parameters = {'tol': [0.0001, 0.00001, 0.000001],'C': [1.0, 1.5, 2.0],'solver': ['lbfgs', 'sag', 'saga']}

grid_search_log = GridSearchCV(estimator=LogisticRegression(max_iter=1000), param_grid=parameters)
grid_search_log.fit(train_x_vectors, train_y)
best_parameters = grid_search.best_params_
best_result = grid_search.best_score_

print(best_parameters)
print(best_result)

{'C': 1.5, 'solver': 'saga', 'tol': 1e-05}
0.6146793015705887


In [999]:
print(grid_search_log.score(test_x_vectors, test_y).round(2) *100)

62.0


#### Salvando nosso modelo

In [1000]:
import pickle

In [1003]:
with open('modelos/sentiment_classifier.pkl', 'wb') as f:
    pickle.dump(log_clf, f)

#### Carregando o modelo

In [1005]:
with open('modelos/sentiment_classifier.pkl', 'rb') as f:
    loaded_log_clf = pickle.load(f)

In [1008]:
print(test_x[0])
loaded_log_clf.predict(test_x_vectors[0])

a very good beginning that never really went anywhere  it seemed as if she got tired of writing this but its a stephanie plum book so it was worth it


array(['Negative'], dtype='<U8')

In [1014]:
new_text = ['I thought it would be better', 'i really enjoyed this journey', 'Too bad I would not recomend it']
new_test = vectorizer.transform(new_text)

loaded_log_clf.predict(new_test)

array(['Neutral', 'Positive', 'Negative'], dtype='<U8')