# Análise de Sentimento do meu TCC

### Importando as funções, classes e pacotes necessários

In [None]:
import csv

import pandas as pd # lembrando que o chormadb pode ser melhor que o pandas!

import numpy as np

import matplotlib.pyplot as plt

import seaborn as sns

import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
nltk.download('punkt')

from sentence_transformers import SentenceTransformer

from sklearn.model_selection import train_test_split

from sklearn.linear_model import SGDClassifier

from sklearn.model_selection import GridSearchCV

from sklearn.ensemble import RandomForestClassifier

### Tratamento dos dados

In [None]:
df = pd.read_csv('amazon_baby.csv') #convertendo o arquivo csv em um DataFrame

In [None]:
df.dropna(inplace=True) # para limpar os dados (eliminar dados faltantes)
len(df)

## VAMOS DEFINIR OS SENTIMENTOS 
SE RATING= 1 ENTAO NEGATIVO, SE RATING = 5 ENTAO POSITIVO
REMOVER O RESTANTE DOS COMENTARIOS

negativo -> 0

positivo -> 1

In [None]:
df = df[(df["rating"] == 1) | (df["rating"] == 5)]

In [None]:
df['sentimento'] = df['rating'].apply(lambda x: 0 if x in [1,2] else 1)

### visualizando a quantidade de textos negativos e positivos

In [None]:
sns.histplot(data = df, x = "sentimento")

In [None]:
round(N_sent[0]/N_sent.sum()*100,2), round(N_sent[1]/N_sent.sum()*100,2)

### Segmentando os textos em Sentenças e Contagem dos tokens de cada sentença 

In [None]:
def count_tokens(sent):
    return np.array([len(word_tokenize(i)) for i in sent])


def list_sent(text):
    return sent_tokenize(text)

In [None]:
df["sentences"] = df['review'].apply(list_sent)

In [None]:
df["count_tokens"]= df["sentences"].apply(count_tokens)

In [None]:
# função para filtrar sentenças que apresentem de 8 a 30 tokens
def filter(array,cutoff = [8,30]):
    return (array.min()>cutoff[0]) & (array.max()<cutoff[1])

In [None]:
df = df[df["count_tokens"].apply(filter)]

In [None]:
def n_sent(lista):
    return len(lista)

In [None]:
length_neg = (df["sentimento"] == 0).sum()
length_neg

### Balanceando os dados (undersampling)

In [None]:
df = pd.concat([df[df["sentimento"]==0],df[df["sentimento"]==1].sample(n=length_neg)], ignore_index=True)

In [None]:
df['count_tokens'].apply(n_sent).sum()

In [None]:
df['count_tokens']

In [None]:
df["sentences"][0]

### Representando vetorialmente os texto através de embeddings (SBERT)

In [None]:
model = SentenceTransformer("all-MiniLM-L6-v2")

In [None]:
#função para criar os embeddings de cada sentença
def embedder(lista):
    return model.encode(lista)

#função para agregar os embeddings de um texto (valor médio)
def mean_aggregation(embeddings):
    return embeddings.mean(axis=0).reshape(1,-1)

#função para agregar os embeddings de um texto (max pooling)
def max_pooling(embeddings):
    max_index = np.argmax(np.abs(embeddings),axis = 0)
    return embeddings[max_index,np.arange(embeddings.shape[1])].reshape(1,-1)

In [None]:
df["embeddings"] = df["sentences"].apply(embedder)

In [None]:
#df["emb_max_pooling"] = df["embeddings"].apply(max_pooling)

In [None]:
df["emb_mean"] = df["embeddings"].apply(mean_aggregation)

### Reservando dados para testar os modelos após treinamento/validação 

In [None]:
def to_train(X):
    return np.concatenate(X.values)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(to_train(df["emb_mean"]), df["sentimento"], test_size = 0.20, random_state=22)

## Treinamento e Validação dos Algoritmos

### Modelo 1

In [None]:
modelo1 = SGDClassifier()

In [None]:
param_grid_model1 = {"loss":["hinge","log_loss","modified_huber"],"penalty":['l1','l2','elasticnet'],"alpha":[0.005, 0.0001],"l1_ratio":[0.15,0.5,0.8],"max_iter":[2000,5000]}

In [None]:
grid_search_model1 = GridSearchCV(modelo1,param_grid_model1,cv=5,return_train_score=True)

In [None]:
grid_search_model1.best_estimator_

In [None]:
cvres_model1 = grid_search_model1.cv_results_

In [None]:
modelo2 = RandomForestClassifier(random_state=42)

In [None]:
param_grid_model2 = {"n_estimators":[100,200,300,400,500,600,700],"max_depth":[10,20,30,40,50],"min_samples_split":[2,5,10],
                    "min_samples_leaf":[1,2,4,8],"max_features":['sqrt','log2']}

In [None]:
grid_search_model2 = GridSearchCV(modelo2,param_grid_model2,cv=5, verbose=2)

In [None]:
grid_search_model2.fit(X_train,y_train)

In [None]:
cvres_model2 = grid_search_model2.cv_results_