In [None]:
!pip install spacy==3.1.0
!python -m spacy download pt_core_news_lg

In [3]:
import matplotlib.pyplot as plt
import nltk
import numpy as np
import pandas as pd
import seaborn as sns

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import (accuracy_score, classification_report,
                             confusion_matrix, precision_recall_fscore_support)

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

nltk.download('stopwords')
nltk.download('punkt')

In [5]:
import spacy

spc = spacy.load("pt_core_news_lg")

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [10]:
default_dir = "/data/"
data_tcc_pos_neg = default_dir+'labeled_data/dataset_label_pos_neg.csv'
data_neg_emotions = default_dir+'labeled_data/dataset_neg_emotions.csv'


In [12]:
data = pd.read_csv(data_tcc_pos_neg)
data['type'] = 'with_theme-' + data['sentiment']
data["cleaned_text"] = data[["clean_text"]].values.astype("U")
data = data.dropna()

In [14]:
def lemma_words(doc):
  spc_letras = spc(str(doc))
  return (token.lemma_ if token.pos_ == 'VERB' else str(token) for token in spc_letras)

In [None]:
def create_splits(data):
    test_validation_size = int(0.30*data.shape[0])
    train, test = train_test_split(data, test_size=test_validation_size, random_state=42, stratify=data['type'])
    return train, test

  
train, test = create_splits(data)
print('Training samples:  ', train.shape[0])
print('Test samples:      ', test.shape[0])



In [26]:
vectorizer = TfidfVectorizer(
    stop_words=nltk.corpus.stopwords.words('portuguese'), 
    analyzer=lemma_words,
    min_df=0.0001, 
    max_df=0.8,
    use_idf=True,
    smooth_idf=True
)

In [27]:
X_train = vectorizer.fit_transform(train['clean_text'].values.astype('str'))
X_test = vectorizer.transform(test['clean_text'].values.astype('str'))

In [28]:
labels = {
    'Positivo' : 0,
    'Negativo' : 1,
}

# encoding = {
#     'tristeza': 0,
#     'medo': 1,
#     'raiva': 2,
#     'desprezo' : 3,
# }


y_train = train['sentiment'].map(labels).values
y_test = test['sentiment'].map(labels).values

In [29]:
nb = MultinomialNB()
nb.fit(X_train, y_train)
y_pred = nb.predict(X_test)


In [None]:
print(classification_report(y_test, y_pred))


In [None]:
ax = sns.heatmap(confusion_matrix(y_test, y_pred), cmap='Greens_r', annot=True, fmt='d')
_ = ax.set(xlabel='Previsto', ylabel='Correto', title='Naive Bayes.')

In [None]:
#Calculando a acurácia
acc = accuracy_score(y_pred, y_test)
precision_recall_fscore = precision_recall_fscore_support(y_pred, y_test, average="weighted")
print(f'Acurácia: {acc}')
print(f'\nResult: {precision_recall_fscore}')