In [None]:
!pip install spacy==3.1.0
!python -m spacy download pt_core_news_lg

In [None]:
import nltk
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import (accuracy_score, classification_report,
                             confusion_matrix, precision_recall_fscore_support)
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

In [None]:
import nltk
nltk.download('stopwords')
  

In [None]:
import spacy

spc = spacy.load("pt_core_news_lg")

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
default_dir = "/data/"
data_tcc_pos_neg = default_dir+'labeled_data/dataset_label_pos_neg.csv'
data_neg_emotions = default_dir+'labeled_data/dataset_neg_emotions.csv'


In [None]:
data = pd.read_csv(data_neg_emotions)
data['type'] = 'with_theme-' + data['sentiment']
data.head()

In [None]:
def lemma_words(doc):
  spc_letras = spc(str(doc))
  return (token.lemma_ if token.pos_ == 'VERB' else str(token) for token in spc_letras)

In [None]:
def create_splits(data):
    test_validation_size = int(0.30*data.shape[0])
    train, test = train_test_split(data, test_size=test_validation_size, random_state=42, stratify=data['type'])
    return train, test

  
train, test = create_splits(data)
print('Training samples:  ', train.shape[0])
print('Test samples:      ', test.shape[0])

In [None]:
vectorizer = TfidfVectorizer(
    stop_words=nltk.corpus.stopwords.words('portuguese'), 
    analyzer=lemma_words,
    min_df=0.0001, 
    max_df=0.8,
    use_idf=True,
    smooth_idf=True
)

In [None]:
X_train = vectorizer.fit_transform(train['cleaned_text'].values.astype('U'))
X_test = vectorizer.transform(test['cleaned_text'].values.astype('U'))

In [None]:
labels = {'Positivo': 0, 'Negativo': 1}
# labels = {
#     'tristeza': 0,
#     'medo': 1,
#     'raiva': 2,
#     'desprezo' : 3,
# }

y_train = train['sentiment'].map(labels).values
y_test = test['sentiment'].map(labels).values

In [None]:
lr = LogisticRegression(random_state=0, class_weight='balanced', max_iter=500, verbose=True)
lr.fit(X_train, y_train)


In [None]:
prediction_logistic = lr.predict(X_test)

In [None]:
print(classification_report(y_test, prediction_logistic))


In [None]:
ax = sns.heatmap(confusion_matrix(y_test, prediction_logistic), cmap='Greens_r', annot=True, fmt='d')
_ = ax.set(xlabel='Previsto', ylabel='Correto', title='Regressão Logística.')

In [None]:
#Calculando a acurácia
acc = accuracy_score(prediction_logistic, y_test)
precision_recall_fscore = precision_recall_fscore_support(prediction_logistic, y_test, average='weighted')
print(f'acc: {acc}')
print(f'\nprf: {precision_recall_fscore}')