In [None]:
import math
import os
import pickle
import re

import matplotlib.pyplot as plt
import nltk
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn import metrics
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import (accuracy_score, classification_report,
                             confusion_matrix, precision_recall_fscore_support)
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

In [None]:
import nltk
nltk.download('stopwords')
  

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
default_dir = "/data/"
data_tcc_pos_neg = default_dir+'labeled_data/dataset_label_pos_neg.csv'
data_neg_emotions = default_dir+'labeled_data/dataset_neg_emotions.csv'


In [None]:
data = pd.read_csv(data_neg_emotions)
data['type'] = 'with_theme-' + data['sentiment']
data.head()

Unnamed: 0,renderedContent,date,sentiment,cleaned_text,type
0,@CorreiosBR Já informei faz tempo.\n#raiva #ch...,2021-06-18 12:35:41+00:00,raiva,informei fazer tempo raiva chatear,with_theme-raiva
1,@MidiaNINJA @samanthaschmutz E desse modo que ...,2021-06-18 01:14:37+00:00,raiva,d esse modo muitas vezes sinto visualizar rede...,with_theme-raiva
2,a m**** deve achar demorado entrar em contato ...,2021-06-16 21:19:59+00:00,raiva,m deve achar demorar entrar contato outros dep...,with_theme-raiva
3,"Em uma propriedade, 4 animais morreram com o v...",2021-06-16 20:40:55+00:00,raiva,propriedade animais morrer vírus raivar foco d...,with_theme-raiva
4,Estudo da FGV expõe um país infeliz santosbanc...,2021-06-16 20:25:39+00:00,raiva,estudo fgv expor país infeliz santosbancarios ...,with_theme-raiva


In [None]:
stemmer = nltk.stem.snowball.PortugueseStemmer()
analyzer = TfidfVectorizer().build_analyzer()

In [None]:
def stemmed_words(doc):
    return (stemmer.stem(w) for w in analyzer(doc) if w[0]!='@')

In [None]:
def create_splits(data):
    test_validation_size = int(0.30*data.shape[0])
    train, test = train_test_split(data, test_size=test_validation_size, random_state=42, stratify=data['type'])
    return train, test

  
train, test = create_splits(data)
print('Training samples:  ', train.shape[0])
print('Test samples:      ', test.shape[0])



Training samples:   54065
Test samples:       23170


In [None]:
vectorizer = TfidfVectorizer(
    stop_words=nltk.corpus.stopwords.words('portuguese'), 
    analyzer=stemmed_words,
    min_df=0.0001, 
    max_features=100000, 
    max_df=0.8)

In [None]:
X_train = vectorizer.fit_transform(train['cleaned_text'].values.astype('U'))
X_test = vectorizer.transform(test['cleaned_text'].values.astype('U'))

In [None]:
# labels = {'Negativo': 1, 'Positivo': 0}
# labels = {
#     'tristeza': 0,
#     'medo': 1,
#     'raiva': 2,
#     'desprezo' : 3,
# }

y_train = train['sentiment'].map(labels).values
y_test = test['sentiment'].map(labels).values

In [None]:
lr = LogisticRegression(random_state=0, class_weight='balanced', max_iter=500, verbose=True)
lr.fit(X_train, y_train)


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   13.8s finished


LogisticRegression(C=1.0, class_weight='balanced', dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=500, multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=0, solver='lbfgs', tol=0.0001, verbose=True,
                   warm_start=False)

In [None]:
prediction_logistic = lr.predict(X_test)

In [None]:
print(classification_report(y_test, prediction_logistic))


In [None]:
ax = sns.heatmap(confusion_matrix(y_test, prediction_logistic), cmap='Greens_r', annot=True, fmt='d')
_ = ax.set(xlabel='Previsto', ylabel='Correto', title='Regressão Logística.')

In [None]:
#Calculando a acurácia
acc = accuracy_score(prediction_logistic, y_test)
precision_recall_fscore = precision_recall_fscore_support(prediction_logistic, y_test, average='weighted')
print(f'acc: {acc}')
print(f'\nprf: {precision_recall_fscore}')