In [None]:
!pip install spacy==3.1.0
!python -m spacy download pt_core_news_lg

In [None]:
import nltk
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import (accuracy_score, classification_report,
                             confusion_matrix, precision_recall_fscore_support)
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

In [None]:
import nltk
nltk.download('stopwords')
  

In [None]:
import spacy

spc = spacy.load("pt_core_news_lg")

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
default_dir = "/data/"
data_tcc_pos_neg = default_dir+'labeled_data/dataset_label_pos_neg.csv'
data_neg_emotions = default_dir+'labeled_data/dataset_neg_emotions.csv'


In [None]:
data = pd.read_csv(data_tcc_pos_neg)
data['type'] = 'with_theme-' + data['sentiment']
data = data.dropna()
data.head()

In [None]:
def lemma_words(doc):
  spc_letras = spc(str(doc))
  return (token.lemma_ if token.pos_ == 'VERB' else str(token) for token in spc_letras)

In [None]:
def create_splits(data):
    test_validation_size = int(0.20*data.shape[0])
    train, test = train_test_split(data, test_size=test_validation_size, random_state=42, stratify=data['type'])
    return train, test

  
train, test = create_splits(data)
print('Training samples:  ', train.shape[0])
print('Test samples:      ', test.shape[0])



Training samples:   61788
Test samples:       15447


In [None]:
vectorizer = TfidfVectorizer(
    stop_words=nltk.corpus.stopwords.words('portuguese'), 
    analyzer=lemma_words,
    min_df=0.0001, 
    max_df=0.8,
    use_idf=True,
    smooth_idf=True
)

In [None]:
X_train = vectorizer.fit_transform(train['clean_text'].values.astype('U'))
X_test = vectorizer.transform(test['clean_text'].values.astype('U'))

In [None]:
labels = {
    'Positivo' : 0,
    'Negativo' : 1,
}

# labels = {
#     'tristeza': 0,
#     'medo': 1,
#     'raiva': 2,
#     'desprezo' : 3,
# }

y_train = train['sentiment'].map(labels).values
y_test = test['sentiment'].map(labels).values

In [None]:
# Create Decision Tree classifer object
clf = DecisionTreeClassifier()

# Train Decision Tree Classifer
clf = clf.fit(X_train,y_train)

In [None]:
#Predict the response for test dataset
prediction_tree = clf.predict(X_test)

In [None]:
print(classification_report(y_test, prediction_tree))


In [None]:
ax = sns.heatmap(confusion_matrix(y_test, prediction_tree), cmap='Greens_r', annot=True, fmt='d')
_ = ax.set(xlabel='Previsto', ylabel='Correto', title='Árvore de Decisão.')

In [None]:
#Calculando a acurácia
acc = accuracy_score(prediction_tree, y_test)
precision_recall_fscore = precision_recall_fscore_support(prediction_tree, y_test, average='weighted')
print(f'acc: {acc}')
print(f'\nprf: {precision_recall_fscore}')