In [None]:
import pandas as pd
import numpy as np
import spacy
import re
from nltk.corpus import stopwords
from string import punctuation
from imblearn.over_sampling import SMOTE

# AI models
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from gensim.models import Word2Vec
from transformers import BertTokenizer, BertModel
import torch
from sklearn.metrics import accuracy_score, classification_report

# Classifiers
from sklearn.linear_model import LogisticRegression, Perceptron
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, StackingClassifier

In [15]:
# Import data
stop_words = set(stopwords.words('portuguese'))
def clean_text(text):
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    text = re.sub(r'#\w+', '', text)  # Remove hashtags
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = text.lower()  # Convert to lowercase
    text = " ".join([word for word in text.split() if word not in stop_words])  # Remove stopwords
    return text

test = pd.read_excel("data\\test.xlsx")
test['Msg'] = test['Msg'].apply(clean_text)

train = pd.read_excel("data\\train.xlsx")
train['Msg'] = train['Msg'].apply(clean_text)

In [16]:
print(train['SentimentoClassificacao'].value_counts())
print("-------------------")
print(test['SentimentoClassificacao'].value_counts())

SentimentoClassificacao
 0    1114
-1     653
-2     262
 1     256
Name: count, dtype: int64
-------------------
SentimentoClassificacao
 0    936
-1    449
-2    292
 1    228
Name: count, dtype: int64


In [17]:
nan_counts = train.isna().sum()
print(nan_counts)
print("----------------------")
nan_counts = test.isna().sum()
print(nan_counts)

Dialog_ID                  0
SentimentoRegressao        0
SentimentoClassificacao    0
Msg                        0
dtype: int64
----------------------
Dialog_ID                  0
Msg                        0
Anotador1                  6
Anotador2                  2
SentimentoRegressao        0
SentimentoClassificacao    0
dtype: int64


In [18]:
y_train = train["SentimentoClassificacao"].values
x_train = train["Msg"].values
print(f"x_train dataset length: {len(x_train)}")

y_test = test["SentimentoClassificacao"].values
x_test = test["Msg"].values

vectorizer = TfidfVectorizer(max_features=100)
x_train = vectorizer.fit_transform(x_train.astype(str))
x_test = vectorizer.transform(x_test.astype(str))

smote = SMOTE(sampling_strategy='auto', random_state=42)
x_train, y_train = smote.fit_resample(x_train, y_train)
print(f"resized x_train dataset length: {x_train.shape[0]}")

x_train dataset length: 2285
resized x_train dataset length: 4456


In [19]:
weights = ['uniform', 'distance']
for k in range(1, 11):
    for w in weights:
        clf = KNeighborsClassifier(n_neighbors=k, weights=w)
        clf.fit(x_train, y_train)
        y_pred = clf.predict(x_test)
        accuracy = round(accuracy_score(y_test, y_pred)*100, 2)
        print(f"Accuracy of KNN for k={k} and w={w}: {accuracy}%")
    print("")

print("Best hyperparameters: k=9 & w=distance.")

Accuracy of KNN for k=1 and w=uniform: 37.38%
Accuracy of KNN for k=1 and w=distance: 37.38%

Accuracy of KNN for k=2 and w=uniform: 32.97%
Accuracy of KNN for k=2 and w=distance: 36.06%

Accuracy of KNN for k=3 and w=uniform: 33.39%
Accuracy of KNN for k=3 and w=distance: 36.27%

Accuracy of KNN for k=4 and w=uniform: 31.76%
Accuracy of KNN for k=4 and w=distance: 33.7%

Accuracy of KNN for k=5 and w=uniform: 26.82%
Accuracy of KNN for k=5 and w=distance: 32.07%

Accuracy of KNN for k=6 and w=uniform: 26.82%
Accuracy of KNN for k=6 and w=distance: 31.34%

Accuracy of KNN for k=7 and w=uniform: 28.24%
Accuracy of KNN for k=7 and w=distance: 30.97%

Accuracy of KNN for k=8 and w=uniform: 28.08%
Accuracy of KNN for k=8 and w=distance: 30.55%

Accuracy of KNN for k=9 and w=uniform: 29.29%
Accuracy of KNN for k=9 and w=distance: 31.34%

Accuracy of KNN for k=10 and w=uniform: 30.29%
Accuracy of KNN for k=10 and w=distance: 32.18%

Best hyperparameters: k=9 & w=distance.


In [20]:
kernels = ['linear', 'poly', 'rbf', 'sigmoid']
for c in range(1, 11):
    for k in kernels:
        clf = SVC(C=c, kernel=k)
        clf.fit(x_train, y_train)
        y_pred = clf.predict(x_test)
        accuracy = round(accuracy_score(y_test, y_pred)*100, 2)
        print(f"Accuracy of SVM for c={c} and kernel={k}: {accuracy}%")
    print("")

print("Best hyperparameters: c=1 & kernel=rbf.")

Accuracy of SVM for c=1 and kernel=linear: 39.9%
Accuracy of SVM for c=1 and kernel=poly: 36.01%
Accuracy of SVM for c=1 and kernel=rbf: 43.31%
Accuracy of SVM for c=1 and kernel=sigmoid: 33.39%

Accuracy of SVM for c=2 and kernel=linear: 39.16%
Accuracy of SVM for c=2 and kernel=poly: 36.01%
Accuracy of SVM for c=2 and kernel=rbf: 42.99%
Accuracy of SVM for c=2 and kernel=sigmoid: 36.54%

Accuracy of SVM for c=3 and kernel=linear: 37.95%
Accuracy of SVM for c=3 and kernel=poly: 35.54%
Accuracy of SVM for c=3 and kernel=rbf: 41.89%
Accuracy of SVM for c=3 and kernel=sigmoid: 30.24%

Accuracy of SVM for c=4 and kernel=linear: 37.9%
Accuracy of SVM for c=4 and kernel=poly: 35.64%
Accuracy of SVM for c=4 and kernel=rbf: 41.05%
Accuracy of SVM for c=4 and kernel=sigmoid: 30.6%

Accuracy of SVM for c=5 and kernel=linear: 37.59%
Accuracy of SVM for c=5 and kernel=poly: 35.75%
Accuracy of SVM for c=5 and kernel=rbf: 40.94%
Accuracy of SVM for c=5 and kernel=sigmoid: 32.13%

Accuracy of SVM fo

In [21]:
criterions = ['gini', 'entropy', 'log_loss']

for criterion in criterions:
	for depth in range(1, 11):
		tree = DecisionTreeClassifier(criterion=criterion, max_depth=depth)
		tree.fit(x_train, y_train)
		y_pred = tree.predict(x_test)
		accuracy = round(accuracy_score(y_test, y_pred)*100, 2)
		print(f"Accuracy for criterion={criterion} and depth={depth}: {accuracy}%")
	print("")

print("Best hyperparameters: criterion=log_loss & depth=3.")

Accuracy for criterion=gini and depth=1: 24.15%
Accuracy for criterion=gini and depth=2: 48.45%
Accuracy for criterion=gini and depth=3: 48.35%
Accuracy for criterion=gini and depth=4: 48.56%
Accuracy for criterion=gini and depth=5: 45.3%
Accuracy for criterion=gini and depth=6: 53.12%
Accuracy for criterion=gini and depth=7: 55.07%
Accuracy for criterion=gini and depth=8: 54.96%
Accuracy for criterion=gini and depth=9: 54.59%
Accuracy for criterion=gini and depth=10: 52.39%

Accuracy for criterion=entropy and depth=1: 49.71%
Accuracy for criterion=entropy and depth=2: 50.03%
Accuracy for criterion=entropy and depth=3: 50.24%
Accuracy for criterion=entropy and depth=4: 49.87%
Accuracy for criterion=entropy and depth=5: 48.14%
Accuracy for criterion=entropy and depth=6: 45.41%
Accuracy for criterion=entropy and depth=7: 52.39%
Accuracy for criterion=entropy and depth=8: 53.65%
Accuracy for criterion=entropy and depth=9: 55.85%
Accuracy for criterion=entropy and depth=10: 52.81%

Accurac

In [22]:
# Random Forest
random_forest = RandomForestClassifier()
random_forest.fit(x_train, y_train)
y_pred = random_forest.predict(x_test)
accuracy_rf = round(accuracy_score(y_test, y_pred) * 100, 2)
print(f"Accuracy for Random Forest: {accuracy}%")
print(classification_report(y_test, y_pred))

# Bagging Classifier
bagging = BaggingClassifier()
bagging.fit(x_train, y_train)
y_pred_bagging = bagging.predict(x_test)
accuracy_bagging = round(accuracy_score(y_test, y_pred_bagging) * 100, 2)
print(f"Accuracy for Bagging: {accuracy_bagging}%")
print(classification_report(y_test, y_pred_bagging))

Accuracy for Random Forest: 52.86%
              precision    recall  f1-score   support

          -2       0.34      0.27      0.30       292
          -1       0.41      0.34      0.37       449
           0       0.61      0.48      0.53       936
           1       0.19      0.47      0.28       228

    accuracy                           0.41      1905
   macro avg       0.39      0.39      0.37      1905
weighted avg       0.47      0.41      0.43      1905

Accuracy for Bagging: 39.9%
              precision    recall  f1-score   support

          -2       0.41      0.28      0.33       292
          -1       0.38      0.32      0.35       449
           0       0.59      0.45      0.51       936
           1       0.18      0.49      0.27       228

    accuracy                           0.40      1905
   macro avg       0.39      0.38      0.36      1905
weighted avg       0.46      0.40      0.42      1905



In [23]:
logistic = LogisticRegression()
logistic.fit(x_train, y_train)
y_pred = logistic.predict(x_test)
accuracy = round(accuracy_score(y_test, y_pred)*100, 2)
print(f"Accuracy for Logistic: {accuracy}%")
print(classification_report(y_test, y_pred))

perceptron = Perceptron()
perceptron.fit(x_train, y_train)
y_pred = perceptron.predict(x_test)
accuracy = round(accuracy_score(y_test, y_pred)*100, 2)
print(f"Accuracy for Perceptron: {accuracy}%")
print(classification_report(y_test, y_pred))

Accuracy for Logistic: 40.21%
              precision    recall  f1-score   support

          -2       0.40      0.45      0.42       292
          -1       0.40      0.32      0.35       449
           0       0.72      0.38      0.50       936
           1       0.19      0.59      0.28       228

    accuracy                           0.40      1905
   macro avg       0.43      0.43      0.39      1905
weighted avg       0.53      0.40      0.43      1905

Accuracy for Perceptron: 31.6%
              precision    recall  f1-score   support

          -2       0.30      0.44      0.36       292
          -1       0.32      0.22      0.26       449
           0       0.58      0.28      0.38       936
           1       0.16      0.50      0.24       228

    accuracy                           0.32      1905
   macro avg       0.34      0.36      0.31      1905
weighted avg       0.42      0.32      0.33      1905



In [24]:
# TEST THE HYPERPARAMETERS
knn = KNeighborsClassifier(n_neighbors=9, weights="distance")
knn.fit(x_train, y_train)
y_pred = knn.predict(x_test)
accuracy = round(accuracy_score(y_test, y_pred)*100, 2)
print(f"Accuracy of KNN: {accuracy}%")
print(classification_report(y_test, y_pred))

svm = SVC(C=1, kernel="rbf")
svm.fit(x_train, y_train)
y_pred = svm.predict(x_test)
accuracy = round(accuracy_score(y_test, y_pred)*100, 2)
print(f"Accuracy of SVM: {accuracy}%")
print(classification_report(y_test, y_pred))

tree = DecisionTreeClassifier(criterion="log_loss", max_depth=3)
tree.fit(x_train, y_train)
y_pred = tree.predict(x_test)
accuracy = round(accuracy_score(y_test, y_pred)*100, 2)
print(f"Accuracy of tree: {accuracy}%")
print(classification_report(y_test, y_pred))

bernoulli = BernoulliNB()
bernoulli.fit(x_train, y_train)
y_pred = bernoulli.predict(x_test)
accuracy = round(accuracy_score(y_test, y_pred)*100, 2)
print(f"Accuracy of bernoulliNB: {accuracy}%")
print(classification_report(y_test, y_pred))

multi = MultinomialNB()
multi.fit(x_train, y_train)
y_pred = multi.predict(x_test)
accuracy = round(accuracy_score(y_test, y_pred)*100, 2)
print(f"Accuracy of multinomialNB: {accuracy}%")
print(classification_report(y_test, y_pred))

Accuracy of KNN: 31.34%
              precision    recall  f1-score   support

          -2       0.30      0.30      0.30       292
          -1       0.28      0.40      0.33       449
           0       0.53      0.27      0.36       936
           1       0.16      0.34      0.22       228

    accuracy                           0.31      1905
   macro avg       0.32      0.33      0.30      1905
weighted avg       0.39      0.31      0.32      1905

Accuracy of SVM: 43.31%
              precision    recall  f1-score   support

          -2       0.49      0.26      0.34       292
          -1       0.45      0.42      0.44       449
           0       0.68      0.48      0.56       936
           1       0.17      0.51      0.26       228

    accuracy                           0.43      1905
   macro avg       0.45      0.42      0.40      1905
weighted avg       0.54      0.43      0.46      1905

Accuracy of tree: 50.24%
              precision    recall  f1-score   support

  

In [25]:
# Stacking Classifier (usando Logistic Regression y Random Forest como modelos base)
stacking = StackingClassifier(
    estimators=[
        ('logistic', LogisticRegression()),		# 51.39% accuracy 
        ('svm', SVC(C=1, kernel="rbf")),		# 88% precision for -2
        ("tree", DecisionTreeClassifier(criterion="log_loss", max_depth=3)),	# 42% precision for -1
        ("Bernoulli", BernoulliNB()),			# 75% precision for 0
        ("Multinomial", MultinomialNB())		# 70% precision for 1
    ],
    final_estimator=SVC(C=1, kernel="rbf")
)
stacking.fit(x_train, y_train)
y_pred_stacking = stacking.predict(x_test)
accuracy_stacking = round(accuracy_score(y_test, y_pred_stacking) * 100, 2)
print(f"Accuracy for Stacking: {accuracy_stacking}%")
print(classification_report(y_test, y_pred_stacking))

Accuracy for Stacking: 43.94%
              precision    recall  f1-score   support

          -2       0.54      0.26      0.35       292
          -1       0.46      0.42      0.44       449
           0       0.68      0.49      0.57       936
           1       0.17      0.51      0.26       228

    accuracy                           0.44      1905
   macro avg       0.46      0.42      0.40      1905
weighted avg       0.55      0.44      0.47      1905



In [26]:
x_train_dense = x_train.toarray() if hasattr(x_train, "toarray") else x_train
y_train_dense = y_train.toarray() if hasattr(y_train, "toarray") else y_train
x_test_dense = x_test.toarray() if hasattr(x_test, "toarray") else x_test
y_test_dense = y_test.toarray() if hasattr(y_test, "toarray") else y_test

gauss = GaussianNB()
gauss.fit(x_train_dense, y_train_dense)
y_pred = gauss.predict(x_test_dense)
accuracy = round(accuracy_score(y_test_dense, y_pred)*100, 2)
print(f"Accuracy of gaussianNB: {accuracy}%")
print(classification_report(y_test, y_pred))

Accuracy of gaussianNB: 29.19%
              precision    recall  f1-score   support

          -2       0.26      0.52      0.35       292
          -1       0.41      0.05      0.08       449
           0       0.74      0.22      0.33       936
           1       0.18      0.79      0.29       228

    accuracy                           0.29      1905
   macro avg       0.40      0.39      0.27      1905
weighted avg       0.52      0.29      0.27      1905



As we can see the best classfier is the Decision Tree with a 55.7% of accuracy using TF-IDF

In [None]:
vectorizer = CountVectorizer()
x_train_bow = vectorizer.fit_transform(x_train)
x_test_bow = vectorizer.transform(x_test)
