In [10]:
import pandas as pd
import numpy as np
import spacy
import re
import nltk
from nltk.corpus import stopwords
from string import punctuation
from imblearn.over_sampling import SMOTE

# AI models
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from gensim.models import Word2Vec
from transformers import BertTokenizer, BertModel
import torch
from sklearn.metrics import accuracy_score, classification_report

# Classifiers
from sklearn.linear_model import LogisticRegression, Perceptron
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, StackingClassifier

In [11]:
# Import data
nltk.download('stopwords')
stop_words = set(stopwords.words('portuguese'))
def clean_text(text):
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    text = re.sub(r'#\w+', '', text)  # Remove hashtags
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = text.lower()  # Convert to lowercase
    text = " ".join([word for word in text.split() if word not in stop_words])  # Remove stopwords
    return text

test = pd.read_excel("data\\test.xlsx")
test['Msg'] = test['Msg'].apply(clean_text)

train = pd.read_excel("data\\train.xlsx")
train['Msg'] = train['Msg'].apply(clean_text)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\bykon\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [12]:
print(train['SentimentoClassificacao'].value_counts())
print("-------------------")
print(test['SentimentoClassificacao'].value_counts())

SentimentoClassificacao
 0    1114
-1     653
-2     262
 1     256
Name: count, dtype: int64
-------------------
SentimentoClassificacao
 0    936
-1    449
-2    292
 1    228
Name: count, dtype: int64


In [13]:
nan_counts = train.isna().sum()
print(nan_counts)
print("----------------------")
nan_counts = test.isna().sum()
print(nan_counts)

Dialog_ID                  0
SentimentoRegressao        0
SentimentoClassificacao    0
Msg                        0
dtype: int64
----------------------
Dialog_ID                  0
Msg                        0
Anotador1                  6
Anotador2                  2
SentimentoRegressao        0
SentimentoClassificacao    0
dtype: int64


x_train dataset length: 2285


In [60]:

y_train = train["SentimentoClassificacao"].values
x_train = train["Msg"].values
print(f"x_train dataset length: {len(x_train)}")

y_test = test["SentimentoClassificacao"].values
x_test = test["Msg"].values

from transformers import BertTokenizer, BertModel
import torch
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertModel.from_pretrained("bert-base-uncased")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
def get_bert_embedding(texts):
    if isinstance(texts, str):
        texts = [texts]  
    elif not isinstance(texts, list) or not all(isinstance(t, str) for t in texts):
        raise ValueError("La entrada debe ser una cadena o una lista de cadenas.")
    inputs = tokenizer(texts, return_tensors="pt", padding=True, truncation=True, max_length=512)
    for key in inputs:
        inputs[key] = inputs[key].to(device)
    with torch.no_grad():
        outputs = model(**inputs)
    cls_vectors = outputs.last_hidden_state[:, 0, :] 
    return cls_vectors
x_train = x_train.tolist() if not isinstance(x_train, list) else x_train
x_test = x_test.tolist() if not isinstance(x_test, list) else x_test
x_train = get_bert_embedding(x_train)
x_test = get_bert_embedding(x_test)




x_train dataset length: 2285


In [40]:
'''
y_train = train["SentimentoClassificacao"].values
x_train = train["Msg"].values
print(f"x_train dataset length: {len(x_train)}")

y_test = test["SentimentoClassificacao"].values
x_test = test["Msg"].values
vectorizer = CountVectorizer()
x_train = vectorizer.fit_transform(x_train)
x_test = vectorizer.transform(x_test)

'''


x_train dataset length: 2285


In [52]:
'''
nltk.download('punkt_tab')
y_train = train["SentimentoClassificacao"].values
x_train = train["Msg"].values
print(f"x_train dataset length: {len(x_train)}")

y_test = test["SentimentoClassificacao"].values
x_test = test["Msg"].values

import numpy as np
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize

train_sentences = [word_tokenize(text.lower()) for text in x_train]
test_sentences = [word_tokenize(text.lower()) for text in x_test]
model = Word2Vec(sentences=train_sentences, vector_size=100, window=5, min_count=1, workers=4)
def get_sentence_vector(sentence):
    tokens = word_tokenize(sentence.lower())
    word_vectors = [model.wv[token] for token in tokens if token in model.wv]
    if word_vectors:
        return np.mean(word_vectors, axis=0)
    else:
        return np.zeros(model.vector_size)
x_train = np.array([get_sentence_vector(text) for text in x_train])
x_test = np.array([get_sentence_vector(text) for text in x_test])
'''

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\bykon\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


x_train dataset length: 2285


In [53]:
'''
y_train = train["SentimentoClassificacao"].values
x_train = train["Msg"].values
print(f"x_train dataset length: {len(x_train)}")

y_test = test["SentimentoClassificacao"].values
x_test = test["Msg"].values

vectorizer = TfidfVectorizer(max_features=100)
x_train = vectorizer.fit_transform(x_train.astype(str))
x_test = vectorizer.transform(x_test.astype(str))

smote = SMOTE(sampling_strategy='auto', random_state=42)
x_train, y_train = smote.fit_resample(x_train, y_train)
print(f"resized x_train dataset length: {x_train.shape[0]}")
'''

'\ny_train = train["SentimentoClassificacao"].values\nx_train = train["Msg"].values\nprint(f"x_train dataset length: {len(x_train)}")\n\ny_test = test["SentimentoClassificacao"].values\nx_test = test["Msg"].values\n\nvectorizer = TfidfVectorizer(max_features=100)\nx_train = vectorizer.fit_transform(x_train.astype(str))\nx_test = vectorizer.transform(x_test.astype(str))\n\nsmote = SMOTE(sampling_strategy=\'auto\', random_state=42)\nx_train, y_train = smote.fit_resample(x_train, y_train)\nprint(f"resized x_train dataset length: {x_train.shape[0]}")\n'

In [61]:
weights = ['uniform', 'distance']
for k in range(1, 11):
    for w in weights:
        clf = KNeighborsClassifier(n_neighbors=k, weights=w)
        clf.fit(x_train, y_train)
        y_pred = clf.predict(x_test)
        accuracy = round(accuracy_score(y_test, y_pred)*100, 2)
        print(f"Accuracy of KNN for k={k} and w={w}: {accuracy}%")
    print("")


Accuracy of KNN for k=1 and w=uniform: 41.26%
Accuracy of KNN for k=1 and w=distance: 41.26%

Accuracy of KNN for k=2 and w=uniform: 44.67%
Accuracy of KNN for k=2 and w=distance: 41.05%

Accuracy of KNN for k=3 and w=uniform: 45.83%
Accuracy of KNN for k=3 and w=distance: 47.35%

Accuracy of KNN for k=4 and w=uniform: 45.51%
Accuracy of KNN for k=4 and w=distance: 47.82%

Accuracy of KNN for k=5 and w=uniform: 47.66%
Accuracy of KNN for k=5 and w=distance: 47.51%

Accuracy of KNN for k=6 and w=uniform: 46.72%
Accuracy of KNN for k=6 and w=distance: 48.61%

Accuracy of KNN for k=7 and w=uniform: 45.14%
Accuracy of KNN for k=7 and w=distance: 47.93%

Accuracy of KNN for k=8 and w=uniform: 46.88%
Accuracy of KNN for k=8 and w=distance: 48.5%

Accuracy of KNN for k=9 and w=uniform: 46.51%
Accuracy of KNN for k=9 and w=distance: 48.45%

Accuracy of KNN for k=10 and w=uniform: 47.66%
Accuracy of KNN for k=10 and w=distance: 49.45%

Best hyperparameters: k=9 & w=distance.


In [62]:
kernels = ['linear', 'poly', 'rbf', 'sigmoid']
for c in range(1, 11):
    for k in kernels:
        clf = SVC(C=c, kernel=k)
        clf.fit(x_train, y_train)
        y_pred = clf.predict(x_test)
        accuracy = round(accuracy_score(y_test, y_pred)*100, 2)
        print(f"Accuracy of SVM for c={c} and kernel={k}: {accuracy}%")
    print("")

Accuracy of SVM for c=1 and kernel=linear: 43.99%
Accuracy of SVM for c=1 and kernel=poly: 52.23%
Accuracy of SVM for c=1 and kernel=rbf: 52.13%
Accuracy of SVM for c=1 and kernel=sigmoid: 49.19%

Accuracy of SVM for c=2 and kernel=linear: 42.2%
Accuracy of SVM for c=2 and kernel=poly: 52.7%
Accuracy of SVM for c=2 and kernel=rbf: 52.55%
Accuracy of SVM for c=2 and kernel=sigmoid: 49.55%

Accuracy of SVM for c=3 and kernel=linear: 41.52%
Accuracy of SVM for c=3 and kernel=poly: 52.91%
Accuracy of SVM for c=3 and kernel=rbf: 52.76%
Accuracy of SVM for c=3 and kernel=sigmoid: 51.02%

Accuracy of SVM for c=4 and kernel=linear: 39.53%
Accuracy of SVM for c=4 and kernel=poly: 53.02%
Accuracy of SVM for c=4 and kernel=rbf: 52.91%
Accuracy of SVM for c=4 and kernel=sigmoid: 51.6%

Accuracy of SVM for c=5 and kernel=linear: 38.74%
Accuracy of SVM for c=5 and kernel=poly: 53.54%
Accuracy of SVM for c=5 and kernel=rbf: 52.97%
Accuracy of SVM for c=5 and kernel=sigmoid: 51.39%

Accuracy of SVM fo

In [63]:
criterions = ['gini', 'entropy', 'log_loss']

for criterion in criterions:
	for depth in range(1, 11):
		tree = DecisionTreeClassifier(criterion=criterion, max_depth=depth)
		tree.fit(x_train, y_train)
		y_pred = tree.predict(x_test)
		accuracy = round(accuracy_score(y_test, y_pred)*100, 2)
		print(f"Accuracy for criterion={criterion} and depth={depth}: {accuracy}%")
	print("")


Accuracy for criterion=gini and depth=1: 49.13%
Accuracy for criterion=gini and depth=2: 45.56%
Accuracy for criterion=gini and depth=3: 45.2%
Accuracy for criterion=gini and depth=4: 48.98%
Accuracy for criterion=gini and depth=5: 46.67%
Accuracy for criterion=gini and depth=6: 45.3%
Accuracy for criterion=gini and depth=7: 44.09%
Accuracy for criterion=gini and depth=8: 43.52%
Accuracy for criterion=gini and depth=9: 38.43%
Accuracy for criterion=gini and depth=10: 43.04%

Accuracy for criterion=entropy and depth=1: 49.13%
Accuracy for criterion=entropy and depth=2: 51.86%
Accuracy for criterion=entropy and depth=3: 48.35%
Accuracy for criterion=entropy and depth=4: 47.51%
Accuracy for criterion=entropy and depth=5: 50.03%
Accuracy for criterion=entropy and depth=6: 45.41%
Accuracy for criterion=entropy and depth=7: 46.25%
Accuracy for criterion=entropy and depth=8: 44.93%
Accuracy for criterion=entropy and depth=9: 43.15%
Accuracy for criterion=entropy and depth=10: 43.41%

Accuracy

In [64]:
# Random Forest
random_forest = RandomForestClassifier()
random_forest.fit(x_train, y_train)
y_pred = random_forest.predict(x_test)
accuracy_rf = round(accuracy_score(y_test, y_pred) * 100, 2)
print(f"Accuracy for Random Forest: {accuracy}%")
print(classification_report(y_test, y_pred))

# Bagging Classifier
bagging = BaggingClassifier()
bagging.fit(x_train, y_train)
y_pred_bagging = bagging.predict(x_test)
accuracy_bagging = round(accuracy_score(y_test, y_pred_bagging) * 100, 2)
print(f"Accuracy for Bagging: {accuracy_bagging}%")
print(classification_report(y_test, y_pred_bagging))

Accuracy for Random Forest: 43.83%
              precision    recall  f1-score   support

          -2       0.62      0.08      0.14       292
          -1       0.39      0.34      0.36       449
           0       0.56      0.87      0.68       936
           1       0.21      0.03      0.05       228

    accuracy                           0.52      1905
   macro avg       0.45      0.33      0.31      1905
weighted avg       0.49      0.52      0.45      1905

Accuracy for Bagging: 43.62%
              precision    recall  f1-score   support

          -2       0.36      0.12      0.18       292
          -1       0.31      0.46      0.37       449
           0       0.54      0.60      0.57       936
           1       0.26      0.10      0.15       228

    accuracy                           0.44      1905
   macro avg       0.37      0.32      0.32      1905
weighted avg       0.42      0.44      0.41      1905



In [65]:
logistic = LogisticRegression()
logistic.fit(x_train, y_train)
y_pred = logistic.predict(x_test)
accuracy = round(accuracy_score(y_test, y_pred)*100, 2)
print(f"Accuracy for Logistic: {accuracy}%")
print(classification_report(y_test, y_pred))

perceptron = Perceptron()
perceptron.fit(x_train, y_train)
y_pred = perceptron.predict(x_test)
accuracy = round(accuracy_score(y_test, y_pred)*100, 2)
print(f"Accuracy for Perceptron: {accuracy}%")
print(classification_report(y_test, y_pred))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Accuracy for Logistic: 45.51%
              precision    recall  f1-score   support

          -2       0.45      0.20      0.27       292
          -1       0.31      0.36      0.33       449
           0       0.55      0.66      0.60       936
           1       0.23      0.14      0.17       228

    accuracy                           0.46      1905
   macro avg       0.38      0.34      0.35      1905
weighted avg       0.44      0.46      0.44      1905

Accuracy for Perceptron: 40.94%
              precision    recall  f1-score   support

          -2       0.29      0.66      0.41       292
          -1       0.18      0.07      0.10       449
           0       0.58      0.54      0.56       936
           1       0.24      0.20      0.22       228

    accuracy                           0.41      1905
   macro avg       0.32      0.37      0.32      1905
weighted avg       0.40      0.41      0.39      1905



In [66]:
# TEST THE HYPERPARAMETERS
knn = KNeighborsClassifier(n_neighbors=9, weights="distance")
knn.fit(x_train, y_train)
y_pred = knn.predict(x_test)
accuracy = round(accuracy_score(y_test, y_pred)*100, 2)
print(f"Accuracy of KNN: {accuracy}%")
print(classification_report(y_test, y_pred))

svm = SVC(C=1, kernel="rbf")
svm.fit(x_train, y_train)
y_pred = svm.predict(x_test)
accuracy = round(accuracy_score(y_test, y_pred)*100, 2)
print(f"Accuracy of SVM: {accuracy}%")
print(classification_report(y_test, y_pred))

tree = DecisionTreeClassifier(criterion="log_loss", max_depth=3)
tree.fit(x_train, y_train)
y_pred = tree.predict(x_test)
accuracy = round(accuracy_score(y_test, y_pred)*100, 2)
print(f"Accuracy of tree: {accuracy}%")
print(classification_report(y_test, y_pred))

bernoulli = BernoulliNB()
bernoulli.fit(x_train, y_train)
y_pred = bernoulli.predict(x_test)
accuracy = round(accuracy_score(y_test, y_pred)*100, 2)
print(f"Accuracy of bernoulliNB: {accuracy}%")
print(classification_report(y_test, y_pred))

multi = MultinomialNB()
multi.fit(x_train, y_train)
y_pred = multi.predict(x_test)
accuracy = round(accuracy_score(y_test, y_pred)*100, 2)
print(f"Accuracy of multinomialNB: {accuracy}%")
print(classification_report(y_test, y_pred))

Accuracy of KNN: 48.45%
              precision    recall  f1-score   support

          -2       0.44      0.12      0.19       292
          -1       0.32      0.37      0.34       449
           0       0.57      0.75      0.65       936
           1       0.30      0.09      0.14       228

    accuracy                           0.48      1905
   macro avg       0.41      0.33      0.33      1905
weighted avg       0.46      0.48      0.44      1905

Accuracy of SVM: 52.13%
              precision    recall  f1-score   support

          -2       0.00      0.00      0.00       292
          -1       0.43      0.19      0.26       449
           0       0.53      0.97      0.69       936
           1       0.00      0.00      0.00       228

    accuracy                           0.52      1905
   macro avg       0.24      0.29      0.24      1905
weighted avg       0.36      0.52      0.40      1905



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Accuracy of tree: 48.35%
              precision    recall  f1-score   support

          -2       0.00      0.00      0.00       292
          -1       0.30      0.40      0.34       449
           0       0.57      0.79      0.66       936
           1       0.00      0.00      0.00       228

    accuracy                           0.48      1905
   macro avg       0.22      0.30      0.25      1905
weighted avg       0.35      0.48      0.41      1905

Accuracy of bernoulliNB: 42.31%
              precision    recall  f1-score   support

          -2       0.34      0.45      0.39       292
          -1       0.35      0.28      0.31       449
           0       0.72      0.47      0.56       936
           1       0.21      0.50      0.29       228

    accuracy                           0.42      1905
   macro avg       0.40      0.42      0.39      1905
weighted avg       0.51      0.42      0.44      1905



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


ValueError: Negative values in data passed to MultinomialNB (input X)

In [49]:
# Stacking Classifier (usando Logistic Regression y Random Forest como modelos base)
stacking = StackingClassifier(
    estimators=[
        ('logistic', LogisticRegression()),		# 51.39% accuracy 
        ('svm', SVC(C=1, kernel="rbf")),		# 88% precision for -2
        ("tree", DecisionTreeClassifier(criterion="log_loss", max_depth=3)),	# 42% precision for -1
        ("Bernoulli", BernoulliNB()),			# 75% precision for 0
        ("Multinomial", MultinomialNB())		# 70% precision for 1
    ],
    final_estimator=SVC(C=1, kernel="rbf")
)
stacking.fit(x_train, y_train)
y_pred_stacking = stacking.predict(x_test)
accuracy_stacking = round(accuracy_score(y_test, y_pred_stacking) * 100, 2)
print(f"Accuracy for Stacking: {accuracy_stacking}%")
print(classification_report(y_test, y_pred_stacking))

Accuracy for Stacking: 52.49%
              precision    recall  f1-score   support

          -2       0.93      0.04      0.08       292
          -1       0.45      0.27      0.33       449
           0       0.55      0.93      0.69       936
           1       0.05      0.01      0.01       228

    accuracy                           0.52      1905
   macro avg       0.49      0.31      0.28      1905
weighted avg       0.52      0.52      0.43      1905



In [50]:
x_train_dense = x_train.toarray() if hasattr(x_train, "toarray") else x_train
y_train_dense = y_train.toarray() if hasattr(y_train, "toarray") else y_train
x_test_dense = x_test.toarray() if hasattr(x_test, "toarray") else x_test
y_test_dense = y_test.toarray() if hasattr(y_test, "toarray") else y_test

gauss = GaussianNB()
gauss.fit(x_train_dense, y_train_dense)
y_pred = gauss.predict(x_test_dense)
accuracy = round(accuracy_score(y_test_dense, y_pred)*100, 2)
print(f"Accuracy of gaussianNB: {accuracy}%")
print(classification_report(y_test, y_pred))

Accuracy of gaussianNB: 41.84%
              precision    recall  f1-score   support

          -2       0.29      0.21      0.24       292
          -1       0.32      0.44      0.37       449
           0       0.58      0.47      0.52       936
           1       0.31      0.41      0.35       228

    accuracy                           0.42      1905
   macro avg       0.37      0.38      0.37      1905
weighted avg       0.44      0.42      0.42      1905



As we can see the best classfier is the Decision Tree with a 55.7% of accuracy using TF-IDF