In [25]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from string import punctuation
from imblearn.over_sampling import SMOTE
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split

# Classifiers
from sklearn.linear_model import LogisticRegression, Perceptron
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, StackingClassifier

In [26]:
# Import data
nltk.download('stopwords')
stop_words = set(stopwords.words('portuguese'))
def clean_text(text):
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    text = re.sub(r'#\w+', '', text)  # Remove hashtags
    text = text.translate(str.maketrans('', '', punctuation))  # Remove punctuation
    text = text.lower()  # Convert to lowercase
    text = " ".join([word for word in text.split() if word not in stop_words])  # Remove stopwords
    return text

test = pd.read_excel("data\\test.xlsx")
test['Msg'] = test['Msg'].apply(clean_text)

train = pd.read_excel("data\\train.xlsx")
train['Msg'] = train['Msg'].apply(clean_text)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\valko\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [27]:
print(train['SentimentoClassificacao'].value_counts())
print("-------------------")
print(test['SentimentoClassificacao'].value_counts())

SentimentoClassificacao
 0    1114
-1     653
-2     262
 1     256
Name: count, dtype: int64
-------------------
SentimentoClassificacao
 0    936
-1    449
-2    292
 1    228
Name: count, dtype: int64


In [28]:
nan_counts = train.isna().sum()
print(nan_counts)
print("----------------------")
nan_counts = test.isna().sum()
print(nan_counts)

Dialog_ID                  0
SentimentoRegressao        0
SentimentoClassificacao    0
Msg                        0
dtype: int64
----------------------
Dialog_ID                  0
Msg                        0
Anotador1                  6
Anotador2                  2
SentimentoRegressao        0
SentimentoClassificacao    0
dtype: int64


## Representation of text: Bag-Of-Words

In [29]:
y_test = test["SentimentoClassificacao"].values
x_test = test["Msg"].values
y_train = train["SentimentoClassificacao"].values
x_train = train["Msg"].values

vectorizer = CountVectorizer()
x_train = vectorizer.fit_transform(x_train)
x_test = vectorizer.transform(x_test)

In [30]:
smote = SMOTE(sampling_strategy='auto', random_state=42)
x_train, y_train = smote.fit_resample(x_train, y_train)
print(f"resized x_train dataset length: {x_train.shape[0]}")
x_train_2, x_val, y_train_2, y_val = train_test_split(x_train, y_train, test_size=0.2, random_state=42)

resized x_train dataset length: 4456


In [31]:
weights = ['uniform', 'distance']
for k in range(1, 11):
    for w in weights:
        clf = KNeighborsClassifier(n_neighbors=k, weights=w)
        clf.fit(x_train_2, y_train_2)
        y_pred = clf.predict(x_val)
        accuracy = round(accuracy_score(y_val, y_pred)*100, 2)
        print(f"Accuracy of KNN for k={k} and w={w}: {accuracy}%")
    print("\n")
print("The best hyperparameters for Bag-of-Words are k=1 & w=uniform with 54.15%")
print("-------------")

kernels = ['linear', 'poly', 'rbf', 'sigmoid']
for c in range(1, 21):
    for k in kernels:
        clf = SVC(C=c, kernel=k)
        clf.fit(x_train_2, y_train_2)
        y_pred = clf.predict(x_val)
        accuracy = round(accuracy_score(y_val, y_pred)*100, 2)
        print(f"Accuracy of SVM for c={c} and kernel={k}: {accuracy}%")
    print("\n")
print("The best hyperparameters for Bag-of-Words are c=1 & kernel=rbf with 58.18%")
print("-------------")

criterions = ['gini', 'entropy', 'log_loss']
for criterion in criterions:
	for depth in range(1, 21):
		tree = DecisionTreeClassifier(criterion=criterion, max_depth=depth)
		tree.fit(x_train_2, y_train_2)
		y_pred = tree.predict(x_val)
		accuracy = round(accuracy_score(y_val, y_pred)*100, 2)
		print(f"Accuracy for criterion={criterion} and depth={depth}: {accuracy}%")
	print("\n")
print("The best hyperparameters for Bag-of-Words are criterion=gini & depth=19 with 46.86%")

Accuracy of KNN for k=1 and w=uniform: 54.71%
Accuracy of KNN for k=1 and w=distance: 54.71%


Accuracy of KNN for k=2 and w=uniform: 45.18%
Accuracy of KNN for k=2 and w=distance: 47.09%


Accuracy of KNN for k=3 and w=uniform: 50.11%
Accuracy of KNN for k=3 and w=distance: 51.91%


Accuracy of KNN for k=4 and w=uniform: 49.22%
Accuracy of KNN for k=4 and w=distance: 50.78%


Accuracy of KNN for k=5 and w=uniform: 50.45%
Accuracy of KNN for k=5 and w=distance: 52.91%


Accuracy of KNN for k=6 and w=uniform: 48.43%
Accuracy of KNN for k=6 and w=distance: 51.46%


Accuracy of KNN for k=7 and w=uniform: 45.74%
Accuracy of KNN for k=7 and w=distance: 48.43%


Accuracy of KNN for k=8 and w=uniform: 46.52%
Accuracy of KNN for k=8 and w=distance: 49.89%


Accuracy of KNN for k=9 and w=uniform: 45.18%
Accuracy of KNN for k=9 and w=distance: 49.44%


Accuracy of KNN for k=10 and w=uniform: 46.08%
Accuracy of KNN for k=10 and w=distance: 49.33%


The best hyperparameters for Bag-of-Words are k=

In [32]:
def compare_classifiers(k: int, w: str, c: int, kernel: str, criterion: str, depth: int, Neg_x_values: bool = False):
	knn = KNeighborsClassifier(n_neighbors=k, weights=w)
	knn.fit(x_train, y_train)
	y_pred = knn.predict(x_test)
	accuracy = round(accuracy_score(y_test, y_pred)*100, 2)
	print(f"Accuracy of KNN: {accuracy}%")
	print(classification_report(y_test, y_pred))

	svm = SVC(C=c, kernel=kernel)
	svm.fit(x_train, y_train)
	y_pred = svm.predict(x_test)
	accuracy = round(accuracy_score(y_test, y_pred)*100, 2)
	print(f"Accuracy of SVM: {accuracy}%")
	print(classification_report(y_test, y_pred))

	tree = DecisionTreeClassifier(criterion=criterion, max_depth=depth)
	tree.fit(x_train, y_train)
	y_pred = tree.predict(x_test)
	accuracy = round(accuracy_score(y_test, y_pred)*100, 2)
	print(f"Accuracy of tree: {accuracy}%")
	print(classification_report(y_test, y_pred))

	random_forest = RandomForestClassifier()
	random_forest.fit(x_train, y_train)
	y_pred = random_forest.predict(x_test)
	accuracy_rf = round(accuracy_score(y_test, y_pred) * 100, 2)
	print(f"Accuracy for Random Forest: {accuracy}%")
	print(classification_report(y_test, y_pred))

	bagging = BaggingClassifier()
	bagging.fit(x_train, y_train)
	y_pred_bagging = bagging.predict(x_test)
	accuracy_bagging = round(accuracy_score(y_test, y_pred_bagging) * 100, 2)
	print(f"Accuracy for Bagging: {accuracy_bagging}%")
	print(classification_report(y_test, y_pred_bagging))

	logistic = LogisticRegression()
	logistic.fit(x_train, y_train)
	y_pred = logistic.predict(x_test)
	accuracy = round(accuracy_score(y_test, y_pred)*100, 2)
	print(f"Accuracy for Logistic: {accuracy}%")
	print(classification_report(y_test, y_pred))

	perceptron = Perceptron()
	perceptron.fit(x_train, y_train)
	y_pred = perceptron.predict(x_test)
	accuracy = round(accuracy_score(y_test, y_pred)*100, 2)
	print(f"Accuracy for Perceptron: {accuracy}%")
	print(classification_report(y_test, y_pred))

	bernoulli = BernoulliNB()
	bernoulli.fit(x_train, y_train)
	y_pred = bernoulli.predict(x_test)
	accuracy = round(accuracy_score(y_test, y_pred)*100, 2)
	print(f"Accuracy of bernoulliNB: {accuracy}%")
	print(classification_report(y_test, y_pred))

	if not Neg_x_values:
		multi = MultinomialNB()
		multi.fit(x_train, y_train)
		y_pred = multi.predict(x_test)
		accuracy = round(accuracy_score(y_test, y_pred)*100, 2)
		print(f"Accuracy of multinomialNB: {accuracy}%")
		print(classification_report(y_test, y_pred))

		stacking = StackingClassifier(
			estimators=[
				('logistic', LogisticRegression()),		
				('svm', SVC(C=1, kernel="rbf")),		
				("tree", DecisionTreeClassifier(criterion="gini", max_depth=19)),	
				("Bernoulli", BernoulliNB()),			
				("Multinomial", MultinomialNB())		
			],
			final_estimator=SVC(C=1, kernel="rbf"))
		stacking.fit(x_train, y_train)
		y_pred_stacking = stacking.predict(x_test)
		accuracy_stacking = round(accuracy_score(y_test, y_pred_stacking) * 100, 2)
		print(f"Accuracy for Stacking: {accuracy_stacking}%")
		print(classification_report(y_test, y_pred_stacking))

	x_train_dense = x_train.toarray() if hasattr(x_train, "toarray") else x_train
	y_train_dense = y_train.toarray() if hasattr(y_train, "toarray") else y_train
	x_test_dense = x_test.toarray() if hasattr(x_test, "toarray") else x_test
	y_test_dense = y_test.toarray() if hasattr(y_test, "toarray") else y_test

	gauss = GaussianNB()
	gauss.fit(x_train_dense, y_train_dense)
	y_pred = gauss.predict(x_test_dense)
	accuracy = round(accuracy_score(y_test_dense, y_pred)*100, 2)
	print(f"Accuracy of gaussianNB: {accuracy}%")
	print(classification_report(y_test, y_pred))

In [33]:
print(compare_classifiers(1, "uniform", 1, "rbf", "gini", 19))

Accuracy of KNN: 35.33%
              precision    recall  f1-score   support

          -2       0.27      0.40      0.32       292
          -1       0.30      0.47      0.37       449
           0       0.59      0.33      0.42       936
           1       0.15      0.16      0.15       228

    accuracy                           0.35      1905
   macro avg       0.33      0.34      0.32      1905
weighted avg       0.42      0.35      0.36      1905

Accuracy of SVM: 46.67%
              precision    recall  f1-score   support

          -2       0.35      0.19      0.25       292
          -1       0.39      0.35      0.37       449
           0       0.58      0.65      0.62       936
           1       0.22      0.29      0.25       228

    accuracy                           0.47      1905
   macro avg       0.39      0.37      0.37      1905
weighted avg       0.46      0.47      0.46      1905

Accuracy of tree: 28.29%
              precision    recall  f1-score   support

  

## Representation of text: Word2Vec

In [34]:
y_test = test["SentimentoClassificacao"].values
x_test = test["Msg"].values
y_train = train["SentimentoClassificacao"].values
x_train = train["Msg"].values

nltk.download('punkt_tab')
train_sentences = [word_tokenize(text.lower()) for text in x_train]
test_sentences = [word_tokenize(text.lower()) for text in x_test]
model = Word2Vec(sentences=train_sentences, vector_size=100, window=5, min_count=1, workers=4)
def get_sentence_vector(sentence):
    tokens = word_tokenize(sentence.lower())
    word_vectors = [model.wv[token] for token in tokens if token in model.wv]
    if word_vectors:
        return np.mean(word_vectors, axis=0)
    else:
        return np.zeros(model.vector_size)
x_train = np.array([get_sentence_vector(text) for text in x_train])
x_test = np.array([get_sentence_vector(text) for text in x_test])

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\valko\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [35]:
smote = SMOTE(sampling_strategy='auto', random_state=42)
x_train, y_train = smote.fit_resample(x_train, y_train)
print(f"resized x_train dataset length: {x_train.shape[0]}")
x_train_2, x_val, y_train_2, y_val = train_test_split(x_train, y_train, test_size=0.2, random_state=42)

resized x_train dataset length: 4456


In [36]:
weights = ['uniform', 'distance']
for k in range(1, 11):
    for w in weights:
        clf = KNeighborsClassifier(n_neighbors=k, weights=w)
        clf.fit(x_train_2, y_train_2)
        y_pred = clf.predict(x_val)
        accuracy = round(accuracy_score(y_val, y_pred)*100, 2)
        print(f"Accuracy of KNN for k={k} and w={w}: {accuracy}%")
    print("\n")
print("The best hyperparameters for Word2Vec are k=1 & w=distance with 78.03%")
print("-------------")

kernels = ['linear', 'poly', 'rbf', 'sigmoid']
for c in range(1, 21):
    for k in kernels:
        clf = SVC(C=c, kernel=k)
        clf.fit(x_train_2, y_train_2)
        y_pred = clf.predict(x_val)
        accuracy = round(accuracy_score(y_val, y_pred)*100, 2)
        print(f"Accuracy of SVM for c={c} and kernel={k}: {accuracy}%")
    print("\n")
print("The best hyperparameters for Word2Vec are c=20 & kernel=rbf with 54.93%")
print("-------------")

criterions = ['gini', 'entropy', 'log_loss']
for criterion in criterions:
	for depth in range(1, 21):
		tree = DecisionTreeClassifier(criterion=criterion, max_depth=depth)
		tree.fit(x_train_2, y_train_2)
		y_pred = tree.predict(x_val)
		accuracy = round(accuracy_score(y_val, y_pred)*100, 2)
		print(f"Accuracy for criterion={criterion} and depth={depth}: {accuracy}%")
	print("\n")
print("The best hyperparameters for Word2Vec are criterion=gini & depth=19 with 65.81%")

Accuracy of KNN for k=1 and w=uniform: 77.91%
Accuracy of KNN for k=1 and w=distance: 77.91%


Accuracy of KNN for k=2 and w=uniform: 71.97%
Accuracy of KNN for k=2 and w=distance: 78.03%


Accuracy of KNN for k=3 and w=uniform: 71.86%
Accuracy of KNN for k=3 and w=distance: 75.34%


Accuracy of KNN for k=4 and w=uniform: 69.06%
Accuracy of KNN for k=4 and w=distance: 75.34%


Accuracy of KNN for k=5 and w=uniform: 68.05%
Accuracy of KNN for k=5 and w=distance: 73.88%


Accuracy of KNN for k=6 and w=uniform: 66.03%
Accuracy of KNN for k=6 and w=distance: 73.65%


Accuracy of KNN for k=7 and w=uniform: 65.7%
Accuracy of KNN for k=7 and w=distance: 72.76%


Accuracy of KNN for k=8 and w=uniform: 64.57%
Accuracy of KNN for k=8 and w=distance: 71.97%


Accuracy of KNN for k=9 and w=uniform: 63.45%
Accuracy of KNN for k=9 and w=distance: 71.3%


Accuracy of KNN for k=10 and w=uniform: 61.21%
Accuracy of KNN for k=10 and w=distance: 70.52%


The best hyperparameters for Word2Vec are k=1 & w=

In [37]:
print(compare_classifiers(2, "distance", 20, "rbf", "gini", 19, True))

Accuracy of KNN: 35.28%
              precision    recall  f1-score   support

          -2       0.25      0.35      0.29       292
          -1       0.31      0.38      0.34       449
           0       0.62      0.35      0.45       936
           1       0.17      0.30      0.22       228

    accuracy                           0.35      1905
   macro avg       0.34      0.34      0.32      1905
weighted avg       0.44      0.35      0.37      1905

Accuracy of SVM: 36.85%
              precision    recall  f1-score   support

          -2       0.29      0.37      0.33       292
          -1       0.28      0.30      0.29       449
           0       0.66      0.36      0.47       936
           1       0.22      0.52      0.31       228

    accuracy                           0.37      1905
   macro avg       0.36      0.39      0.35      1905
weighted avg       0.46      0.37      0.39      1905

Accuracy of tree: 36.01%
              precision    recall  f1-score   support

  

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## Representation of text: TF-IDF

In [38]:
y_test = test["SentimentoClassificacao"].values
x_test = test["Msg"].values
y_train = train["SentimentoClassificacao"].values
x_train = train["Msg"].values

print(f"x_train dataset length: {len(x_train)}")
vectorizer = TfidfVectorizer(max_features=100)
x_train = vectorizer.fit_transform(x_train.astype(str))
x_test = vectorizer.transform(x_test.astype(str))

x_train dataset length: 2285


In [39]:
smote = SMOTE(sampling_strategy='auto', random_state=42)
x_train, y_train = smote.fit_resample(x_train, y_train)
print(f"resized x_train dataset length: {x_train.shape[0]}")
x_train_2, x_val, y_train_2, y_val = train_test_split(x_train, y_train, test_size=0.2, random_state=42)

resized x_train dataset length: 4456


In [42]:
weights = ['uniform', 'distance']
for k in range(1, 11):
    for w in weights:
        clf = KNeighborsClassifier(n_neighbors=k, weights=w)
        clf.fit(x_train_2, y_train_2)
        y_pred = clf.predict(x_val)
        accuracy = round(accuracy_score(y_val, y_pred)*100, 2)
        print(f"Accuracy of KNN for k={k} and w={w}: {accuracy}%")
    print("\n")
print("The best hyperparameters for TF-IDF are k=1 & w=distance with 63.0%")
print("-------------")

kernels = ['linear', 'poly', 'rbf', 'sigmoid']
for c in range(1, 21):
    for k in kernels:
        clf = SVC(C=c, kernel=k)
        clf.fit(x_train_2, y_train_2)
        y_pred = clf.predict(x_val)
        accuracy = round(accuracy_score(y_val, y_pred)*100, 2)
        print(f"Accuracy of SVM for c={c} and kernel={k}: {accuracy}%")
    print("\n")
print("The best hyperparameters for TF-IDF are c=8 & kernel=rbf with 66.93%")
print("-------------")

criterions = ['gini', 'entropy', 'log_loss']
for criterion in criterions:
	for depth in range(1, 21):
		tree = DecisionTreeClassifier(criterion=criterion, max_depth=depth)
		tree.fit(x_train_2, y_train_2)
		y_pred = tree.predict(x_val)
		accuracy = round(accuracy_score(y_val, y_pred)*100, 2)
		print(f"Accuracy for criterion={criterion} and depth={depth}: {accuracy}%")
	print("\n")
print("The best hyperparameters for TF-IDF are criterion=log_loss & depth=20 with 57.74%")

Accuracy of KNN for k=1 and w=uniform: 63.23%
Accuracy of KNN for k=1 and w=distance: 63.23%


Accuracy of KNN for k=2 and w=uniform: 56.95%
Accuracy of KNN for k=2 and w=distance: 59.19%


Accuracy of KNN for k=3 and w=uniform: 57.51%
Accuracy of KNN for k=3 and w=distance: 59.42%


Accuracy of KNN for k=4 and w=uniform: 55.27%
Accuracy of KNN for k=4 and w=distance: 58.18%


Accuracy of KNN for k=5 and w=uniform: 54.26%
Accuracy of KNN for k=5 and w=distance: 58.18%


Accuracy of KNN for k=6 and w=uniform: 53.81%
Accuracy of KNN for k=6 and w=distance: 58.18%


Accuracy of KNN for k=7 and w=uniform: 54.26%
Accuracy of KNN for k=7 and w=distance: 59.75%


Accuracy of KNN for k=8 and w=uniform: 52.47%
Accuracy of KNN for k=8 and w=distance: 57.85%


Accuracy of KNN for k=9 and w=uniform: 51.91%
Accuracy of KNN for k=9 and w=distance: 56.95%


Accuracy of KNN for k=10 and w=uniform: 51.35%
Accuracy of KNN for k=10 and w=distance: 57.96%


The best hyperparameters for TF-IDF are k=1 & w=

In [43]:
print(compare_classifiers(1, "distance", 8, "rbf", "log_loss", 20))

Accuracy of KNN: 37.27%
              precision    recall  f1-score   support

          -2       0.33      0.27      0.29       292
          -1       0.37      0.38      0.38       449
           0       0.56      0.41      0.47       936
           1       0.15      0.33      0.21       228

    accuracy                           0.37      1905
   macro avg       0.35      0.35      0.34      1905
weighted avg       0.43      0.37      0.39      1905

Accuracy of SVM: 40.84%
              precision    recall  f1-score   support

          -2       0.41      0.32      0.36       292
          -1       0.44      0.33      0.38       449
           0       0.60      0.46      0.52       936
           1       0.17      0.45      0.24       228

    accuracy                           0.41      1905
   macro avg       0.40      0.39      0.38      1905
weighted avg       0.48      0.41      0.43      1905

Accuracy of tree: 29.34%
              precision    recall  f1-score   support

  