In [None]:
import pandas as pd
import numpy as np
import spacy
from string import punctuation
import matplotlib.pyplot as plt
from sklearn.metrics import cohen_kappa_score
from collections import Counter

# AI models
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report

# Classifiers
from sklearn.linear_model import LogisticRegression, Perceptron
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB
from sklearn.tree import DecisionTreeClassifier

In [132]:
# Import data
test = pd.read_excel("data\\test.xlsx")
test.head(20)

Unnamed: 0,Dialog_ID,Msg,Anotador1,Anotador2,SentimentoRegressao,SentimentoClassificacao
0,916,Olá ⁦⁦@VodafonePT⁩ podem-me ajudar ? Tenho a o...,-2.0,-2.0,-2.0,-2
1,916,@nrg78 Olá. Pelo print a subscrição terá sido ...,0.0,0.0,0.0,0
2,917,sou o único com problemas no router da vodafon...,-1.0,-1.0,-1.0,-1
3,917,@bennyys17 Para que a situação seja analisada ...,0.0,0.0,0.0,0
4,918,Menos uma? HORA Bolas! 😱⏰ #MudançaDaHora\nQuem...,-1.0,-1.0,-1.0,-1
5,918,@VodafonePT Óptimo serviço prestado hoje: troc...,1.0,1.0,1.0,1
6,918,@freitasdesousa Olá. Agradecemos o seu feedbac...,1.0,1.0,1.0,1
7,919,"@VodafonePT okaaaaay, onde está a VH1? Já não ...",-1.0,-1.0,-1.0,-1
8,919,@DearVRodrigues Olá @DearVRodrigues. Informamo...,0.0,0.0,0.0,0
9,920,"@VodafonePT, não sei que raio fazem aos cartõe...",-2.0,-2.0,-2.0,-2


In [133]:
train = pd.read_excel("data\\train.xlsx")
train.head(10)

Unnamed: 0,Dialog_ID,SentimentoRegressao,SentimentoClassificacao,Msg
0,0,-1.0,-1,"@nowoportugal alô, preciso de internet pelo me..."
1,0,0.0,0,Olá @Paulozxl lamentamos a situação que nos re...
2,1,-1.5,-2,@nowoportugal Posso terminar o contrato e muda...
3,1,0.0,0,Olá @mpnm77 lamentamos a situação que se sinta...
4,2,-1.0,-1,"@nowoportugal sem sinal de Nowo, tampoco net"
5,2,0.0,0,"Olá @doradarocha, lamentamos a situação, mas e..."
6,3,-1.0,-1,@nowoportugal têm o serviço de net em baixo e ...
7,3,0.0,0,"Olá @pngantunes, lamentamos a situação, mas ex..."
8,4,0.5,0,Hoje é dia de ver o último concerto de Simone ...
9,4,0.5,1,@IuriPereira09 @RTP1 Certo! Aqui está: \n\nhtt...


In [134]:
nan_counts = train.isna().sum()
print(nan_counts)
print("----------------------")
nan_counts = test.isna().sum()
print(nan_counts)

Dialog_ID                  0
SentimentoRegressao        0
SentimentoClassificacao    0
Msg                        0
dtype: int64
----------------------
Dialog_ID                  0
Msg                        0
Anotador1                  6
Anotador2                  2
SentimentoRegressao        0
SentimentoClassificacao    0
dtype: int64


In [135]:
y_train = train["SentimentoClassificacao"].values
x_train = train["Msg"].values

y_test = test["SentimentoClassificacao"].values
x_test = test["Msg"].values

vectorizer = TfidfVectorizer(max_features=100)
x_train = vectorizer.fit_transform(x_train.astype(str))
x_test = vectorizer.transform(x_test.astype(str))

In [136]:
weights = ['uniform', 'distance']
for k in range(1, 11):
    for w in weights:
        clf = KNeighborsClassifier(n_neighbors=k, weights=w)
        clf.fit(x_train, y_train)
        y_pred = clf.predict(x_test)
        accuracy = round(accuracy_score(y_test, y_pred)*100, 2)
        print(f"Accuracy of KNN for k={k} and w={w}: {accuracy}%")
    print("")

print("Best hyperparameters: k=9 & w=distance.")

Accuracy of KNN for k=1 and w=uniform: 41.89%
Accuracy of KNN for k=1 and w=distance: 41.89%

Accuracy of KNN for k=2 and w=uniform: 43.41%
Accuracy of KNN for k=2 and w=distance: 43.31%

Accuracy of KNN for k=3 and w=uniform: 45.04%
Accuracy of KNN for k=3 and w=distance: 45.35%

Accuracy of KNN for k=4 and w=uniform: 46.77%
Accuracy of KNN for k=4 and w=distance: 48.35%

Accuracy of KNN for k=5 and w=uniform: 47.77%
Accuracy of KNN for k=5 and w=distance: 48.14%

Accuracy of KNN for k=6 and w=uniform: 49.19%
Accuracy of KNN for k=6 and w=distance: 49.76%

Accuracy of KNN for k=7 and w=uniform: 49.13%
Accuracy of KNN for k=7 and w=distance: 50.03%

Accuracy of KNN for k=8 and w=uniform: 49.13%
Accuracy of KNN for k=8 and w=distance: 49.97%

Accuracy of KNN for k=9 and w=uniform: 48.71%
Accuracy of KNN for k=9 and w=distance: 50.81%

Accuracy of KNN for k=10 and w=uniform: 49.19%
Accuracy of KNN for k=10 and w=distance: 50.24%

Best hyperparameters: k=9 & w=distance.


In [137]:
kernels = ['linear', 'poly', 'rbf', 'sigmoid']
for c in range(1, 11):
    for k in kernels:
        clf = SVC(C=c, kernel=k)
        clf.fit(x_train, y_train)
        y_pred = clf.predict(x_test)
        accuracy = round(accuracy_score(y_test, y_pred)*100, 2)
        print(f"Accuracy of SVM for c={c} and kernel={k}: {accuracy}%")
    print("")

print("Best hyperparameters: c=1 & kernel=rbf.")

Accuracy of SVM for c=1 and kernel=linear: 51.65%
Accuracy of SVM for c=1 and kernel=poly: 51.97%
Accuracy of SVM for c=1 and kernel=rbf: 52.49%
Accuracy of SVM for c=1 and kernel=sigmoid: 46.46%

Accuracy of SVM for c=2 and kernel=linear: 49.55%
Accuracy of SVM for c=2 and kernel=poly: 50.29%
Accuracy of SVM for c=2 and kernel=rbf: 51.97%
Accuracy of SVM for c=2 and kernel=sigmoid: 45.77%

Accuracy of SVM for c=3 and kernel=linear: 48.77%
Accuracy of SVM for c=3 and kernel=poly: 49.19%
Accuracy of SVM for c=3 and kernel=rbf: 50.92%
Accuracy of SVM for c=3 and kernel=sigmoid: 43.31%

Accuracy of SVM for c=4 and kernel=linear: 48.03%
Accuracy of SVM for c=4 and kernel=poly: 49.29%
Accuracy of SVM for c=4 and kernel=rbf: 50.55%
Accuracy of SVM for c=4 and kernel=sigmoid: 44.88%

Accuracy of SVM for c=5 and kernel=linear: 47.87%
Accuracy of SVM for c=5 and kernel=poly: 49.24%
Accuracy of SVM for c=5 and kernel=rbf: 49.34%
Accuracy of SVM for c=5 and kernel=sigmoid: 41.78%

Accuracy of SVM

In [138]:
criterions = ['gini', 'entropy', 'log_loss']

for criterion in criterions:
	for depth in range(1, 11):
		tree = DecisionTreeClassifier(criterion=criterion, max_depth=depth)
		tree.fit(x_train, y_train)
		y_pred = tree.predict(x_test)
		accuracy = round(accuracy_score(y_test, y_pred)*100, 2)
		print(f"Accuracy for criterion={criterion} and depth={depth}: {accuracy}%")
	print("")

print("Best hyperparameters: criterion=log_loss & depth=3.")

Accuracy for criterion=gini and depth=1: 49.4%
Accuracy for criterion=gini and depth=2: 49.61%
Accuracy for criterion=gini and depth=3: 47.93%
Accuracy for criterion=gini and depth=4: 46.88%
Accuracy for criterion=gini and depth=5: 52.02%
Accuracy for criterion=gini and depth=6: 52.39%
Accuracy for criterion=gini and depth=7: 52.23%
Accuracy for criterion=gini and depth=8: 52.7%
Accuracy for criterion=gini and depth=9: 49.45%
Accuracy for criterion=gini and depth=10: 51.76%

Accuracy for criterion=entropy and depth=1: 49.4%
Accuracy for criterion=entropy and depth=2: 49.76%
Accuracy for criterion=entropy and depth=3: 55.7%
Accuracy for criterion=entropy and depth=4: 52.39%
Accuracy for criterion=entropy and depth=5: 52.49%
Accuracy for criterion=entropy and depth=6: 53.12%
Accuracy for criterion=entropy and depth=7: 53.07%
Accuracy for criterion=entropy and depth=8: 51.02%
Accuracy for criterion=entropy and depth=9: 51.18%
Accuracy for criterion=entropy and depth=10: 50.66%

Accuracy f

In [142]:
logistic = LogisticRegression()
logistic.fit(x_train, y_train)
y_pred = logistic.predict(x_test)
accuracy = round(accuracy_score(y_test, y_pred)*100, 2)
print(f"Accuracy for Logistic: {accuracy}%")
print(classification_report(y_test, y_pred))

perceptron = Perceptron()
perceptron.fit(x_train, y_train)
y_pred = perceptron.predict(x_test)
accuracy = round(accuracy_score(y_test, y_pred)*100, 2)
print(f"Accuracy for Perceptron: {accuracy}%")
print(classification_report(y_test, y_pred))

Accuracy for Logistic: 51.39%
              precision    recall  f1-score   support

          -2       0.41      0.03      0.06       292
          -1       0.34      0.36      0.35       449
           0       0.58      0.84      0.68       936
           1       0.54      0.09      0.16       228

    accuracy                           0.51      1905
   macro avg       0.47      0.33      0.31      1905
weighted avg       0.49      0.51      0.45      1905

Accuracy for Perceptron: 34.65%
              precision    recall  f1-score   support

          -2       0.27      0.08      0.12       292
          -1       0.23      0.30      0.26       449
           0       0.53      0.50      0.51       936
           1       0.10      0.14      0.12       228

    accuracy                           0.35      1905
   macro avg       0.28      0.26      0.25      1905
weighted avg       0.36      0.35      0.35      1905



In [None]:
# TEST THE HYPERPARAMETERS
knn = KNeighborsClassifier(n_neighbors=9, weights="distance")
knn.fit(x_train, y_train)
y_pred = knn.predict(x_test)
accuracy = round(accuracy_score(y_test, y_pred)*100, 2)
print(f"Accuracy of KNN: {accuracy}%")
print(classification_report(y_test, y_pred))

svm = SVC(C=1, kernel="rbf")
svm.fit(x_train, y_train)
y_pred = svm.predict(x_test)
accuracy = round(accuracy_score(y_test, y_pred)*100, 2)
print(f"Accuracy of SVM: {accuracy}%")
print(classification_report(y_test, y_pred))

tree = DecisionTreeClassifier(criterion="log_loss", max_depth=3)
tree.fit(x_train, y_train)
y_pred = tree.predict(x_test)
accuracy = round(accuracy_score(y_test, y_pred)*100, 2)
print(f"Accuracy of tree: {accuracy}%")
print(classification_report(y_test, y_pred))

bernoulli = BernoulliNB()
bernoulli.fit(x_train, y_train)
y_pred = bernoulli.predict(x_test)
accuracy = round(accuracy_score(y_test, y_pred)*100, 2)
print(f"Accuracy of bernoulliNB: {accuracy}%")
print(classification_report(y_test, y_pred))

multi = MultinomialNB()
multi.fit(x_train, y_train)
y_pred = multi.predict(x_test)
accuracy = round(accuracy_score(y_test, y_pred)*100, 2)
print(f"Accuracy of multinomialNB: {accuracy}%")
print(classification_report(y_test, y_pred))

Accuracy of KNN: 50.6%
Accuracy of SVM: 52.49%
Accuracy of tree: 55.7%
Accuracy of bernoulliNB: 44.93%
Accuracy of multinomialNB: 50.71%


In [None]:
x_train_dense = x_train.toarray() if hasattr(x_train, "toarray") else x_train
y_train_dense = y_train.toarray() if hasattr(y_train, "toarray") else y_train
x_test_dense = x_test.toarray() if hasattr(x_test, "toarray") else x_test
y_test_dense = y_test.toarray() if hasattr(y_test, "toarray") else y_test

gauss = GaussianNB()
gauss.fit(x_train_dense, y_train_dense)
y_pred = gauss.predict(x_test_dense)
accuracy = round(accuracy_score(y_test_dense, y_pred)*100, 2)
print(f"Accuracy of gaussianNB: {accuracy}%")
print(classification_report(y_test, y_pred))

Accuracy of gaussianNB: 22.78%


As we can see the best classfier is the Decision Tree with a 55.7% of accuracy