In [76]:
import pandas as pd
import numpy as np
import nltk
import re
#nltk.download('stopwords')
#nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report


In [5]:
#download data
sentence = pd.read_csv("Data-20220517/training_set.txt",sep='\t',header=0)
dev_set = pd.read_csv("Data-20220517/dev_set.txt",sep='\t',header=0)

sentence.head()

Unnamed: 0,sentence,emotion
0,I'm too old to be traded in .,6
1,Mother said you could always tell a lady by he...,8
2,I always said I'd leave off when the time came .,6
3,He'll be safe with me .,2
4,Lay off .,1


In [71]:
def preprocessing(text):
    
    processed_corpus = []
    stop_words = set(stopwords.words("english"))

    # Remove punctuations
    text = re.sub('[^a-zA-Z]', ' ', text)

    # Convert to lowercase
    text = text.lower()        

    # Convert to list from string
    text = text.split()

    # Lemmatization
    lem = WordNetLemmatizer()
    text = [lem.lemmatize(word) for word in text if not word in stop_words] 
    text = " ".join(text)
    processed_corpus.append(text)
        
    return processed_corpus

In [90]:
#transforme a corpus into a Tfidf matrix (n,m) where n is the number of texts and m the words
TfidfVec = TfidfVectorizer(
            max_df=0.8,
            max_features=10000, 
            ngram_range=(1,3),
            tokenizer=preprocessing
            )

Tfidf_matrix = TfidfVec.fit_transform(sentence.sentence)

#train a model based in the Tfidf_matrix 
clf = GaussianNB()
clf.fit(Tfidf_matrix.toarray(),sentence.emotion.values)

In [95]:
#creat a Tfidf matrix for dev set based on the fit Tfidf matrix by the train
tf_idf_vector_dev = TfidfVec.transform(dev_set.sentence)

#predict for new sentences of the dev set
y_pred = clf.predict(tf_idf_vector_dev.toarray())
Y_test = dev_set.emotion.values

#compare the ground truth and show results
x = dev_set.emotion.unique()
condlist= [x==1,x==2,x==3,x==4,x==5,x==6,x==7,x==8]
choicelist = ['Anger','Anticipation','Disgust','Fear','Joy','Sadness','Surprise','Trust']
category_names = list(np.select(condlist, choicelist))

print(classification_report(Y_test, y_pred, target_names=category_names,zero_division=1))

              precision    recall  f1-score   support

        Fear       0.46      0.03      0.05       211
       Trust       0.30      0.02      0.03       170
    Surprise       0.10      0.01      0.02        77
Anticipation       0.16      0.03      0.05       104
       Anger       0.28      0.05      0.09        97
     Disgust       0.33      0.07      0.11        87
     Sadness       0.10      0.93      0.18        96
         Joy       0.25      0.01      0.01       158

    accuracy                           0.11      1000
   macro avg       0.25      0.14      0.07      1000
weighted avg       0.28      0.11      0.06      1000

