In [1]:
import pandas as pd
import numpy as np
import nltk
import re
#nltk.download('stopwords')
#nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from bs4 import BeautifulSoup

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report


In [2]:
#download data
sentence = pd.read_csv("Data-20220517/training_set.txt",sep='\t',header=0)
dev_set = pd.read_csv("Data-20220517/dev_set.txt",sep='\t',header=0)

sentence.head()

Unnamed: 0,sentence,emotion
0,I'm too old to be traded in .,6
1,Mother said you could always tell a lady by he...,8
2,I always said I'd leave off when the time came .,6
3,He'll be safe with me .,2
4,Lay off .,1


In [3]:
#define preprocessing function that will be used in the TfidfVectorizer
def preprocessing(text):
    
    processed_corpus = []
    stop_words = set(stopwords.words("english"))

    # Remove punctuations
    text = re.sub('[^a-zA-Z!?]', ' ', text)

    # Convert to lowercase
    text = text.lower()        

    # Convert to list from string
    text = text.split()
    
    # Lemmatization
    lem = WordNetLemmatizer()
    text = [lem.lemmatize(word) for word in text if not word in stop_words] 
        
    return text

preprocessing(sentence.sentence[0])

['old', 'traded']

In [4]:
#function that show results for several classifiers 
def Classifier_tester(X,y):
    
    clf_lis =[
              MultinomialNB(),
              RandomForestClassifier(max_depth=5),
              #MultiOutputClassifier(LogisticRegression()),
              GaussianNB(),
              #SVC(kernel='linear'),
              KNeighborsClassifier(n_neighbors=5),
              MLPClassifier(hidden_layer_sizes=(5,5,3))
             ]
    
    #save the list of category names in the order they appear in class. report.
    x = dev_set.emotion.unique()
    condlist= [x==1,x==2,x==3,x==4,x==5,x==6,x==7,x==8]
    choicelist = ['Anger','Anticipation','Disgust','Fear','Joy','Sadness','Surprise','Trust']
    category_names = list(np.select(condlist, choicelist))
    
    results = dict()
    for clf in clf_lis:
        print(str(clf))
        #train the clf
        clf.fit(X,y)
     
        #creat a Tfidf matrix for dev set based on the fit Tfidf matrix by the train
        tf_idf_vector_dev = TfidfVec.transform(dev_set.sentence)

        #predict for new sentences of the dev set
        y_pred = clf.predict(tf_idf_vector_dev.toarray())
        Y_test = dev_set.emotion.values
        
        #saves result in dict
        cls_rep = classification_report(y_pred, Y_test, target_names=category_names,zero_division=1,output_dict=True)
        
        #delete the support metric
        cls_rep['weighted avg'].pop("support")
        
        #add the accuracy metric
        cls_rep['weighted avg'].update({'accuracy':cls_rep['accuracy']})
        
        #save the weighted avg metrics in the classifer name key
        results[str(clf)] = cls_rep['weighted avg']
        
    return pd.DataFrame(results).round(2)

In [5]:
#transforme a corpus into a Tfidf matrix (n,m) where n is the number of texts and m the words
TfidfVec = TfidfVectorizer(
            max_df=0.9,
            max_features=6000, 
            ngram_range=(1,3),
            tokenizer=preprocessing
            )

Tfidf_matrix = TfidfVec.fit_transform(sentence.sentence.values)

s = Classifier_tester(Tfidf_matrix.toarray(),sentence.emotion.values)

MultinomialNB()
RandomForestClassifier(max_depth=5)
GaussianNB()
KNeighborsClassifier()
MLPClassifier(hidden_layer_sizes=(5, 5, 3))


In [6]:
s

Unnamed: 0,MultinomialNB(),RandomForestClassifier(max_depth=5),GaussianNB(),KNeighborsClassifier(),"MLPClassifier(hidden_layer_sizes=(5, 5, 3))"
precision,0.57,1.0,0.34,0.33,1.0
recall,0.34,0.21,0.2,0.28,0.21
f1-score,0.39,0.35,0.21,0.29,0.35
accuracy,0.34,0.21,0.2,0.28,0.21
