# 1 - Importing libraries and loading data

In [28]:
# Data manipulation libraries
import pandas as pd
import numpy as np
import json
from pprint import pprint

# Text processing libraries
import emoji
import re
import contractions
import spacy
from spacy.lang.en.stop_words import STOP_WORDS

## Feature Extraction Libraries
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split

# Classifier Model libraries
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
import xgboost as xgb
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn import tree
# from sklearn.pipeline import Pipeline

# Performance Matrix libraries
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report
from sklearn.metrics import ConfusionMatrixDisplay

# other
import warnings
warnings.filterwarnings("ignore")

In [29]:
# Importing train, validation and test datasets with preprocessed texts and labels
train_GE = pd.read_csv("./train_clean.csv")
val_GE = pd.read_csv("./val_clean.csv")
test_GE = pd.read_csv("./test_clean.csv")

# Shape validation
print(train_GE.shape)
print(val_GE.shape)
print(test_GE.shape)

(43410, 29)
(5426, 29)
(5427, 29)


In [30]:
# Loading emotion labels
with open("./emotions.txt", "r") as file :
    emotions = file.read().split("\n")

for emo in emotions :
    print(emo)

admiration
amusement
anger
annoyance
approval
caring
confusion
curiosity
desire
disappointment
disapproval
disgust
embarrassment
excitement
fear
gratitude
grief
joy
love
nervousness
optimism
pride
realization
relief
remorse
sadness
surprise
neutral


# 2 - Preprocessing and transformations

## 2.1 - Additional preprocessing

In [31]:
# Download model 
!python -m spacy download en_core_web_sm -q

[38;5;2mâœ” Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [32]:
# Import English using en_core_web_sm.load()
import en_core_web_sm
nlp = en_core_web_sm.load()

In [33]:
# Creating tokenized documents
tokenized_train_GE = list(nlp.pipe(train_GE["Clean_text"]))
tokenized_test_GE = list(nlp.pipe(test_GE["Clean_text"]))

In [34]:
from spacy.lang.en.stop_words import STOP_WORDS

tokenized_train_GE = [
    [token.lemma_ for token in doc if token.lemma_ not in STOP_WORDS]
    for doc in tokenized_train_GE
]

tokenized_test_GE = [
    [token.lemma_ for token in doc if token.lemma_ not in STOP_WORDS]
    for doc in tokenized_test_GE
]

train_GE["Clean_token"] = [" ".join(tokens) for tokens in tokenized_train_GE]
test_GE["Clean_token"] = [" ".join(tokens) for tokens in tokenized_test_GE]


## 2.2 - Create TF-IDF matrix

In [35]:
# TF-IDF vector with 1000 words vocabulary 
vectorizer = TfidfVectorizer(stop_words="english", max_features=1000)

# Fitting the vectorizer and transforming train and test data
tfidf_train_GE = vectorizer.fit_transform(train_GE['Clean_token'])
tfidf_test_GE = vectorizer.transform(test_GE['Clean_token'])

# Transforming from generators to arrays
tfidf_train_GE = tfidf_train_GE.toarray()
tfidf_test_GE = tfidf_test_GE.toarray()

# Validating the shape of train and test data
print(tfidf_train_GE.shape)
print(tfidf_test_GE.shape)

(43410, 1000)
(5427, 1000)


## 2.3 - Train and test variables

In [36]:
# Defining train and test variables
x_train =  tfidf_train_GE
y_train = train_GE.loc[:,emotions].values

x_test =  tfidf_test_GE
y_test = test_GE.loc[:,emotions].values

# Shape validation
print("The shape of X_train is : ", x_train.shape)
print("The shape of y_train is : ", y_train.shape)
print()
print("The shape of X_test is : ", x_test.shape)
print("The shape of y_test is : ", y_test.shape)

The shape of X_train is :  (43410, 1000)
The shape of y_train is :  (43410, 28)

The shape of X_test is :  (5427, 1000)
The shape of y_test is :  (5427, 28)


# 3 - Basic Models

In [37]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


In [38]:
models = {
    "Logistic Regression": OneVsRestClassifier(
        LogisticRegression(
            max_iter=1000,
            class_weight="balanced"
        )
    ),
    "Linear SVM": OneVsRestClassifier(
        LinearSVC()
    ),
    "SGD Classifier": OneVsRestClassifier(
        SGDClassifier(
            loss="log_loss",
            max_iter=1000
        )
    )
}


In [39]:
results = {}

for name, model in models.items():
    print(f"\nTraining {name}...")
    
    model.fit(x_train, y_train)
    y_pred = model.predict(x_test)
    
    results[name] = {
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred, average="micro", zero_division=0),
        "Recall": recall_score(y_test, y_pred, average="micro", zero_division=0),
        "F1-score": f1_score(y_test, y_pred, average="micro", zero_division=0)
    }



Training Logistic Regression...

Training Linear SVM...

Training SGD Classifier...


In [40]:
for model_name, metrics in results.items():
    print(f"\n===== {model_name} =====")
    for metric, value in metrics.items():
        print(f"{metric:<10}: {value:.4f}")



===== Logistic Regression =====
Accuracy  : 0.0374
Precision : 0.1931
Recall    : 0.7306
F1-score  : 0.3054

===== Linear SVM =====
Accuracy  : 0.3228
Precision : 0.6811
Recall    : 0.3395
F1-score  : 0.4532

===== SGD Classifier =====
Accuracy  : 0.2432
Precision : 0.6963
Recall    : 0.2485
F1-score  : 0.3663


In [41]:
# Retrieving initial text preprocessings
def preprocess_corpus(x):
    
    # Adding a space between words and punctation
    x = re.sub( r'([a-zA-Z\[\]])([,;.!?])', r'\1 \2', x)
    x = re.sub( r'([,;.!?])([a-zA-Z\[\]])', r'\1 \2', x)

    # Demojize
    x = emoji.demojize(x)

    # Expand contraction
    x = contractions.fix(x)

    # Lower
    x = x.lower()

    #correct some acronyms/typos/abbreviations  
    x = re.sub(r"lmao", "laughing my ass off", x)  
    x = re.sub(r"amirite", "am i right", x)
    x = re.sub(r"\b(tho)\b", "though", x)
    x = re.sub(r"\b(ikr)\b", "i know right", x)
    x = re.sub(r"\b(ya|u)\b", "you", x)
    x = re.sub(r"\b(eu)\b", "europe", x)
    x = re.sub(r"\b(da)\b", "the", x)
    x = re.sub(r"\b(dat)\b", "that", x)
    x = re.sub(r"\b(dats)\b", "that is", x)
    x = re.sub(r"\b(cuz)\b", "because", x)
    x = re.sub(r"\b(fkn)\b", "fucking", x)
    x = re.sub(r"\b(tbh)\b", "to be honest", x)
    x = re.sub(r"\b(tbf)\b", "to be fair", x)
    x = re.sub(r"faux pas", "mistake", x)
    x = re.sub(r"\b(btw)\b", "by the way", x)
    x = re.sub(r"\b(bs)\b", "bullshit", x)
    x = re.sub(r"\b(kinda)\b", "kind of", x)
    x = re.sub(r"\b(bruh)\b", "bro", x)
    x = re.sub(r"\b(w/e)\b", "whatever", x)
    x = re.sub(r"\b(w/)\b", "with", x)
    x = re.sub(r"\b(w/o)\b", "without", x)
    x = re.sub(r"\b(doj)\b", "department of justice", x)

    # replace some words with multiple occurences of a letter, example "coooool" turns into --> cool
    x = re.sub(r"\b(j+e{2,}z+e*)\b", "jeez", x)
    x = re.sub(r"\b(co+l+)\b", "cool", x)
    x = re.sub(r"\b(g+o+a+l+)\b", "goal", x)
    x = re.sub(r"\b(s+h+i+t+)\b", "shit", x)
    x = re.sub(r"\b(o+m+g+)\b", "omg", x)
    x = re.sub(r"\b(w+t+f+)\b", "wtf", x)
    x = re.sub(r"\b(w+h+a+t+)\b", "what", x)
    x = re.sub(r"\b(y+e+y+|y+a+y+|y+e+a+h+)\b", "yeah", x)
    x = re.sub(r"\b(w+o+w+)\b", "wow", x)
    x = re.sub(r"\b(w+h+y+)\b", "why", x)
    x = re.sub(r"\b(s+o+)\b", "so", x)
    x = re.sub(r"\b(f)\b", "fuck", x)
    x = re.sub(r"\b(w+h+o+p+s+)\b", "whoops", x)
    x = re.sub(r"\b(ofc)\b", "of course", x)
    x = re.sub(r"\b(the us)\b", "usa", x)
    x = re.sub(r"\b(gf)\b", "girlfriend", x)
    x = re.sub(r"\b(hr)\b", "human ressources", x)
    x = re.sub(r"\b(mh)\b", "mental health", x)
    x = re.sub(r"\b(idk)\b", "i do not know", x)
    x = re.sub(r"\b(gotcha)\b", "i got you", x)
    x = re.sub(r"\b(y+e+p+)\b", "yes", x)
    x = re.sub(r"\b(a*ha+h[ha]*|a*ha +h[ha]*)\b", "haha", x)
    x = re.sub(r"\b(o?l+o+l+[ol]*)\b", "lol", x)
    x = re.sub(r"\b(o*ho+h[ho]*|o*ho +h[ho]*)\b", "ohoh", x)
    x = re.sub(r"\b(o+h+)\b", "oh", x)
    x = re.sub(r"\b(a+h+)\b", "ah", x)
    x = re.sub(r"\b(u+h+)\b", "uh", x)

    # Handling emojis
    x = re.sub(r"<3", " love ", x)
    x = re.sub(r"xd", " smiling_face_with_open_mouth_and_tightly_closed_eyes ", x)
    x = re.sub(r":\)", " smiling_face ", x)
    x = re.sub(r"^_^", " smiling_face ", x)
    x = re.sub(r"\*_\*", " star_struck ", x)
    x = re.sub(r":\(", " frowning_face ", x)
    x = re.sub(r":\^\(", " frowning_face ", x)
    x = re.sub(r";\(", " frowning_face ", x)
    x = re.sub(r":\/",  " confused_face", x)
    x = re.sub(r";\)",  " wink", x)
    x = re.sub(r">__<",  " unamused ", x)
    x = re.sub(r"\b([xo]+x*)\b", " xoxo ", x)
    x = re.sub(r"\b(n+a+h+)\b", "no", x)
    
    # Handling special cases of text
    x = re.sub(r"h a m b e r d e r s", "hamberders", x)
    x = re.sub(r"b e n", "ben", x)
    x = re.sub(r"s a t i r e", "satire", x)
    x = re.sub(r"y i k e s", "yikes", x)
    x = re.sub(r"s p o i l e r", "spoiler", x)
    x = re.sub(r"thankyou", "thank you", x)
    x = re.sub(r"a^r^o^o^o^o^o^o^o^n^d", "around", x)

    # Remove special characters and numbers replace by space + remove double space
    x = re.sub(r"\b([.]{3,})"," dots ", x)
    x = re.sub(r"[^A-Za-z!?_]+"," ", x)
    x = re.sub(r"\b([s])\b *","", x)
    x = re.sub(r" +"," ", x)
    x = x.strip()

    return x     

In [42]:
def predict_samples(text_samples, model):
    # Ensure input is a list
    if isinstance(text_samples, str):
        text_samples = [text_samples]

    # Text preprocessing
    text_samples = pd.Series(text_samples)
    text_samples_clean = text_samples.apply(preprocess_corpus)

    # TF-IDF transformation
    tfidf_text_samples = vectorizer.transform(text_samples_clean)

    # Predictions
    samples_pred_labels = model.predict(tfidf_text_samples)

    samples_pred_labels_df = pd.DataFrame(samples_pred_labels)

    samples_pred_emotions = samples_pred_labels_df.apply(
        lambda x: [emotions[i] for i in range(len(x)) if x[i] == 1],
        axis=1
    )

    # Final result
    return pd.DataFrame({
        "Text": text_samples,
        "Emotions": samples_pred_emotions
    })


In [43]:
for name, model in models.items() :
    samples = [
        "no one cares my guy",
        "I am so happy today",
        "I feel empty and tired"
    ]
    
    result = predict_samples(samples, model)
    
    print(result)
    print()


                     Text                                           Emotions
0     no one cares my guy                               [curiosity, neutral]
1     I am so happy today                           [excitement, joy, pride]
2  I feel empty and tired  [annoyance, caring, disappointment, nervousnes...

                     Text   Emotions
0     no one cares my guy  [neutral]
1     I am so happy today      [joy]
2  I feel empty and tired         []

                     Text   Emotions
0     no one cares my guy  [neutral]
1     I am so happy today      [joy]
2  I feel empty and tired         []

