In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("Emotion_classify_Data.csv")
df.head()

Unnamed: 0,Comment,Emotion
0,i seriously hate one subject to death but now ...,fear
1,im so full of life i feel appalled,anger
2,i sit here to write i start to dig out my feel...,fear
3,ive been really angry with r and i feel like a...,joy
4,i feel suspicious if there is no one outside l...,fear


In [3]:
df.shape

(5937, 2)

In [4]:
df.Emotion.value_counts()

anger    2000
joy      2000
fear     1937
Name: Emotion, dtype: int64

In [5]:
df['Emotion_num'] = df['Emotion'].map({
    'joy':0,
    'fear':1,
    'anger':2
})
df.head()

Unnamed: 0,Comment,Emotion,Emotion_num
0,i seriously hate one subject to death but now ...,fear,1
1,im so full of life i feel appalled,anger,2
2,i sit here to write i start to dig out my feel...,fear,1
3,ive been really angry with r and i feel like a...,joy,0
4,i feel suspicious if there is no one outside l...,fear,1


# Modeling without Preprocessing

In [6]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df.Comment, df.Emotion_num, test_size=0.2, random_state=2022, stratify=df.Emotion_num)

In [7]:
X_train.shape

(4749,)

In [8]:
X_test.shape

(1188,)

## 1.) Using Random Forest (trigram)

In [10]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import CountVectorizer

In [12]:
clf = Pipeline([
    ('vectorizer', CountVectorizer(ngram_range=(3,3))),
    ('rf', RandomForestClassifier())
])

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.58      0.26      0.35       400
           1       0.36      0.79      0.50       388
           2       0.54      0.22      0.31       400

    accuracy                           0.42      1188
   macro avg       0.49      0.42      0.39      1188
weighted avg       0.49      0.42      0.39      1188



## 2.) Using MultinomialNB (unigram and bigram)

In [13]:
from sklearn.naive_bayes import MultinomialNB

In [15]:
clf = Pipeline([
    ('vectorizer', CountVectorizer(ngram_range=(1,2))),
    ('nb', MultinomialNB())
])

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.87      0.86      0.87       400
           1       0.87      0.83      0.85       388
           2       0.83      0.88      0.85       400

    accuracy                           0.86      1188
   macro avg       0.86      0.86      0.86      1188
weighted avg       0.86      0.86      0.86      1188



## 3.) Using Random Forest (unigram and bigram)

In [16]:
clf = Pipeline([
    ('vectorizer', CountVectorizer(ngram_range=(1,2))),
    ('rf', RandomForestClassifier())
])

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.83      0.97      0.90       400
           1       0.95      0.88      0.92       388
           2       0.94      0.85      0.89       400

    accuracy                           0.90      1188
   macro avg       0.91      0.90      0.90      1188
weighted avg       0.91      0.90      0.90      1188



## 4.) Using Random Forest (Tf-Idf)

In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [18]:
clf = Pipeline([
    ('tf_vectorizer', TfidfVectorizer()),
    ('rf', RandomForestClassifier())
])

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.89      0.94      0.91       400
           1       0.90      0.92      0.91       388
           2       0.93      0.86      0.90       400

    accuracy                           0.91      1188
   macro avg       0.91      0.91      0.91      1188
weighted avg       0.91      0.91      0.91      1188



# Use text pre-processing to remove stop words, punctuations and apply lemmatization

In [19]:
import spacy

In [22]:
nlp = spacy.load("en_core_web_sm")

def preprocess(text):
    filtered_tokens = []
    doc = nlp(text)
    for token in doc:
        if token.is_stop or token.is_punct:
            continue
        filtered_tokens.append(token.lemma_)
        
    return " ".join(filtered_tokens)

In [23]:
df['preprocessed_comment'] = df['Comment'].apply(preprocess)
df.head()

Unnamed: 0,Comment,Emotion,Emotion_num,preprocessed_comment
0,i seriously hate one subject to death but now ...,fear,1,seriously hate subject death feel reluctant drop
1,im so full of life i feel appalled,anger,2,m life feel appalled
2,i sit here to write i start to dig out my feel...,fear,1,sit write start dig feeling think afraid accep...
3,ive been really angry with r and i feel like a...,joy,0,ve angry r feel like idiot trust place
4,i feel suspicious if there is no one outside l...,fear,1,feel suspicious outside like rapture happen


# Modeling after preprocessing

In [24]:
X_train, X_test, y_train, y_test = train_test_split(df.preprocessed_comment, df.Emotion_num, test_size=0.2, random_state=2022, stratify=df.Emotion_num)

## 1.) Using Random Forest(unigram and bigram)

In [25]:
clf = Pipeline([
    ('vectorizer', CountVectorizer(ngram_range=(1,2))),
    ('rf', RandomForestClassifier())
])

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.94      0.95      0.95       400
           1       0.94      0.90      0.92       388
           2       0.92      0.94      0.93       400

    accuracy                           0.93      1188
   macro avg       0.93      0.93      0.93      1188
weighted avg       0.93      0.93      0.93      1188



## 2.) Using Random Forest (Tf-Idf)

In [26]:
clf = Pipeline([
    ('tf_vectorizer', TfidfVectorizer()),
    ('rf', RandomForestClassifier())
])

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.91      0.96      0.94       400
           1       0.93      0.91      0.92       388
           2       0.95      0.91      0.93       400

    accuracy                           0.93      1188
   macro avg       0.93      0.93      0.93      1188
weighted avg       0.93      0.93      0.93      1188

