In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

import spacy

In [2]:
df = pd.read_csv("/kaggle/input/emotions-in-text/Emotion_final.csv")

# Print the shape of dataframe
print(df.shape)

# Print top 5 rows
df.head(5)

(21459, 2)


Unnamed: 0,Text,Emotion
0,i didnt feel humiliated,sadness
1,i can go from feeling so hopeless to so damned...,sadness
2,im grabbing a minute to post i feel greedy wrong,anger
3,i am ever feeling nostalgic about the fireplac...,love
4,i am feeling grouchy,anger


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21459 entries, 0 to 21458
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Text     21459 non-null  object
 1   Emotion  21459 non-null  object
dtypes: object(2)
memory usage: 335.4+ KB


In [4]:
df['Emotion'].value_counts()

Emotion
happy       7029
sadness     6265
anger       2993
fear        2652
love        1641
surprise     879
Name: count, dtype: int64

In [5]:
print(f"{df['Text'][2912]} -> {df['Emotion'][2912]}")

i can t do anything but feel the feelings because the issue has to get resolved to dissipate the emotion but i am powerless to make any resolution because it s not my issue -> happy


In [6]:
nlp = spacy.load("en_core_web_sm")

In [7]:
txt = df['Text'][3]
txt

'i am ever feeling nostalgic about the fireplace i will know that it is still on the property'

In [8]:
doc = nlp(txt)

In [9]:
for token in doc:
    print(token)

i
am
ever
feeling
nostalgic
about
the
fireplace
i
will
know
that
it
is
still
on
the
property


In [10]:
for token in doc:
    print(f"Word: {token} | -> {token.lemma_}")

Word: i | -> I
Word: am | -> be
Word: ever | -> ever
Word: feeling | -> feel
Word: nostalgic | -> nostalgic
Word: about | -> about
Word: the | -> the
Word: fireplace | -> fireplace
Word: i | -> I
Word: will | -> will
Word: know | -> know
Word: that | -> that
Word: it | -> it
Word: is | -> be
Word: still | -> still
Word: on | -> on
Word: the | -> the
Word: property | -> property


In [11]:
for token in doc:
    if token.is_stop or token.is_punct:
        print(token)

i
am
ever
about
the
i
will
that
it
is
still
on
the


In [12]:
def preprocess(text):
    # remove stop words and lemmatize the text
    doc = nlp(text)
    filtered_tokens = []
    for token in doc:
        if token.is_stop or token.is_punct:
            continue
        filtered_tokens.append(token.lemma_)

    return " ".join(filtered_tokens)

In [13]:
print(txt)
procces_txt = preprocess(txt)
print(procces_txt)

i am ever feeling nostalgic about the fireplace i will know that it is still on the property
feel nostalgic fireplace know property


In [14]:
df['preprocessed_text'] = df['Text'].apply(preprocess)

In [15]:
df

Unnamed: 0,Text,Emotion,preprocessed_text
0,i didnt feel humiliated,sadness,not feel humiliate
1,i can go from feeling so hopeless to so damned...,sadness,feel hopeless damned hopeful care awake
2,im grabbing a minute to post i feel greedy wrong,anger,m grab minute post feel greedy wrong
3,i am ever feeling nostalgic about the fireplac...,love,feel nostalgic fireplace know property
4,i am feeling grouchy,anger,feel grouchy
...,...,...,...
21454,Melissa stared at her friend in dism,fear,Melissa stare friend dism
21455,Successive state elections have seen the gover...,fear,successive state election see govern party pum...
21456,Vincent was irritated but not dismay,fear,Vincent irritated dismay
21457,Kendall-Hume turned back to face the dismayed ...,fear,Kendall Hume turn face dismayed coup


In [16]:
from sklearn.preprocessing import LabelEncoder
label=LabelEncoder()
df['Emotion_label']=label.fit_transform(df['Emotion'])
df.head(10)

Unnamed: 0,Text,Emotion,preprocessed_text,Emotion_label
0,i didnt feel humiliated,sadness,not feel humiliate,4
1,i can go from feeling so hopeless to so damned...,sadness,feel hopeless damned hopeful care awake,4
2,im grabbing a minute to post i feel greedy wrong,anger,m grab minute post feel greedy wrong,0
3,i am ever feeling nostalgic about the fireplac...,love,feel nostalgic fireplace know property,3
4,i am feeling grouchy,anger,feel grouchy,0
5,ive been feeling a little burdened lately wasn...,sadness,ve feel little burden lately not sure,4
6,ive been taking or milligrams or times recomme...,surprise,ve take milligram time recommend ve fall aslee...,5
7,i feel as confused about life as a teenager or...,fear,feel confused life teenager jade year old man,1
8,i have been with petronas for years i feel tha...,happy,petrona year feel petrona perform huge profit,2
9,i feel romantic too,love,feel romantic,3


In [17]:
X_train, X_test, y_train, y_test = train_test_split(df['preprocessed_text'], df['Emotion_label'],
                                                    test_size=0.25, random_state=42, stratify=df['Emotion_label'])

In [18]:
print("Shape of X_train: ", X_train.shape)
print("Shape of X_test: ", X_test.shape)

Shape of X_train:  (16094,)
Shape of X_test:  (5365,)


In [19]:
v = TfidfVectorizer()

X_train_cv = v.fit_transform(X_train)
X_test_cv = v.transform(X_test)

print(v.vocabulary_)



In [20]:
RFC_model = RandomForestClassifier()

RFC_model.fit(X_train_cv, y_train)

In [21]:
y_pred = RFC_model.predict(X_test_cv)

In [22]:
print(accuracy_score(y_test, y_pred))

0.8570363466915191


In [23]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.84      0.84      0.84       748
           1       0.84      0.86      0.85       663
           2       0.85      0.92      0.88      1758
           3       0.80      0.66      0.72       410
           4       0.91      0.87      0.89      1566
           5       0.78      0.70      0.74       220

    accuracy                           0.86      5365
   macro avg       0.84      0.81      0.82      5365
weighted avg       0.86      0.86      0.86      5365



In [24]:
from sklearn.svm import SVC
model_svm = SVC(kernel = 'linear', random_state = 0)
model_svm.fit(X_train_cv, y_train)

In [25]:
y_pred=model_svm.predict(X_test_cv)

In [26]:
print(accuracy_score(y_test, y_pred))

0.8609506057781919


In [27]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.86      0.84      0.85       748
           1       0.82      0.83      0.83       663
           2       0.87      0.91      0.89      1758
           3       0.79      0.68      0.73       410
           4       0.90      0.89      0.89      1566
           5       0.81      0.70      0.75       220

    accuracy                           0.86      5365
   macro avg       0.84      0.81      0.82      5365
weighted avg       0.86      0.86      0.86      5365

