## **Import Needed libraries**

In [1]:
import nltk
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

In [2]:
# Download required NLTK resources
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## **Data Representation**

In [3]:
df = pd.read_csv("/kaggle/input/emotion-dataset/Emotion_classify_Data.csv")

In [4]:
df.head()

Unnamed: 0,Comment,Emotion
0,i seriously hate one subject to death but now ...,fear
1,im so full of life i feel appalled,anger
2,i sit here to write i start to dig out my feel...,fear
3,ive been really angry with r and i feel like a...,joy
4,i feel suspicious if there is no one outside l...,fear


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5937 entries, 0 to 5936
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Comment  5937 non-null   object
 1   Emotion  5937 non-null   object
dtypes: object(2)
memory usage: 92.9+ KB


In [6]:
df['Emotion'].value_counts()

Emotion
anger    2000
joy      2000
fear     1937
Name: count, dtype: int64

## **Preprocessing**

In [7]:
# Use this utility function to get the preprocessed text data
def preprocess_text(text):
    stop_words = set(stopwords.words('english'))
    stemmer = PorterStemmer()
    
    tokens = word_tokenize(text.lower())  # Tokenization & Lowercasing
    filtered_tokens = [stemmer.stem(word) for word in tokens if word.isalnum() and word not in stop_words]
    
    return ' '.join(filtered_tokens) 

In [8]:
df['preprocessed_comment'] = df['Comment'].apply(preprocess_text)

In [9]:
df.head()

Unnamed: 0,Comment,Emotion,preprocessed_comment
0,i seriously hate one subject to death but now ...,fear,serious hate one subject death feel reluct drop
1,im so full of life i feel appalled,anger,im full life feel appal
2,i sit here to write i start to dig out my feel...,fear,sit write start dig feel think afraid accept p...
3,ive been really angry with r and i feel like a...,joy,ive realli angri r feel like idiot trust first...
4,i feel suspicious if there is no one outside l...,fear,feel suspici one outsid like raptur happen someth


In [10]:
encoder = LabelEncoder()

df['emotion_num'] = encoder.fit_transform(df['Emotion'])

In [11]:
df.head()

Unnamed: 0,Comment,Emotion,preprocessed_comment,emotion_num
0,i seriously hate one subject to death but now ...,fear,serious hate one subject death feel reluct drop,1
1,im so full of life i feel appalled,anger,im full life feel appal,0
2,i sit here to write i start to dig out my feel...,fear,sit write start dig feel think afraid accept p...,1
3,ive been really angry with r and i feel like a...,joy,ive realli angri r feel like idiot trust first...,2
4,i feel suspicious if there is no one outside l...,fear,feel suspici one outsid like raptur happen someth,1


In [12]:
X = df['preprocessed_comment']
y = df['emotion_num']

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=df['emotion_num'])

## **Text Representation**

In [14]:
v = TfidfVectorizer()

X_train_cv = v.fit_transform(X_train)
X_test_cv = v.transform(X_test)

## **Data Modeling and Evaluation**

In [15]:
RFC_model = RandomForestClassifier()

RFC_model.fit(X_train_cv, y_train)

In [16]:
# Get the predictions for X_test and store it in y_pred
y_pred = RFC_model.predict(X_test_cv)

In [17]:
# Print Accuracy
print(accuracy_score(y_test, y_pred))

0.9284511784511784


In [18]:
# Print the classfication report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.92      0.92      0.92       400
           1       0.93      0.93      0.93       388
           2       0.94      0.94      0.94       400

    accuracy                           0.93      1188
   macro avg       0.93      0.93      0.93      1188
weighted avg       0.93      0.93      0.93      1188



In [19]:
test_text = "I'm looking good and feeling good other than this crappy cold i'm dealing with"

In [20]:
test_text_processed = [preprocess_text(test_text)]
test_text_processed

['look good feel good crappi cold deal']

In [21]:
test_text_vc = v.transform(test_text_processed)

In [22]:
test_text = RFC_model.predict(test_text_vc)

In [23]:
classes = ['anger', 'fear', 'joy']
classes[test_text[0]]

'joy'