In [1]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning) 
warnings.simplefilter(action='ignore', category=UserWarning)
warnings.simplefilter(action='ignore', category=RuntimeWarning) 
warnings.simplefilter(action='ignore', category=DeprecationWarning)

In [77]:
import numpy as np
import pandas as pd 
import tensorflow as tf
from tensorflow import keras
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
import spacy
import re
import string 
import collections
import joblib
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import Counter
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [79]:
df = pd.read_csv('training.csv')

In [81]:
df.head()

Unnamed: 0,text,label
0,i didnt feel humiliated,0
1,i can go from feeling so hopeless to so damned...,0
2,im grabbing a minute to post i feel greedy wrong,3
3,i am ever feeling nostalgic about the fireplac...,2
4,i am feeling grouchy,3


In [83]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16000 entries, 0 to 15999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    16000 non-null  object
 1   label   16000 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 250.1+ KB


In [85]:
df.describe()

Unnamed: 0,label
count,16000.0
mean,1.565937
std,1.50143
min,0.0
25%,0.0
50%,1.0
75%,3.0
max,5.0


In [87]:
df.shape[0]

16000

In [89]:
df.shape[1]

2

In [91]:
print("Duplicate entries in the dataset: " + str(df.duplicated().sum()))

Duplicate entries in the dataset: 1


In [93]:
df = df.drop_duplicates()

In [95]:
print("Duplicate entries in the dataset: " + str(df.duplicated().sum()))

Duplicate entries in the dataset: 0


In [97]:
labels_dict = {0:'sadness', 1:'joy', 2:'love', 3:'anger', 4:'fear', 5:'surprise'}
df['description'] = df['label'].map(labels_dict )
df.head()

Unnamed: 0,text,label,description
0,i didnt feel humiliated,0,sadness
1,i can go from feeling so hopeless to so damned...,0,sadness
2,im grabbing a minute to post i feel greedy wrong,3,anger
3,i am ever feeling nostalgic about the fireplac...,2,love
4,i am feeling grouchy,3,anger


In [99]:
stopwords_english = set(stopwords.words('english'))
my_stopwords = set(["http", "'s", "n't", "'m", "'re", "'ve"])
stopwords_english.update(my_stopwords)

def preprocess_review(text):
    text = text.lower() #lowecase

    text = re.sub(r'\d+', '', text) #removes numbers

    text = re.sub(r'[^\w\s]', '', text) #keeps words and spaces only

    tokens = text.split()  #splits sentence into words

    clean_tokens = [tok for tok in tokens if tok not in stopwords_english and len(tok) > 1]  #removes stopwords and single character words

    clean_text = ' '.join(clean_tokens)  #joins clean token back into a single string

    return clean_text

In [101]:
sample = df['text'][100]

print('ORIGINAL REVIEW:   ' + sample + '\n')

print('WITH PROCESSING:    ' + preprocess_review(sample))

ORIGINAL REVIEW:   i wont let me child cry it out because i feel that loving her and lily when she was little was going to be opportunities that only lasted for those short few months

WITH PROCESSING:    wont let child cry feel loving lily little going opportunities lasted short months


In [103]:
df["clean_text"] = df["text"].apply(preprocess_review)

In [104]:
df.head()

Unnamed: 0,text,label,description,clean_text
0,i didnt feel humiliated,0,sadness,didnt feel humiliated
1,i can go from feeling so hopeless to so damned...,0,sadness,go feeling hopeless damned hopeful around some...
2,im grabbing a minute to post i feel greedy wrong,3,anger,im grabbing minute post feel greedy wrong
3,i am ever feeling nostalgic about the fireplac...,2,love,ever feeling nostalgic fireplace know still pr...
4,i am feeling grouchy,3,anger,feeling grouchy


In [107]:
df.head()

Unnamed: 0,text,label,description,clean_text
0,i didnt feel humiliated,0,sadness,didnt feel humiliated
1,i can go from feeling so hopeless to so damned...,0,sadness,go feeling hopeless damned hopeful around some...
2,im grabbing a minute to post i feel greedy wrong,3,anger,im grabbing minute post feel greedy wrong
3,i am ever feeling nostalgic about the fireplac...,2,love,ever feeling nostalgic fireplace know still pr...
4,i am feeling grouchy,3,anger,feeling grouchy


In [109]:
df = df.drop(columns=['text'])

In [111]:
X = df['clean_text']
y = df['description']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

models = {
    'Logistic Regression': Pipeline([('tfidf', TfidfVectorizer()), ('lr', LogisticRegression())]),
    'SVM': Pipeline([('tfidf', TfidfVectorizer()), ('svm', SVC(kernel='rbf', C=1.0))]),
    'Random Forest': Pipeline([('tfidf', TfidfVectorizer()), ('rf', RandomForestClassifier(n_estimators=100))])
}

for name, model in models.items():
    model.fit(X_train, y_train)
    score = model.score(X_test, y_test)
    print(f'{name} Accuracy: {score:.4f}')

grid_params = {'svm__C': [0.1, 1, 10], 'svm__kernel': ['linear', 'rbf']}
grid_search = GridSearchCV(models['SVM'], grid_params, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)
print("Best SVM Parameters:", grid_search.best_params_)
print("Best SVM Score:", grid_search.best_score_)


Logistic Regression Accuracy: 0.8531
SVM Accuracy: 0.8419
Random Forest Accuracy: 0.8844
Best SVM Parameters: {'svm__C': 1, 'svm__kernel': 'linear'}
Best SVM Score: 0.8777576086263


In [116]:
joblib.dump(model, "emotion_model.pkl")

print("Model trained and saved successfully!")

Model trained and saved successfully!
