Project: 2 Text Emotion Classification

Problem Statement:
Classify text into emotions like happy, sad, angry, etc.

In [10]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


In [11]:
os.chdir(os.getcwd().replace('notebooks', 'data'))

df = pd.read_csv('data.csv')


In [12]:
df.sample(10)


Unnamed: 0.1,Unnamed: 0,text,label
401269,401269,i know and feeling what i felt im scared that ...,4
122875,122875,i feel it is perhaps the most valuable thing i...,1
157901,157901,i cant say i always knew what i wanted to do w...,2
97030,97030,i truly feel and think without being afraid th...,4
320724,320724,i feel distracted ineffective out of shape dir...,3
235030,235030,i feel like there are so many amazing opportun...,1
66128,66128,i make the choices in my relationships to act ...,4
126173,126173,i started feeling anxious wondering where i co...,4
102517,102517,i actually want to cry of the time i am at sch...,0
188554,188554,i feel his laugh when he giggles when he is de...,1


In [13]:
df.columns = ['unnamed', 'text', 'emotion']


In [14]:
df.sample(10)


Unnamed: 0,unnamed,text,emotion
203418,203418,i think each and every single human being in t...,5
182377,182377,i just think its better to cry than just walki...,0
53529,53529,i soon found myself flooded by the familiar fe...,4
65729,65729,i really feel passionate about and it scares m...,1
403709,403709,i remember feeling helpless and sad said dugger,0
208430,208430,i leave this job i will laze around whole day ...,1
38200,38200,i woke up feeling gloomy since i had a bad dream,0
405476,405476,i just feel like nobody is supporting me,2
237397,237397,i feel like my heart is aching,0
42605,42605,i did not feel safe because this man said he k...,1


In [15]:
df.drop(columns=['unnamed'], inplace=True) 


In [16]:
df.sample(10)


Unnamed: 0,text,emotion
161726,i can think of only one way to truly describe ...,3
93807,i kindly asked if she could waive it this one ...,1
39226,i havent had a job and the feeling is really r...,5
310143,i do feel im being tortured not necessarily by...,3
15728,ill need to watch a few youtube tutorials befo...,1
29079,i haven t gotten very far into this book but i...,1
281828,i think i was crying because i missed feeling ...,2
2587,i read about a politician who wants to wield u...,3
372536,i still feel kind of dazed i need to wake up a...,5
42856,i consider a gift to inhabit and others those ...,0


In [17]:
df.isna().sum()


text       0
emotion    0
dtype: int64

In [18]:
df.duplicated().sum()


686

In [19]:
df.dropna(inplace=True)


In [20]:
df.drop_duplicates(inplace=True)


In [21]:
df.shape


(416123, 2)

In [22]:
X, y = df['text'], df['emotion']


In [23]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [24]:

import re
from nltk.stem import PorterStemmer

stemmer = PorterStemmer()


def convert_text(text_series):
    return text_series.apply(lambda text: ' '.join(
        stemmer.stem(word) for word in re.findall(r'\b\w+\b', text.lower())
    ))


In [25]:
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
from sklearn.linear_model import SGDClassifier


In [None]:
model = Pipeline([
    ('preprocess', FunctionTransformer(convert_text, validate=False)),
    ('tfidf', TfidfVectorizer(stop_words='english', max_features=8000, ngram_range=(1, 2))),
    ('clf', BernoulliNB())
])


In [46]:
model.fit(X_train, y_train)


In [47]:
model.score(X_test, y_test)


0.8819104836287174

In [29]:
models = {
    'Logistic Regression' : LogisticRegression(),
    'Random Forest' : RandomForestClassifier(),
    'Support Vector Machine' : SVC(),
    'Decision Tree' : DecisionTreeClassifier(),
    'ExtraTreeClassifier' : ExtraTreeClassifier(),
    'SVC' : SVC(),
    'Multinomial Naive Bayes' : MultinomialNB(),
    'Bernoulli Naive Bayes' : BernoulliNB(),
    'AdaBoost' : AdaBoostClassifier(),
    'Gradient Boosting' : GradientBoostingClassifier(),
    
}


for name, current_model in models.items():
    print(f"Checking accuracy for {name}")
    model = Pipeline([
        ('preprocess', FunctionTransformer(convert_text, validate=False)),
        ('tfidf', TfidfVectorizer(stop_words='english', max_features=8000, ngram_range=(1, 2))),
        ('clf', current_model)
    ])

    model.fit(X_train, y_train)
    print(f"Accuracy for {name} is {model.score(X_test, y_test)}")
    print("*"*50)
    print("\n")



Checking accuracy for Logistic Regression


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Accuracy for Logistic Regression is 0.8961730249324121
**************************************************


Checking accuracy for Random Forest


KeyboardInterrupt: 

In [None]:
from sklearn.model_selection import GridSearchCV

pipeline = Pipeline([
    ('preprocess', FunctionTransformer(convert_text, validate=False)),
    ('tfidf', TfidfVectorizer()),
    ('clf', BernoulliNB())
])

# Define parameter grid
param_grid = {
    'tfidf__max_df': [0.9, 1.0],
    'tfidf__min_df': [1, 3, 5],
    'tfidf__ngram_range': [(1, 1), (1, 2)],
    'tfidf__max_features': [5000, 8000, 10000],
    'clf__alpha': [0.1, 0.5, 1.0],  # Laplace smoothing
    'clf__binarize': [0.0, 0.5, 1.0],
    'clf__fit_prior': [True, False]
}

# Run GridSearchCV
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=1)

grid_search.fit(X_train, y_train)

# Best score and params
print("Best accuracy:", grid_search.best_score_)
print("Best parameters:", grid_search.best_params_)


Fitting 5 folds for each of 648 candidates, totalling 3240 fits
