# Emotion Detection Model

Importing dependency

In [55]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer

from string import punctuation 
from nltk.tokenize import word_tokenize
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer 
import re #regular expression

Loading Dataset

In [56]:
#we use pandas to load our dataset
df=pd.read_csv('emotion-dataset.csv')

In [57]:
#check the structure of dataset
df.head()

Unnamed: 0,Emotion,Text
0,neutral,Why ?
1,joy,Sage Act upgrade on my to do list for tommorow.
2,sadness,ON THE WAY TO MY HOMEGIRL BABY FUNERAL!!! MAN ...
3,joy,Such an eye ! The true hazel eye-and so brill...
4,joy,@Iluvmiasantos ugh babe.. hugggzzz for u .! b...


In [58]:
#check the shape of the dataset
df.shape

(34792, 2)

In [59]:
# check if the dataset has any missing values
df.isnull().sum()

Emotion    0
Text       0
dtype: int64

In [60]:
#To check the value_count for each emotion - it will give each emotion is containing this much amt of text 
df['Emotion'].value_counts()

joy         11045
sadness      6722
fear         5410
anger        4297
surprise     4062
neutral      2254
disgust       856
shame         146
Name: Emotion, dtype: int64

Pre-Processing the Dataset

To clean the text messages by removing stopwords, numbers, and punctuation. Then convert each word into its base form by using the lemmatization process in the NLTK package.

In [61]:
#data cleaning
stop_words =  stopwords.words('english')
def text_cleaning(text, remove_stop_words=True, lemmatize_words=True):
    # Clean the text, with the option to remove stop_words and to lemmatize word
    # Clean the text
    text = re.sub(r"[^A-Za-z0-9]", " ", text)
    text = re.sub(r"\'s", " ", text)
    text =  re.sub(r'http\S+',' link ', text)
    text = re.sub(r'\b\d+(?:\.\d+)?\s+', '', text) # remove numbers
        
    # Remove punctuation from text
    text = ''.join([c for c in text if c not in punctuation])
    
    # Optionally, remove stop words
    if remove_stop_words:
        text = text.split()
        text = [w for w in text if not w in stop_words]
        text = " ".join(text)
    
    # Optionally, shorten words to their stems
    if lemmatize_words:
        text = text.split()
        lemmatizer = WordNetLemmatizer() 
        lemmatized_words = [lemmatizer.lemmatize(word) for word in text]
        text = " ".join(lemmatized_words)
    
    # Return a list of words
    return(text)

In [62]:
#clean the text
df["cleaned_text"] = df["Text"].apply(text_cleaning)

In [63]:
#to get clean dataset
df

Unnamed: 0,Emotion,Text,cleaned_text
0,neutral,Why ?,Why
1,joy,Sage Act upgrade on my to do list for tommorow.,Sage Act upgrade list tommorow
2,sadness,ON THE WAY TO MY HOMEGIRL BABY FUNERAL!!! MAN ...,ON THE WAY TO MY HOMEGIRL BABY FUNERAL MAN I H...
3,joy,Such an eye ! The true hazel eye-and so brill...,Such eye The true hazel eye brilliant Regular ...
4,joy,@Iluvmiasantos ugh babe.. hugggzzz for u .! b...,Iluvmiasantos ugh babe hugggzzz u babe naamaze...
...,...,...,...
34787,surprise,@MichelGW have you gift! Hope you like it! It'...,MichelGW gift Hope like It hand made wear It k...
34788,joy,The world didnt give it to me..so the world MO...,The world didnt give world MOST DEFINITELY cnt...
34789,anger,A man robbed me today .,A man robbed today
34790,fear,"Youu call it JEALOUSY, I call it of #Losing YO...",Youu call JEALOUSY I call Losing YOU


Model features and labels

In [64]:
#X-features,y-labels
X = df['cleaned_text']
y = df['Emotion']

Dataset splitting

In [65]:
#split Data
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=42)

Pipeline approach

In [66]:
# Create a model in pipeline(LogisticRegression Pipeline)
pipe_lr = Pipeline(steps=[
                               ('pre_processing',TfidfVectorizer(lowercase=False)),
                                 ('lr',LogisticRegression())
                                 ])

In [67]:
#train the model
pipe_lr.fit(X_train,y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Pipeline(steps=[('pre_processing', TfidfVectorizer(lowercase=False)),
                ('lr', LogisticRegression())])

In [68]:
pipe_lr

Pipeline(steps=[('pre_processing', TfidfVectorizer(lowercase=False)),
                ('lr', LogisticRegression())])

In [69]:
#create a prediction from the test set
y_test = pipe_lr.predict(X_test)

In [70]:
# Check Accuracy
pipe_lr.score(X_test,y_test)

1.0

In [71]:
# Make a Prediction
sample_text = "@llumiasantos ugh babe.. hugggzzz for u.! babe naamazed nga ako e ababe e,despite nega's mas pinaramdam at fil lo ang"
pipe_lr.predict([sample_text])

array(['sadness'], dtype=object)

In [72]:
import os

print(os.getcwd())


C:\Users\sneka


save model pipeline

In [73]:
import pickle
pickle_out = open("EmotionDetection/pipe_lr.pkl","wb")
pickle.dump(pipe_lr,pickle_out)
pickle_out.close()