In [1]:
import pandas as pd
import numpy as np
import matplotlib as plt
import seaborn as sns
from nltk.corpus import stopwords
import re
import nltk
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB,BernoulliNB,GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.metrics import accuracy_score,classification_report,r2_score,confusion_matrix


In [2]:
df=pd.read_csv('train.txt',sep=';',header=None,names=['text','Emotions'])
df.head()

Unnamed: 0,text,Emotions
0,i didnt feel humiliated,sadness
1,i can go from feeling so hopeless to so damned...,sadness
2,im grabbing a minute to post i feel greedy wrong,anger
3,i am ever feeling nostalgic about the fireplac...,love
4,i am feeling grouchy,anger


In [3]:
df.isnull().sum()

text        0
Emotions    0
dtype: int64

In [4]:
df.duplicated().sum()

np.int64(1)

In [5]:
df.drop_duplicates(inplace=True)

In [6]:
df.head()

Unnamed: 0,text,Emotions
0,i didnt feel humiliated,sadness
1,i can go from feeling so hopeless to so damned...,sadness
2,im grabbing a minute to post i feel greedy wrong,anger
3,i am ever feeling nostalgic about the fireplac...,love
4,i am feeling grouchy,anger


In [7]:
df['Emotions']=df['Emotions'].apply(lambda x:x.lower().strip())

In [8]:
df['Emotions'].value_counts()

Emotions
joy         5361
sadness     4666
anger       2159
fear        1937
love        1304
surprise     572
Name: count, dtype: int64

In [9]:
df['Emotions']=df['Emotions'].replace({'joy':'love'})

In [10]:
df['Emotions'].value_counts()

Emotions
love        6665
sadness     4666
anger       2159
fear        1937
surprise     572
Name: count, dtype: int64

In [11]:
encoder=LabelEncoder()
df['Emotions']=encoder.fit_transform(df['Emotions'])

In [12]:
df['text']=df['text'].apply(lambda x:x.lower())

In [13]:
df['text']=df['text'].apply(lambda x:re.sub(r'[^a-z\s]', '', x))

In [14]:
nltk.download('stopwords')
stop_words=set(stopwords.words('english'))

[nltk_data] Error loading stopwords: <urlopen error [Errno 11001]
[nltk_data]     getaddrinfo failed>


In [15]:
def clean_text(text):
    words = text.split()
    clean_words = []
    for word in words:
        if word not in stop_words:
            clean_words.append(word)
    return ' '.join(clean_words)


In [16]:
df['text']=df['text'].apply(clean_text)

In [17]:
df['text'][1]

'go feeling hopeless damned hopeful around someone cares awake'

In [18]:
X_train, X_test, y_train, y_test = train_test_split(df['text'],df['Emotions'], test_size=0.2, random_state=42)

In [19]:
X_train.shape

(12799,)

In [20]:
X_test.shape

(3200,)

In [21]:
vectorizer = CountVectorizer()
X_train_cv = vectorizer.fit_transform(X_train)
X_test_cv = vectorizer.transform(X_test)


In [22]:
L_model=LogisticRegression()
L_model.fit(X_train_cv,y_train)
prediction=L_model.predict(X_test_cv)
accuracy = accuracy_score(y_test, prediction)
print(f'Accuracy: {accuracy * 100:.2f}%')
print("Classification Report:")
print(classification_report(y_test, prediction))


Accuracy: 92.50%
Classification Report:
              precision    recall  f1-score   support

           0       0.90      0.87      0.89       439
           1       0.87      0.85      0.86       375
           2       0.95      0.97      0.96      1330
           3       0.93      0.94      0.93       950
           4       0.88      0.71      0.79       106

    accuracy                           0.93      3200
   macro avg       0.91      0.87      0.88      3200
weighted avg       0.92      0.93      0.92      3200



In [25]:

import joblib

joblib.dump(L_model, "emotion_model.pkl")
joblib.dump(vectorizer, "vectorizer.pkl")
joblib.dump(encoder,'Encoder.pkl')


['Encoder.pkl']