In [1]:
import pandas as pd
from io import StringIO
import string
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

2023-12-30 15:47:54.069135: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
file_path = '/Users/joshuaodugbemi/Downloads/archive/train.txt'
with open(file_path, 'r') as file:
    text_data = file.read()

In [3]:
df = pd.read_csv(StringIO(text_data), delimiter=';', header=None, names=['text', 'emotion'])

In [4]:
df.head()

Unnamed: 0,text,emotion
0,i didnt feel humiliated,sadness
1,i can go from feeling so hopeless to so damned...,sadness
2,im grabbing a minute to post i feel greedy wrong,anger
3,i am ever feeling nostalgic about the fireplac...,love
4,i am feeling grouchy,anger


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16000 entries, 0 to 15999
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   text     16000 non-null  object
 1   emotion  16000 non-null  object
dtypes: object(2)
memory usage: 250.1+ KB


In [6]:
df['emotion'].value_counts()

emotion
joy         5362
sadness     4666
anger       2159
fear        1937
love        1304
surprise     572
Name: count, dtype: int64

In [7]:
df2=pd.read_csv('/Users/joshuaodugbemi/Downloads/Emotion_classify_Data.csv')

In [8]:
df2.head()

Unnamed: 0,Comment,Emotion
0,i seriously hate one subject to death but now ...,fear
1,im so full of life i feel appalled,anger
2,i sit here to write i start to dig out my feel...,fear
3,ive been really angry with r and i feel like a...,joy
4,i feel suspicious if there is no one outside l...,fear


In [9]:
df2.columns=['text','emotion']

In [10]:
df2.head()

Unnamed: 0,text,emotion
0,i seriously hate one subject to death but now ...,fear
1,im so full of life i feel appalled,anger
2,i sit here to write i start to dig out my feel...,fear
3,ive been really angry with r and i feel like a...,joy
4,i feel suspicious if there is no one outside l...,fear


In [11]:
df2['emotion'].value_counts()

emotion
anger    2000
joy      2000
fear     1937
Name: count, dtype: int64

In [12]:
df3=pd.read_csv('/Users/joshuaodugbemi/Downloads/Emotion_final.csv')

In [13]:
df3.columns=['text','emotion']

In [14]:
df3.head()

Unnamed: 0,text,emotion
0,i didnt feel humiliated,sadness
1,i can go from feeling so hopeless to so damned...,sadness
2,im grabbing a minute to post i feel greedy wrong,anger
3,i am ever feeling nostalgic about the fireplac...,love
4,i am feeling grouchy,anger


In [15]:
df3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21459 entries, 0 to 21458
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   text     21459 non-null  object
 1   emotion  21459 non-null  object
dtypes: object(2)
memory usage: 335.4+ KB


In [16]:
df3['emotion'].value_counts()

emotion
happy       7029
sadness     6265
anger       2993
fear        2652
love        1641
surprise     879
Name: count, dtype: int64

In [17]:
df = pd.concat([df, df2], ignore_index=True)

In [18]:
result_df = pd.concat([df, df3], ignore_index=True)

In [19]:
result_df.head()

Unnamed: 0,text,emotion
0,i didnt feel humiliated,sadness
1,i can go from feeling so hopeless to so damned...,sadness
2,im grabbing a minute to post i feel greedy wrong,anger
3,i am ever feeling nostalgic about the fireplac...,love
4,i am feeling grouchy,anger


In [20]:
result_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 43396 entries, 0 to 43395
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   text     43396 non-null  object
 1   emotion  43396 non-null  object
dtypes: object(2)
memory usage: 678.2+ KB


In [21]:
result_df['emotion'].value_counts()

emotion
sadness     10931
joy          7362
anger        7152
happy        7029
fear         6526
love         2945
surprise     1451
Name: count, dtype: int64

In [22]:
emotion_to_mood = {
    'sadness': 'Sad',
    'joy': 'Happy',
    'anger': 'Angry',
    'fear': 'Fear',
    'happy': 'Happy',
    'love': 'Happy',
    'surprise': 'Excited',
}


In [23]:
result_df['mood'] = result_df['emotion'].map(emotion_to_mood)

In [24]:
result_df.head()

Unnamed: 0,text,emotion,mood
0,i didnt feel humiliated,sadness,Sad
1,i can go from feeling so hopeless to so damned...,sadness,Sad
2,im grabbing a minute to post i feel greedy wrong,anger,Angry
3,i am ever feeling nostalgic about the fireplac...,love,Happy
4,i am feeling grouchy,anger,Angry


In [25]:
result_df['mood'].value_counts()

mood
Happy      17336
Sad        10931
Angry       7152
Fear        6526
Excited     1451
Name: count, dtype: int64

In [26]:
df=result_df.drop('emotion',axis=1)

In [27]:
df.head()

Unnamed: 0,text,mood
0,i didnt feel humiliated,Sad
1,i can go from feeling so hopeless to so damned...,Sad
2,im grabbing a minute to post i feel greedy wrong,Angry
3,i am ever feeling nostalgic about the fireplac...,Happy
4,i am feeling grouchy,Angry


In [28]:
nlp = spacy.load('en_core_web_sm')
stopwords=list(STOP_WORDS)
punc=string.punctuation

In [29]:
def text_cleaner(sentence):
    doc=nlp(sentence)
    
    tokens=[]
    for token in doc:
        if token.lemma_!="-PRON-":
            temp=token.lemma_.lower().strip()
        else:
            temp=token.lower_
        tokens.append(temp)
        
    cleaned_tokens=[]
    for token in tokens:
        if token not in stopwords and token not in punc:
            cleaned_tokens.append(token)
    return cleaned_tokens

In [30]:
x=df['text']
y=df['mood']

In [31]:
x=x.astype('U')

In [32]:
tfidf=TfidfVectorizer(tokenizer=text_cleaner)

In [33]:
RF=RandomForestClassifier()

In [34]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3)

In [35]:
model=Pipeline([('tfidf',tfidf),('RF',RF)])

In [36]:
model.fit(x_train,y_train)

  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


In [37]:
model.score(x_test,y_test)

  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


0.9619018357784777

In [38]:
y_pred=model.predict(x_test)

In [39]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

       Angry       0.94      0.97      0.95      2133
     Excited       0.94      0.85      0.89       446
        Fear       0.95      0.97      0.96      2016
       Happy       0.97      0.97      0.97      5168
         Sad       0.97      0.95      0.96      3256

    accuracy                           0.96     13019
   macro avg       0.95      0.94      0.95     13019
weighted avg       0.96      0.96      0.96     13019



  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


In [40]:
from joblib import dump

In [41]:
# Assuming 'model' is your scikit-learn model
model_filename = 'RMA-EMO.joblib'
dump(model, model_filename)

['RMA-EMO.joblib']

In [42]:
from joblib import load

# Load the saved model
model = load('RMA-EMO.joblib')

In [43]:
model.predict(['im glad'])

array(['Happy'], dtype=object)