# Data treatment

In [None]:
import pandas as pd
import nltk
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import wordnet
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer

nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet') 
nltk.download('omw-1.4')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger_eng')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


True

In [2]:
data = pd.read_csv('../data/emotions.csv')
mapping = { 0: 'sadness',
           1: 'joy',
           2: 'love',
           3: 'anger',
           4: 'fear',
           5: 'surprise'}

data['label'].replace(mapping, inplace=True)
data.sample(10)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['label'].replace(mapping, inplace=True)


Unnamed: 0,text,label
296391,i feel more comfortable more secure more cultu...,joy
75772,i feel like helia would be suspicious too i dunno,fear
72765,im just having another set of eyes review one ...,fear
48964,im actually glad that ive finally got some com...,surprise
125207,i use the same trick when i m feeling overwhel...,fear
293200,i had to pick one word to describe how i am fe...,fear
310336,i have just been feeling somewhat disheartened...,sadness
198426,i feel it and i am shaken,fear
91497,i feel that when i have a blaze to do some pro...,joy
270678,i fight for him when i feel it is just he said...,surprise


In [3]:
def tokenizer_and_remove_punctuation(row):
  tokens = word_tokenize(row['text'])
  return [word.lower() for word in tokens if word.isalpha()]

data['tokenized'] = data.apply(tokenizer_and_remove_punctuation,axis=1)
data.head()

Unnamed: 0,text,label,tokenized
0,i just feel really helpless and heavy hearted,fear,"[i, just, feel, really, helpless, and, heavy, ..."
1,ive enjoyed being able to slouch about relax a...,sadness,"[ive, enjoyed, being, able, to, slouch, about,..."
2,i gave up my internship with the dmrg and am f...,fear,"[i, gave, up, my, internship, with, the, dmrg,..."
3,i dont know i feel so lost,sadness,"[i, dont, know, i, feel, so, lost]"
4,i am a kindergarten teacher and i am thoroughl...,fear,"[i, am, a, kindergarten, teacher, and, i, am, ..."


In [4]:
# unfortunately pos_tag and lemmatize use different codes for parts of speech
def get_wordnet_pos(word):
    tag = nltk.pos_tag([word])[0][1][0].upper() # gets first letter of POS categorization
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN) # get returns second argument if first key does not exist


In [5]:
lemmatizer = WordNetLemmatizer()

def lemmatizer_with_pos(row):
  return [lemmatizer.lemmatize(word,get_wordnet_pos(word)) for word in row['tokenized']]

data['lemmatized'] = data.apply(lemmatizer_with_pos,axis=1)
data.head()

Unnamed: 0,text,label,tokenized,lemmatized
0,i just feel really helpless and heavy hearted,fear,"[i, just, feel, really, helpless, and, heavy, ...","[i, just, feel, really, helpless, and, heavy, ..."
1,ive enjoyed being able to slouch about relax a...,sadness,"[ive, enjoyed, being, able, to, slouch, about,...","[ive, enjoy, be, able, to, slouch, about, rela..."
2,i gave up my internship with the dmrg and am f...,fear,"[i, gave, up, my, internship, with, the, dmrg,...","[i, give, up, my, internship, with, the, dmrg,..."
3,i dont know i feel so lost,sadness,"[i, dont, know, i, feel, so, lost]","[i, dont, know, i, feel, so, lose]"
4,i am a kindergarten teacher and i am thoroughl...,fear,"[i, am, a, kindergarten, teacher, and, i, am, ...","[i, be, a, kindergarten, teacher, and, i, be, ..."


In [6]:

def remove_sw(row):
  return list(set(row['lemmatized']).difference(stopwords.words()))

data['no_stopwords'] = data.apply(remove_sw,axis=1)
data.head()

Unnamed: 0,text,label,tokenized,lemmatized,no_stopwords
0,i just feel really helpless and heavy hearted,fear,"[i, just, feel, really, helpless, and, heavy, ...","[i, just, feel, really, helpless, and, heavy, ...","[hearted, helpless, feel, heavy]"
1,ive enjoyed being able to slouch about relax a...,sadness,"[ive, enjoyed, being, able, to, slouch, about,...","[ive, enjoy, be, able, to, slouch, about, rela...","[find, unwind, frankly, listless, week, bit, s..."
2,i gave up my internship with the dmrg and am f...,fear,"[i, gave, up, my, internship, with, the, dmrg,...","[i, give, up, my, internship, with, the, dmrg,...","[give, internship, dmrg, distraught, feel]"
3,i dont know i feel so lost,sadness,"[i, dont, know, i, feel, so, lost]","[i, dont, know, i, feel, so, lose]","[feel, lose]"
4,i am a kindergarten teacher and i am thoroughl...,fear,"[i, am, a, kindergarten, teacher, and, i, am, ...","[i, be, a, kindergarten, teacher, and, i, be, ...","[alternative, exam, study, kindergarten, weary..."


In [7]:
def re_blob(row):
  return " ".join(row['no_stopwords'])

data['clean_blob'] = data.apply(re_blob,axis=1)
data.head()

Unnamed: 0,text,label,tokenized,lemmatized,no_stopwords,clean_blob
0,i just feel really helpless and heavy hearted,fear,"[i, just, feel, really, helpless, and, heavy, ...","[i, just, feel, really, helpless, and, heavy, ...","[hearted, helpless, feel, heavy]",hearted helpless feel heavy
1,ive enjoyed being able to slouch about relax a...,sadness,"[ive, enjoyed, being, able, to, slouch, about,...","[ive, enjoy, be, able, to, slouch, about, rela...","[find, unwind, frankly, listless, week, bit, s...",find unwind frankly listless week bit start sl...
2,i gave up my internship with the dmrg and am f...,fear,"[i, gave, up, my, internship, with, the, dmrg,...","[i, give, up, my, internship, with, the, dmrg,...","[give, internship, dmrg, distraught, feel]",give internship dmrg distraught feel
3,i dont know i feel so lost,sadness,"[i, dont, know, i, feel, so, lost]","[i, dont, know, i, feel, so, lose]","[feel, lose]",feel lose
4,i am a kindergarten teacher and i am thoroughl...,fear,"[i, am, a, kindergarten, teacher, and, i, am, ...","[i, be, a, kindergarten, teacher, and, i, be, ...","[alternative, exam, study, kindergarten, weary...",alternative exam study kindergarten weary teac...


In [None]:
# Export bow_vect
bow_vect = CountVectorizer(max_features=10000)
# fit creates one entry for each different word seen
X = bow_vect.fit_transform(data['clean_blob']).toarray()

In [9]:
display(X.shape)
print(data['label'].shape)

(416809, 10000)

(416809,)


In [10]:
from sklearn.model_selection import train_test_split

X = pd.DataFrame(X, columns=bow_vect.get_feature_names_out())
X_train, X_test, y_train, y_test = train_test_split(X, data['label'], test_size=0.2, random_state=42)


In [11]:
from sklearn.ensemble import RandomForestClassifier
import pickle

rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [12]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Make predictions on the test set
y_pred = rf_model.predict(X_test)

# Calculate metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

# Print metrics
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision (weighted): {precision:.4f}")
print(f"Recall (weighted): {recall:.4f}")
print(f"F1 Score (weighted): {f1:.4f}")

Accuracy: 0.8457
Precision (weighted): 0.8461
Recall (weighted): 0.8457
F1 Score (weighted): 0.8458


In [14]:
from sklearn.ensemble import RandomForestClassifier
import pickle

model = RandomForestClassifier(
    n_estimators=100,
    max_depth=25,
    n_jobs=-1,
    random_state=42
)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

# Calculate metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

# Print metrics
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision (weighted): {precision:.4f}")
print(f"Recall (weighted): {recall:.4f}")
print(f"F1 Score (weighted): {f1:.4f}")

Accuracy: 0.5369
Precision (weighted): 0.7570
Recall (weighted): 0.5369
F1 Score (weighted): 0.4773


# Model creation

In [13]:
pickle.dump(rf_model, open('model.pkl', 'wb'))
pickle.dump(bow_vect, open('vectorizer.pkl', 'wb'))