# Sentiment Analysis using ML

Link to dataset: https://www.kaggle.com/datasets/praveengovi/emotions-dataset-for-nlp

## Data preprocessing

### Imports

In [26]:
import pandas as pd
import csv
import spacy
nlp = spacy.load('en_core_web_sm')
from spacy.lang.en import stop_words

Dataset already split into test, train and val. However, we want to try and do our own data splits. Therefore, merge the data together.

In [27]:
path = './archive/'
filenames = ['test.txt', 'train.txt', 'val.txt']

# Combine all the files into one csv file
with open('all.csv', 'w', newline='') as outfile:
    writer = csv.writer(outfile, delimiter=';')
    writer.writerow(['text', 'label'])

    for fname in filenames:
        with open(path + fname) as infile:
            for line in infile:
                # Split the line into text and label using the semicolon
                text, label = line.strip().split(';')
                writer.writerow([text, label])
            

Read csv

In [28]:
df = pd.read_csv('all.csv', delimiter=';')
df.head()

Unnamed: 0,text,label
0,im feeling rather rotten so im not very ambiti...,sadness
1,im updating my blog because i feel shitty,sadness
2,i never make her separate from me because i do...,sadness
3,i left with my bouquet of red and yellow tulip...,joy
4,i was feeling a little vain when i did this one,sadness


Tokenize the words, remove stop words and lemmatize

In [29]:
def process_text(text):
    doc = nlp(text)
    lemmas = [token.lemma_ for token in doc if not token.is_stop]
    return lemmas

# Split the text into tokens
df['text'] = df['text'].apply(process_text)
df.head()

Unnamed: 0,text,label
0,"[m, feel, rotten, m, ambitious, right]",sadness
1,"[m, update, blog, feel, shitty]",sadness
2,"[separate, not, want, feel, like, m, ashamed]",sadness
3,"[leave, bouquet, red, yellow, tulip, arm, feel...",joy
4,"[feel, little, vain]",sadness


Separate data into categories

In [36]:
labels = df['label'].unique()
# Split df into dictionary with label as key and text as value
emotion_dict = {label: df[df['label'] == label]['text'] for label in labels}

14                 [find, odd, position, feel, supportive]
71               [feel, like, naughty, school, girl, fall]
74       [overseas, cross, border, foreigner, feel, way...
79                           [want, feel, gentle, embrace]
96           [love, neglect, blog, feel, faithful, reader]
                               ...                        
19955    [sincerely, feel, benefit, relationship, roman...
19956           [like, feel, game, m, fond, color, scheme]
19972      [m, feel, little, tender, mash, today, m, good]
19978    [feel, supportive, jrock, ish, school, tommorrow]
19988             [feel, like, get, know, bit, know, like]
Name: text, Length: 1641, dtype: object


In [41]:
# Print length of all the emotions
for emotion in emotion_dict.keys():
    print(f'{emotion}: {len(emotion_dict[emotion])}')

sadness: 5797
joy: 6761
fear: 2373
anger: 2709
love: 1641
surprise: 719
