# Sentiment Analysis using ML

Link to dataset: https://www.kaggle.com/datasets/praveengovi/emotions-dataset-for-nlp

## Data preprocessing

### Imports

In [68]:
import pandas as pd
import csv
import spacy
nlp = spacy.load('en_core_web_sm')
import tensorflow as tf
from keras.layers import StringLookup
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder

Dataset already split into test, train and val. However, we want to try and do our own data splits. Therefore, merge the data together.

In [69]:
path = './archive/'
filenames = ['test.txt', 'train.txt', 'val.txt']

# Combine all the files into one csv file
with open('all.csv', 'w', newline='') as outfile:
    writer = csv.writer(outfile, delimiter=';')
    writer.writerow(['text', 'label'])

    for fname in filenames:
        with open(path + fname) as infile:
            for line in infile:
                # Split the line into text and label using the semicolon
                text, label = line.strip().split(';')
                writer.writerow([text, label])
            

Read csv

In [70]:
df = pd.read_csv('all.csv', delimiter=';')
df.head()

Unnamed: 0,text,label
0,im feeling rather rotten so im not very ambiti...,sadness
1,im updating my blog because i feel shitty,sadness
2,i never make her separate from me because i do...,sadness
3,i left with my bouquet of red and yellow tulip...,joy
4,i was feeling a little vain when i did this one,sadness


Tokenize the words, remove stop words and lemmatize

In [71]:
def process_text(text):
    doc = nlp(text)
    lemmas = [token.lemma_ for token in doc if not token.is_stop and len(token) > 1]
    return lemmas

# Split the text into tokens
df['text'] = df['text'].apply(process_text)
df.head()

Unnamed: 0,text,label
0,"[feel, rotten, ambitious, right]",sadness
1,"[update, blog, feel, shitty]",sadness
2,"[separate, not, want, feel, like, ashamed]",sadness
3,"[leave, bouquet, red, yellow, tulip, arm, feel...",joy
4,"[feel, little, vain]",sadness


Show statistics about the data

In [72]:
print(f'Categories: {df["label"].unique()}')
print('Instances of each category:')
for label in df['label'].unique():
    print(f'{label}: {len(df[df["label"] == label])}')

Categories: ['sadness' 'joy' 'fear' 'anger' 'love' 'surprise']
Instances of each category:
sadness: 5797
joy: 6761
fear: 2373
anger: 2709
love: 1641
surprise: 719


Unnamed: 0,text,label
0,"[feel, rotten, ambitious, right]",0
1,"[update, blog, feel, shitty]",0
2,"[separate, not, want, feel, like, ashamed]",0
3,"[leave, bouquet, red, yellow, tulip, arm, feel...",1
4,"[feel, little, vain]",0


Map labels to numerical

In [None]:
mapping = {
    'sadness': 0,
    'joy': 1,
    'fear': 2,
    'anger': 3,
    'love': 4,
    'surprise': 5
}

df['label'] = df['label'].apply(lambda x: mapping[x])
df.head()

Encode label with one-hot encoding

In [73]:
def to_categorical(label):
    return tf.keras.utils.to_categorical(label, num_classes=6)

df['label'] = df['label'].apply(to_categorical)
df.head()

Unnamed: 0,text,label
0,"[feel, rotten, ambitious, right]","[1.0, 0.0, 0.0, 0.0, 0.0, 0.0]"
1,"[update, blog, feel, shitty]","[1.0, 0.0, 0.0, 0.0, 0.0, 0.0]"
2,"[separate, not, want, feel, like, ashamed]","[1.0, 0.0, 0.0, 0.0, 0.0, 0.0]"
3,"[leave, bouquet, red, yellow, tulip, arm, feel...","[0.0, 1.0, 0.0, 0.0, 0.0, 0.0]"
4,"[feel, little, vain]","[1.0, 0.0, 0.0, 0.0, 0.0, 0.0]"


Use K-Fold to split data

In [None]:
splits = 5
skf = StratifiedKFold(n_splits=splits, shuffle=True, random_state=1)



Vectorize all inputs using multi-hot encoding

In [51]:
# Convert list into ragged tensors
training_data = tf.ragged.constant(df['text'].values)
layer = StringLookup(output_mode="multi_hot")
layer.adapt(training_data)
layer(training_data)

2024-01-10 11:56:48.540733: W external/local_tsl/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 1063280000 exceeds 10% of free system memory.


<tf.Tensor: shape=(20000, 13291), dtype=float32, numpy=
array([[0., 1., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 1., 1., ..., 0., 0., 0.],
       ...,
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 1., 1., ..., 0., 0., 0.]], dtype=float32)>