# Sentiment Analysis using ML

Link to dataset: https://www.kaggle.com/datasets/praveengovi/emotions-dataset-for-nlp

## Data preprocessing

### Imports

In [187]:
import pandas as pd
import numpy as np
import os
import csv
import spacy
nlp = spacy.load('en_core_web_sm')
import tensorflow as tf
from tensorflow.python import keras
from keras.models import Sequential
from keras.layers import Dense, TextVectorization
from sklearn.model_selection import StratifiedKFold

Dataset already split into test, train and val. However, we want to try and do our own data splits. Therefore, merge the data together.

In [181]:
path = './archive/'
filenames = ['test.txt', 'train.txt', 'val.txt']

if not os.path.exists('all.csv'):
    # Combine all the files into one csv file
    with open('all.csv', 'w', newline='') as outfile:
        writer = csv.writer(outfile, delimiter=';')
        writer.writerow(['text', 'label'])

        for fname in filenames:
            with open(path + fname) as infile:
                for line in infile:
                    # Split the line into text and label using the semicolon
                    text, label = line.strip().split(';')
                    writer.writerow([text, label])
            

Read csv

In [182]:
df = pd.read_csv('all.csv', delimiter=';')
df.head()

Unnamed: 0,text,label
0,im feeling rather rotten so im not very ambiti...,sadness
1,im updating my blog because i feel shitty,sadness
2,i never make her separate from me because i do...,sadness
3,i left with my bouquet of red and yellow tulip...,joy
4,i was feeling a little vain when i did this one,sadness


Remove stop words and lemmatize

In [183]:
def process_text(text):
    doc = nlp(text)
    lemmas = [token.lemma_ for token in doc if not token.is_stop and len(token) > 1]
    return ' '.join(lemmas)

if not os.path.exists('processed.csv'):
    # Split the text into tokens
    df['text'] = df['text'].apply(process_text)
    df.to_csv('processed.csv', index=False)
else:
    df = pd.read_csv('processed.csv')

Show statistics about the data

In [184]:
print(f'Categories: {df["label"].unique()}')
print('Instances of each category:')
for label in df['label'].unique():
    print(f'{label}: {len(df[df["label"] == label])}')

Categories: ['sadness' 'joy' 'fear' 'anger' 'love' 'surprise']
Instances of each category:
sadness: 5797
joy: 6761
fear: 2373
anger: 2709
love: 1641
surprise: 719


Map labels to numerical

In [185]:
mapping = {
    'sadness': 0,
    'joy': 1,
    'fear': 2,
    'anger': 3,
    'love': 4,
    'surprise': 5
}

df['label'] = df['label'].apply(lambda x: mapping[x])
df.head()

Unnamed: 0,text,label
0,feel rotten ambitious right,0
1,update blog feel shitty,0
2,separate not want feel like ashamed,0
3,leave bouquet red yellow tulip arm feel slight...,1
4,feel little vain,0


Use K-Fold to split data

In [203]:
def to_categorical(label):
    return tf.keras.utils.to_categorical(label, num_classes=6)
fold_index = []
splits = 5
num_of_classes = len(df['label'].unique())
skf = StratifiedKFold(n_splits=splits, shuffle=True, random_state=1)
for i, (train_i, test_i) in enumerate(skf.split(df['text'], df['label'])):
    print(f'Fold {i}')
    fold_index.append((train_i, test_i))
    text_train, text_test = df['text'][train_i], df['text'][test_i]
    label_train, label_test = df['label'][train_i], df['label'][test_i]

    # Encode labels with one-hot encoding
    label_train = label_train.apply(to_categorical)
    label_test = label_test.apply(to_categorical)
    
    # Convert labels to tensors
    label_train = tf.convert_to_tensor(label_train.values.tolist())
    label_test = tf.convert_to_tensor(label_test.values.tolist())
    
    # Create vectorization layer
    vectorize_layer = TextVectorization(output_sequence_length=100)
    training_data = tf.convert_to_tensor(text_train.values.tolist())
    vectorize_layer.adapt(training_data)
    
    # Create the model
    model = Sequential()
    model.add(tf.keras.Input(shape=(1,), dtype=tf.string))
    model.add(vectorize_layer)
    model.add(Dense(units=num_of_classes, activation='softmax', input_dim=100))
    
    # Compile the model
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    
    # Train the model
    model.fit(text_train, label_train, epochs=10, batch_size=32)
    
    # Evaluate the model
    loss, accuracy = model.evaluate(text_test, label_test)
    print(f"Test Loss: {loss}, Test Accuracy: {accuracy}")
    

Fold 0
Epoch 1/10
