# Sentiment Analysis using ML

Link to dataset: https://www.kaggle.com/datasets/praveengovi/emotions-dataset-for-nlp

## Data preprocessing

### Imports

In [32]:
import pandas as pd
import numpy as np
import os
import csv
import spacy
nlp = spacy.load('en_core_web_sm')
from imblearn.over_sampling import RandomOverSampler
import tensorflow as tf
from tensorflow.python import keras
from keras.models import Sequential
from keras.layers import Dense, TextVectorization, Dropout
from keras.metrics import AUC, CategoricalAccuracy
from keras import backend as K
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.utils.class_weight import compute_class_weight

Dataset already split into test, train and val. However, we want to try and do our own data splits. Therefore, merge the data together.

In [33]:
path = './archive/'
filenames = ['test.txt', 'train.txt', 'val.txt']

if not os.path.exists('all.csv'):
    # Combine all the files into one csv file
    with open('all.csv', 'w', newline='') as outfile:
        writer = csv.writer(outfile, delimiter=';')
        writer.writerow(['text', 'label'])

        for fname in filenames:
            with open(path + fname) as infile:
                for line in infile:
                    # Split the line into text and label using the semicolon
                    text, label = line.strip().split(';')
                    writer.writerow([text, label])
            

Read csv

In [34]:
df = pd.read_csv('all.csv', delimiter=';')
df.head()

Unnamed: 0,text,label
0,im feeling rather rotten so im not very ambiti...,sadness
1,im updating my blog because i feel shitty,sadness
2,i never make her separate from me because i do...,sadness
3,i left with my bouquet of red and yellow tulip...,joy
4,i was feeling a little vain when i did this one,sadness


Remove stop words and lemmatize

In [35]:
def process_text(text):
    doc = nlp(text)
    lemmas = [token.lemma_ for token in doc if not token.is_stop and len(token) > 1]
    return ' '.join(lemmas)

if not os.path.exists('processed.csv'):
    # Split the text into tokens
    df['text'] = df['text'].apply(process_text)
    df.to_csv('processed.csv', index=False)
else:
    df = pd.read_csv('processed.csv')

Show statistics about the data

In [36]:
print(f'Categories: {df["label"].unique()}')
print('Instances of each category:')
for label in df['label'].unique():
    print(f'{label}: {len(df[df["label"] == label])}')

Categories: ['sadness' 'joy' 'fear' 'anger' 'love' 'surprise']
Instances of each category:
sadness: 5797
joy: 6761
fear: 2373
anger: 2709
love: 1641
surprise: 719


Map labels to numerical

In [37]:
mapping = {
    'sadness': 0,
    'joy': 1,
    'fear': 2,
    'anger': 3,
    'love': 4,
    'surprise': 5
}

df['label'] = df['label'].apply(lambda x: mapping[x])
df.head()

Unnamed: 0,text,label
0,feel rotten ambitious right,0
1,update blog feel shitty,0
2,separate not want feel like ashamed,0
3,leave bouquet red yellow tulip arm feel slight...,1
4,feel little vain,0


Function to convert label to categorical

In [38]:
def to_categorical(label):
    return tf.keras.utils.to_categorical(label, num_classes=6)

Split the data into train and test. Do random oversampling to reduce categorical imbalance

In [39]:
num_of_classes = len(df['label'].unique())
# Split the data into training and testing data
train_df, test_df = train_test_split(df, test_size=0.2, random_state=1)
train_df = train_df.reset_index(drop=True)

x = train_df['text']
y = train_df['label']
ros = RandomOverSampler(random_state=1)
x_resampled, y_resampled = ros.fit_resample(x.values.reshape(-1, 1), y)
train_df = pd.concat([pd.DataFrame(x_resampled, columns=['text']), pd.DataFrame(y_resampled, columns=['label'])], axis=1)

# Prepare the test data
test_df = test_df.reset_index(drop=True)
text_test = test_df['text']
label_test = test_df['label'].apply(to_categorical)
label_test = tf.convert_to_tensor(label_test.values.tolist())

Use K-Fold split and train logistic regression model

In [41]:
#########################################################################################
# Code from https://datascience.stackexchange.com/questions/45165/how-to-get-accuracy-f1-precision-and-recall-for-a-keras-model
def recall_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))
#########################################################################################
splits = 5
skf = StratifiedKFold(n_splits=splits, shuffle=True, random_state=1)
best_accuracy = 0
for i, (train_i, test_i) in enumerate(skf.split(train_df['text'], train_df['label'])):
    print(f'Fold {i+1}')
    text_train, text_validation = train_df['text'][train_i], train_df['text'][test_i]
    label_train, label_validation = train_df['label'][train_i], train_df['label'][test_i]

    # Encode labels with one-hot encoding
    label_train = label_train.apply(to_categorical)
    label_validation = label_validation.apply(to_categorical)
    
    # Convert labels to tensors
    label_train = tf.convert_to_tensor(label_train.values.tolist())
    label_validation = tf.convert_to_tensor(label_validation.values.tolist())
    
    # Create vectorization layer
    vectorize_layer = TextVectorization(output_mode='tf-idf')
    training_data = tf.convert_to_tensor(text_train.values.tolist())
    vectorize_layer.adapt(training_data)
    
    # Create the model
    model = Sequential()
    model.add(vectorize_layer)
    model.add(Dense(units=num_of_classes,
                    kernel_regularizer=tf.keras.regularizers.L1L2(0.0001),
                    activation='softmax'))
    model.add(Dropout(0.1))
    
    # Compile the model
    optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
    model.compile(optimizer=optimizer,
                loss='categorical_crossentropy',
                metrics=[CategoricalAccuracy(name='Accuracy'), AUC(multi_label=True, name='AUC'), f1_m, precision_m, recall_m])
    
    # Train the model
    model.fit(text_train, label_train, batch_size=64, epochs=10, verbose=1)
    
    # Evaluate the model on validation then test data
    print('Validation:')
    model.evaluate(text_validation, label_validation)
    
    print('Test:')
    _, accuracy, _, _, _, _ = model.evaluate(text_test, label_test)
    
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        model.save('logistic_regression_model.keras')
    

Fold 1


KeyboardInterrupt: 