# Sentiment Analysis using ML

Link to dataset: https://www.kaggle.com/datasets/praveengovi/emotions-dataset-for-nlp

## Imports

In [17]:
import pandas as pd
import numpy as np
import os
import csv
import spacy
nlp = spacy.load('en_core_web_sm')
from imblearn.over_sampling import RandomOverSampler, SMOTE
import tensorflow as tf
from tensorflow.python import keras
from keras.models import Sequential
from keras.layers import Dense, TextVectorization, Dropout
from keras.metrics import AUC, CategoricalAccuracy, Precision, Recall
from keras import backend as K
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score

## Data preprocessing

Dataset already split into test, train and val. However, we want to try and do our own data splits. Therefore, merge the data together.

In [2]:
path = './archive/'
filenames = ['test.txt', 'train.txt', 'val.txt']

if not os.path.exists('all.csv'):
    # Combine all the files into one csv file
    with open('all.csv', 'w', newline='') as outfile:
        writer = csv.writer(outfile, delimiter=';')
        writer.writerow(['text', 'label'])

        for fname in filenames:
            with open(path + fname) as infile:
                for line in infile:
                    # Split the line into text and label using the semicolon
                    text, label = line.strip().split(';')
                    writer.writerow([text, label])
            

Read csv

In [3]:
df = pd.read_csv('all.csv', delimiter=';')
df.head()

Unnamed: 0,text,label
0,im feeling rather rotten so im not very ambiti...,sadness
1,im updating my blog because i feel shitty,sadness
2,i never make her separate from me because i do...,sadness
3,i left with my bouquet of red and yellow tulip...,joy
4,i was feeling a little vain when i did this one,sadness


Function to remove stop words and lemmatize

In [4]:
def process_text(text):
    doc = nlp(text)
    lemmas = [token.lemma_ for token in doc if not token.is_stop and len(token) > 1]
    return ' '.join(lemmas)

Remove stop words and lemmatize from data

In [5]:
if not os.path.exists('processed.csv'):
    # Split the text into tokens
    df['text'] = df['text'].apply(process_text)
    df.to_csv('processed.csv', index=False)
else:
    df = pd.read_csv('processed.csv')

Show statistics about the data

In [6]:
print(f'Categories: {df["label"].unique()}')
print('Instances of each category:')
for label in df['label'].unique():
    print(f'{label}: {len(df[df["label"] == label])}')

Categories: ['sadness' 'joy' 'fear' 'anger' 'love' 'surprise']
Instances of each category:
sadness: 5797
joy: 6761
fear: 2373
anger: 2709
love: 1641
surprise: 719


Map labels to numerical

In [7]:
mapping = {
    'sadness': 0,
    'joy': 1,
    'fear': 2,
    'anger': 3,
    'love': 4,
    'surprise': 5
}

df['label'] = df['label'].apply(lambda x: mapping[x])
df.head()

Unnamed: 0,text,label
0,feel rotten ambitious right,0
1,update blog feel shitty,0
2,separate not want feel like ashamed,0
3,leave bouquet red yellow tulip arm feel slight...,1
4,feel little vain,0


Split the data into train and test. Do random oversampling to reduce categorical imbalance

In [8]:
num_of_classes = len(df['label'].unique())
# Split the data into training and testing data
train_df, test_df = train_test_split(df, test_size=0.2, random_state=1)
train_df = train_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)

x = train_df['text']
y = train_df['label']
ros = RandomOverSampler(random_state=1)
x_resampled, y_resampled = ros.fit_resample(x.values.reshape(-1, 1), y)
train_df = pd.concat([pd.DataFrame(x_resampled, columns=['text']), pd.DataFrame(y_resampled, columns=['label'])], axis=1)

## Naive Bayes

Function to evaluate model

In [23]:
def evaluate(model, text, label):
    validation_predictions = model.predict(text)
    validation_accuracy = format(accuracy_score(label, validation_predictions), '.5g')
    validation_precision = format(precision_score(label, validation_predictions, average='weighted'), '.5g')
    validation_recall = format(recall_score(label, validation_predictions, average='weighted'), '.5g')
    validation_auc = format(roc_auc_score(label, model.predict_proba(text), multi_class='ovr'), '.5g')
    return validation_accuracy, validation_precision, validation_recall, validation_auc

Use K-Fold split and train naive bayes model

In [24]:
splits = 5
skf = StratifiedKFold(n_splits=splits, shuffle=True, random_state=1)
best_accuracy = 0
for i, (train_i, test_i) in enumerate(skf.split(train_df['text'], train_df['label'])):
    print(f'Fold {i+1}')
    text_train, text_validation = train_df['text'][train_i], train_df['text'][test_i]
    label_train, label_validation = train_df['label'][train_i], train_df['label'][test_i]
    text_test, label_test = test_df['text'], test_df['label']
    
    # Create vectorization layer
    vectorizer = TfidfVectorizer()
    
    # Vectorize text for training and validation data
    text_train = vectorizer.fit_transform(text_train)
    text_validation = vectorizer.transform(text_validation)
    text_test = vectorizer.transform(text_test)
    
    # Create the model and train
    model = MultinomialNB()
    model.fit(text_train, label_train)
    
    # Evaluate the model on validation then test data
    validation_accuracy, validation_precision, validation_recall, validation_auc = evaluate(model, text_validation, label_validation)
    
    print(f'Validation accuracy: {validation_accuracy} - Validation precision: {validation_precision} - Validation recall: {validation_recall} - Validation AUC: {validation_auc}')
    
    test_accuracy, test_precision, test_recall, test_auc = evaluate(model, text_test, label_test)
    print(f'Test accuracy: {test_accuracy} - Test precision: {test_precision} - Test recall: {test_recall} - Test AUC: {test_auc}')

Fold 1
Validation accuracy: 0.90572 - Validation precision: 0.90659 - Validation recall: 0.90572 - Validation AUC: 0.99056
Test accuracy: 0.8085 - Test precision: 0.8284 - Test recall: 0.8085 - Test AUC: 0.96412
Fold 2
Validation accuracy: 0.90557 - Validation precision: 0.90748 - Validation recall: 0.90557 - Validation AUC: 0.99008
Test accuracy: 0.806 - Test precision: 0.82486 - Test recall: 0.806 - Test AUC: 0.9642
Fold 3
Validation accuracy: 0.90049 - Validation precision: 0.90199 - Validation recall: 0.90049 - Validation AUC: 0.98987
Test accuracy: 0.809 - Test precision: 0.82683 - Test recall: 0.809 - Test AUC: 0.96414
Fold 4
Validation accuracy: 0.90771 - Validation precision: 0.9091 - Validation recall: 0.90771 - Validation AUC: 0.99041
Test accuracy: 0.81125 - Test precision: 0.8294 - Test recall: 0.81125 - Test AUC: 0.96441
Fold 5
Validation accuracy: 0.90432 - Validation precision: 0.90589 - Validation recall: 0.90432 - Validation AUC: 0.99034
Test accuracy: 0.80525 - Test p

## Logistic Regression Model

Function to do one-hot encoding for labels

In [20]:
def to_categorical(label):
    return tf.keras.utils.to_categorical(label, num_classes=6)

Use K-Fold split and train logistic regression model

In [21]:
# Prepare the test data
text_test = test_df['text']
label_test = test_df['label'].apply(to_categorical)
label_test = tf.convert_to_tensor(label_test.values.tolist())

splits = 5
skf = StratifiedKFold(n_splits=splits, shuffle=True, random_state=1)
best_accuracy = 0
for i, (train_i, test_i) in enumerate(skf.split(train_df['text'], train_df['label'])):
    print(f'Fold {i+1}')
    text_train, text_validation = train_df['text'][train_i], train_df['text'][test_i]
    label_train, label_validation = train_df['label'][train_i], train_df['label'][test_i]

    # Encode labels with one-hot encoding
    label_train = label_train.apply(to_categorical)
    label_validation = label_validation.apply(to_categorical)
    
    # Convert labels to tensors
    label_train = tf.convert_to_tensor(label_train.values.tolist())
    label_validation = tf.convert_to_tensor(label_validation.values.tolist())
    
    # Create vectorization layer
    vectorize_layer = TextVectorization(output_mode='tf-idf')
    training_data = tf.convert_to_tensor(text_train.values.tolist())
    vectorize_layer.adapt(training_data)
    
    # Create the model
    model = Sequential()
    model.add(vectorize_layer)
    model.add(Dense(units=num_of_classes,
                    kernel_regularizer=tf.keras.regularizers.L1L2(0.0001),
                    activation='softmax'))
    model.add(Dropout(0.1))
    
    # Compile the model
    optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
    model.compile(optimizer=optimizer,
                loss='categorical_crossentropy',
                metrics=[CategoricalAccuracy(name='Accuracy'), AUC(multi_label=True, name='AUC'), Precision(name='Precision'), Recall(name='Recall')])
    
    # Train the model
    model.fit(text_train, label_train, batch_size=64, epochs=10, verbose=1)
    
    # Evaluate the model on validation then test data
    print('Validation:')
    model.evaluate(text_validation, label_validation)
    
    print('Test:')
    _, accuracy, _, _, _ = model.evaluate(text_test, label_test)
    
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        model.save('logistic_regression_model.keras')
    

2024-01-12 16:23:22.446779: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:887] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-01-12 16:23:22.602071: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:887] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-01-12 16:23:22.602188: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:887] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-01-12 16:23:22.605543: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:887] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-01-12 16:23:22.605655: I external/local_xla/xla/stream_executor

Fold 1


2024-01-12 16:23:25.231871: I external/local_tsl/tsl/platform/default/subprocess.cc:304] Start cannot spawn child process: Permission denied


Epoch 1/10


2024-01-12 16:23:49.254576: I external/local_xla/xla/service/service.cc:168] XLA service 0x7f0b08049050 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2024-01-12 16:23:49.254631: I external/local_xla/xla/service/service.cc:176]   StreamExecutor device (0): NVIDIA GeForce RTX 3070 Laptop GPU, Compute Capability 8.6
2024-01-12 16:23:49.271223: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:269] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2024-01-12 16:23:49.759374: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:454] Loaded cuDNN version 8904
I0000 00:00:1705076629.863728  479283 device_compiler.h:186] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


Epoch 2/10
Epoch 3/10
 70/407 [====>.........................] - ETA: 3s - loss: 1.9947 - Accuracy: 0.8663 - AUC: 0.9069 - Precision: 0.9760 - Recall: 0.8154

KeyboardInterrupt: 

Function to run the model on a given piece of text

In [None]:
def logistic_regression(text):
    emotions = ['sadness', 'joy', 'fear', 'anger', 'love', 'surprise']
    # Load the model
    model = tf.keras.models.load_model('./logistic_regression_model.keras')
    processed_text = process_text(text)
    prediction = model.predict([processed_text])
    prediction = np.argmax(prediction)
    return emotions[prediction]

logistic_regression("im feeling a little cranky negative after this doctors appointment")



'anger'