# Sentiment Analysis using ML

Link to dataset: https://www.kaggle.com/datasets/praveengovi/emotions-dataset-for-nlp

## Imports

In [1]:
import pandas as pd
import numpy as np
import os
import csv
import spacy
nlp = spacy.load('en_core_web_sm')
from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.pipeline import Pipeline
import tensorflow as tf
from sklearn.model_selection import StratifiedKFold, train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

2024-01-13 00:31:29.952778: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-01-13 00:31:29.952841: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-01-13 00:31:30.009102: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-01-13 00:31:30.123785: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


## Data preprocessing

Dataset already split into test, train and val. However, we want to try and do our own data splits. Therefore, merge the data together.

In [100]:
path = './archive/'
filenames = ['test.txt', 'train.txt', 'val.txt']

if not os.path.exists('all.csv'):
    # Combine all the files into one csv file
    with open('all.csv', 'w', newline='') as outfile:
        writer = csv.writer(outfile, delimiter=';')
        writer.writerow(['text', 'label'])

        for fname in filenames:
            with open(path + fname) as infile:
                for line in infile:
                    # Split the line into text and label using the semicolon
                    text, label = line.strip().split(';')
                    writer.writerow([text, label])
            

Read csv

In [101]:
df = pd.read_csv('all.csv', delimiter=';')
df.head()

Unnamed: 0,text,label
0,im feeling rather rotten so im not very ambiti...,sadness
1,im updating my blog because i feel shitty,sadness
2,i never make her separate from me because i do...,sadness
3,i left with my bouquet of red and yellow tulip...,joy
4,i was feeling a little vain when i did this one,sadness


Function to remove stop words and lemmatize

In [102]:
def process_text(text):
    doc = nlp(text)
    lemmas = [token.lemma_ for token in doc if not token.is_stop and len(token) > 1]
    return ' '.join(lemmas)

Remove stop words and lemmatize from data

In [103]:
if not os.path.exists('processed.csv'):
    # Split the text into tokens
    df['text'] = df['text'].apply(process_text)
    df.to_csv('processed.csv', index=False)
else:
    df = pd.read_csv('processed.csv')

Show statistics about the data

In [104]:
print(f'Categories: {df["label"].unique()}')
print('Instances of each category:')
for label in df['label'].unique():
    print(f'{label}: {len(df[df["label"] == label])}')

Categories: ['sadness' 'joy' 'fear' 'anger' 'love' 'surprise']
Instances of each category:
sadness: 5797
joy: 6761
fear: 2373
anger: 2709
love: 1641
surprise: 719


Map labels to numerical

In [105]:
mapping = {
    'sadness': 0,
    'joy': 1,
    'fear': 2,
    'anger': 3,
    'love': 4,
    'surprise': 5
}

df['label'] = df['label'].apply(lambda x: mapping[x])
df.head()

Unnamed: 0,text,label
0,feel rotten ambitious right,0
1,update blog feel shitty,0
2,separate not want feel like ashamed,0
3,leave bouquet red yellow tulip arm feel slight...,1
4,feel little vain,0


Split the data into train and test. Do random oversampling to reduce categorical imbalance

In [106]:
num_of_classes = len(df['label'].unique())
# Split the data into training and testing data
train_df, test_df = train_test_split(df, test_size=0.2, random_state=1)
train_df = train_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)

## Naive Bayes

Use K-Fold split and train naive bayes model

In [116]:
splits = 5
skf = StratifiedKFold(n_splits=splits, shuffle=True, random_state=1)
best_accuracy = 0
for i, (train_i, test_i) in enumerate(skf.split(train_df['text'], train_df['label'])):
    text_train, text_validation = train_df['text'][train_i], train_df['text'][test_i]
    label_train, label_validation = train_df['label'][train_i], train_df['label'][test_i]
    
    ros = RandomOverSampler(random_state=1)
    text_train, label_train = ros.fit_resample(text_train.values.reshape(-1, 1), label_train)
    text_train = text_train.flatten()
    
    # Create the model and train
    naive_bayes = Pipeline([('tfidf', TfidfVectorizer()), ('clf', MultinomialNB())])
    naive_bayes.fit(text_train, label_train)
    
    # Evaluate the model on validation then test data
    print(f"Fold {i+1} validation:")
    print(classification_report(label_validation, naive_bayes.predict(text_validation)))
    

Fold 1 validation:
              precision    recall  f1-score   support

           0       0.90      0.82      0.85       930
           1       0.91      0.80      0.85      1084
           2       0.71      0.76      0.74       379
           3       0.74      0.80      0.77       436
           4       0.59      0.77      0.67       257
           5       0.42      0.72      0.53       114

    accuracy                           0.79      3200
   macro avg       0.71      0.78      0.73      3200
weighted avg       0.82      0.79      0.80      3200

Fold 2 validation:
              precision    recall  f1-score   support

           0       0.90      0.80      0.85       930
           1       0.92      0.79      0.85      1084
           2       0.70      0.76      0.73       379
           3       0.79      0.83      0.81       436
           4       0.56      0.80      0.66       257
           5       0.45      0.80      0.57       114

    accuracy                           

## Logistic Regression Model

Use K-Fold split to evaluate the logistic regression model

In [77]:
splits = 5
skf = StratifiedKFold(n_splits=splits, shuffle=True, random_state=1)

for i, (train_i, test_i) in enumerate(skf.split(train_df['text'], train_df['label'])):
    print(f'Fold {i+1}')
    text_train, text_validation = train_df['text'][train_i], train_df['text'][test_i]
    label_train, label_validation = train_df['label'][train_i], train_df['label'][test_i]
    text_test, label_test = test_df['text'], test_df['label']

    # Create the model and train
    logistic_regression = Pipeline([('tfidf', TfidfVectorizer()), ('clf', LogisticRegression(max_iter=1000))])
    logistic_regression.fit(text_train, label_train)
    
    # Evaluate the model on validation then test data
    print(f"Fold {i+1} validation:")
    print(classification_report(label_validation, logistic_regression.predict(text_validation)))
    
    print(f"Fold {i+1} test:")
    print(classification_report(label_test, logistic_regression.predict(text_test)))
    

Fold 1
Fold 1 validation:
              precision    recall  f1-score   support

           0       0.93      0.90      0.92      1083
           1       0.94      0.89      0.91      1084
           2       0.96      0.94      0.95      1084
           3       0.95      0.96      0.95      1084
           4       0.92      0.98      0.95      1084
           5       0.96      1.00      0.98      1083

    accuracy                           0.94      6502
   macro avg       0.94      0.94      0.94      6502
weighted avg       0.94      0.94      0.94      6502

Fold 1 test:
              precision    recall  f1-score   support

           0       0.93      0.90      0.91      1146
           1       0.92      0.88      0.90      1343
           2       0.84      0.84      0.84       475
           3       0.88      0.87      0.87       529
           4       0.75      0.84      0.79       356
           5       0.64      0.85      0.73       151

    accuracy                          

Function to run the model on a given piece of text

In [None]:
def logistic_regression(text):
    emotions = ['sadness', 'joy', 'fear', 'anger', 'love', 'surprise']
    # Load the model
    model = tf.keras.models.load_model('./logistic_regression_model.keras')
    processed_text = process_text(text)
    prediction = model.predict([processed_text])
    prediction = np.argmax(prediction)
    return emotions[prediction]

logistic_regression("im feeling a little cranky negative after this doctors appointment")



'anger'