# Sentiment Analysis using ML

Link to dataset: https://www.kaggle.com/datasets/praveengovi/emotions-dataset-for-nlp

## Imports

In [92]:
import pandas as pd
import numpy as np
import os
import csv
from itertools import product
import spacy
nlp = spacy.load('en_core_web_sm')
import joblib
from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

## Data preprocessing

Dataset already split into test, train and val. However, we want to try and do our own data splits. Therefore, merge the data together.

In [93]:
path = './archive/'
filenames = ['test.txt', 'train.txt', 'val.txt']

if not os.path.exists('all.csv'):
    # Combine all the files into one csv file
    with open('all.csv', 'w', newline='') as outfile:
        writer = csv.writer(outfile, delimiter=';')
        writer.writerow(['text', 'label'])

        for fname in filenames:
            with open(path + fname) as infile:
                for line in infile:
                    # Split the line into text and label using the semicolon
                    text, label = line.strip().split(';')
                    writer.writerow([text, label])
            

Read csv

In [94]:
df = pd.read_csv('all.csv', delimiter=';')
df.head()

Unnamed: 0,text,label
0,im feeling rather rotten so im not very ambiti...,sadness
1,im updating my blog because i feel shitty,sadness
2,i never make her separate from me because i do...,sadness
3,i left with my bouquet of red and yellow tulip...,joy
4,i was feeling a little vain when i did this one,sadness


Function to remove stop words and lemmatize

In [95]:
def process_text(text):
    doc = nlp(text)
    lemmas = [token.lemma_ for token in doc if not token.is_stop and len(token) > 1]
    return ' '.join(lemmas)

Remove stop words and lemmatize from data

In [96]:
if not os.path.exists('processed.csv'):
    # Split the text into tokens
    df['text'] = df['text'].apply(process_text)
    df.to_csv('processed.csv', index=False)
else:
    df = pd.read_csv('processed.csv')

Show statistics about the data

In [97]:
print(f'Categories: {df["label"].unique()}')
print('Instances of each category:')
for label in df['label'].unique():
    print(f'{label}: {len(df[df["label"] == label])}')

Categories: ['sadness' 'joy' 'fear' 'anger' 'love' 'surprise']
Instances of each category:
sadness: 5797
joy: 6761
fear: 2373
anger: 2709
love: 1641
surprise: 719


Map labels to numerical

In [98]:
mapping = {
    'sadness': 0,
    'joy': 1,
    'fear': 2,
    'anger': 3,
    'love': 4,
    'surprise': 5
}

df['label'] = df['label'].apply(lambda x: mapping[x])
df.head()

Unnamed: 0,text,label
0,feel rotten ambitious right,0
1,update blog feel shitty,0
2,separate not want feel like ashamed,0
3,leave bouquet red yellow tulip arm feel slight...,1
4,feel little vain,0


Split the data into train and test. Do random oversampling to reduce categorical imbalance

In [99]:
num_of_classes = len(df['label'].unique())
# Split the data into training and testing data
train_df, test_df = train_test_split(df, test_size=0.2, random_state=1)
train_df = train_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)

## Shared function for evaluating performance

Function to calculate best combined score

In [220]:
def combined_score(reports):
    total_score = 0
    for report in reports:
        # Calculate the combined score of precision, recall and f1-score
        precision = report['weighted avg']['precision']
        recall = report['weighted avg']['recall']
        f1_score = report['weighted avg']['f1-score']
        accuracy = report['accuracy']
        total_score += (2 * precision + recall + 2 * f1_score + accuracy) / 6
    
    return total_score / len(reports)

Function to evaluate model using K-Fold

In [218]:
def evaluate_kfold(model, train_df):
    splits = 5
    skf = StratifiedKFold(n_splits=splits, shuffle=True, random_state=1)
    reports = []
    for i, (train_i, test_i) in enumerate(skf.split(train_df['text'], train_df['label'])):
        text_train, text_validation = train_df['text'][train_i], train_df['text'][test_i]
        label_train, label_validation = train_df['label'][train_i], train_df['label'][test_i]
        
        ros = RandomOverSampler(random_state=1)
        text_train, label_train = ros.fit_resample(text_train.values.reshape(-1, 1), label_train)
        text_train = text_train.flatten()
        
        # Create the model and train
        model.fit(text_train, label_train)
        
        # Evaluate the model on validation then test data
        report = classification_report(label_validation, model.predict(text_validation), output_dict=True, zero_division=0)
        reports.append(report)
    return combined_score(reports)

## Naive Bayes

Function to build model based on input

In [110]:
def build_model_nb(max_features, ngram_range, alpha, smooth_df, max_df, norm):
    vectorizer = TfidfVectorizer(max_features=max_features,
                                ngram_range=ngram_range,
                                smooth_idf=smooth_df,
                                max_df=max_df,
                                norm=norm)
    classifier = MultinomialNB(alpha=alpha)
    return Pipeline([('vectorizer', vectorizer), ('classifier', classifier)])

Grid search using K-Fold

In [111]:
param_grid = {
    'tfidf__max_features': [100, 500, 1000, 2000, None],
    'tfidf__ngram_range': [(1, 1), (1, 2)],
    'clf__alpha': [0.1, 0.5, 1, 2],
    'tfidf__smooth_idf': (True, False),
    'tfidf__max_df': [0.25, 0.5, 0.75, 1.0],
    'tfidf__norm': ('l1', 'l2', None),
}
param_combinations = list(product(*param_grid.values()))

best_score = 0
best_params = {}

for max_features, ngram_range, alpha, tfidf__smooth_idf, tfidf__max_df, tfidf__norm in param_combinations:
    model = build_model_nb(max_features=max_features, 
                            ngram_range=ngram_range, 
                            alpha=alpha, 
                            smooth_df=tfidf__smooth_idf, 
                            max_df=tfidf__max_df, 
                            norm=tfidf__norm)
    score = evaluate_kfold(model, train_df)

    if score > best_score:
        best_score = score
        best_params = {
            'max_features': max_features,
            'ngram_range': ngram_range,
            'alpha': alpha,
            'smooth_df': tfidf__smooth_idf,
            'max_df': tfidf__max_df,
            'norm': tfidf__norm
        }

print(f'Best score: {best_score}')
print(f'Best params: {best_params}')
with open('naive_bayes_best_parameters', 'w') as f:
    f.write(str(best_params))

Best score: 0.8522843711801421
Best params: {'max_features': None, 'ngram_range': (1, 2), 'alpha': 0.5, 'smooth_df': True, 'max_df': 0.25, 'norm': 'l1'}


In [144]:
text_train, label_train = train_df['text'], train_df['label']
ros = RandomOverSampler(random_state=1)
text_train, label_train = ros.fit_resample(text_train.values.reshape(-1, 1), label_train)
text_train = text_train.flatten()

model = build_model_nb(max_features=None,
                    ngram_range=(1, 2),
                    alpha=0.5,
                    smooth_df=True,
                    max_df=0.25,
                    norm='l2')

# Create the model and train on entire training data
model.fit(text_train, label_train)
joblib.dump(model, 'naive_bayes.pkl')
print(classification_report(test_df['label'], model.predict(test_df['text']), zero_division=0))

              precision    recall  f1-score   support

           0       0.91      0.86      0.89      1146
           1       0.92      0.85      0.89      1343
           2       0.79      0.81      0.80       475
           3       0.84      0.85      0.85       529
           4       0.68      0.79      0.73       356
           5       0.51      0.78      0.62       151

    accuracy                           0.84      4000
   macro avg       0.78      0.83      0.79      4000
weighted avg       0.86      0.84      0.85      4000



Function to predict using Naive Bayes

In [266]:
def naive_bayes(text):
    emotions = ['sadness', 'joy', 'fear', 'anger', 'love', 'surprise']
    # Load the model
    model = joblib.load('naive_bayes.pkl')
    processed_text = process_text(text)
    prediction = model.predict([processed_text])
    return emotions[prediction[0]]

print(naive_bayes('Charles are you fluent in jappanese?'))

sadness


## Logistic Regression Model

Function to build model based on input

In [243]:
def build_model_lr(solver, C, multi_class, tol, max_df, norm):
    vectorizer = TfidfVectorizer(max_features=None,
                                ngram_range=(1, 2),
                                smooth_idf=True,
                                max_df=max_df,
                                norm=norm,)
    classifier = LogisticRegression(solver=solver,
                                    C=C,
                                    multi_class=multi_class,
                                    tol=tol,
                                    max_iter=500)
    return Pipeline([('vectorizer', vectorizer), ('classifier', classifier)])

Grid search using K-Fold

In [221]:
param_grid = {
    'tfidf__ngram_range': [(1, 1), (1, 2)],
    'tfidf__smooth_idf': (True, False),
    'tfidf__max_df': [0.25, 0.5, 1.0],
    'tfidf__norm': ['l1', 'l2'],
    'clf__solver': ['newton-cg', 'sag', 'saga'],
    'clf__C': [1, 10],
    'clf__multi_class': ['ovr', 'multinomial'],
    'clf__tol': [0.01, 0.1, 1],
}

param_combinations = list(product(*param_grid.values()))

best_score = 0
best_params = {}

for tfidf__max_df, tfidf__norm, clf__solver, clf__C, clf__multi_class, clf__tol in param_combinations:
    model = build_model_lr(max_df=tfidf__max_df,
                            norm=tfidf__norm,
                            solver=clf__solver,
                            C=clf__C,
                            multi_class=clf__multi_class,
                            tol=clf__tol)
    score = evaluate_kfold(model, train_df)

    if score > best_score:
        best_score = score
        best_params = {
            'max_df': tfidf__max_df,
            'norm': tfidf__norm,
            'solver': clf__solver,
            'C': clf__C,
            'multi_class': clf__multi_class,
            'tol': clf__tol,
        }
        with open('logistic_regression_best_parameters', 'w') as f:
            f.write(str(best_params))

print(f'Best score: {best_score}')
print(f'Best params: {best_params}')


Best score: 0.8921059859048593
Best params: {'max_df': 1.0, 'norm': 'l2', 'solver': 'saga', 'C': 10, 'multi_class': 'ovr', 'tol': 0.01}


In [264]:
text_train, label_train = train_df['text'], train_df['label']
ros = RandomOverSampler(random_state=1)
text_train, label_train = ros.fit_resample(text_train.values.reshape(-1, 1), label_train)
text_train = text_train.flatten()

model = build_model_lr(solver='saga',
                        C=10,
                        multi_class='ovr',
                        tol=0.01,
                        max_df=1.0,
                        norm='l2')
                        
# Create the model and train on entire training data
model.fit(text_train, label_train)
joblib.dump(model, 'logistic_regression.pkl')
print(classification_report(test_df['label'], model.predict(test_df['text']), zero_division=0))

              precision    recall  f1-score   support

           0       0.93      0.93      0.93      1146
           1       0.93      0.92      0.92      1343
           2       0.87      0.84      0.85       475
           3       0.92      0.90      0.91       529
           4       0.79      0.83      0.81       356
           5       0.68      0.82      0.74       151

    accuracy                           0.90      4000
   macro avg       0.85      0.87      0.86      4000
weighted avg       0.90      0.90      0.90      4000



Function to run the model on a given piece of text

In [265]:
def logistic_regression(text):
    emotions = ['sadness', 'joy', 'fear', 'anger', 'love', 'surprise']
    # Load the model
    model = joblib.load('naive_bayes.pkl')
    processed_text = process_text(text)
    prediction = model.predict([processed_text])
    return emotions[prediction[0]]

logistic_regression("Charles are you fluent in janpanese?")

'sadness'