### Data is cleaned and embedded using MPNet model. 

In [4]:
!pip install imblearn
!pip install nltk
!pip install transformers
!pip install Sentence_Transformers



In [44]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import math
import nltk
import re
from nltk.corpus import stopwords#, PlainTextCorpusReader
from nltk import word_tokenize, ngrams
from nltk.stem import WordNetLemmatizer
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
import seaborn as sns
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
from transformers import LlamaTokenizer
from sentence_transformers import SentenceTransformer
from datetime import datetime, date, timedelta
from transformers import BertForSequenceClassification, BertTokenizer, AdamW, LlamaForSequenceClassification
from imblearn.over_sampling import SMOTE, ADASYN, RandomOverSampler
from sklearn import metrics
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    accuracy_score,
    balanced_accuracy_score,
    f1_score,
)

import warnings
warnings.filterwarnings('ignore')

## Data Partitioning

In [3]:
import os
os.getcwd()

'C:\\Users\\mehdi.sadeghi\\OneDrive - Georgia Institute of Technology\\Gatech\\ISYE 6740\\Project\\archive'

In [4]:
new_directory = 'C:\\Users\\mehdi.sadeghi\\OneDrive - Georgia Institute of Technology\\Gatech\\ISYE 6740\\Project\\Emotions-main\\Emotions-main\\02_Model'

# Change the current working directory
os.chdir(new_directory)

# Verify the current working directory
current_directory = os.getcwd()
print("Current Working Directory:", current_directory)

Current Working Directory: C:\Users\mehdi.sadeghi\OneDrive - Georgia Institute of Technology\Gatech\ISYE 6740\Project\Emotions-main\Emotions-main\02_Model


In [6]:
embd_data=pd.read_csv('./embeddings/mpnet_embed_df_part400000.csv') 
embd_data

Unnamed: 0,c_0,c_1,c_2,c_3,c_4,c_5,c_6,c_7,c_8,c_9,...,c_761,c_762,c_763,c_764,c_765,c_766,c_767,text_WO_stopwords,text,label
0,0.057199,-0.008160,0.001413,-0.049585,0.032901,0.057294,-0.069347,-0.019316,0.037468,-0.033207,...,-0.011801,-0.000416,-0.056683,-0.003689,-0.009209,-0.009716,-0.033261,feel really helpless heavy hearted,i just feel really helpless and heavy hearted,4
1,-0.057446,0.005956,-0.021202,-0.015104,-0.020960,0.035543,-0.050522,0.016254,0.016209,0.036987,...,-0.014699,-0.043307,-0.025727,-0.002154,0.029399,0.034281,-0.062547,ive enjoy able slouch relax unwind frankly nee...,ive enjoyed being able to slouch about relax a...,0
2,0.018095,-0.022132,-0.009757,-0.014497,0.006242,0.049693,-0.061562,-0.009043,0.054573,-0.015577,...,0.026375,-0.009772,0.015014,0.032342,-0.007213,0.007759,-0.037378,give internship dmrg feel distraught,i gave up my internship with the dmrg and am f...,4
3,0.005479,-0.035476,-0.021814,-0.022414,0.043661,0.003678,-0.041316,-0.009272,-0.022895,-0.023498,...,-0.024940,-0.000361,0.031484,-0.021171,0.040770,-0.025041,0.021277,dont know feel lose,i dont know i feel so lost,0
4,0.021173,0.006722,-0.028587,-0.033321,0.020113,0.015797,0.040073,0.028007,-0.029061,0.014258,...,0.017429,-0.006862,-0.090544,-0.012799,0.030897,0.026404,-0.063194,kindergarten teacher thoroughly weary job take...,i am a kindergarten teacher and i am thoroughl...,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
416804,0.020868,0.042881,0.011425,0.014421,0.003465,-0.007785,0.026107,0.026876,-0.014553,-0.017963,...,0.014526,0.046698,0.001502,-0.010543,0.000936,0.010221,-0.015694,feel like tell horny devil find site suit sort...,i feel like telling these horny devils to find...,2
416805,-0.002463,0.040540,-0.000846,-0.065937,0.008002,0.049785,-0.106255,0.024187,0.041116,-0.013932,...,-0.025246,-0.001675,-0.012717,-0.025871,0.030419,-0.013384,-0.035171,begin realize feel agitate restless would thin...,i began to realize that when i was feeling agi...,3
416806,0.039020,0.039674,0.004809,-0.031511,-0.064205,-0.000712,-0.017174,0.001122,-0.006511,0.003020,...,0.001483,-0.002141,0.000917,-0.033828,0.012037,0.009011,-0.009031,feel curious previous early dawn time seek tro...,i feel very curious be why previous early dawn...,5
416807,-0.029445,0.039162,-0.025720,-0.017802,0.080966,0.025269,-0.034207,-0.013629,-0.004662,-0.011820,...,0.066382,0.010981,-0.058766,-0.056127,-0.026503,0.008378,0.038368,feel becuase tyranical nature government el sa...,i feel that becuase of the tyranical nature of...,3


In [7]:
embd_data.shape

(416809, 771)

In [8]:
X = embd_data.drop('label', axis=1)
y = embd_data['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)

train = pd.concat([X_train, y_train], axis=1)
test = pd.concat([X_test, y_test], axis=1)

In [12]:
# Saving Train and test data for later use in all modeling efforts
train.to_csv('./train_df.csv')
test.to_csv('./test_df.csv')

# 1- Using Embedded data from MPNet model and LLama Lite pre-trained model as classifier for emotion classification

In [9]:
class EmbeddedDataset(Dataset):
    def __init__(self, embeddings, labels):
        self.embeddings = embeddings
        self.labels = labels
    def __len__(self):
        return len(self.embeddings)
    def __getitem__(self, idx):
        embedding=self.embeddings[idx]
        embedding=torch.tensor(embedding, dtype=torch.float32)
        embedding=embedding.unsqueeze(0)
        return embedding, self.labels[idx]

In [10]:
train.reset_index(drop=True, inplace=True)
test.reset_index(drop=True, inplace=True)

In [11]:
print('train: ', len(train), '\ntest: ', len(test)) #, '\nval: ', len(val)

train:  312606 
test:  104203


In [12]:
train['label'] = train['label'].astype('category')
test['label'] = test['label'].astype('category')

In [13]:
train.dtypes

c_0                   float64
c_1                   float64
c_2                   float64
c_3                   float64
c_4                   float64
                       ...   
c_766                 float64
c_767                 float64
text_WO_stopwords      object
text                   object
label                category
Length: 771, dtype: object

## Prepare model input for model training and validation

In [14]:
train_embeddings= np.stack(train.iloc[:,:768].values)
train_labels= train['label'].values
train_dataset= EmbeddedDataset(train_embeddings, train_labels)
train_loader= DataLoader(train_dataset, batch_size=128, shuffle=True)

In [15]:
test_embeddings= np.stack(test.iloc[:,:768].values)
test_labels= test['label'].values
test_dataset= EmbeddedDataset(test_embeddings, test_labels)
test_loader= DataLoader(test_dataset, batch_size=128, shuffle=True)

In [16]:
num_labels= len(train['label'].unique())
print(num_labels)

6


In [27]:
from transformers import LlamaForSequenceClassification, AdamW
model_name='skeskinen/llama-lite-134m'
model=LlamaForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)
optimizer= AdamW(model.parameters(), lr=2e-5, weight_decay=1e-2)
device= torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('device: ', device)
model.to(device)

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at skeskinen/llama-lite-134m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


device:  cuda


LlamaForSequenceClassification(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 768, padding_idx=0)
    (layers): ModuleList(
      (0-11): 12 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=768, out_features=768, bias=False)
          (k_proj): Linear(in_features=768, out_features=768, bias=False)
          (v_proj): Linear(in_features=768, out_features=768, bias=False)
          (o_proj): Linear(in_features=768, out_features=768, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=768, out_features=2048, bias=False)
          (up_proj): Linear(in_features=768, out_features=2048, bias=False)
          (down_proj): Linear(in_features=2048, out_features=768, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
    (norm): LlamaRMSNorm()
  )
  

In [28]:
num_epochs=30
best_test_loss=float('inf')
patience=3
patience_counter=0
best_accuracy=-1
for epoch in range(num_epochs):
    model.train()
    total_loss= 0
    for batch in train_loader:
        embeddings=batch[0].to(device)
        labels=batch[1].to(device)
        optimizer.zero_grad()
        outputs= model(inputs_embeds=embeddings, labels=labels)
        loss= outputs.loss
        total_loss+= loss.item()
        
        loss.backward()
        optimizer.step()
    average_loss= total_loss / len(train_loader)
    print (f'Epoch: {epoch+1}, Loss: {average_loss}')
    
    model.eval()
    test_loss= 0
    correct_predictions= 0
    with torch.no_grad():
        for batch in test_loader:
            embeddings=batch[0].to(device)
            labels=batch[1].to(device)
            
            outputs= model(inputs_embeds=embeddings, labels=labels)
            test_loss+= outputs.loss.item()
            
            logits= outputs.logits
            _, predicted_labels= torch.max(logits, dim=1)
            correct_predictions+= torch.sum(predicted_labels==labels)
    average_test_loss = test_loss/len(test_loader)
    accuracy=correct_predictions/len(test_dataset)
    print(f'Test Loss: {average_test_loss}, Accuracy: {accuracy}')
    
    if accuracy > best_accuracy:
        best_accuracy= accuracy
        patience_counter=0
        torch.save({'model.state_dict()': model.state_dict(), 'optimizer.state_dict()': optimizer.state_dict()}, 'model_unbalanced.pt')
        
        torch.save({
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict()
    }, model_path) 
        
        patience_counter=0
    else:
        patience_counter+=1
    if patience_counter>=patience:
        print('Early stopping Triggered')
        break

Epoch: 1, Loss: 0.47116596780176606
Test Loss: 0.6818288690473404, Accuracy: 0.750822901725769
Epoch: 2, Loss: 0.25431708495673394
Test Loss: 0.7496792136891488, Accuracy: 0.7564273476600647
Epoch: 3, Loss: 0.1716895796681544
Test Loss: 0.8472883035251699, Accuracy: 0.7544216513633728
Epoch: 4, Loss: 0.13620078459216822
Test Loss: 0.9292120997525432, Accuracy: 0.7511683702468872
Epoch: 5, Loss: 0.12082830793036509
Test Loss: 0.9896049231839327, Accuracy: 0.7497192621231079
Early stopping Triggered


In [20]:
num_labels

6

In [37]:
# using saved best model
model_name='skeskinen/llama-lite-134m'
model1=LlamaForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)

optimizer =AdamW(model1.parameters(), lr=2e-5, weight_decay=1e-2)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

checkpoint = torch.load('model_unbalanced.pt')
optimizer.load_state_dict(checkpoint['optimizer.state_dict()'])
model1.load_state_dict(checkpoint['model.state_dict()'])
model1.to(device)

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at skeskinen/llama-lite-134m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


LlamaForSequenceClassification(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 768, padding_idx=0)
    (layers): ModuleList(
      (0-11): 12 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=768, out_features=768, bias=False)
          (k_proj): Linear(in_features=768, out_features=768, bias=False)
          (v_proj): Linear(in_features=768, out_features=768, bias=False)
          (o_proj): Linear(in_features=768, out_features=768, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=768, out_features=2048, bias=False)
          (up_proj): Linear(in_features=768, out_features=2048, bias=False)
          (down_proj): Linear(in_features=2048, out_features=768, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
    (norm): LlamaRMSNorm()
  )
  

In [38]:
# Check the performance of model trained with unbalanced data on test dataset
test_embeddings= np.stack(test.iloc[:,:768].values)
test_labels= test['label'].values
test_dataset= EmbeddedDataset(test_embeddings, test_labels)
test_loader= DataLoader(test_dataset, batch_size=128, shuffle=True)

In [45]:
def evaluate_model(model, data_loader, device, data_df):
    correct_predictions = 0
    data_out_df_list = []
    
    with torch.no_grad():
        for batch in data_loader:
            embeddings = batch[0].to(device)
            labels = batch[1].to(device)
            
            outputs = model(inputs_embeds=embeddings)
            logits = outputs.logits
            _, predicted_labels = torch.max(logits, dim=1)
            correct_predictions += (predicted_labels == labels).sum().item()
            
            df = pd.DataFrame(embeddings.cpu().numpy().reshape(embeddings.shape[0], embeddings.shape[2]))
            df.columns = ['_' + str(x) for x in df.columns.values]
            df['Actual'] = labels.cpu()
            df['Pred'] = predicted_labels.cpu()
            
            data_out_df_list.append(df)
        
    accuracy = correct_predictions / len(data_loader.dataset)
    print('Accuracy=', accuracy)
        
    data_df_res = pd.concat(data_out_df_list, ignore_index=True)
        
    print('Accuracy:', accuracy_score(data_df_res['Actual'], data_df_res['Pred']))
    print('Balanced Accuracy:', balanced_accuracy_score(data_df_res['Actual'], data_df_res['Pred']))
    print('F1 score:', f1_score(data_df_res['Actual'], data_df_res['Pred'], average='micro'))
    print('Recall:', metrics.recall_score(data_df_res['Actual'], data_df_res['Pred'], average='micro'))
    print('Precision:', metrics.precision_score(data_df_res['Actual'], data_df_res['Pred'], average='micro'))
    
    return data_df_res

In [55]:
test_df_res = evaluate_model(model1, test_loader, device, test )

Accuracy= 0.7564273581374816
Accuracy: 0.7564273581374816
Balanced Accuracy: 0.7196425923862521
F1 score: 0.7564273581374816
Recall: 0.7564273581374816
Precision: 0.7564273581374816


# 2- Balancing train dataset for potential model improvement and use same LLama pre-trained model for fine-tuning and compare the results

In [47]:
class_counts = train['label'].value_counts()
# Sort the counts from smallest to largest
class_counts_sorted = class_counts.sort_values()
class_counts_sorted

5     11229
2     25915
4     35784
3     42988
0     90890
1    105800
Name: label, dtype: int64

In [48]:
X_train=train.iloc[:,:768]
y_train=train['label']
sm=SMOTE(random_state=25, n_jobs=-1, k_neighbors=5)
X_train_sm, y_train_sm= sm.fit_resample(X_train, y_train)

In [49]:
X_train_sm.shape

(634800, 768)

In [50]:
y_train_sm.value_counts()

0    105800
1    105800
2    105800
3    105800
4    105800
5    105800
Name: label, dtype: int64

In [51]:
train_embeddings= np.stack(X_train_sm.iloc[:,:768].values)
train_labels= y_train_sm.values
train_dataset= EmbeddedDataset(train_embeddings, train_labels)
train_loader= DataLoader(train_dataset, batch_size=128, shuffle=True)

In [52]:
test_embeddings= np.stack(test.iloc[:,:768].values)
test_labels= test['label'].values
test_dataset= EmbeddedDataset(test_embeddings, test_labels)
test_loader= DataLoader(test_dataset, batch_size=128, shuffle=True)

In [53]:
num_labels= len(train['label'].unique())
print(num_labels)

6


In [27]:
from transformers import LlamaForSequenceClassification, AdamW
model_name='skeskinen/llama-lite-134m'
model=LlamaForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)
optimizer= AdamW(model.parameters(), lr=2e-5, weight_decay=1e-2)
device= torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('device: ', device)
model.to(device)

config.json:   0%|          | 0.00/535 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/536M [00:00<?, ?B/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at skeskinen/llama-lite-134m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


device:  cuda


LlamaForSequenceClassification(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 768, padding_idx=0)
    (layers): ModuleList(
      (0): LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=768, out_features=768, bias=False)
          (k_proj): Linear(in_features=768, out_features=768, bias=False)
          (v_proj): Linear(in_features=768, out_features=768, bias=False)
          (o_proj): Linear(in_features=768, out_features=768, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=768, out_features=2048, bias=False)
          (up_proj): Linear(in_features=768, out_features=2048, bias=False)
          (down_proj): Linear(in_features=2048, out_features=768, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
      (1): LlamaDecoderLayer(
        (self_att

In [35]:
train.shape , test.shape

((312606, 771), (104203, 771))

In [40]:
num_epochs=30
best_test_loss=float('inf')
patience=3
patience_counter=0
best_accuracy=-1
for epoch in range(num_epochs):
    model.train()
    total_loss= 0
    for batch in train_loader:
        embeddings=batch[0].to(device)
        labels=batch[1].to(device)
        optimizer.zero_grad()
        outputs= model(inputs_embeds=embeddings, labels=labels)
        loss= outputs.loss
        total_loss+= loss.item()
        
        loss.backward()
        optimizer.step()
    average_loss= total_loss / len(train_loader)
    print (f'Epoch: {epoch+1}, Loss: {average_loss}')
    
    model.eval()
    test_loss= 0
    correct_predictions= 0
    with torch.no_grad():
        for batch in test_loader:
            embeddings=batch[0].to(device)
            labels=batch[1].to(device)
            
            outputs= model(inputs_embeds=embeddings, labels=labels)
            test_loss+= outputs.loss.item()
            
            logits= outputs.logits
            _, predicted_labels= torch.max(logits, dim=1)
            correct_predictions+= torch.sum(predicted_labels==labels)
    average_test_loss = test_loss/len(test_loader)
    accuracy=correct_predictions/len(test_dataset)
    print(f'Test Loss: {average_test_loss}, Accuracy: {accuracy}')
    
    if accuracy > best_accuracy:
        best_accuracy= accuracy
        patience_counter=0
        torch.save({'model.state_dict()': model.state_dict(), 'optimizer.state_dict()': optimizer.state_dict()}, 'model_balanced.pt')
        patience_counter=0
    else:
        patience_counter+=1
    if patience_counter>=patience:
        print('Early stopping Triggered')
        break

Epoch: 1, Loss: 0.6910204766863348
Test Loss: 0.6035168456884981, Accuracy: 0.767338752746582
Epoch: 2, Loss: 0.4778885061117259
Test Loss: 0.5801429836662269, Accuracy: 0.7747953534126282
Epoch: 3, Loss: 0.31834184839205326
Test Loss: 0.6402754418323376, Accuracy: 0.7651699185371399
Epoch: 4, Loss: 0.20605498998503288
Test Loss: 0.7783686681759138, Accuracy: 0.7620317935943604
Epoch: 5, Loss: 0.15999099656915108
Test Loss: 0.8979340281954572, Accuracy: 0.7577421069145203
Early stopping Triggered


In [54]:
# using saved best model
model_name='skeskinen/llama-lite-134m'
model1=LlamaForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)

optimizer =AdamW(model1.parameters(), lr=2e-5, weight_decay=1e-2)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

checkpoint = torch.load('model_Balanced.pt')
optimizer.load_state_dict(checkpoint['optimizer.state_dict()'])
model1.load_state_dict(checkpoint['model.state_dict()'])
model1.to(device)

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at skeskinen/llama-lite-134m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


LlamaForSequenceClassification(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 768, padding_idx=0)
    (layers): ModuleList(
      (0-11): 12 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=768, out_features=768, bias=False)
          (k_proj): Linear(in_features=768, out_features=768, bias=False)
          (v_proj): Linear(in_features=768, out_features=768, bias=False)
          (o_proj): Linear(in_features=768, out_features=768, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=768, out_features=2048, bias=False)
          (up_proj): Linear(in_features=768, out_features=2048, bias=False)
          (down_proj): Linear(in_features=2048, out_features=768, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
    (norm): LlamaRMSNorm()
  )
  

In [46]:
test_df_res = evaluate_model(model1, test_loader, device, test )

Accuracy= 0.7747953513814382
Accuracy: 0.7747953513814382
Balanced Accuracy: 0.7143492474497553
F1 score: 0.7747953513814382
Recall: 0.7747953513814382
Precision: 0.7747953513814382


### Balancing data slightly improved the performance of the model, which cause increase in accuracy from 75.6% on epoch 2 to 77.5% again on epoch 2 when using balanced data.