### Data is cleaned and embedded using MPNet model. 

## Data Partitioning

In [None]:
import pandas as pd

In [8]:
embd_data=pd.read_csv('/domino/datasets/local/SCM_SC/text_mpnet_embed_df.csv')

In [9]:
embd_data.shape

(416809, 771)

In [10]:
X = embd_data.drop('label', axis=1)
y = embd_data['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)

train = pd.concat([X_train, y_train], axis=1)
test = pd.concat([X_test, y_test], axis=1)

In [12]:
# Saving Train and test data for later use in all modeling efforts
train.to_csv('/domino/datasets/local/SCM_SC/train_df.csv')
test.to_csv('/domino/datasets/local/SCM_SC/test_df.csv')

In [11]:
train.head()

Unnamed: 0,c_0,c_1,c_2,c_3,c_4,c_5,c_6,c_7,c_8,c_9,...,c_761,c_762,c_763,c_764,c_765,c_766,c_767,text_WO_stopwords,text,label
342462,0.005049,-0.004521,-0.033797,-0.011138,-0.02482,0.018492,-0.01522,0.012329,-0.017906,0.047792,...,0.021197,0.024708,-0.03928,-0.020336,0.022013,-0.060829,-0.017347,feel im torture slowly today,i feel as if im being tortured very slowly today,4
87990,-0.039426,-0.004666,-0.0227,0.006759,-0.026521,0.036558,-0.009668,-0.055998,0.037705,-0.007174,...,-0.043236,0.02076,0.00637,0.007713,-0.046003,-0.000759,-0.012333,ill feel really bad,ill feel really bad,0
381594,0.02758,0.060547,-0.028398,-0.019655,0.0179,-0.02805,-0.067037,0.006156,-0.047099,0.023724,...,0.000482,0.009941,-0.061392,-0.028997,0.022805,0.026034,-0.006839,feel content whatever news tell know rise chal...,i feel content with whatever news we are told ...,1
208217,-0.002705,0.047667,-0.00158,-0.026343,0.01589,0.024639,-0.091406,0.014512,0.029813,0.026739,...,0.031983,-0.009349,-0.035115,0.008799,0.015763,0.038264,-0.037314,love feel invigorate arrive work,i love feeling invigorated when i arrive at work,1
376976,0.014338,-0.047544,0.025093,0.011402,-0.006114,-0.007457,-0.060139,-0.04103,-0.072453,0.019001,...,-0.019103,0.007613,-0.013725,-0.008387,0.000549,0.007308,0.002215,feel acceptable use first many polish get rece...,i feel is acceptable i used the first of many ...,1


In [1]:
!pip install imblearn
!pip install nltk
!pip install transformers
!pip install Sentence_Transformers

Defaulting to user installation because normal site-packages is not writeable
Collecting imblearn
  Downloading imblearn-0.0-py2.py3-none-any.whl (1.9 kB)
Installing collected packages: imblearn
Successfully installed imblearn-0.0
Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable


# 1- Using Embedded data from MPNet model and LLama Lite pre-trained model as classifier for emotion classification

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math
import nltk
import re
from nltk.corpus import stopwords#, PlainTextCorpusReader
from nltk import word_tokenize, ngrams
from nltk.stem import WordNetLemmatizer
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
import seaborn as sns
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
from transformers import LlamaTokenizer
from sentence_transformers import SentenceTransformer
from datetime import datetime, date, timedelta
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report, balanced_accuracy_score, f1_score
from imblearn.over_sampling import SMOTE

import warnings
warnings.filterwarnings('ignore')

In [5]:
class EmbeddedDataset(Dataset):
    def __init__(self, embeddings, labels):
        self.embeddings = embeddings
        self.labels = labels
    def __len__(self):
        return len(self.embeddings)
    def __getitem__(self, idx):
        embedding=self.embeddings[idx]
        embedding=torch.tensor(embedding, dtype=torch.float32)
        embedding=embedding.unsqueeze(0)
        return embedding, self.labels[idx]

In [13]:
import os
current_directory = os.getcwd()
print("Current directory:", current_directory)

Current directory: /mnt/code/notebooks


In [13]:
train.reset_index(drop=True, inplace=True)
test.reset_index(drop=True, inplace=True)

In [15]:
print('train: ', len(train), '\ntest: ', len(test)) #, '\nval: ', len(val)

train:  312606 
test:  104203


In [16]:
train['label'] = train['label'].astype('category')
test['label'] = test['label'].astype('category')

In [18]:
train.dtypes

c_0                   float64
c_1                   float64
c_2                   float64
c_3                   float64
c_4                   float64
                       ...   
c_766                 float64
c_767                 float64
text_WO_stopwords      object
text                   object
label                category
Length: 771, dtype: object

## Prepare model input for model training and validation

In [36]:
train_embeddings= np.stack(train.iloc[:,:768].values)
train_labels= train['label'].values
train_dataset= EmbeddedDataset(train_embeddings, train_labels)
train_loader= DataLoader(train_dataset, batch_size=128, shuffle=True)

In [38]:
test_embeddings= np.stack(test.iloc[:,:768].values)
test_labels= test['label'].values
test_dataset= EmbeddedDataset(test_embeddings, test_labels)
test_loader= DataLoader(test_dataset, batch_size=128, shuffle=True)

In [26]:
num_labels= len(train['label'].unique())
print(num_labels)

6


In [39]:
from transformers import LlamaForSequenceClassification, AdamW
model_name='skeskinen/llama-lite-134m'
model=LlamaForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)
optimizer= AdamW(model.parameters(), lr=2e-5, weight_decay=1e-2)
device= torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('device: ', device)
model.to(device)

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at skeskinen/llama-lite-134m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


device:  cuda


LlamaForSequenceClassification(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 768, padding_idx=0)
    (layers): ModuleList(
      (0): LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=768, out_features=768, bias=False)
          (k_proj): Linear(in_features=768, out_features=768, bias=False)
          (v_proj): Linear(in_features=768, out_features=768, bias=False)
          (o_proj): Linear(in_features=768, out_features=768, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=768, out_features=2048, bias=False)
          (up_proj): Linear(in_features=768, out_features=2048, bias=False)
          (down_proj): Linear(in_features=2048, out_features=768, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
      (1): LlamaDecoderLayer(
        (self_att

In [28]:
num_epochs=30
best_test_loss=float('inf')
patience=3
patience_counter=0
best_accuracy=-1
for epoch in range(num_epochs):
    model.train()
    total_loss= 0
    for batch in train_loader:
        embeddings=batch[0].to(device)
        labels=batch[1].to(device)
        optimizer.zero_grad()
        outputs= model(inputs_embeds=embeddings, labels=labels)
        loss= outputs.loss
        total_loss+= loss.item()
        
        loss.backward()
        optimizer.step()
    average_loss= total_loss / len(train_loader)
    print (f'Epoch: {epoch+1}, Loss: {average_loss}')
    
    model.eval()
    test_loss= 0
    correct_predictions= 0
    with torch.no_grad():
        for batch in test_loader:
            embeddings=batch[0].to(device)
            labels=batch[1].to(device)
            
            outputs= model(inputs_embeds=embeddings, labels=labels)
            test_loss+= outputs.loss.item()
            
            logits= outputs.logits
            _, predicted_labels= torch.max(logits, dim=1)
            correct_predictions+= torch.sum(predicted_labels==labels)
    average_test_loss = test_loss/len(test_loader)
    accuracy=correct_predictions/len(test_dataset)
    print(f'Test Loss: {average_test_loss}, Accuracy: {accuracy}')
    
    if accuracy > best_accuracy:
        best_accuracy= accuracy
        patience_counter=0
        torch.save({'model.state_dict()': model.state_dict(), 'optimizer.state_dict()': optimizer.state_dict()}, 'model.pt')
        patience_counter=0
    else:
        patience_counter+=1
    if patience_counter>=patience:
        print('Early stopping Triggered')
        break

Epoch: 1, Loss: 0.47116596780176606
Test Loss: 0.6818288690473404, Accuracy: 0.750822901725769
Epoch: 2, Loss: 0.25431708495673394
Test Loss: 0.7496792136891488, Accuracy: 0.7564273476600647
Epoch: 3, Loss: 0.1716895796681544
Test Loss: 0.8472883035251699, Accuracy: 0.7544216513633728
Epoch: 4, Loss: 0.13620078459216822
Test Loss: 0.9292120997525432, Accuracy: 0.7511683702468872
Epoch: 5, Loss: 0.12082830793036509
Test Loss: 0.9896049231839327, Accuracy: 0.7497192621231079
Early stopping Triggered


# 2- Balancing train dataset for potential model improvement and use same LLama pre-trained model for fine-tuning and compare the results

In [19]:
class_counts = train['label'].value_counts()
# Sort the counts from smallest to largest
class_counts_sorted = class_counts.sort_values()
class_counts_sorted

5     11229
2     25915
4     35784
3     42988
0     90890
1    105800
Name: label, dtype: int64

In [20]:
X_train=train.iloc[:,:768]
y_train=train['label']
sm=SMOTE(random_state=25, n_jobs=-1, k_neighbors=5)
X_train_sm, y_train_sm= sm.fit_resample(X_train, y_train)

In [21]:
X_train_sm.shape

(634800, 768)

In [22]:
y_train_sm.value_counts()

0    105800
1    105800
2    105800
3    105800
4    105800
5    105800
Name: label, dtype: int64

In [23]:
train_embeddings= np.stack(X_train_sm.iloc[:,:768].values)
train_labels= y_train_sm.values
train_dataset= EmbeddedDataset(train_embeddings, train_labels)
train_loader= DataLoader(train_dataset, batch_size=128, shuffle=True)

In [24]:
test_embeddings= np.stack(test.iloc[:,:768].values)
test_labels= test['label'].values
test_dataset= EmbeddedDataset(test_embeddings, test_labels)
test_loader= DataLoader(test_dataset, batch_size=128, shuffle=True)

In [37]:
num_labels= len(train['label'].unique())
print(num_labels)

6


In [27]:
from transformers import LlamaForSequenceClassification, AdamW
model_name='skeskinen/llama-lite-134m'
model=LlamaForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)
optimizer= AdamW(model.parameters(), lr=2e-5, weight_decay=1e-2)
device= torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('device: ', device)
model.to(device)

config.json:   0%|          | 0.00/535 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/536M [00:00<?, ?B/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at skeskinen/llama-lite-134m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


device:  cuda


LlamaForSequenceClassification(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 768, padding_idx=0)
    (layers): ModuleList(
      (0): LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=768, out_features=768, bias=False)
          (k_proj): Linear(in_features=768, out_features=768, bias=False)
          (v_proj): Linear(in_features=768, out_features=768, bias=False)
          (o_proj): Linear(in_features=768, out_features=768, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=768, out_features=2048, bias=False)
          (up_proj): Linear(in_features=768, out_features=2048, bias=False)
          (down_proj): Linear(in_features=2048, out_features=768, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
      (1): LlamaDecoderLayer(
        (self_att

In [35]:
train.shape , test.shape

((312606, 771), (104203, 771))

In [40]:
num_epochs=30
best_test_loss=float('inf')
patience=3
patience_counter=0
best_accuracy=-1
for epoch in range(num_epochs):
    model.train()
    total_loss= 0
    for batch in train_loader:
        embeddings=batch[0].to(device)
        labels=batch[1].to(device)
        optimizer.zero_grad()
        outputs= model(inputs_embeds=embeddings, labels=labels)
        loss= outputs.loss
        total_loss+= loss.item()
        
        loss.backward()
        optimizer.step()
    average_loss= total_loss / len(train_loader)
    print (f'Epoch: {epoch+1}, Loss: {average_loss}')
    
    model.eval()
    test_loss= 0
    correct_predictions= 0
    with torch.no_grad():
        for batch in test_loader:
            embeddings=batch[0].to(device)
            labels=batch[1].to(device)
            
            outputs= model(inputs_embeds=embeddings, labels=labels)
            test_loss+= outputs.loss.item()
            
            logits= outputs.logits
            _, predicted_labels= torch.max(logits, dim=1)
            correct_predictions+= torch.sum(predicted_labels==labels)
    average_test_loss = test_loss/len(test_loader)
    accuracy=correct_predictions/len(test_dataset)
    print(f'Test Loss: {average_test_loss}, Accuracy: {accuracy}')
    
    if accuracy > best_accuracy:
        best_accuracy= accuracy
        patience_counter=0
        torch.save({'model.state_dict()': model.state_dict(), 'optimizer.state_dict()': optimizer.state_dict()}, 'model_unbalanced.pt')
        patience_counter=0
    else:
        patience_counter+=1
    if patience_counter>=patience:
        print('Early stopping Triggered')
        break

Epoch: 1, Loss: 0.6910204766863348
Test Loss: 0.6035168456884981, Accuracy: 0.767338752746582
Epoch: 2, Loss: 0.4778885061117259
Test Loss: 0.5801429836662269, Accuracy: 0.7747953534126282
Epoch: 3, Loss: 0.31834184839205326
Test Loss: 0.6402754418323376, Accuracy: 0.7651699185371399
Epoch: 4, Loss: 0.20605498998503288
Test Loss: 0.7783686681759138, Accuracy: 0.7620317935943604
Epoch: 5, Loss: 0.15999099656915108
Test Loss: 0.8979340281954572, Accuracy: 0.7577421069145203
Early stopping Triggered


### Balancing data slightly improved the performance of the model, which cause increase in accuracy from 75.6% on epoch 2 to 77.5% again on epoch 2 when using balanced data.

# 3- Another Method Keras tokenizer and Tensorflow model

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import re

import warnings
warnings.filterwarnings('ignore')

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem.snowball import SnowballStemmer

from keras.preprocessing import sequence
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from tensorflow.keras.layers import Embedding, LSTM, Bidirectional, Dense, Dropout, BatchNormalization

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import *
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from collections import Counter

nltk.download('punkt')
nltk.download('stopwords')

2024-04-19 23:55:56.076835: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
[nltk_data] Downloading package punkt to /home/ubuntu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/ubuntu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
# Loading Train and Test clean data which is partitioned and saved for later uses previously.

In [3]:
# importing data with clean text column ("text_WO_stopwords) and it also have embedded features too , just in case we use it for modeling
train=pd.read_csv('/domino/datasets/local/SCM_SC/train_df.csv')
test=pd.read_csv('/domino/datasets/local/SCM_SC/test_df.csv')

In [4]:
#to remove the first column
train = train.drop(train.columns[0], axis=1)
test = test.drop(test.columns[0], axis=1)
test

Unnamed: 0,c_0,c_1,c_2,c_3,c_4,c_5,c_6,c_7,c_8,c_9,...,c_761,c_762,c_763,c_764,c_765,c_766,c_767,text_WO_stopwords,text,label
0,0.019394,-0.044919,0.000026,0.017203,0.038961,0.066090,-0.049987,-0.011413,0.036748,0.019023,...,-0.000739,0.003633,-0.109465,-0.013429,0.031202,-0.058474,0.030161,rin dahil bakit ako magtatanim ng sama ng loob...,i rin dahil bakit ako magtatanim ng sama ng lo...,3
1,0.021275,-0.005803,-0.015457,-0.018007,-0.004438,0.047791,-0.081813,0.030755,-0.032801,-0.027810,...,-0.031838,0.027748,-0.027941,0.000003,0.014574,-0.039903,0.025485,still feel bite distraught try get,i am still feeling a bit distraught but i have...,4
2,0.029161,0.051610,-0.010187,0.012589,-0.000941,0.013400,-0.051471,0.022667,-0.054751,-0.011609,...,0.027453,-0.002714,0.045774,-0.027437,-0.005116,0.042347,-0.007992,scoop card gift wrap feel smug part group gift...,i scooped up a card and gift wrap feeling smug...,1
3,0.029598,0.035654,0.024396,0.022355,0.020141,0.012857,-0.051097,-0.053518,-0.021370,-0.034440,...,0.049417,-0.009656,-0.074797,-0.043896,0.000957,0.042334,-0.026798,feel important,i feel that is so important,1
4,0.031208,0.063900,-0.033559,0.057609,-0.021166,-0.019349,-0.044515,-0.010385,-0.010897,0.010456,...,0.049952,-0.008722,0.004873,-0.047218,0.007098,0.015758,-0.045528,tell expectation important day would feel spec...,i told him that my expectation for that very i...,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
104198,0.027633,0.025179,-0.001323,-0.051901,-0.010680,0.006089,-0.028517,-0.030698,0.053657,-0.050739,...,-0.000989,0.003046,0.042012,0.003837,-0.030643,0.071695,-0.016831,feel dirty remember way use think,i feel dirty just remembering the way i used t...,0
104199,-0.044641,0.050187,-0.019373,-0.018147,0.026715,0.011591,-0.039571,0.012069,0.004874,0.020797,...,-0.016689,-0.003244,-0.024648,-0.020703,0.025322,0.099140,-0.021172,feel bless short years,i feel blessed to have had too short years wit...,1
104200,-0.019972,0.094170,0.015813,-0.039249,0.029252,0.026102,0.013029,0.015672,0.072358,0.025073,...,-0.027735,-0.008830,0.016110,-0.014148,0.018982,0.053207,-0.007986,feel intelligent additional years,i feel intelligent for for an additional years,1
104201,0.049425,0.008892,-0.011874,-0.026436,-0.023583,0.035841,-0.031572,0.064218,-0.094790,0.014435,...,-0.028396,-0.013129,0.038700,0.021196,0.055631,-0.029546,0.021097,im feel bite less wimpy still pastel,im feeling a bit less wimpy about them but sti...,4


In [5]:
train = train.fillna('')
test = test.fillna('')
train.reset_index(drop=True)
test.reset_index(drop=True)

Unnamed: 0,c_0,c_1,c_2,c_3,c_4,c_5,c_6,c_7,c_8,c_9,...,c_761,c_762,c_763,c_764,c_765,c_766,c_767,text_WO_stopwords,text,label
0,0.019394,-0.044919,0.000026,0.017203,0.038961,0.066090,-0.049987,-0.011413,0.036748,0.019023,...,-0.000739,0.003633,-0.109465,-0.013429,0.031202,-0.058474,0.030161,rin dahil bakit ako magtatanim ng sama ng loob...,i rin dahil bakit ako magtatanim ng sama ng lo...,3
1,0.021275,-0.005803,-0.015457,-0.018007,-0.004438,0.047791,-0.081813,0.030755,-0.032801,-0.027810,...,-0.031838,0.027748,-0.027941,0.000003,0.014574,-0.039903,0.025485,still feel bite distraught try get,i am still feeling a bit distraught but i have...,4
2,0.029161,0.051610,-0.010187,0.012589,-0.000941,0.013400,-0.051471,0.022667,-0.054751,-0.011609,...,0.027453,-0.002714,0.045774,-0.027437,-0.005116,0.042347,-0.007992,scoop card gift wrap feel smug part group gift...,i scooped up a card and gift wrap feeling smug...,1
3,0.029598,0.035654,0.024396,0.022355,0.020141,0.012857,-0.051097,-0.053518,-0.021370,-0.034440,...,0.049417,-0.009656,-0.074797,-0.043896,0.000957,0.042334,-0.026798,feel important,i feel that is so important,1
4,0.031208,0.063900,-0.033559,0.057609,-0.021166,-0.019349,-0.044515,-0.010385,-0.010897,0.010456,...,0.049952,-0.008722,0.004873,-0.047218,0.007098,0.015758,-0.045528,tell expectation important day would feel spec...,i told him that my expectation for that very i...,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
104198,0.027633,0.025179,-0.001323,-0.051901,-0.010680,0.006089,-0.028517,-0.030698,0.053657,-0.050739,...,-0.000989,0.003046,0.042012,0.003837,-0.030643,0.071695,-0.016831,feel dirty remember way use think,i feel dirty just remembering the way i used t...,0
104199,-0.044641,0.050187,-0.019373,-0.018147,0.026715,0.011591,-0.039571,0.012069,0.004874,0.020797,...,-0.016689,-0.003244,-0.024648,-0.020703,0.025322,0.099140,-0.021172,feel bless short years,i feel blessed to have had too short years wit...,1
104200,-0.019972,0.094170,0.015813,-0.039249,0.029252,0.026102,0.013029,0.015672,0.072358,0.025073,...,-0.027735,-0.008830,0.016110,-0.014148,0.018982,0.053207,-0.007986,feel intelligent additional years,i feel intelligent for for an additional years,1
104201,0.049425,0.008892,-0.011874,-0.026436,-0.023583,0.035841,-0.031572,0.064218,-0.094790,0.014435,...,-0.028396,-0.013129,0.038700,0.021196,0.055631,-0.029546,0.021097,im feel bite less wimpy still pastel,im feeling a bit less wimpy about them but sti...,4


In [6]:
train

Unnamed: 0,c_0,c_1,c_2,c_3,c_4,c_5,c_6,c_7,c_8,c_9,...,c_761,c_762,c_763,c_764,c_765,c_766,c_767,text_WO_stopwords,text,label
0,0.005049,-0.004521,-0.033797,-0.011138,-0.024820,0.018492,-0.015220,0.012329,-0.017906,0.047792,...,0.021197,0.024708,-0.039280,-0.020336,0.022013,-0.060829,-0.017347,feel im torture slowly today,i feel as if im being tortured very slowly today,4
1,-0.039426,-0.004666,-0.022700,0.006759,-0.026521,0.036558,-0.009668,-0.055998,0.037705,-0.007174,...,-0.043236,0.020760,0.006370,0.007713,-0.046003,-0.000759,-0.012333,ill feel really bad,ill feel really bad,0
2,0.027580,0.060547,-0.028398,-0.019655,0.017900,-0.028050,-0.067037,0.006156,-0.047099,0.023724,...,0.000482,0.009941,-0.061392,-0.028997,0.022805,0.026034,-0.006839,feel content whatever news tell know rise chal...,i feel content with whatever news we are told ...,1
3,-0.002705,0.047667,-0.001580,-0.026343,0.015890,0.024639,-0.091406,0.014512,0.029813,0.026739,...,0.031983,-0.009349,-0.035115,0.008799,0.015763,0.038264,-0.037314,love feel invigorate arrive work,i love feeling invigorated when i arrive at work,1
4,0.014338,-0.047544,0.025093,0.011402,-0.006114,-0.007457,-0.060139,-0.041030,-0.072453,0.019001,...,-0.019103,0.007613,-0.013725,-0.008387,0.000549,0.007308,0.002215,feel acceptable use first many polish get rece...,i feel is acceptable i used the first of many ...,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
312601,-0.020691,-0.014837,-0.003853,-0.028849,-0.007957,0.065683,-0.099857,-0.012396,0.054042,-0.024364,...,-0.040756,0.004264,-0.043533,0.008517,0.032682,0.011600,-0.009431,feel terrible course fantastic,i was feeling terrible but of course she did f...,0
312602,0.012106,0.056657,-0.035532,-0.017904,-0.007871,0.036242,0.011781,0.000313,0.005871,0.013937,...,0.021886,0.003809,0.042900,-0.027610,0.030307,0.061982,-0.047177,give permission feel vulnerable feel anxious p...,i can give myself permission to feel vulnerabl...,4
312603,-0.013450,-0.008782,0.033715,0.015963,-0.009311,0.008958,-0.031408,-0.024701,0.038739,0.009752,...,0.049028,-0.053441,-0.020124,0.018001,0.023213,0.027379,0.033591,think socially inept feel little intimidate pa...,i don t think i m socially inept but i do feel...,4
312604,0.036642,0.034220,0.017764,-0.011373,0.013963,0.034240,-0.005113,-0.037219,0.024395,0.015452,...,0.022900,0.003701,-0.016553,0.006632,0.018382,-0.059728,0.008445,ive try make feel ugly inside,ive tried being against it and it makes me fee...,0


In [7]:
X_train= train['text_WO_stopwords']
y_train= train['label']

X_test= test['text_WO_stopwords']
y_test= test['label']

In [8]:
train[train['text_WO_stopwords'].isnull()]

Unnamed: 0,c_0,c_1,c_2,c_3,c_4,c_5,c_6,c_7,c_8,c_9,...,c_761,c_762,c_763,c_764,c_765,c_766,c_767,text_WO_stopwords,text,label


Looks like there are some rows in our dataset which are null after cleaning data, therefore we need to replace those rows with  '' before tokenizing it.

In [26]:
train

Unnamed: 0.1,Unnamed: 0,c_0,c_1,c_2,c_3,c_4,c_5,c_6,c_7,c_8,...,c_761,c_762,c_763,c_764,c_765,c_766,c_767,text_WO_stopwords,text,label
0,342462,0.005049,-0.004521,-0.033797,-0.011138,-0.024820,0.018492,-0.015220,0.012329,-0.017906,...,0.021197,0.024708,-0.039280,-0.020336,0.022013,-0.060829,-0.017347,feel im torture slowly today,i feel as if im being tortured very slowly today,4
1,87990,-0.039426,-0.004666,-0.022700,0.006759,-0.026521,0.036558,-0.009668,-0.055998,0.037705,...,-0.043236,0.020760,0.006370,0.007713,-0.046003,-0.000759,-0.012333,ill feel really bad,ill feel really bad,0
2,381594,0.027580,0.060547,-0.028398,-0.019655,0.017900,-0.028050,-0.067037,0.006156,-0.047099,...,0.000482,0.009941,-0.061392,-0.028997,0.022805,0.026034,-0.006839,feel content whatever news tell know rise chal...,i feel content with whatever news we are told ...,1
3,208217,-0.002705,0.047667,-0.001580,-0.026343,0.015890,0.024639,-0.091406,0.014512,0.029813,...,0.031983,-0.009349,-0.035115,0.008799,0.015763,0.038264,-0.037314,love feel invigorate arrive work,i love feeling invigorated when i arrive at work,1
4,376976,0.014338,-0.047544,0.025093,0.011402,-0.006114,-0.007457,-0.060139,-0.041030,-0.072453,...,-0.019103,0.007613,-0.013725,-0.008387,0.000549,0.007308,0.002215,feel acceptable use first many polish get rece...,i feel is acceptable i used the first of many ...,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
312601,70499,-0.020691,-0.014837,-0.003853,-0.028849,-0.007957,0.065683,-0.099857,-0.012396,0.054042,...,-0.040756,0.004264,-0.043533,0.008517,0.032682,0.011600,-0.009431,feel terrible course fantastic,i was feeling terrible but of course she did f...,0
312602,132865,0.012106,0.056657,-0.035532,-0.017904,-0.007871,0.036242,0.011781,0.000313,0.005871,...,0.021886,0.003809,0.042900,-0.027610,0.030307,0.061982,-0.047177,give permission feel vulnerable feel anxious p...,i can give myself permission to feel vulnerabl...,4
312603,133735,-0.013450,-0.008782,0.033715,0.015963,-0.009311,0.008958,-0.031408,-0.024701,0.038739,...,0.049028,-0.053441,-0.020124,0.018001,0.023213,0.027379,0.033591,think socially inept feel little intimidate pa...,i don t think i m socially inept but i do feel...,4
312604,403684,0.036642,0.034220,0.017764,-0.011373,0.013963,0.034240,-0.005113,-0.037219,0.024395,...,0.022900,0.003701,-0.016553,0.006632,0.018382,-0.059728,0.008445,ive try make feel ugly inside,ive tried being against it and it makes me fee...,0


The Tokenizer class in TensorFlow's Keras API is used for converting text documents into tokenized sequences, which can then be used for training machine learning models, particularly neural networks. 

In [9]:
tkn = Tokenizer(num_words=500000)
tkn.fit_on_texts(X_train)
tkn.fit_on_texts(X_test)
sq_X_train = tkn.texts_to_sequences(X_train)
sq_X_test = tkn.texts_to_sequences(X_test)

size = max(len(tokens) for tokens in sq_X_train)
X_train_pd = pad_sequences(sq_X_train, padding='post', maxlen=size)
X_test_pd = pad_sequences(sq_X_test, padding='post', maxlen=size)

print("The maximu size of the squence train:", size)

The maximu size of the squence train: 48


In [10]:
X_train_pd.shape

(312606, 48)

In [12]:
input_shape = np.max(X_train_pd) + 1
input_shape

# defining a sequential model 
model = Sequential([
    Embedding(input_dim=input_shape, output_dim=1000,input_shape=(size,)), #, input_length=size
    GRU(units=128),
    BatchNormalization(),
    Dropout(0.5),
    Dense(units=64, activation='relu'),
    Dropout(0.5),
    Dense(units=6, activation='softmax') 
])

model.compile(optimizer=Adam(learning_rate=0.001), metrics=['accuracy'], loss=SparseCategoricalCrossentropy())

print(model.summary())

2024-04-20 00:01:16.008056: I tensorflow/core/common_runtime/process_util.cc:146] Creating new thread pool with default inter op setting: 2. Tune using inter_op_parallelism_threads for best performance.


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 48, 1000)          56534000  
                                                                 
 gru (GRU)                   (None, 128)               433920    
                                                                 
 batch_normalization (BatchN  (None, 128)              512       
 ormalization)                                                   
                                                                 
 dropout (Dropout)           (None, 128)               0         
                                                                 
 dense (Dense)               (None, 64)                8256      
                                                                 
 dropout_1 (Dropout)         (None, 64)                0         
                                                        

2024-04-20 00:01:16.281704: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2024-04-20 00:01:16.283211: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2024-04-20 00:01:16.284319: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

Model training

In [None]:
model_training_history= model.fit(X_train_pd, y_train, epochs=10, batch_size=64, validation_data=(X_test_pd, y_test),callbacks=[EarlyStopping(patience=5)])

Epoch 1/10


2024-04-20 00:01:49.816350: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2024-04-20 00:01:49.818037: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2024-04-20 00:01:49.819247: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

 275/4885 [>.............................] - ETA: 1:10:26 - loss: 1.6041 - accuracy: 0.3323

In [None]:
background_color = '#5fa1bc'
plt.style.use('seaborn-dark')
plt.rcParams['axes.facecolor'] = background_color

# Get the epoch with the highest validation accuracy
best_epoch = history.history['val_accuracy'].index(max(history.history['val_accuracy'])) + 1

# Create a new figure
plt.figure(figsize=(10, 6))

# Plot training and validation accuracy
plt.plot(history.history['accuracy'], label='Training Accuracy', color='blue')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy', color='red')
plt.scatter(best_epoch - 1, history.history['val_accuracy'][best_epoch - 1], color='green', label=f'Best Epoch: {best_epoch}')

plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.title('Training and Validation Accuracy')
plt.legend()

# Show the plot
plt.tight_layout()
plt.show()


# 4- Another Method Keras tokenizer and Tensorflow model with balanced data