In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import sys
import os
import shutil
# from typing_extensions import TypeAliasType

try:
    sys.path.append(os.path.join(os.path.dirname(__file__), '../'))
except:
    sys.path.append(os.path.join(os.getcwd(), '../'))

In [None]:
import re
import nltk
from nltk.corpus import sentiwordnet as swn
from collections import Counter
from sklearn import feature_extraction

In [None]:
nltk.download('punkt')
nltk.download('sentiwordnet')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package sentiwordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/sentiwordnet.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [None]:
import pandas as pd
import numpy as np

import pickle
from collections import Counter
from tqdm import tqdm

import torch
from torch import nn
from torch.nn.utils.rnn import pad_sequence

from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import classification_report, f1_score, accuracy_score, confusion_matrix, precision_score, recall_score
from sklearn.feature_extraction.text import TfidfVectorizer


# from src import data

np.random.seed(42)
torch.manual_seed(42)

<torch._C.Generator at 0x7ba590108030>

In [None]:
# train_file = '/content/drive/MyDrive/NLP_Project/IIITH_Codemixed.txt'
train_file = '/content/drive/MyDrive/nlp_project/HIT-ACL2021-Codemixed-Representation/data/hindi_sentiment/IIITH_Codemixed.txt'
df = pd.read_csv(train_file, sep='\t', header=None, usecols=[1,2])
df.columns = ['text', 'category']

In [None]:
sentences = []
with open(train_file, 'r', encoding='utf-8') as file:
    for line in file:
      parts = line.strip().split('\t')
      if len(parts) > 0:
            sentences.append(parts[1])

In [None]:
capital_words_count = []
for sentence in sentences:
    capital_words = re.findall(r'\b[A-Z]+\b', sentence)
    capital_words_count.append(len(capital_words))

In [None]:
extended_words_count = []
for sentence in sentences:
    extended_words = re.findall(r'\b\w*([a-zA-Z])\1\w*\b', sentence)
    # print(extended_words)
    extended_words_count.append(len(extended_words))

In [None]:
exclamation_at_end = []
for sentence in sentences:
    exclamation_at_end.append(int(sentence.endswith('!')))

In [None]:
repeated_punctuation_count = []

punctuation_pattern = r'(\W)\1+'

for sentence in sentences:
    repeated_punctuation = re.findall(punctuation_pattern, sentence)
    repeated_punctuation_count.append(len(repeated_punctuation))

print(repeated_punctuation_count)

[1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 2, 2, 0, 2, 1, 1, 0, 5, 0, 0, 0, 0, 0, 1, 3, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 3, 0, 0, 0, 0, 1, 0, 0, 3, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 2, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 2, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 5, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 2, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 2, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 0, 0, 4, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 2, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 2, 1, 0, 0, 0, 0, 

In [None]:
def get_sentiment_scores(word_list):
    sentiment_scores_pos = sentiment_scores_neg = 0
    for word in word_list:
        synsets = list(swn.senti_synsets(word))
        if synsets:
            pos_score = synsets[0].pos_score()
            neg_score = synsets[0].neg_score()
            sentiment_scores_pos += pos_score
            sentiment_scores_neg += neg_score
    return sentiment_scores_pos,sentiment_scores_neg

In [None]:
pos_sentiment_score = []
neg_sentiment_score = []

punctuation_pattern = r'(\W)\1+'

for sentence in sentences:
    pos_sentiment,negetive_sentiment = get_sentiment_scores(sentence.split(' '))
    # print(pos_sentiment)
    pos_sentiment_score.append((pos_sentiment))
    neg_sentiment_score.append((negetive_sentiment))


print(pos_sentiment_score)
print(neg_sentiment_score)

[0.0, 1.375, 0.0, 0, 0.0, 0, 1.0, 1.0, 0.0, 0.0, 0, 0.0, 0.0, 0.0, 0.25, 0.0, 0.375, 0.5, 1.375, 0.0, 0.375, 0.0, 0.0, 0, 0.0, 0.0, 0.0, 0.0, 0.0, 0, 0.0, 0.0, 0.0, 0.75, 0.625, 0.75, 0.5, 0.25, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.375, 0.0, 0.0, 0, 0.25, 0.375, 0.0, 0.0, 0.5, 0.0, 0.0, 0.75, 0.0, 0.625, 0.0, 0.5, 0.0, 0.0, 0.25, 0.0, 0.25, 0.0, 0.0, 0.0, 0.0, 0.0, 0.375, 0, 0.25, 0.5, 0, 0.0, 0.0, 0, 0.25, 0.0, 0.0, 0.0, 0.0, 0.625, 0.0, 0.25, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.375, 0.0, 0.5, 0, 0.625, 0, 0.25, 0.75, 1.125, 0, 1.125, 0.0, 0.0, 0.0, 0.0, 0.25, 0.0, 0.0, 0.0, 0.0, 0.5, 0.25, 0.875, 0.375, 1.0, 0.0, 0.5, 0.0, 0.5, 0.875, 0.0, 0.0, 0.0, 1.125, 0.0, 0.0, 0.25, 0.0, 0.25, 0.5, 1.0, 0.0, 0.0, 0.625, 0.0, 0.375, 0.0, 0.25, 0.0, 0.0, 0.125, 0.0, 0.0, 0, 0.0, 0.375, 0.5, 0.5, 0.0, 0.25, 0.0, 0.0, 0.0, 0, 0.875, 0.375, 0.5, 0.0, 0.0, 0.0, 0.0, 0.0, 0.5, 0.5, 0.0, 0.0, 0.25, 0.25, 0.0, 0, 0.0, 0.0, 0.0, 0.25, 0.0, 0.0, 0.0, 0.0, 0.5, 0.0, 0.0, 0.5, 0.0, 0.0, 0.625, 0.0, 0.0, 0.

In [None]:
df['Capital_Words_Count']=capital_words_count
df['Extended_Words_Count']= extended_words_count
df['Exclamation_at_End']= exclamation_at_end
df['Repeated_Punctuation_Count']= repeated_punctuation_count
df['Sentiment_Scores_positive']= pos_sentiment_score
df['Sentiment_Scores_negetive']= neg_sentiment_score

In [None]:
df.head()

Unnamed: 0,text,category,Capital_Words_Count,Extended_Words_Count,Exclamation_at_End,Repeated_Punctuation_Count,Sentiment_Scores_positive,Sentiment_Scores_negetive
0,Ye song nahi hi Ye MODI Ji ka mehnat ka rang h...,Positive,1,0,0,1,0.0,0.25
1,Love u sir love u soo much urs I'ts beautyful ...,Positive,1,1,0,1,1.375,0.125
2,Arae sur jee pahelae hamare bharat ke bachho k...,Neutral,0,4,0,1,0.0,0.0
3,Wah! Jitni sundar geet ke bhao hain utnihi sun...,Positive,0,3,0,0,0.0,0.0
4,Sundar ekdam sahi Gaya Hua gana.chhotisi gudiy...,Positive,0,1,0,0,0.0,0.0


In [None]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)
for train_index, test_index in kf.split(df.text):
    break

test_df = df.iloc[test_index]
kf2 = KFold(n_splits=10, shuffle=True, random_state=42)
for train_index, val_index in kf2.split(df.iloc[train_index].text):
    break

val_df = df.iloc[val_index]
train_df = df.iloc[train_index]

In [None]:
train_df.shape, val_df.shape, test_df.shape

((2792, 8), (311, 8), (776, 8))

In [None]:
def clean_tweets(text):
    text = text.lower()
    text = re.sub(r'@\w+','',text)
    text = re.sub(r'http\w+','',text)
    text = re.sub(r'#\w+','',text)
    text = re.sub(r'\d+','',text)
    return text.strip()

In [None]:
train_df.text = train_df.text.apply(lambda x: clean_tweets(x))
val_df.text = val_df.text.apply(lambda x: clean_tweets(x))
test_df.text = test_df.text.apply(lambda x: clean_tweets(x))

print(train_df[:5])

                                                text  category  \
1  love u sir love u soo much urs i'ts beautyful ...  Positive   
2  arae sur jee pahelae hamare bharat ke bachho k...   Neutral   
3  wah! jitni sundar geet ke bhao hain utnihi sun...  Positive   
4  sundar ekdam sahi gaya hua gana.chhotisi gudiy...  Positive   
5                                  wao lata mangekar  Positive   

   Capital_Words_Count  Extended_Words_Count  Exclamation_at_End  \
1                    1                     1                   0   
2                    0                     4                   0   
3                    0                     3                   0   
4                    0                     1                   0   
5                    0                     0                   0   

   Repeated_Punctuation_Count  Sentiment_Scores_positive  \
1                           1                      1.375   
2                           1                      0.000   
3             

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df.text = train_df.text.apply(lambda x: clean_tweets(x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  val_df.text = val_df.text.apply(lambda x: clean_tweets(x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df.text = test_df.text.apply(lambda x: clean_tweets(x))


In [None]:
print(len(train_df['text']), len(val_df['text']), len(test_df['text']))

2792 311 776


In [None]:
def _get_unique(elems):
    if type(elems[0]) == list:
        corpus = flatten(elems)
    else:
        corpus = elems
    elems, freqs = zip(*Counter(corpus).most_common())
    return list(elems)


def convert_categorical_label_to_int(labels):
    if type(labels[0]) == list:
        uniq_labels = _get_unique(flatten(labels))
    else:
        uniq_labels = _get_unique(labels)


    if type(labels[0]) == list:
        label_to_id = {w:i+1 for i,w in enumerate(uniq_labels)}
    else:
        label_to_id = {w:i for i,w in enumerate(uniq_labels)}

    new_labels = []
    if type(labels[0]) == list:
        for i in labels:
            new_labels.append([label_to_id[j] for j in i])
    else:
        new_labels = [label_to_id[j] for j in labels]

    return new_labels, label_to_id

In [None]:
train_df.head()

Unnamed: 0,text,category,Capital_Words_Count,Extended_Words_Count,Exclamation_at_End,Repeated_Punctuation_Count,Sentiment_Scores_positive,Sentiment_Scores_negetive
1,love u sir love u soo much urs i'ts beautyful ...,Positive,1,1,0,1,1.375,0.125
2,arae sur jee pahelae hamare bharat ke bachho k...,Neutral,0,4,0,1,0.0,0.0
3,wah! jitni sundar geet ke bhao hain utnihi sun...,Positive,0,3,0,0,0.0,0.0
4,sundar ekdam sahi gaya hua gana.chhotisi gudiy...,Positive,0,1,0,0,0.0,0.0
5,wao lata mangekar,Positive,0,0,0,0,0.0,0.0


In [None]:
train_df.category, label2idx = convert_categorical_label_to_int(train_df.category.values)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df.category, label2idx = convert_categorical_label_to_int(train_df.category.values)


In [None]:
val_df.category, _ = convert_categorical_label_to_int(val_df.category.values)
test_df.category, _ = convert_categorical_label_to_int(test_df.category.values)

print(label2idx)

print(train_df.category[:10])

{'Neutral': 0, 'Positive': 1, 'Negative': 2}
1     1
2     0
3     1
4     1
5     1
6     1
7     1
8     1
9     2
10    0
Name: category, dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  val_df.category, _ = convert_categorical_label_to_int(val_df.category.values)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df.category, _ = convert_categorical_label_to_int(test_df.category.values)


In [None]:
idx2label = {i:w for (w, i) in label2idx.items()}
print(idx2label)

{0: 'Neutral', 1: 'Positive', 2: 'Negative'}


### Bangla Emb

In [None]:
vocab,embeddings = [],[]
with open('/content/drive/MyDrive/nlp_project/custom_embedding.txt','rt') as fi:
    full_content = fi.read().strip().split('\n')
for i in range(len(full_content)):
    i_word = full_content[i].split(' ')[0]
    i_embeddings = [float(val) for val in full_content[i].split(' ')[1:]]
    vocab.append(i_word)
    embeddings.append(i_embeddings)

In [None]:
vocab_npa = np.array(vocab)
embs_npa = np.array(embeddings)

In [None]:
len(vocab_npa), print(vocab_npa[:10])

['<unk>' '<s>' '</s>' 's' '▁the' '▁' '▁to' 'e' '▁i' '▁you']


(7000, None)

In [None]:
embs_npa.shape

(7000, 128)

In [None]:
vocab_npa = np.insert(vocab_npa, 0, '<pad>')
# vocab_npa = np.insert(vocab_npa, 1, '[UNK]')
print(vocab_npa[:10])

pad_emb_npa = np.zeros((1,embs_npa.shape[1]))   #embedding for '<pad>' token.
# unk_emb_npa = np.mean(embs_npa,axis=0,keepdims=True)    #embedding for '<unk>' token.

#insert embeddings for pad and unk tokens at top of embs_npa.
# embs_npa = np.vstack((pad_emb_npa,unk_emb_npa,embs_npa))
embs_npa = np.vstack((pad_emb_npa,embs_npa))
print(embs_npa.shape)

['<pad>' '<unk>' '<s>' '</s>' 's' '▁the' '▁' '▁to' 'e' '▁i']
(7001, 128)


### Continue

In [None]:
!pip install fasttext

Collecting fasttext
  Downloading fasttext-0.9.2.tar.gz (68 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/68.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m68.8/68.8 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pybind11>=2.2 (from fasttext)
  Using cached pybind11-2.11.1-py3-none-any.whl (227 kB)
Building wheels for collected packages: fasttext
  Building wheel for fasttext (setup.py) ... [?25l[?25hdone
  Created wheel for fasttext: filename=fasttext-0.9.2-cp310-cp310-linux_x86_64.whl size=4199772 sha256=6a231aed45204ca23ca4d90a7d9d72942b308bc97c44ff824934e5293db6df75
  Stored in directory: /root/.cache/pip/wheels/a5/13/75/f811c84a8ab36eedbaef977a6a58a98990e8e0f1967f98f394
Successfully built fasttext
Installing collected packages: pybind11, fasttext
Successfully installed fasttext-0.9.2 pybind11-2.11.1


In [None]:
import fasttext

In [None]:
df.text = df.text.apply(lambda x: clean_tweets(x))
df.head()

Unnamed: 0,text,category,Capital_Words_Count,Extended_Words_Count,Exclamation_at_End,Repeated_Punctuation_Count,Sentiment_Scores_positive,Sentiment_Scores_negetive
0,ye song nahi hi ye modi ji ka mehnat ka rang h...,Positive,1,0,0,1,0.0,0.25
1,love u sir love u soo much urs i'ts beautyful ...,Positive,1,1,0,1,1.375,0.125
2,arae sur jee pahelae hamare bharat ke bachho k...,Neutral,0,4,0,1,0.0,0.0
3,wah! jitni sundar geet ke bhao hain utnihi sun...,Positive,0,3,0,0,0.0,0.0
4,sundar ekdam sahi gaya hua gana.chhotisi gudiy...,Positive,0,1,0,0,0.0,0.0


In [None]:
df.text.to_csv(r'data.txt', header=None, index=None, sep=' ', mode='a')

In [None]:
# t = "__label__"+df.category.str.lower()+" "+df.text
# t.to_csv(r'supervised.txt', header=None, index=None, sep=' ', mode='a')

In [None]:
fastText_model = fasttext.train_unsupervised('data.txt', model='skipgram', dim=300, minn=2, maxn=10)

In [None]:
fastText_model['the'].shape

(300,)

In [None]:
# model = fasttext.train_supervised('supervised.txt')

In [None]:
# model.predict("Which baking dish is best to bake a banana bread ?")

In [None]:
fastText_model.get_input_matrix().shape

(2001189, 300)

In [None]:
from gensim.models import FastText
from gensim.utils import simple_preprocess


fastText_model = None
#if not os.path.exists('fastText_model.model'):
tokenized = [simple_preprocess(text) for text in train_df['text']]

embedding_size = 300
fastText_model = FastText(sentences=tokenized, vector_size=embedding_size, window=3, min_count=2, sg=1)

fastText_model.save('fastText_model.model')
"""else:
    fastText_model = FastText.load('fastText_model.model')"""

KeyboardInterrupt: ignored

In [None]:
print(len(fastText_model.wv), len(fastText_model.wv[0]))

2116 300


In [None]:
def text_to_sequence(text, fastText_model, max_seq_len=40, embedding_size=300):
    tokens = simple_preprocess(text)
    vectors = []
    for token in text:
        if token in fastText_model:
            vectors.append(fastText_model[token])
        else:
            vectors.append(np.zeros(embedding_size))

    if len(vectors) < max_seq_len:
        padding = [np.zeros(embedding_size)] * (max_seq_len - len(vectors))
        vectors += padding
    else:
        vectors = vectors[:max_seq_len]

    emb = np.array(vectors)
    seq=  [
        word2idxLower.get(word, word2idxLower.get('<unk>'))
        for word in text
    ]
    if len(seq) < max_seq_len:
        padding = [0] * (max_seq_len - len(seq))
        seq += padding
    else:
        seq = seq[:max_seq_len]
    return emb, seq

def generate_subword_embeddings(df, fastText_model, max_seq_len=40):
    embeddings = []
    embeddings_bng = []
    for text in df['text']:
        emb, emb_bng = text_to_sequence(text, fastText_model, max_seq_len)
        embeddings.append(emb)
        embeddings_bng.append(emb_bng)


    return np.array(embeddings),np.array(embeddings_bng), np.array(df.drop(['text', 'category'],axis=1))

In [None]:
def text_to_sequence(text, fastText_model, max_seq_len=105, embedding_size=300):
    tokens = simple_preprocess(text)
    vectors = []
    for token in tokens:
        if token in fastText_model.wv:
            vectors.append(fastText_model.wv[token])
        else:
            vectors.append(np.zeros(embedding_size))

    if len(vectors) < max_seq_len:
        padding = [np.zeros(embedding_size)] * (max_seq_len - len(vectors))
        vectors += padding
    else:
        vectors = vectors[:max_seq_len]

    emb = np.array(vectors)
    seq=  [
        word2idxLower.get(word, word2idxLower.get('<unk>'))
        for word in tokens
    ]
    if len(seq) < max_seq_len:
        padding = [0] * (max_seq_len - len(seq))
        seq += padding
    else:
        seq = seq[:max_seq_len]
    return emb,seq

def generate_subword_embeddings(df, fastText_model, max_seq_len=105):
    embeddings = []
    embeddings_bng = []
    for text in df['text']:
        emb,emb_bng = text_to_sequence(text, fastText_model, max_seq_len)
        embeddings.append(emb)
        embeddings_bng.append(emb_bng)


    return np.array(embeddings),np.array(embeddings_bng), np.array(df.drop(['text', 'category'],axis=1))

In [None]:
word2idxLower = {
    word: index
    for index, word in enumerate(vocab_npa)
}

# train_df = (
#     train_df
#     .map(lambda x: {
#             'bang_emb': [
#                 word2idxLower.get(simple_preprocess(text), word2idxLower['[UNK]'])
#                 for text in x['text']
#             ]
#         }
#     )
# )
# def pad_sequence(sequence, max_length, padding_value=0):
#     return sequence + [padding_value] * (max_length - len(sequence))

# def map_text_to_embeddings(text):
#     tokens = simple_preprocess(text)
#     seq=  [
#         word2idxLower.get(word, word2idxLower.get('[UNK]'))
#         for word in tokens
#     ]
#     return pad_sequence(seq,105)

# def text_to_sequence(text):
#     tokens = simple_preprocess(text)
#     max_seq_len=105
#     vectors = []
#     for token in tokens:
#         if token in fastText_model.wv:
#             vectors.append(fastText_model.wv[token])
#         else:
#             vectors.append(np.zeros(embedding_size))

#     if len(vectors) < max_seq_len:
#         padding = [np.zeros(embedding_size)] * (max_seq_len - len(vectors))
#         vectors += padding
#     else:
#         vectors = vectors[:max_seq_len]

#     return np.array(vectors)

# Apply the function to each row in the DataFrame
# train_df['bng_emb'] = train_df['text'].apply(map_text_to_embeddings)
# val_df['bng_emb'] = val_df['text'].apply(map_text_to_embeddings)
# test_df['bng_emb'] = test_df['text'].apply(map_text_to_embeddings)

# max_seq_length = train_df['bng_emb'].apply(len).max()

# train_df['emb'] = train_df['text'].apply(text_to_sequence)
# val_df['emb'] = val_df['text'].apply(text_to_sequence)
# test_df['emb'] = test_df['text'].apply(text_to_sequence)

In [None]:
train_embeddings,train_embeddings_bng, train_fn = generate_subword_embeddings(train_df, fastText_model)

In [None]:
train_embeddings.shape

(2792, 40, 300)

In [None]:
train_embeddings_bng

array([[103,  40, 496, ...,   8,  17, 116],
       [ 17,  66,  17, ...,   1, 171,  17],
       [363,  17,  89, ...,  32,  27,  11],
       ...,
       [ 43,  11,   4, ...,   0,   0,   0],
       [  4,  17, 103, ..., 171,   1, 176],
       [  4,  17, 103, ..., 103, 218,   1]])

In [None]:
val_embeddings,val_embeddings_bng, val_fn = generate_subword_embeddings(val_df, fastText_model)
test_embeddings ,test_embeddings_bng, test_fn = generate_subword_embeddings(test_df, fastText_model)

In [None]:
from torch.utils.data import TensorDataset, DataLoader

train_embeddings = torch.Tensor(train_embeddings)
train_embeddings_bng = torch.Tensor(train_embeddings_bng)
train_fn = torch.Tensor(train_fn)
train_labels = torch.Tensor(train_df['category'].values)

val_embeddings = torch.Tensor(val_embeddings)
val_embeddings_bng = torch.Tensor(val_embeddings_bng)
val_fn = torch.Tensor(val_fn)
val_labels = torch.Tensor(val_df['category'].values)

test_embeddings = torch.Tensor(test_embeddings)
test_embeddings_bng = torch.Tensor(test_embeddings_bng)
test_fn = torch.Tensor(test_fn)
test_labels = torch.Tensor(test_df['category'].values)


In [None]:
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import TensorDataset, DataLoader

In [None]:
batch_size = 64
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'

train_data = TensorDataset(train_embeddings, train_embeddings_bng , train_fn, train_labels)
train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)

val_data = TensorDataset(val_embeddings, val_embeddings_bng , val_fn, val_labels)
val_loader = DataLoader(val_data, batch_size=batch_size, shuffle=False)

test_data = TensorDataset(test_embeddings, test_embeddings_bng , test_fn, test_labels)
test_loader = DataLoader(test_data, batch_size=batch_size, shuffle=False)

In [None]:
from sklearn.metrics import accuracy_score
import copy

def train_with_early_stopping(model, train_loader=train_loader, val_loader=val_loader, num_epochs=50, patience=10):
    best_val_loss = float('inf')
    best_val_f1 = 0
    current_patience = 0
    best_model = None

    for epoch in tqdm(range(num_epochs)):
        model.train()
        total_loss = 0

        for inputs, inputs_bng, inputs_fn, labels in train_loader:
            inputs,inputs_bng, inputs_fn, labels = inputs.to(device),inputs_bng.to(device), inputs_fn.to(device), labels.to(device)
            optimizer.zero_grad()

            out = model(inputs,inputs_bng, inputs_fn)
            loss = criterion(out, labels.long())
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        mean_loss = total_loss / len(train_loader)

        model.eval()
        val_loss = 0
        val_predictions = []
        val_targets = []
        correct, total = 0, 0

        with torch.no_grad():
            for inputs,inputs_bng, inputs_fn, labels in val_loader:
                inputs,inputs_bng, inputs_fn, labels = inputs.to(device),inputs_bng.to(device), inputs_fn.to(device), labels.to(device)
                out = model(inputs,inputs_bng, inputs_fn)

                loss = criterion(out, labels.long())
                val_loss += loss.item()

                _, predicted = torch.max(out, 1)

                val_predictions.extend(predicted.cpu().long().numpy())
                val_targets.extend(labels.cpu().numpy())

                # print(len(val_targets), val_targets[-10:])
                # print(len(val_predictions), val_predictions[-10:])

        # val_accuracy = accuracy_score(val_targets, val_predictions)
        val_loss /= len(val_loader)
        val_w_f1 = f1_score(val_targets, val_predictions, average='weighted')

        print(f'Epoch [{epoch + 1}/{num_epochs}], Train Loss: {mean_loss}, Validation Loss: {val_loss}, Validation F1: {val_w_f1}')

        if val_w_f1 > best_val_f1:
            best_val_f1 = val_w_f1
            current_patience = 0
            best_model = copy.deepcopy(model)
        else:
            current_patience += 1

        if current_patience >= patience:
            print(f'Stopping after {epoch+1} epochs')
            break

    return best_model

In [None]:
from sklearn.metrics import f1_score, recall_score, precision_score, classification_report

def test_metrics(model, test_loader):
    model.eval()
    test_predictions = []
    test_targets = []
    with torch.no_grad():
        for inputs,inputs_bng, inputs_fn, labels in test_loader:
            inputs,inputs_bng, inputs_fn, labels = inputs.to(device),inputs_bng.to(device), inputs_fn.to(device), labels.to(device)
            out = model(inputs,inputs_bng, inputs_fn)

            _, predicted = torch.max(out, 1)
            test_predictions.extend(predicted.cpu().long().numpy())
            test_targets.extend(labels.cpu().numpy())

    print(len(test_targets), len(test_predictions))

    test_acc = accuracy_score(test_targets, test_predictions)
    test_w_f1 = f1_score(test_targets, test_predictions, average='weighted')
    test_macro_f1 = f1_score(test_targets, test_predictions, average='macro')
    class_report = classification_report(test_targets, test_predictions)

    print('Accuracy:', test_acc)
    print('F1-Weighted:', test_w_f1)
    print('F1-Macro:', test_macro_f1)
    print('Classification Report:', class_report, sep='\n')

In [None]:
from sklearn.metrics import accuracy_score
import copy

def maori_train_with_early_stopping(model, train_loader=train_loader, val_loader=val_loader, num_epochs=50, patience=10):
    best_val_loss = float('inf')
    best_val_f1 = 0
    current_patience = 0
    best_model = None

    for epoch in tqdm(range(num_epochs)):
        model.train()
        total_loss = 0

        for inputs,inputs_bng, inputs_fn, labels in train_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            optimizer.zero_grad()

            out = model(inputs)
            loss = criterion(out, labels.long())
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        mean_loss = total_loss / len(train_loader)

        model.eval()
        val_loss = 0
        val_predictions = []
        val_targets = []
        correct, total = 0, 0

        with torch.no_grad():
            for inputs,inputs_bng, inputs_fn, labels in val_loader:
                inputs, labels = inputs.to(device), labels.to(device)
                out = model(inputs)

                loss = criterion(out, labels.long())
                val_loss += loss.item()

                _, predicted = torch.max(out, 1)

                val_predictions.extend(predicted.cpu().long().numpy())
                val_targets.extend(labels.cpu().numpy())

                # print(len(val_targets), val_targets[-10:])
                # print(len(val_predictions), val_predictions[-10:])

        # val_accuracy = accuracy_score(val_targets, val_predictions)
        val_loss /= len(val_loader)
        val_w_f1 = f1_score(val_targets, val_predictions, average='weighted')

        print(f'Epoch [{epoch + 1}/{num_epochs}], Train Loss: {mean_loss}, Validation Loss: {val_loss}, Validation F1: {val_w_f1}')

        if val_w_f1 > best_val_f1:
            best_val_f1 = val_w_f1
            current_patience = 0
            best_model = copy.deepcopy(model)
        else:
            current_patience += 1

        if current_patience >= patience:
            print(f'Stopping after {epoch+1} epochs')
            break

    return best_model

In [None]:
from sklearn.metrics import f1_score, recall_score, precision_score, classification_report

def maori_test_metrics(model, test_loader):
    model.eval()
    test_predictions = []
    test_targets = []
    with torch.no_grad():
        for inputs,inputs_bng, inputs_fn, labels in test_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            out = model(inputs)

            _, predicted = torch.max(out, 1)
            test_predictions.extend(predicted.cpu().long().numpy())
            test_targets.extend(labels.cpu().numpy())

    print(len(test_targets), len(test_predictions))

    test_acc = accuracy_score(test_targets, test_predictions)
    test_w_f1 = f1_score(test_targets, test_predictions, average='weighted')
    test_macro_f1 = f1_score(test_targets, test_predictions, average='macro')
    class_report = classification_report(test_targets, test_predictions)

    print('Accuracy:', test_acc)
    print('F1-Weighted:', test_w_f1)
    print('F1-Macro:', test_macro_f1)
    print('Classification Report:', class_report, sep='\n')

In [None]:
print(model)

Maori(
  (embedding): Embedding(7002, 128)
  (lstm): LSTM(300, 12, batch_first=True, bidirectional=True)
  (drop): Dropout(p=0.0, inplace=False)
  (linear): Linear(in_features=24, out_features=3, bias=True)
  (softmax): Softmax(dim=0)
)


In [None]:
maori_train_with_early_stopping(model, num_epochs=50)

  2%|▏         | 1/50 [00:00<00:24,  2.00it/s]

Epoch [1/50], Train Loss: 1.065773897821253, Validation Loss: 1.060996515410287, Validation F1: 0.35065572620800894


  4%|▍         | 2/50 [00:00<00:23,  2.06it/s]

Epoch [2/50], Train Loss: 1.0659996953877535, Validation Loss: 1.060996515410287, Validation F1: 0.35065572620800894


  6%|▌         | 3/50 [00:01<00:22,  2.10it/s]

Epoch [3/50], Train Loss: 1.0659996910528704, Validation Loss: 1.060996515410287, Validation F1: 0.35065572620800894


  8%|▊         | 4/50 [00:01<00:21,  2.13it/s]

Epoch [4/50], Train Loss: 1.0661010438745673, Validation Loss: 1.060996515410287, Validation F1: 0.35065572620800894


 10%|█         | 5/50 [00:02<00:21,  2.12it/s]

Epoch [5/50], Train Loss: 1.0661010427908464, Validation Loss: 1.060996515410287, Validation F1: 0.35065572620800894


 12%|█▏        | 6/50 [00:02<00:20,  2.14it/s]

Epoch [6/50], Train Loss: 1.0659996997226369, Validation Loss: 1.060996515410287, Validation F1: 0.35065572620800894


 14%|█▍        | 7/50 [00:03<00:19,  2.16it/s]

Epoch [7/50], Train Loss: 1.065999687801708, Validation Loss: 1.060996515410287, Validation F1: 0.35065572620800894


 16%|█▌        | 8/50 [00:03<00:19,  2.16it/s]

Epoch [8/50], Train Loss: 1.0661010482094504, Validation Loss: 1.060996515410287, Validation F1: 0.35065572620800894


 18%|█▊        | 9/50 [00:04<00:18,  2.16it/s]

Epoch [9/50], Train Loss: 1.0659996943040329, Validation Loss: 1.060996515410287, Validation F1: 0.35065572620800894


 20%|██        | 10/50 [00:04<00:18,  2.16it/s]

Epoch [10/50], Train Loss: 1.0654467636888678, Validation Loss: 1.060996515410287, Validation F1: 0.35065572620800894


 20%|██        | 10/50 [00:05<00:20,  1.95it/s]

Epoch [11/50], Train Loss: 1.0657739086584612, Validation Loss: 1.060996515410287, Validation F1: 0.35065572620800894
Stopping after 11 epochs





Maori(
  (embedding): Embedding(7002, 128)
  (lstm): LSTM(300, 12, batch_first=True, bidirectional=True)
  (drop): Dropout(p=0.0, inplace=False)
  (linear): Linear(in_features=24, out_features=3, bias=True)
  (softmax): Softmax(dim=0)
)

In [None]:
maori_test_metrics(model, val_loader)

194 194
Accuracy: 0.5154639175257731
F1-Weighted: 0.35065572620800894
F1-Macro: 0.22675736961451246
Classification Report:
              precision    recall  f1-score   support

         0.0       0.52      1.00      0.68       100
         1.0       0.00      0.00      0.00        51
         2.0       0.00      0.00      0.00        43

    accuracy                           0.52       194
   macro avg       0.17      0.33      0.23       194
weighted avg       0.27      0.52      0.35       194



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
maori_test_metrics(model, test_loader)

194 194
Accuracy: 0.5567010309278351
F1-Weighted: 0.39817027377619985
F1-Macro: 0.23841059602649006
Classification Report:
              precision    recall  f1-score   support

         0.0       0.56      1.00      0.72       108
         1.0       0.00      0.00      0.00        47
         2.0       0.00      0.00      0.00        39

    accuracy                           0.56       194
   macro avg       0.19      0.33      0.24       194
weighted avg       0.31      0.56      0.40       194



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
class Maori(nn.Module): #Ignore
    def __init__(self, input_size=300, hidden_size=128, out_size=3, dropout_prob=0.5):
        super(Maori, self).__init__()
        self.embedding = nn.Embedding.from_pretrained(torch.from_numpy(embs_npa).float())
        self.embedding = nn.Embedding.from_pretrained(torch.from_numpy(embs_npa).float(), freeze=True)
        self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True, bidirectional=True)
        self.drop = nn.Dropout(dropout_prob)
        self.linear = nn.Linear(2*hidden_size, out_size)
        self.softmax = nn.Softmax(dim=0)

    def forward(self, x):
        out = self.drop(x)
        out, _ = self.lstm(x)
        # out = self.drop(out)

        out = self.linear(out[:, -1, :])

        return out

model = Maori(out_size=len(idx2label), hidden_size=12, dropout_prob=0.)
model = model.to(device)
print(model)

Maori(
  (embedding): Embedding(7002, 128)
  (lstm): LSTM(300, 12, batch_first=True, bidirectional=True)
  (drop): Dropout(p=0.0, inplace=False)
  (linear): Linear(in_features=24, out_features=3, bias=True)
  (softmax): Softmax(dim=0)
)


In [None]:
model(batch[0].to(device))

torch.Size([32, 3])

In [None]:
class SelfAttention(nn.Module):
    def __init__(self, input_dim):
        super(SelfAttention, self).__init__()
        self.input_dim = input_dim
        self.query = nn.Linear(input_dim, input_dim)
        self.key = nn.Linear(input_dim, input_dim)
        self.value = nn.Linear(input_dim, input_dim)
        self.softmax = nn.Softmax(dim=2)

    def forward(self, x):
        queries = self.query(x)
        keys = self.key(x)
        values = self.value(x)
        scores = torch.bmm(queries, keys.transpose(1, 2)) / (self.input_dim ** 0.5)
        attention = self.softmax(scores)
        weighted = torch.bmm(attention, values)
        return weighted


In [None]:
class MukherjeeV1(nn.Module):
    def __init__(self, config):
        super(MukherjeeV1, self).__init__()

        # Word Embeddings
        self.lstm_word = nn.LSTM(config['word_input_size'], config['lstm_word_hidden'], num_layers=2, batch_first=True, bidirectional=True)
        # self.dropout_word = nn.Dropout(config["drop_word"])
        self.attention_word = SelfAttention(config['lstm_word_hidden']*2)
        # self.multihead_attn_word = nn.MultiheadAttention(config['lstm_word_hidden']*2, config['num_heads'])

        # Char Embeddings
        # self.lstm_char = nn.LSTM(config['input_size'], config['lstm_char_hidden'], batch_first=True, bidirectional=True)
        # self.dropout_char = nn.Dropout(config["drop_char"])

        #Bangla Embeddings
        self.embedding = nn.Embedding.from_pretrained(torch.from_numpy(embs_npa).float(), freeze=True)
        self.lstm = nn.LSTM(input_size = config['subword_input_size'], hidden_size = config['lstm_subword_hidden'], num_layers=4, bidirectional=True, batch_first=True)
        # self.linear = nn.Linear(config['hidden_dim'] * 2,  config['num_heads'])
        # self.dropout = nn.Dropout(config['drop_char'])
        self.attention_subword = SelfAttention(config['lstm_subword_hidden']*2)
        # self.multihead_attn_subword = nn.MultiheadAttention(config['hidden_dim']*2, config['num_heads'])

        # Dense Layers
        self.attention_both = SelfAttention(config['lstm_word_hidden']*2+config['lstm_subword_hidden']*2)
        # self.multihead_attn_both = nn.MultiheadAttention(config['lstm_word_hidden']*2+config['hidden_dim']*2, config['num_heads'])

        # self.dropout_fc1 = nn.Dropout(config["drop_fc"])
        # self.dropout_fc2 = nn.Dropout(config["drop_fc"])
        self.fc1 = nn.Linear(config['fc1_dim'], config['fc2_dim'])
        self.fc2 = nn.Linear(config['fc2_dim'], config['fc3_dim'])
        self.fc3 = nn.Linear(config['fc3_dim'], config['n_classes'])
        self.relu = nn.ReLU()

    def forward(self, x, x_bng, fn):
        # Word Embeddings
        out, (hidden,_) = self.lstm_word(x)
        # out = self.dropout_word(out)
        out_word = self.attention_word(out)
        # out_word, _ = self.multihead_attn_word(out, out, out)

        # Char Embeddings
        # out, (hidden,_) = self.lstm_char(x)
        # out_char = self.dropout_char(out)
        out_subword = self.embedding(x_bng.long())
        # print(x[0])
        # print(embedded[0])
        # print(embedded.shape)
        out_subword, (hidden, _) = self.lstm(out_subword)
        # embedded = self.dropout(embedded)
        out_subword = self.attention_subword(out_subword)
        # out, _ = self.multihead_attn_subword(embedded, embedded, embedded)
        # outputs = self.linear(outputs)
        # embedded =  outputs.transpose(1, 2)

        # print(x[0])
        # print(embedded[0])



        # Concatenate Word and Char Embeddings
        # print(out_word.shape)
        # print(embedded.shape)
        out = torch.cat((out_word, out_subword), dim=2)
        # print(out.shape)
        # print(out[:, -1, :].shape)
        # expected_input_size = config['lstm_word_hidden']*2 + config['hidden_dim']*2
        # print(f"Expected input size of fully connected layer: {expected_input_size}")
        # out = out_word

        # Attention
        out = self.attention_both(out)
        # out = self.multihead_attn_both(out,out,out)
        # print(out.shape)
        # print(out[:, -1, :].shape)
        # print(fn.shape)
        out = torch.cat((out[:, -1, :], fn), dim=1)
        # print(out.shape)
        # x = torch.cat((x, any_caps.unsqueeze(2)), dim=2)
        # Dense Layers
        out = self.relu(self.fc1(out))
        # out = self.dropout_fc1(out)
        out = self.relu(self.fc2(out))
        # out = self.dropout_fc2(out)
        out = self.relu(self.fc3(out))
        return out

In [None]:
config={'word_input_size':300,
        'subword_input_size':128,
        'lstm_word_hidden':300,
        'lstm_subword_hidden':128,
        # 'lstm_char_hidden':128,
        'fc1_dim':300*2+128*2+train_fn.shape[1],
        'fc2_dim':512,
        'fc3_dim':128,
        'n_classes':len(idx2label),
        'drop_subword':0.2,
        'drop_word':0.4,
        'drop_fc':0.4,
        # 'num_heads':8,
        # 'embedding_dim' :128,
        # 'hidden_dim' : 256,
        }

model = MukherjeeV1(config)
model = model.to(device)
print(model)

MukherjeeV1(
  (lstm_word): LSTM(300, 300, num_layers=2, batch_first=True, bidirectional=True)
  (attention_word): SelfAttention(
    (query): Linear(in_features=600, out_features=600, bias=True)
    (key): Linear(in_features=600, out_features=600, bias=True)
    (value): Linear(in_features=600, out_features=600, bias=True)
    (softmax): Softmax(dim=2)
  )
  (embedding): Embedding(7001, 128)
  (lstm): LSTM(128, 128, num_layers=4, batch_first=True, bidirectional=True)
  (attention_subword): SelfAttention(
    (query): Linear(in_features=256, out_features=256, bias=True)
    (key): Linear(in_features=256, out_features=256, bias=True)
    (value): Linear(in_features=256, out_features=256, bias=True)
    (softmax): Softmax(dim=2)
  )
  (attention_both): SelfAttention(
    (query): Linear(in_features=856, out_features=856, bias=True)
    (key): Linear(in_features=856, out_features=856, bias=True)
    (value): Linear(in_features=856, out_features=856, bias=True)
    (softmax): Softmax(dim=2

In [None]:
import torch.optim as optim

# class_weights = [1.0,5.0,5.0]
# criterion = nn.CrossEntropyLoss(weight=torch.tensor(class_weights).to(device))
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adamax(model.parameters())

In [None]:
best_model = train_with_early_stopping(model, num_epochs=50)

  2%|▏         | 1/50 [00:02<02:16,  2.78s/it]

Epoch [1/50], Train Loss: 1.0409886864098636, Validation Loss: 0.9885019779205322, Validation F1: 0.3459122939043579


  4%|▍         | 2/50 [00:05<02:13,  2.78s/it]

Epoch [2/50], Train Loss: 0.9620972722768784, Validation Loss: 0.949260675907135, Validation F1: 0.3459122939043579


  6%|▌         | 3/50 [00:08<02:10,  2.78s/it]

Epoch [3/50], Train Loss: 0.924645181406628, Validation Loss: 0.9481417775154114, Validation F1: 0.3459122939043579


  8%|▊         | 4/50 [00:11<02:07,  2.77s/it]

Epoch [4/50], Train Loss: 0.8915377272800966, Validation Loss: 0.9743558287620544, Validation F1: 0.3459122939043579


 10%|█         | 5/50 [00:13<02:05,  2.79s/it]

Epoch [5/50], Train Loss: 0.8652920953252099, Validation Loss: 0.9198076248168945, Validation F1: 0.5503222868498677


 12%|█▏        | 6/50 [00:16<02:03,  2.81s/it]

Epoch [6/50], Train Loss: 0.7987961836836555, Validation Loss: 0.9201187372207642, Validation F1: 0.56455475088653


 14%|█▍        | 7/50 [00:19<02:01,  2.83s/it]

Epoch [7/50], Train Loss: 0.7619060386310924, Validation Loss: 0.9878057658672332, Validation F1: 0.5566524013939022


 16%|█▌        | 8/50 [00:22<01:58,  2.83s/it]

Epoch [8/50], Train Loss: 0.7164708281105215, Validation Loss: 0.9031160414218903, Validation F1: 0.5760344063642018


 18%|█▊        | 9/50 [00:25<01:55,  2.82s/it]

Epoch [9/50], Train Loss: 0.6658594350923192, Validation Loss: 1.04158273935318, Validation F1: 0.5456627291525774


 20%|██        | 10/50 [00:28<01:52,  2.81s/it]

Epoch [10/50], Train Loss: 0.6420487084171989, Validation Loss: 0.9452392935752869, Validation F1: 0.5279031517181086


 22%|██▏       | 11/50 [00:30<01:49,  2.80s/it]

Epoch [11/50], Train Loss: 0.6020404574545947, Validation Loss: 1.0265158712863922, Validation F1: 0.5243269977560268


 24%|██▍       | 12/50 [00:33<01:46,  2.79s/it]

Epoch [12/50], Train Loss: 0.5476078682325103, Validation Loss: 1.201272976398468, Validation F1: 0.5520326136885623


 26%|██▌       | 13/50 [00:36<01:43,  2.79s/it]

Epoch [13/50], Train Loss: 0.5403898947618224, Validation Loss: 1.2153971374034882, Validation F1: 0.5899248185797413


 28%|██▊       | 14/50 [00:39<01:40,  2.79s/it]

Epoch [14/50], Train Loss: 0.4694282893430103, Validation Loss: 1.1040640115737914, Validation F1: 0.6005811854119022


 30%|███       | 15/50 [00:41<01:37,  2.78s/it]

Epoch [15/50], Train Loss: 0.4214337776330384, Validation Loss: 1.2181549072265625, Validation F1: 0.586568744338237


 32%|███▏      | 16/50 [00:44<01:34,  2.77s/it]

Epoch [16/50], Train Loss: 0.37404588068073447, Validation Loss: 1.738259506225586, Validation F1: 0.582070739974335


 34%|███▍      | 17/50 [00:47<01:31,  2.77s/it]

Epoch [17/50], Train Loss: 0.3428217463872649, Validation Loss: 1.343940508365631, Validation F1: 0.5953760062885959


 36%|███▌      | 18/50 [00:50<01:28,  2.76s/it]

Epoch [18/50], Train Loss: 0.31241255693814973, Validation Loss: 1.5404184699058532, Validation F1: 0.6353633479248303


 38%|███▊      | 19/50 [00:52<01:25,  2.76s/it]

Epoch [19/50], Train Loss: 0.2538599561561238, Validation Loss: 1.6463610351085662, Validation F1: 0.623336378694711


 40%|████      | 20/50 [00:55<01:22,  2.76s/it]

Epoch [20/50], Train Loss: 0.2507292553782463, Validation Loss: 2.037303149700165, Validation F1: 0.5753555908754631


 42%|████▏     | 21/50 [00:58<01:19,  2.76s/it]

Epoch [21/50], Train Loss: 0.23604725335131993, Validation Loss: 1.6930459976196288, Validation F1: 0.5870454287048815


 44%|████▍     | 22/50 [01:01<01:17,  2.75s/it]

Epoch [22/50], Train Loss: 0.19409146667881447, Validation Loss: 1.7017766892910005, Validation F1: 0.5955664110808804


 46%|████▌     | 23/50 [01:03<01:14,  2.75s/it]

Epoch [23/50], Train Loss: 0.16308119867674328, Validation Loss: 2.271748161315918, Validation F1: 0.5972193424730821


 48%|████▊     | 24/50 [01:06<01:11,  2.76s/it]

Epoch [24/50], Train Loss: 0.11171280673112381, Validation Loss: 2.83335394859314, Validation F1: 0.6035173552574014


 50%|█████     | 25/50 [01:09<01:09,  2.76s/it]

Epoch [25/50], Train Loss: 0.13244913450696252, Validation Loss: 2.5762812376022337, Validation F1: 0.6004081119733229


 52%|█████▏    | 26/50 [01:12<01:06,  2.76s/it]

Epoch [26/50], Train Loss: 0.09426060731692071, Validation Loss: 2.5214628100395204, Validation F1: 0.5909414071549663


 54%|█████▍    | 27/50 [01:15<01:03,  2.76s/it]

Epoch [27/50], Train Loss: 0.1107005953619426, Validation Loss: 2.2050605177879334, Validation F1: 0.6056771604646883


 54%|█████▍    | 27/50 [01:17<01:06,  2.88s/it]

Epoch [28/50], Train Loss: 0.09504386971027336, Validation Loss: 2.44401193857193, Validation F1: 0.6229067289157211
Stopping after 28 epochs





In [None]:
test_metrics(best_model, test_loader)

  result = _VF.lstm(input, hx, self._flat_weights, self.bias, self.num_layers,


776 776
Accuracy: 0.8079896907216495
F1-Weighted: 0.8081453284750719
F1-Macro: 0.7788914508851073
Classification Report:
              precision    recall  f1-score   support

         0.0       0.85      0.86      0.85       395
         1.0       0.80      0.78      0.79       265
         2.0       0.68      0.71      0.69       116

    accuracy                           0.81       776
   macro avg       0.78      0.78      0.78       776
weighted avg       0.81      0.81      0.81       776



In [None]:
test_metrics(best_model, val_loader)

311 311
Accuracy: 0.6366559485530546
F1-Weighted: 0.6353633479248303
F1-Macro: 0.5940871795485457
Classification Report:
              precision    recall  f1-score   support

         0.0       0.70      0.71      0.70       159
         1.0       0.60      0.60      0.60       106
         2.0       0.50      0.46      0.48        46

    accuracy                           0.64       311
   macro avg       0.60      0.59      0.59       311
weighted avg       0.63      0.64      0.64       311



  result = _VF.lstm(input, hx, self._flat_weights, self.bias, self.num_layers,


In [None]:
test_metrics(best_model, train_loader)

  result = _VF.lstm(input, hx, self._flat_weights, self.bias, self.num_layers,


2792 2792
Accuracy: 0.9151146131805158
F1-Weighted: 0.9159956118997861
F1-Macro: 0.888767564974542
Classification Report:
              precision    recall  f1-score   support

         0.0       0.96      0.94      0.95      1496
         1.0       0.91      0.89      0.90       927
         2.0       0.77      0.86      0.81       369

    accuracy                           0.92      2792
   macro avg       0.88      0.90      0.89      2792
weighted avg       0.92      0.92      0.92      2792



In [None]:
torch.save(best_model.state_dict(), '/content/drive/MyDrive/nlp_project/MukherjeeV2.pt')

### Trials

In [None]:
# best_model, test_loader
test_predictions = []
test_targets = []
with torch.no_grad():
    for inputs, labels in test_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        out = best_model(inputs)

        _, predicted = torch.max(out, 1)
        test_predictions.extend(predicted.cpu().long().numpy())
        test_targets.extend(labels.cpu().numpy())

print(len(test_targets), len(test_predictions))

test_acc = accuracy_score(test_targets, test_predictions)
test_w_f1 = f1_score(test_targets, test_predictions, average='weighted')
test_macro_f1 = f1_score(test_targets, test_predictions, average='macro')
class_report = classification_report(test_targets, test_predictions)

print('Accuracy:', test_acc)
print('F1-Weighted:', test_w_f1)
print('F1-Macro:', test_macro_f1)
print('Classification Report:', class_report, sep='\n')

  result = _VF.lstm(input, hx, self._flat_weights, self.bias, self.num_layers,


3491 3491
Accuracy: 0.5124606130048697
F1-Weighted: 0.39482327239999104
F1-Macro: 0.27662668511280347
Classification Report:
              precision    recall  f1-score   support

         0.0       0.52      0.95      0.67      1766
         1.0       0.43      0.10      0.16      1218
         2.0       0.00      0.00      0.00       507

    accuracy                           0.51      3491
   macro avg       0.32      0.35      0.28      3491
weighted avg       0.41      0.51      0.39      3491



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
set(test_predictions)

{0}

In [None]:
batch = next(iter(train_loader))

In [None]:
batch[-1]

tensor([0., 1., 2., 1., 0., 0., 0., 0., 0., 2., 0., 1., 0., 1., 1., 0., 1., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 2., 1., 0.])

In [None]:
batch[0].shape

torch.Size([32, 105, 300])

In [None]:
model(batch[0].to(device), batch[1].to(device), batch[2].to(device))

torch.Size([64, 40, 856])
torch.Size([64, 856])
torch.Size([64, 6])


tensor([[0.0040, 0.0768, 0.0788],
        [0.0000, 0.0321, 0.0752],
        [0.0000, 0.0217, 0.0738],
        [0.0000, 0.0259, 0.0745],
        [0.0000, 0.0260, 0.0746],
        [0.0000, 0.0255, 0.0755],
        [0.0000, 0.0253, 0.0797],
        [0.0016, 0.0674, 0.0769],
        [0.0000, 0.0369, 0.0741],
        [0.0000, 0.0258, 0.0761],
        [0.0000, 0.0258, 0.0744],
        [0.0000, 0.0346, 0.0757],
        [0.0000, 0.0219, 0.0671],
        [0.0000, 0.0388, 0.0693],
        [0.0000, 0.0210, 0.0800],
        [0.0000, 0.0257, 0.0744],
        [0.0000, 0.0220, 0.0720],
        [0.0000, 0.0323, 0.0751],
        [0.0000, 0.0261, 0.0745],
        [0.0000, 0.0321, 0.0751],
        [0.0000, 0.0217, 0.0737],
        [0.0000, 0.0216, 0.0737],
        [0.0000, 0.0321, 0.0751],
        [0.0000, 0.0462, 0.0763],
        [0.0000, 0.0214, 0.0744],
        [0.0000, 0.0321, 0.0703],
        [0.0000, 0.0531, 0.0758],
        [0.0000, 0.0216, 0.0738],
        [0.0000, 0.0217, 0.0738],
        [0.000