In [None]:
!pip install indic-nlp-library



In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import matplotlib.pyplot as plt

import torch
from torch import nn,tensor
from torch.utils.data import Dataset,DataLoader
from torchtext.vocab import build_vocab_from_iterator
from tqdm import tqdm
import re
import numpy as np
import pandas as pd
import pickle
from indicnlp.tokenize import indic_tokenize
import seaborn as sns
import copy

train_path = '/content/drive/MyDrive/Colab Notebooks/INLP Project/product_reviews/hi/hi-train.csv'
test_path ='/content/drive/MyDrive/Colab Notebooks/INLP Project/product_reviews/hi/hi-test.csv'
syn_path = '/content/drive/MyDrive/Colab Notebooks/INLP Project/product_reviews/hi/hi-test-updated.csv'
val_path = '/content/drive/MyDrive/Colab Notebooks/INLP Project/product_reviews/hi/hi-valid.csv'
forward_model_path ='/content/drive/MyDrive/Colab Notebooks/INLP Project/train embeddings/pretrained_forward_model'
backward_model_path ='/content/drive/MyDrive/Colab Notebooks/INLP Project/train embeddings/pretrained_backward_model'
word_to_ix_path = '/content/drive/MyDrive/Colab Notebooks/INLP Project/train embeddings/word_to_ix.pkl'

In [None]:

train_df = pd.read_csv(train_path, header=None, names=['Label', 'Text'])

test_df = pd.read_csv(test_path, header=None, names=['Label', 'Text'])
syn_df = pd.read_csv(syn_path, header=None, names=['Label', 'Text'])
test_df.head()



Unnamed: 0,Label,Text
0,neutral,"मगर , इस तरफ कोई ध्यान नहीं दिया जा रहा ।"
1,negative,गेम कई बार मुश्किल मालूम देता है ।
2,negative,मल्टीप्लेयर में छोटी स्टोरीलाइन है ।
3,positive,हाई - एंड एचपी नोटबुक्स की लुक्स हमेशा बेहतरीन...
4,positive,कैमरे बनाने वाली कंपनी निकोन अपनी 1 सीरीज कैमर...


In [None]:
syn_df.head()


Unnamed: 0,Label,Text
0,neutral,"मगर , इस तरफ कोई नज़र नहीं दी जा रही ।"
1,negative,गेम कई बार कठिन मालूम देता है ।
2,negative,मल्टीप्लेयर में लघु स्टोरीलाइन है ।
3,positive,हाई - एंड एचपी नोटबुक्स की लुक्स हमेशा उत्तम ह...
4,positive,कैमरे बनाने वाली कंपनी निकोन अपनी 1 सीरीज कैमर...


In [None]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
train_df['Label'] = label_encoder.fit_transform(train_df['Label'])
test_df['Label'] = label_encoder.transform(test_df['Label'])
syn_df['Label'] = label_encoder.transform(syn_df['Label'])

test_df.head()


Unnamed: 0,Label,Text
0,1,"मगर , इस तरफ कोई ध्यान नहीं दिया जा रहा ।"
1,0,गेम कई बार मुश्किल मालूम देता है ।
2,0,मल्टीप्लेयर में छोटी स्टोरीलाइन है ।
3,2,हाई - एंड एचपी नोटबुक्स की लुक्स हमेशा बेहतरीन...
4,2,कैमरे बनाने वाली कंपनी निकोन अपनी 1 सीरीज कैमर...


In [None]:
syn_df.head()


Unnamed: 0,Label,Text
0,1,"मगर , इस तरफ कोई नज़र नहीं दी जा रही ।"
1,0,गेम कई बार कठिन मालूम देता है ।
2,0,मल्टीप्लेयर में लघु स्टोरीलाइन है ।
3,2,हाई - एंड एचपी नोटबुक्स की लुक्स हमेशा उत्तम ह...
4,2,कैमरे बनाने वाली कंपनी निकोन अपनी 1 सीरीज कैमर...


In [None]:
class ElmoProcessor(torch.nn.Module):
    def __init__(self,vocab_size,embedding_dim,hidden_dim,wt_mat):
        super(ElmoProcessor,self).__init__()
        #self.embeddings=nn.Embedding(vocab_size,embedding_dim)
        self.lstmf1=nn.LSTM(embedding_dim,hidden_dim,num_layers=1,bidirectional=False,batch_first=True)
        self.lstmf2=nn.LSTM(hidden_dim,hidden_size=hidden_dim,bidirectional=False,batch_first=True)
        self.embedding=wt_mat.to(device)
        # self.lstmb1=nn.LSTM(embedding_dim,hidden_dim,num_layers=1,bidirectional=False,batch_first=True)
        # self.lstmb2=nn.LSTM(hidden_dim,hidden_size=hidden_dim,bidirectional=False,batch_first=True)

        self.linear=nn.Linear(hidden_dim,vocab_size)
    def forward(self,embeddingdata):
        emb=torch.tensor(self.embedding(embeddingdata.to(device)))

        emb_f=emb
        # emb_b=emb.flip(1)

        o1f1,h1f1=self.lstmf1(emb_f)
        o2f2,h2f2=self.lstmf2(o1f1)
        # o2f2=o2f2+o1f1
        # h2f2=h2f2+h1f1
        # o1b1,h1b1=self.lstmb1(emb_b)
        # o2b2,h2b2=self.lstmb2(o1b1,h1b1)
        # o2b2=o2b2+o1b1
        # h2b2=h2b2+h1b1

        # weighted_vector_emb=torch.cat([emb_f,emb_b],dim=2)
        # weighted_vector_h1=torch.cat([o1f1,o1f1],dim=2)
        # weighted_vector_h2=torch.cat([o1f1,o2f2],dim=2)
        output =self.linear(o2f2)
        return (output,o1f1,o2f2)

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"


In [None]:
# # not run

# wt_mat=torch.zeros(size=(90337,300))
# embeddings=torch.tensor(wt_mat,dtype=torch.float)
# forward_model= torch.load(forward_model_path,map_location=torch.device('cpu'))
# backward_model= torch.load(backward_model_path,map_location=torch.device('cpu'))

In [None]:
with open(word_to_ix_path, 'rb') as file:

    word_to_ix = pickle.load(file)


In [None]:
# # not run
# mxlen=[]
# for column in train_df[train_df.columns[1]]:
#     mxlen.append(len(column.split(' ')))
# max(mxlen)

In [None]:

mxlen = 30
class DatafetchClassify(Dataset):
    def __init__(self,split:str):
        sequence=[]
        labels=[]
        for x in tqdm(split[split.columns[1]],desc='Generate next word prediction data'):
            if len(x)<2 or x[0]=='=':continue
            words=indic_tokenize.trivial_tokenize(x,'hi') # alternative: x.strip().split(' ')
            words=words
            indices=[word_to_ix.get(word, 1) for word in words[:mxlen]]
            sequence.append((mxlen-len(indices))*[word_to_ix['<PAD>']]+indices[:])
        for x in tqdm(split[split.columns[0]],desc="Generate label"):
            labels.append(x)
        max_seq_len=max([len(ngram) for ngram in sequence])
        self.sequence=tensor(sequence)
        self.sequence.to(device)
        self.labels=tensor(labels)
        self.labels.to(device)
    def __len__(self)->int:
        return len(self.sequence)
    def __getitem__(self,index:int):
        return self.sequence[index],self.labels[index]

In [None]:
# # not run
# classifier_dataset=DatafetchClassify(train_df)

In [None]:
batch_size=64

In [None]:
# # not run
# classifer_dataloader=DataLoader(classifier_dataset,batch_size,shuffle=True)
# data,labels=next(iter(classifer_dataloader))


In [None]:
class ElmoClassifier(torch.nn.Module):
    def __init__(self,no_classes,forward_model,backward_model,embedding_dim):
        super(ElmoClassifier,self).__init__()
        self.forward_model=forward_model
        self.embedding = forward_model.embedding
        self.backward_model=backward_model
        self.forward_model.requires_grad=False
        self.backward_model.requires_grad=False
        self.linear1=nn.Linear(100,no_classes)
        # self.param0=nn.Parameter(torch.rand(1),requires_grad=False)
        # self.param1=nn.Parameter(torch.rand(1),requires_grad=False)
        # self.param2=nn.Parameter(torch.rand(1),requires_grad=False)
        self.bilstm=nn.LSTM(input_size=embedding_dim*2,hidden_size=50,bidirectional=True,batch_first=True)
    def forward(self,embdata):
        # _,o1f1,o2f2=self.forward_model(embdata)
        # _,o1b1,o2b2=self.backward_model(embdata)
        # print(embdata.shape)
        # print('\n',o1f1.shape)
        e = self.embedding(embdata)
        # e = torch.cat([embdata, embdata.flip(1)],dim=2)
        # h1=torch.cat([o1f1,o1b1.flip(1)],dim=2)
        # h2=torch.cat([o2f2,o2b2.flip(1)],dim=2)
        bio,bih=self.bilstm(e) #(self.param0 * e + self.param1 * h1 + self.param2 * h2 )

        y=self.linear1(bio[:,-1,:])

        return y

In [None]:
# # not run
# classifier=ElmoClassifier(3,forward_model,backward_model,300)

In [None]:
# # not run

# for epoch in range(5):
#     train_loss=0
#     classifier.train()
#     for batch in tqdm(classifer_dataloader,desc="Training"):
#         x_train,y_train=batch
#         optimizer.zero_grad()

#         x_train.to(device)
#         y_train.to(device)
#         output=classifier(x_train)
#         loss=loss_fn(output.to(device),y_train.to(device))
#         loss.backward()
#         optimizer.step()
#         train_loss +=loss.item()
#     print(train_loss)

In [None]:
classifier_eval=DatafetchClassify(test_df)

Generate next word prediction data: 100%|██████████| 523/523 [00:00<00:00, 13641.67it/s]
Generate label: 100%|██████████| 523/523 [00:00<00:00, 952919.63it/s]


In [None]:
classifer_dataloader_eval=DataLoader(classifier_eval,batch_size=100,shuffle=None)
data,labels=next(iter(classifer_dataloader_eval))


In [None]:
classifier_path = '/content/drive/MyDrive/Colab Notebooks/INLP Project/product_reviews/product_review_classifier_original_embeddings'
classifier = torch.load(classifier_path, map_location=torch.device('cpu'))

classifier.to(device)
loss_fn=nn.CrossEntropyLoss()
optimizer=torch.optim.Adam(classifier.parameters(),1e-3)

In [None]:
from sklearn.metrics import classification_report,accuracy_score,confusion_matrix,f1_score,recall_score
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
confusion_matrix=np.zeros((4,4))
y_true=[]
y_pred=[]
train_loss=0.0
for batch in tqdm(classifer_dataloader_eval,desc="Evaluation"):
    x_train,y_train=batch
    optimizer.zero_grad()

    x_train.to(device)
    y_train.to(device)
    output=classifier(x_train)

    _,preds=torch.max(output,dim=1)
    y_true.extend(y_train.cpu().detach().numpy())
    y_pred.extend(preds.cpu().detach().numpy())
    break

Evaluation:   0%|          | 0/6 [00:00<?, ?it/s]


In [None]:
output.shape

torch.Size([100, 3])

In [None]:
len(y_true)

100

In [None]:
output_exp = torch.exp(output)
row_sums = torch.sum(output_exp, dim=1, keepdim=True)

# Divide each element by its row sum
output = output_exp / row_sums

In [None]:
y_org_sentence = copy.copy(output)
y_org_sentence

tensor([[0.4083, 0.4585, 0.1332],
        [0.5764, 0.2777, 0.1460],
        [0.3282, 0.4043, 0.2675],
        [0.0078, 0.0189, 0.9733],
        [0.0102, 0.0191, 0.9706],
        [0.0165, 0.0375, 0.9459],
        [0.0130, 0.8316, 0.1554],
        [0.2215, 0.6028, 0.1757],
        [0.4015, 0.1508, 0.4478],
        [0.0101, 0.9355, 0.0544],
        [0.1718, 0.7618, 0.0664],
        [0.0344, 0.8765, 0.0891],
        [0.0051, 0.1156, 0.8793],
        [0.0150, 0.7273, 0.2577],
        [0.1271, 0.4363, 0.4366],
        [0.2478, 0.4530, 0.2992],
        [0.3255, 0.2577, 0.4168],
        [0.0142, 0.9256, 0.0602],
        [0.5761, 0.2773, 0.1466],
        [0.0084, 0.9148, 0.0768],
        [0.6028, 0.2980, 0.0991],
        [0.0659, 0.8128, 0.1213],
        [0.1043, 0.7807, 0.1150],
        [0.3522, 0.4484, 0.1993],
        [0.0484, 0.6056, 0.3460],
        [0.3374, 0.4496, 0.2130],
        [0.4545, 0.4412, 0.1043],
        [0.5032, 0.2511, 0.2457],
        [0.1410, 0.6800, 0.1790],
        [0.631

In [None]:
# accuracy_score(y_true,y_pred)

In [None]:
classifier_eval_syn =DatafetchClassify(syn_df)

Generate next word prediction data: 100%|██████████| 523/523 [00:00<00:00, 20363.16it/s]
Generate label: 100%|██████████| 523/523 [00:00<00:00, 924018.95it/s]


In [None]:
classifer_dataloader_eval_syn =DataLoader(classifier_eval_syn,batch_size=100,shuffle=None)
data,labels=next(iter(classifer_dataloader_eval_syn))


In [None]:
# classifier_path = '/content/drive/MyDrive/Colab Notebooks/INLP Project/product_reviews/product_review_classifier_final'
# classifier = torch.load(classifier_path, map_location=torch.device('cpu'))

# classifier.to(device)
# loss_fn=nn.CrossEntropyLoss()
# optimizer=torch.optim.Adam(classifier.parameters(),1e-3)

In [None]:
from sklearn.metrics import classification_report,accuracy_score,confusion_matrix,f1_score,recall_score
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
confusion_matrix=np.zeros((4,4))
y_true=[]
y_pred=[]
train_loss=0.0
for batch in tqdm(classifer_dataloader_eval_syn,desc="Evaluation"):
    x_train,y_train=batch
    optimizer.zero_grad()

    x_train.to(device)
    y_train.to(device)
    output=classifier(x_train)

    _,preds=torch.max(output,dim=1)
    y_true.extend(y_train.cpu().detach().numpy())
    y_pred.extend(preds.cpu().detach().numpy())
    break

Evaluation:   0%|          | 0/6 [00:00<?, ?it/s]


In [None]:
# accuracy_score(y_true,y_pred)

In [None]:
output.shape

torch.Size([100, 3])

In [None]:
output_exp = torch.exp(output)
row_sums = torch.sum(output_exp, dim=1, keepdim=True)

# Divide each element by its row sum
output = output_exp / row_sums

In [None]:
y_syn_sentence = copy.copy(output)
y_syn_sentence

tensor([[0.4046, 0.4767, 0.1187],
        [0.5324, 0.2998, 0.1677],
        [0.2092, 0.6226, 0.1682],
        [0.0075, 0.0220, 0.9705],
        [0.0224, 0.1271, 0.8505],
        [0.0109, 0.0420, 0.9471],
        [0.0132, 0.8268, 0.1600],
        [0.1565, 0.6666, 0.1770],
        [0.3846, 0.1245, 0.4910],
        [0.0086, 0.9419, 0.0495],
        [0.1552, 0.7799, 0.0648],
        [0.0253, 0.8733, 0.1014],
        [0.0068, 0.2466, 0.7466],
        [0.0192, 0.8775, 0.1033],
        [0.0533, 0.1475, 0.7992],
        [0.3620, 0.3829, 0.2551],
        [0.0369, 0.2176, 0.7455],
        [0.0206, 0.9010, 0.0784],
        [0.5275, 0.3274, 0.1452],
        [0.0065, 0.9066, 0.0869],
        [0.6028, 0.2992, 0.0980],
        [0.0633, 0.8159, 0.1208],
        [0.1096, 0.7756, 0.1147],
        [0.2751, 0.6019, 0.1230],
        [0.0576, 0.6678, 0.2745],
        [0.3447, 0.4387, 0.2167],
        [0.4239, 0.4707, 0.1054],
        [0.5137, 0.2594, 0.2269],
        [0.1793, 0.5459, 0.2748],
        [0.596

In [None]:
import statistics
def evaluate_embeddings(y_org_sentence, y_syn_sentence, y_true):
  differences = []
  for idx in range(len(y_true)):
    y_hat_syn = y_syn_sentence[idx][y_true[idx]]
    y_hat = y_org_sentence[idx][y_true[idx]]
    # print(y_hat_syn)
    # print(y_hat)
    diff = abs(y_hat - y_hat_syn)
    differences.append(diff)
  metric_mean =  sum(differences)/len(y_pred) #
  metric_median = statistics.median(differences)
  return metric_mean, metric_median
metric_mean, metric_median = evaluate_embeddings(y_org_sentence, y_syn_sentence, y_true)

In [None]:
metric_mean

tensor(0.0621, grad_fn=<DivBackward0>)

In [None]:
metric_median

tensor(0.0229, grad_fn=<DivBackward0>)