# Make embeddings for Bengali language
This notebook handles the embedding process.

### Input:
    - Pre-processed training dataframe.

### Output:
    - The trained weights of the embedding layer

## Import libraries

In [1]:
# Imports
import re
import string
import json
from datetime import datetime
from collections import defaultdict, Counter

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm

import torch
import torch.nn as nn
from torch.nn import Module
import torch.optim as optim
from torch.optim.lr_scheduler import LambdaLR
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence
from torch.nn.init import xavier_normal_

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

device = 'cuda'

import random

random.seed(26)
np.random.seed(62)
torch.manual_seed(2021)

<torch._C.Generator at 0x7f30b0316f90>

## Load data

In [2]:
# train data
train_df = pd.read_csv('save/bengali_train_preprocessed.csv')
train_sentences = [[int(s) for s in text.split()] for text in train_df['sentence']]
train_labels = train_df['hate'].to_numpy()

# test data
test_df = pd.read_csv('save/bengali_test_preprocessed.csv')
test_sentences = [[int(s) for s in text.split()] for text in test_df['sentence']]
test_labels = train_df['hate'].to_numpy()

# word <-> convertion
with open('save/word_to_int_dict.json', 'r') as f:
    word_to_int = json.load(f)
with open('save/int_to_word_dict.json', 'r') as f:
    int_to_word = json.load(f)
    int_to_word = {int(k) : v for k, v in int_to_word.items()}

# word-counter
with open('save/word_counter.json', 'r') as f:
    word_counter = json.load(f)
    
vocab_size = len(word_to_int)
total_words = sum(word_counter.values())

### Constants and Hyper-parameters

In [3]:
model_save_path = 'save/bengali_GloVe.pt'

window_size = 5
embedding_size = 300
weighting_alpha = 3/4
max_cooc = 25
learning_rate = 0.02
lr_decay = lambda epoch: max(0.05, 0.8**epoch)
batch_size = 256
epochs = 30

## Co-occurence matrix

In [4]:
cooccurence = np.zeros((total_words, total_words))
for sentence in train_sentences:
    for i, word in enumerate(sentence):
        for j, context_word in enumerate(sentence[i-window_size:i+window_size+1]):
            if i != j:
                cooccurence[word, context_word] += 1
                cooccurence[context_word, word] += 1


## Train word-embedding

### Model

In [5]:
class GloVe(Module):
    def __init__(self, cooccurence_matrix, weighting_alpha, max_cooc):
        super(GloVe, self).__init__()
        self.cooccurence_matrix = cooccurence_matrix
        self.alpha = weighting_alpha
        self.max_cooc = max_cooc
        
        self.center_embed = nn.Embedding(vocab_size, embedding_size)
        self.center_embed.weight = xavier_normal_(self.center_embed.weight)
        
        self.context_embed = nn.Embedding(vocab_size, embedding_size)
        self.context_embed.weight = xavier_normal_(self.context_embed.weight)

        self.center_bias = nn.Embedding(vocab_size, 1)
        self.center_bias.weight = xavier_normal_(self.center_bias.weight)

        self.context_bias = nn.Embedding(vocab_size, 1)
        self.context_bias.weight = xavier_normal_(self.context_bias.weight)

    def forward(self, center_ids, context_ids):
        batch_size = len(center_ids)

        cooc_counts = np.array([self.cooccurence_matrix[w1, w2] + 1 for w1, w2 in zip(center_ids, context_ids)])
        fX = (cooc_counts.clip(max=self.max_cooc) / self.max_cooc) ** self.alpha

        cooc_counts = torch.from_numpy(cooc_counts).to(device)
        fX = torch.from_numpy(fX).to(device)

        center_embed = self.center_embed(center_ids)
        center_bias = self.center_bias(center_ids)
        context_embed = self.context_embed(context_ids)
        context_bias = self.context_bias(context_ids)

        return (fX * torch.pow(
            ((center_embed * context_embed).sum(1) + center_bias + context_bias).squeeze(1) - torch.log(cooc_counts), 2
        )).sum()
    
    def to_embed(self, center_id):
        return self.center_embed(center_id)
    
word2vec = GloVe(cooccurence, weighting_alpha, max_cooc)
torch.save(word2vec.state_dict(), model_save_path)

display(word2vec.parameters)

<bound method Module.parameters of GloVe(
  (center_embed): Embedding(15983, 300)
  (context_embed): Embedding(15983, 300)
  (center_bias): Embedding(15983, 1)
  (context_bias): Embedding(15983, 1)
)>

### Optimizer and Learning-rate scheduler

In [6]:
optimizer = optim.Adam(word2vec.parameters(), lr=learning_rate)
scheduler = LambdaLR(optimizer, lr_lambda=lr_decay)

### Dataset

In [7]:
class W2VDataset(Dataset):
    def __init__(self, co_occurrence):
        self.data = [(i, j) for i, j in np.argwhere(co_occurrence > 0)]
#         for i in range(total_words):
#             for j in range(total_words):
#                 if co_occurrence[i, j] > 0:
#                     self.data.append((i, j))
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        return self.data[index]

train_dataset = W2VDataset(cooccurence)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

### Learning parameters

In [8]:
# load initial weights
word2vec.load_state_dict(torch.load(model_save_path, map_location=torch.device(device)))
word2vec = word2vec.to(device)

early_stop = 5
history_losses = []
for epoch in range(1, epochs+1):
    
    
    losses = 0.
    cnt = 0
    
    word2vec.train()
    for center_words, context_words in tqdm(train_loader):
        optimizer.zero_grad()
        loss = word2vec(center_words.to(device), context_words.to(device))
        loss.backward()
        optimizer.step()
        losses += loss
        cnt += len(center_words)

    scheduler.step()
    
    epoch_loss = losses / cnt
    print(f'Epoch {epoch:2}: training loss: {epoch_loss:.4f} over {cnt} training points.')
    
    if epoch % 10 == 0:
        # save embedding
        embedding_weights = word2vec.center_embed.state_dict()
        embedding_weights['weight']
        torch.save(embedding_weights, f'save/embedding_weights_{epoch}_epoch_{embedding_size}_dim_{window_size}_wsize_GloVe.pt')
    
    history_losses.append(epoch_loss)
    if len(history_losses) > early_stop and min(history_losses[-early_stop:]) >= min(history_losses[:-early_stop]):
        print(f'Early stopping: training loss does not decrease after {early_stop} epochs')
        break

print("Training finished")

100%|██████████| 1152/1152 [00:35<00:00, 32.57it/s]
  0%|          | 4/1152 [00:00<00:34, 33.70it/s]

Epoch  1: training loss: 2155.1007 over 294868 training points.


100%|██████████| 1152/1152 [00:35<00:00, 32.44it/s]
  0%|          | 4/1152 [00:00<00:35, 32.58it/s]

Epoch  2: training loss: 23834.2383 over 294868 training points.


100%|██████████| 1152/1152 [00:35<00:00, 32.36it/s]
  0%|          | 4/1152 [00:00<00:35, 32.50it/s]

Epoch  3: training loss: 11014.8421 over 294868 training points.


100%|██████████| 1152/1152 [00:35<00:00, 32.38it/s]
  0%|          | 4/1152 [00:00<00:35, 32.30it/s]

Epoch  4: training loss: 1407.1752 over 294868 training points.


100%|██████████| 1152/1152 [00:35<00:00, 32.26it/s]
  0%|          | 4/1152 [00:00<00:35, 32.71it/s]

Epoch  5: training loss: 238.9333 over 294868 training points.


100%|██████████| 1152/1152 [00:35<00:00, 32.30it/s]
  0%|          | 4/1152 [00:00<00:35, 32.55it/s]

Epoch  6: training loss: 121.6824 over 294868 training points.


100%|██████████| 1152/1152 [00:36<00:00, 31.98it/s]
  0%|          | 4/1152 [00:00<00:34, 33.58it/s]

Epoch  7: training loss: 137.4192 over 294868 training points.


100%|██████████| 1152/1152 [00:35<00:00, 32.30it/s]
  0%|          | 4/1152 [00:00<00:34, 33.05it/s]

Epoch  8: training loss: 216.7639 over 294868 training points.


100%|██████████| 1152/1152 [00:35<00:00, 32.33it/s]
  0%|          | 4/1152 [00:00<00:34, 33.36it/s]

Epoch  9: training loss: 290.6060 over 294868 training points.


100%|██████████| 1152/1152 [00:35<00:00, 32.31it/s]
  0%|          | 4/1152 [00:00<00:36, 31.65it/s]

Epoch 10: training loss: 216.1969 over 294868 training points.


100%|██████████| 1152/1152 [00:35<00:00, 32.16it/s]
  0%|          | 4/1152 [00:00<00:35, 32.51it/s]

Epoch 11: training loss: 89.8237 over 294868 training points.


100%|██████████| 1152/1152 [00:35<00:00, 32.28it/s]
  0%|          | 4/1152 [00:00<00:36, 31.68it/s]

Epoch 12: training loss: 37.3532 over 294868 training points.


100%|██████████| 1152/1152 [00:35<00:00, 32.26it/s]
  0%|          | 4/1152 [00:00<00:34, 33.35it/s]

Epoch 13: training loss: 24.1251 over 294868 training points.


100%|██████████| 1152/1152 [00:35<00:00, 32.30it/s]
  0%|          | 3/1152 [00:00<00:39, 29.23it/s]

Epoch 14: training loss: 20.5832 over 294868 training points.


100%|██████████| 1152/1152 [00:34<00:00, 33.27it/s]
  0%|          | 4/1152 [00:00<00:33, 34.15it/s]

Epoch 15: training loss: 17.1792 over 294868 training points.


100%|██████████| 1152/1152 [00:35<00:00, 32.55it/s]
  0%|          | 4/1152 [00:00<00:35, 32.70it/s]

Epoch 16: training loss: 18.3076 over 294868 training points.


100%|██████████| 1152/1152 [00:35<00:00, 32.20it/s]
  0%|          | 4/1152 [00:00<00:35, 32.14it/s]

Epoch 17: training loss: 23.1825 over 294868 training points.


100%|██████████| 1152/1152 [00:35<00:00, 32.09it/s]
  0%|          | 3/1152 [00:00<00:39, 28.89it/s]

Epoch 18: training loss: 23.8008 over 294868 training points.


100%|██████████| 1152/1152 [00:36<00:00, 31.22it/s]
  0%|          | 4/1152 [00:00<00:36, 31.88it/s]

Epoch 19: training loss: 20.9828 over 294868 training points.


100%|██████████| 1152/1152 [00:36<00:00, 31.22it/s]


Epoch 20: training loss: 19.6005 over 294868 training points.
Early stopping: training loss does not decrease after 5 epochs
Training finished
