# Make embeddings for Bengali language
This notebook handles the embedding process.

### Input:
    - Pre-processed training dataframe.

### Output:
    - The trained weights of the embedding layer

## Import libraries

In [1]:
# Imports
import re
import string
import json
from datetime import datetime
from collections import defaultdict, Counter

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm

import torch
import torch.nn as nn
from torch.nn import Module
import torch.optim as optim
from torch.optim.lr_scheduler import LambdaLR
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

device = 'cuda'

import random

random.seed(26)
np.random.seed(62)
torch.manual_seed(2021)

<torch._C.Generator at 0x7f2b48382f70>

## Load data

In [2]:
# train data
train_df = pd.read_csv('save/bengali_train_preprocessed.csv')
train_sentences = [[int(s) for s in text.split()] for text in train_df['sentence']]
train_labels = train_df['hate'].to_numpy()

# test data
test_df = pd.read_csv('save/bengali_test_preprocessed.csv')
test_sentences = [[int(s) for s in text.split()] for text in test_df['sentence']]
test_labels = train_df['hate'].to_numpy()

# word <-> convertion
with open('save/word_to_int_dict.json', 'r') as f:
    word_to_int = json.load(f)
with open('save/int_to_word_dict.json', 'r') as f:
    int_to_word = json.load(f)
    int_to_word = {int(k) : v for k, v in int_to_word.items()}

# word-counter
with open('save/word_counter.json', 'r') as f:
    word_counter = json.load(f)
    
vocab_size = len(word_to_int)
total_words = sum(word_counter.values())

### Constants and Hyper-parameters

In [3]:
model_save_path = 'save/bengali_word2vec_neg.pt'

window_size = 5
embedding_size = 300
neg_sample_factor = 10
noise_dist_alpha = 3/4
learning_rate = 0.02
lr_decay = lambda epoch: max(0.05, 0.9**epoch)
batch_size = 256
epochs = 100

## skip-gram

In [4]:
# sampling probability of pair (center, context)
# def sampling_prob(word):
#     z = word_counter[word] / total_words
#     p_keep = ((z/0.000005)**0.5 + 1) * (0.000005/z)
#     return p_keep

def sampling_prob(word):
    z = word_counter[word]
    return 1. / (z ** 0.5)

In [5]:
# noise distribution
noisy_words = [iw for iw in int_to_word]
noisy_dist = np.array([(word_counter[int_to_word[iw]]/total_words)**noise_dist_alpha for iw in noisy_words])
noisy_dist = noisy_dist / noisy_dist.sum()

# noisy word generator
def get_noise_word(batch_size, neg_factor):
    noise_list = np.random.choice(noisy_words, batch_size*neg_factor, p=noisy_dist)
    noise_list = noise_list.reshape((batch_size, neg_factor))
    return torch.from_numpy(noise_list)

In [6]:
def get_target_context(sentence: list(str())):
    for i, word in enumerate(sentence):
        for j, context_word in enumerate(sentence[i-window_size:i+window_size+1]):
            if j != i and random.random() < sampling_prob(int_to_word[context_word]):
                    yield (torch.tensor(word, dtype=torch.long), 
                           torch.tensor(context_word, dtype=torch.long)
                          )

## Train word-embedding

### Model

In [7]:
class Word2Vec(Module):
    def __init__(self):
        super(Word2Vec, self).__init__()
        self.center_embed = nn.Embedding(vocab_size, embedding_size)        
        self.context_embed = nn.Embedding(vocab_size, embedding_size)
        
        init_range = (2 / (vocab_size + embedding_size)) ** 0.5
        self.center_embed.weight.data.uniform_(-init_range, init_range)
        self.context_embed.weight.data.uniform_(-init_range, init_range)
        
        self.log_sigmoid = nn.LogSigmoid()

    def forward(self, center_ids, context_ids, negative_samples):
        # center_ids, context_ids: [batch_size]
        # negatve_samples: [batch_size, neg_sample_factor]
        
        # center_embed, context_embed: [batch_size, embedding_size]
        center_embed = self.center_embed(center_ids)
        context_embed = self.context_embed(context_ids)
        
        # pos_dot: [batch_size]
        pos_dot = (center_embed * context_embed).sum(axis=1)
        
        # pos_loss: [batch_size]
        pos_loss = self.log_sigmoid(pos_dot)
        
        # negative_embed: [batch_size, neg_sample_factor, embedding_size]
        negative_embed = self.context_embed(negative_samples)
        
        # negs_dot: [batch_size, neg_sample_factor]
        negs_dot = torch.bmm(negative_embed, center_embed.unsqueeze(2)).squeeze(2) * (-1)
        
        # neg_dot: [batch_size]
        neg_dot = negs_dot.sum(axis=1)
        
        # neg_loss: [batch_size]
        neg_loss = self.log_sigmoid(neg_dot)
        
        loss = -(pos_loss + neg_loss).sum()
        return loss, -pos_loss.sum(), -neg_loss.sum()
    
    def to_embed(self, center_id):
        return self.center_embed(center_id)
    
word2vec = Word2Vec()
torch.save(word2vec.state_dict(), model_save_path)

display(word2vec.parameters)

<bound method Module.parameters of Word2Vec(
  (center_embed): Embedding(15983, 300)
  (context_embed): Embedding(15983, 300)
  (log_sigmoid): LogSigmoid()
)>

### Optimizer and Learning-rate scheduler

In [8]:
optimizer = optim.Adam(word2vec.parameters(), lr=learning_rate)
scheduler = LambdaLR(optimizer, lr_lambda=lr_decay)

### Dataset

In [9]:
class W2VDataset(Dataset):
    def __init__(self, sentences):
        self.data = []
        for sentence in sentences:
            for data_point in get_target_context(sentence):
                self.data.append(data_point)
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        return self.data[index]

### Learning parameters

In [10]:
# load initial weights
word2vec.load_state_dict(torch.load(model_save_path, map_location=torch.device(device)))
word2vec = word2vec.to(device)

early_stop = 5
history_losses = []
for epoch in range(1, epochs+1):
    train_dataset = W2VDataset(train_sentences)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    
    losses, pos_losses, neg_losses = 0., 0., 0.
    cnt = 0
    
    word2vec.train()
    for center_words, context_words in tqdm(train_loader):
        negative_samples = get_noise_word(len(center_words), neg_sample_factor)
        optimizer.zero_grad()
        loss, pos_loss, neg_loss = word2vec(center_words.to(device), context_words.to(device), negative_samples.to(device))
        loss.backward()
        optimizer.step()
        losses += loss
        cnt += len(center_words)
        pos_losses += pos_loss
        neg_losses += neg_loss

    scheduler.step()
    
    epoch_loss = losses / cnt
    print(f'Epoch {epoch:2}: training loss: {epoch_loss:.4f} (pos: {pos_losses/cnt:.4f}, neg: {neg_losses/(cnt*neg_sample_factor):.4f}) over {cnt} training points.')
    
    if epoch % 10 == 0:
        # save embedding
        embedding_weights = word2vec.center_embed.state_dict()
        embedding_weights['weight']
        torch.save(embedding_weights, f'save/embedding_weights_{epoch}_epoch_{embedding_size}_dim_{window_size}_wsize_{neg_sample_factor}_negfac.pt')
    
    history_losses.append(epoch_loss)
    if len(history_losses) > early_stop and min(history_losses[-early_stop:]) >= min(history_losses[:-early_stop]):
        print(f'Early stopping: training loss does not decrease after {early_stop} epochs')
        break

print("Training finished")

100%|██████████| 575/575 [00:17<00:00, 33.79it/s]


Epoch  1: training loss: 1.9029 (pos: 1.3237, neg: 0.0579) over 147076 training points.


100%|██████████| 575/575 [00:16<00:00, 34.54it/s]


Epoch  2: training loss: 1.9085 (pos: 1.2469, neg: 0.0662) over 147101 training points.


100%|██████████| 575/575 [00:16<00:00, 34.99it/s]


Epoch  3: training loss: 1.4240 (pos: 0.9547, neg: 0.0469) over 147169 training points.


100%|██████████| 575/575 [00:16<00:00, 35.06it/s]


Epoch  4: training loss: 1.0501 (pos: 0.7215, neg: 0.0329) over 146987 training points.


100%|██████████| 574/574 [00:16<00:00, 35.09it/s]


Epoch  5: training loss: 0.7672 (pos: 0.5416, neg: 0.0226) over 146900 training points.


100%|██████████| 576/576 [00:16<00:00, 35.14it/s]


Epoch  6: training loss: 0.6063 (pos: 0.4248, neg: 0.0181) over 147293 training points.


100%|██████████| 574/574 [00:16<00:00, 35.07it/s]


Epoch  7: training loss: 0.4753 (pos: 0.3374, neg: 0.0138) over 146755 training points.


100%|██████████| 576/576 [00:16<00:00, 35.07it/s]


Epoch  8: training loss: 0.3835 (pos: 0.2675, neg: 0.0116) over 147362 training points.


100%|██████████| 574/574 [00:16<00:00, 35.04it/s]


Epoch  9: training loss: 0.2920 (pos: 0.2074, neg: 0.0085) over 146936 training points.


100%|██████████| 575/575 [00:16<00:00, 35.06it/s]


Epoch 10: training loss: 0.2390 (pos: 0.1673, neg: 0.0072) over 146985 training points.


100%|██████████| 575/575 [00:16<00:00, 35.05it/s]


Epoch 11: training loss: 0.1966 (pos: 0.1359, neg: 0.0061) over 147022 training points.


100%|██████████| 573/573 [00:17<00:00, 33.64it/s]


Epoch 12: training loss: 0.1608 (pos: 0.1091, neg: 0.0052) over 146617 training points.


100%|██████████| 576/576 [00:17<00:00, 32.21it/s]


Epoch 13: training loss: 0.1343 (pos: 0.0932, neg: 0.0041) over 147277 training points.


100%|██████████| 576/576 [00:17<00:00, 33.75it/s]


Epoch 14: training loss: 0.1068 (pos: 0.0708, neg: 0.0036) over 147218 training points.


100%|██████████| 575/575 [00:16<00:00, 35.29it/s]


Epoch 15: training loss: 0.0977 (pos: 0.0626, neg: 0.0035) over 147105 training points.


100%|██████████| 573/573 [00:16<00:00, 35.36it/s]


Epoch 16: training loss: 0.0732 (pos: 0.0492, neg: 0.0024) over 146686 training points.


100%|██████████| 576/576 [00:16<00:00, 35.39it/s]


Epoch 17: training loss: 0.0686 (pos: 0.0439, neg: 0.0025) over 147224 training points.


100%|██████████| 575/575 [00:16<00:00, 34.08it/s]


Epoch 18: training loss: 0.0573 (pos: 0.0346, neg: 0.0023) over 147091 training points.


100%|██████████| 576/576 [00:16<00:00, 34.14it/s]


Epoch 19: training loss: 0.0533 (pos: 0.0327, neg: 0.0021) over 147218 training points.


100%|██████████| 573/573 [00:16<00:00, 34.07it/s]


Epoch 20: training loss: 0.0449 (pos: 0.0264, neg: 0.0019) over 146671 training points.


100%|██████████| 575/575 [00:16<00:00, 34.06it/s]


Epoch 21: training loss: 0.0387 (pos: 0.0234, neg: 0.0015) over 147129 training points.


100%|██████████| 574/574 [00:16<00:00, 34.12it/s]


Epoch 22: training loss: 0.0356 (pos: 0.0202, neg: 0.0015) over 146705 training points.


100%|██████████| 575/575 [00:16<00:00, 34.00it/s]


Epoch 23: training loss: 0.0314 (pos: 0.0176, neg: 0.0014) over 147101 training points.


100%|██████████| 575/575 [00:16<00:00, 34.05it/s]


Epoch 24: training loss: 0.0302 (pos: 0.0155, neg: 0.0015) over 147162 training points.


100%|██████████| 574/574 [00:16<00:00, 34.03it/s]


Epoch 25: training loss: 0.0247 (pos: 0.0140, neg: 0.0011) over 146889 training points.


100%|██████████| 577/577 [00:16<00:00, 34.03it/s]


Epoch 26: training loss: 0.0223 (pos: 0.0120, neg: 0.0010) over 147553 training points.


100%|██████████| 575/575 [00:16<00:00, 34.09it/s]


Epoch 27: training loss: 0.0187 (pos: 0.0094, neg: 0.0009) over 146972 training points.


100%|██████████| 576/576 [00:16<00:00, 35.22it/s]


Epoch 28: training loss: 0.0199 (pos: 0.0084, neg: 0.0011) over 147214 training points.


100%|██████████| 573/573 [00:16<00:00, 35.21it/s]


Epoch 29: training loss: 0.0173 (pos: 0.0085, neg: 0.0009) over 146491 training points.


100%|██████████| 574/574 [00:16<00:00, 35.23it/s]


Epoch 30: training loss: 0.0162 (pos: 0.0074, neg: 0.0009) over 146759 training points.


100%|██████████| 575/575 [00:16<00:00, 35.20it/s]


Epoch 31: training loss: 0.0161 (pos: 0.0074, neg: 0.0009) over 146955 training points.


100%|██████████| 575/575 [00:16<00:00, 35.23it/s]


Epoch 32: training loss: 0.0173 (pos: 0.0074, neg: 0.0010) over 147144 training points.


100%|██████████| 576/576 [00:16<00:00, 35.16it/s]


Epoch 33: training loss: 0.0131 (pos: 0.0060, neg: 0.0007) over 147236 training points.


100%|██████████| 575/575 [00:16<00:00, 35.21it/s]


Epoch 34: training loss: 0.0131 (pos: 0.0056, neg: 0.0008) over 146993 training points.


100%|██████████| 576/576 [00:16<00:00, 35.19it/s]


Epoch 35: training loss: 0.0130 (pos: 0.0050, neg: 0.0008) over 147313 training points.


100%|██████████| 574/574 [00:16<00:00, 35.17it/s]


Epoch 36: training loss: 0.0116 (pos: 0.0049, neg: 0.0007) over 146774 training points.


100%|██████████| 575/575 [00:16<00:00, 35.18it/s]


Epoch 37: training loss: 0.0116 (pos: 0.0042, neg: 0.0007) over 147094 training points.


100%|██████████| 573/573 [00:16<00:00, 35.27it/s]


Epoch 38: training loss: 0.0108 (pos: 0.0042, neg: 0.0007) over 146614 training points.


100%|██████████| 574/574 [00:16<00:00, 35.19it/s]


Epoch 39: training loss: 0.0108 (pos: 0.0043, neg: 0.0007) over 146907 training points.


100%|██████████| 575/575 [00:16<00:00, 35.24it/s]


Epoch 40: training loss: 0.0095 (pos: 0.0038, neg: 0.0006) over 147049 training points.


100%|██████████| 573/573 [00:16<00:00, 35.14it/s]


Epoch 41: training loss: 0.0097 (pos: 0.0039, neg: 0.0006) over 146622 training points.


100%|██████████| 575/575 [00:16<00:00, 35.22it/s]


Epoch 42: training loss: 0.0101 (pos: 0.0033, neg: 0.0007) over 147174 training points.


100%|██████████| 575/575 [00:16<00:00, 34.57it/s]


Epoch 43: training loss: 0.0103 (pos: 0.0035, neg: 0.0007) over 147049 training points.


100%|██████████| 576/576 [00:16<00:00, 34.03it/s]


Epoch 44: training loss: 0.0105 (pos: 0.0030, neg: 0.0007) over 147245 training points.


100%|██████████| 576/576 [00:16<00:00, 34.06it/s]


Epoch 45: training loss: 0.0101 (pos: 0.0028, neg: 0.0007) over 147313 training points.
Early stopping: training loss does not decrease after 5 epochs
Training finished
