# Make embeddings for Bengali language
This notebook handles the embedding process.

### Input:
    - Pre-processed training dataframe.

### Output:
    - The trained weights of the embedding layer

## Import libraries

In [1]:
# Imports
import re
import string
import json
from datetime import datetime
from collections import defaultdict, Counter

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm

import torch
import torch.nn as nn
from torch.nn import Module
import torch.optim as optim
from torch.optim.lr_scheduler import LambdaLR
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

device = 'cuda'

import random

random.seed(26)
np.random.seed(62)
torch.manual_seed(2021)

<torch._C.Generator at 0x7f3697c7cf90>

## Load data

In [2]:
# train data
train_df = pd.read_csv('save/bengali_train_preprocessed.csv')
train_sentences = [[int(s) for s in text.split()] for text in train_df['sentence']]
train_labels = train_df['hate'].to_numpy()

# test data
test_df = pd.read_csv('save/bengali_test_preprocessed.csv')
test_sentences = [[int(s) for s in text.split()] for text in test_df['sentence']]
test_labels = train_df['hate'].to_numpy()

# word <-> convertion
with open('save/word_to_int_dict.json', 'r') as f:
    word_to_int = json.load(f)
with open('save/int_to_word_dict.json', 'r') as f:
    int_to_word = json.load(f)
    int_to_word = {int(k) : v for k, v in int_to_word.items()}

# word-counter
with open('save/word_counter.json', 'r') as f:
    word_counter = json.load(f)
    
vocab_size = len(word_to_int)
total_words = sum(word_counter.values())

### Constants and Hyper-parameters

In [3]:
model_save_path = 'save/bengali_word2vec_neg.pt'

window_size = 5
embedding_size = 300
neg_sample_factor = 10
noise_dist_alpha = 3/4
learning_rate = 0.02
lr_decay = lambda epoch: max(0.05, 0.8**epoch)
batch_size = 256
epochs = 100

## skip-gram

In [4]:
# sampling probability of pair (center, context)
def sampling_prob(word):
    z = word_counter[word] / total_words
    p_keep = ((z/0.001)**0.5 + 1) * (0.001/z)
    return p_keep

In [5]:
# noise distribution
noisy_words = [iw for iw in int_to_word]
noisy_dist = np.array([(word_counter[int_to_word[iw]]/total_words)**noise_dist_alpha for iw in noisy_words])
noisy_dist = noisy_dist / noisy_dist.sum()

# noisy word generator
def get_noise_word(batch_size, neg_factor):
    noise_list = np.random.choice(noisy_words, batch_size*neg_factor, p=noisy_dist)
    noise_list = noise_list.reshape((batch_size, neg_factor))
    return torch.from_numpy(noise_list)

In [6]:
def get_target_context(sentence: list(str())):
    for i, word in enumerate(sentence):
        for j, context_word in enumerate(sentence[i-window_size:i+window_size+1]):
            if j != i and random.random() < sampling_prob(int_to_word[context_word]):
                    yield (torch.tensor(word, dtype=torch.long), 
                           torch.tensor(context_word, dtype=torch.long)
                          )

## Train word-embedding

### Model

In [7]:
class Word2Vec(Module):
    def __init__(self):
        super(Word2Vec, self).__init__()
        self.center_embed = nn.Embedding(vocab_size, embedding_size)        
        self.context_embed = nn.Embedding(vocab_size, embedding_size)
        
        init_range = (2 / (vocab_size + embedding_size)) ** 0.5
        self.center_embed.weight.data.uniform_(-init_range, init_range)
        self.context_embed.weight.data.uniform_(-init_range, init_range)
        
        self.log_sigmoid = nn.LogSigmoid()

    def forward(self, center_ids, context_ids, negative_samples):
        # center_ids, context_ids: [batch_size]
        # negatve_samples: [batch_size, neg_sample_factor]
        
        # center_embed, context_embed: [batch_size, embedding_size]
        center_embed = self.center_embed(center_ids)
        context_embed = self.context_embed(context_ids)
        
        # pos_dot: [batch_size]
        pos_dot = (center_embed * context_embed).sum(axis=1)
        
        # pos_loss: [batch_size]
        pos_loss = self.log_sigmoid(pos_dot)
        
        # negative_embed: [batch_size, neg_sample_factor, embedding_size]
        negative_embed = self.context_embed(negative_samples)
        
        # negs_dot: [batch_size, neg_sample_factor]
        negs_dot = torch.bmm(negative_embed, center_embed.unsqueeze(2)).squeeze(2) * (-1)
        
        # neg_dot: [batch_size]
        neg_dot = negs_dot.sum(axis=1)
        
        # neg_loss: [batch_size]
        neg_loss = self.log_sigmoid(neg_dot)
        
        loss = -(pos_loss + neg_loss).sum()
        return loss, -pos_loss.sum(), -neg_loss.sum()
    
    def to_embed(self, center_id):
        return self.center_embed(center_id)
    
word2vec = Word2Vec()
torch.save(word2vec.state_dict(), model_save_path)

display(word2vec.parameters)

<bound method Module.parameters of Word2Vec(
  (center_embed): Embedding(15983, 300)
  (context_embed): Embedding(15983, 300)
  (log_sigmoid): LogSigmoid()
)>

### Optimizer and Learning-rate scheduler

In [8]:
optimizer = optim.Adam(word2vec.parameters(), lr=learning_rate)
scheduler = LambdaLR(optimizer, lr_lambda=lr_decay)

### Dataset

In [9]:
class W2VDataset(Dataset):
    def __init__(self, sentences):
        self.data = []
        for sentence in sentences:
            for data_point in get_target_context(sentence):
                self.data.append(data_point)
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        return self.data[index]

### Learning parameters

In [10]:
# load initial weights
word2vec.load_state_dict(torch.load(model_save_path, map_location=torch.device(device)))
word2vec = word2vec.to(device)

early_stop = 5
history_losses = []
for epoch in range(1, epochs+1):
    train_dataset = W2VDataset(train_sentences)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    
    losses, pos_losses, neg_losses = 0., 0., 0.
    cnt = 0
    
    word2vec.train()
    for center_words, context_words in tqdm(train_loader):
        negative_samples = get_noise_word(len(center_words), neg_sample_factor)
        optimizer.zero_grad()
        loss, pos_loss, neg_loss = word2vec(center_words.to(device), context_words.to(device), negative_samples.to(device))
        loss.backward()
        optimizer.step()
        losses += loss
        cnt += len(center_words)
        pos_losses += pos_loss
        neg_losses += neg_loss

    scheduler.step()
    
    epoch_loss = losses / cnt
    print(f'Epoch {epoch:2}: training loss: {epoch_loss:.4f} (pos: {pos_losses/cnt:.4f}, neg: {neg_losses/(cnt*neg_sample_factor):.4f}) over {cnt} training points.')
    
    if epoch % 10 == 0:
        # save embedding
        embedding_weights = word2vec.center_embed.state_dict()
        embedding_weights['weight']
        torch.save(embedding_weights, f'save/embedding_weights_{epoch}_epoch_{embedding_size}_dim_{window_size}_wsize_{neg_sample_factor}_negfac.pt')
    
    history_losses.append(epoch_loss)
    if len(history_losses) > early_stop and min(history_losses[-early_stop:]) >= min(history_losses[:-early_stop]):
        print(f'Early stopping: training loss does not decrease after {early_stop} epochs')
        break

print("Training finished")

100%|██████████| 1191/1191 [00:33<00:00, 35.84it/s]


Epoch  1: training loss: 2.7034 (pos: 1.8798, neg: 0.0824) over 304879 training points.


100%|██████████| 1191/1191 [00:33<00:00, 35.82it/s]


Epoch  2: training loss: 1.4383 (pos: 0.9493, neg: 0.0489) over 304872 training points.


100%|██████████| 1191/1191 [00:33<00:00, 35.81it/s]


Epoch  3: training loss: 0.5686 (pos: 0.3518, neg: 0.0217) over 304778 training points.


100%|██████████| 1191/1191 [00:33<00:00, 35.84it/s]


Epoch  4: training loss: 0.2789 (pos: 0.1735, neg: 0.0105) over 304707 training points.


100%|██████████| 1191/1191 [00:33<00:00, 35.82it/s]


Epoch  5: training loss: 0.1573 (pos: 0.0926, neg: 0.0065) over 304866 training points.


100%|██████████| 1191/1191 [00:33<00:00, 35.79it/s]


Epoch  6: training loss: 0.0956 (pos: 0.0545, neg: 0.0041) over 304753 training points.


100%|██████████| 1191/1191 [00:33<00:00, 35.84it/s]


Epoch  7: training loss: 0.0631 (pos: 0.0340, neg: 0.0029) over 304848 training points.


100%|██████████| 1191/1191 [00:33<00:00, 35.78it/s]


Epoch  8: training loss: 0.0429 (pos: 0.0208, neg: 0.0022) over 304856 training points.


100%|██████████| 1192/1192 [00:33<00:00, 35.78it/s]


Epoch  9: training loss: 0.0273 (pos: 0.0107, neg: 0.0017) over 304937 training points.


100%|██████████| 1191/1191 [00:33<00:00, 35.74it/s]


Epoch 10: training loss: 0.0200 (pos: 0.0077, neg: 0.0012) over 304896 training points.


100%|██████████| 1192/1192 [00:33<00:00, 35.78it/s]


Epoch 11: training loss: 0.0149 (pos: 0.0048, neg: 0.0010) over 304902 training points.


100%|██████████| 1191/1191 [00:33<00:00, 35.80it/s]


Epoch 12: training loss: 0.0121 (pos: 0.0032, neg: 0.0009) over 304852 training points.


100%|██████████| 1191/1191 [00:33<00:00, 35.76it/s]


Epoch 13: training loss: 0.0109 (pos: 0.0024, neg: 0.0008) over 304827 training points.


100%|██████████| 1191/1191 [00:33<00:00, 35.72it/s]


Epoch 14: training loss: 0.0079 (pos: 0.0018, neg: 0.0006) over 304841 training points.


100%|██████████| 1191/1191 [00:33<00:00, 35.75it/s]


Epoch 15: training loss: 0.0092 (pos: 0.0013, neg: 0.0008) over 304843 training points.


100%|██████████| 1191/1191 [00:33<00:00, 35.72it/s]


Epoch 16: training loss: 0.0076 (pos: 0.0016, neg: 0.0006) over 304821 training points.


100%|██████████| 1191/1191 [00:33<00:00, 35.70it/s]


Epoch 17: training loss: 0.0074 (pos: 0.0012, neg: 0.0006) over 304775 training points.


100%|██████████| 1192/1192 [00:33<00:00, 35.69it/s]


Epoch 18: training loss: 0.0084 (pos: 0.0014, neg: 0.0007) over 304961 training points.


100%|██████████| 1191/1191 [00:33<00:00, 35.72it/s]


Epoch 19: training loss: 0.0078 (pos: 0.0012, neg: 0.0007) over 304715 training points.


100%|██████████| 1192/1192 [00:33<00:00, 35.70it/s]


Epoch 20: training loss: 0.0074 (pos: 0.0012, neg: 0.0006) over 304927 training points.


100%|██████████| 1191/1191 [00:33<00:00, 35.70it/s]


Epoch 21: training loss: 0.0072 (pos: 0.0011, neg: 0.0006) over 304841 training points.


100%|██████████| 1191/1191 [00:33<00:00, 35.68it/s]


Epoch 22: training loss: 0.0079 (pos: 0.0011, neg: 0.0007) over 304659 training points.


100%|██████████| 1192/1192 [00:33<00:00, 35.69it/s]


Epoch 23: training loss: 0.0070 (pos: 0.0012, neg: 0.0006) over 304923 training points.


100%|██████████| 1192/1192 [00:33<00:00, 35.67it/s]


Epoch 24: training loss: 0.0088 (pos: 0.0012, neg: 0.0008) over 304923 training points.


100%|██████████| 1192/1192 [00:33<00:00, 35.67it/s]


Epoch 25: training loss: 0.0075 (pos: 0.0011, neg: 0.0006) over 304957 training points.


100%|██████████| 1191/1191 [00:33<00:00, 35.66it/s]


Epoch 26: training loss: 0.0085 (pos: 0.0011, neg: 0.0007) over 304798 training points.


100%|██████████| 1191/1191 [00:33<00:00, 35.66it/s]


Epoch 27: training loss: 0.0078 (pos: 0.0012, neg: 0.0007) over 304780 training points.


100%|██████████| 1191/1191 [00:33<00:00, 35.68it/s]


Epoch 28: training loss: 0.0077 (pos: 0.0011, neg: 0.0007) over 304743 training points.
Early stopping: training loss does not decrease after 5 epochs
Training finished
