# Make embeddings for Bengali language
This notebook handles the embedding process.

### Input:
    - Pre-processed training dataframe.

### Output:
    - The trained weights of the embedding layer.

## Import libraries

In [1]:
# Imports
import re
import string
import json
from datetime import datetime
from collections import defaultdict, Counter

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm

import torch
import torch.nn as nn
from torch.nn import Module
import torch.optim as optim
from torch.optim.lr_scheduler import LambdaLR
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

device = 'cuda'

import random

random.seed(26)
np.random.seed(62)
torch.manual_seed(2021)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

## Load data

In [2]:
ben_train_df = pd.read_csv('save/bengali_hatespeech_embed_train_preprocessed.csv')
# remove empty texts
ben_train_df = ben_train_df[ben_train_df.sentence.str.len() > 0]

# load mapping {word -> id} and {id -> word}
with open('save/word_to_int_dict.json') as f:
    word_to_int = json.load(f)
with open('save/int_to_word_dict.json') as f:
    int_to_word = json.load(f)
    int_to_word = {int(k) : v for k, v in int_to_word.items()}
with open('save/word_counter.json') as f:
    word_counter = json.load(f)

# get vocab_size
vocab_size = len(word_to_int)
print(f'vocab_size: {vocab_size}')

# get total occurences
total_words = sum(word_counter.values())
print(f'total word occurences: {total_words}')

# extract sentences and labels
train_sentences = [[word_to_int[w] for w in text.split()] for text in ben_train_df['sentence']]
train_labels = ben_train_df['hate'].to_numpy()

vocab_size: 55189
total word occurences: 303627


### Constants and Hyper-parameters

In [3]:
model_save_path = 'save/word2vec_neg.pt'

window_size = 5
embedding_size = 300
neg_sample_factor = 10
noise_dist_alpha = 3/4
learning_rate = 0.01
lr_decay = lambda epoch: max(0.05, 0.9**epoch)
batch_size = 256
epochs = 100

## skip-gram

In [4]:
# sampling probability of pair (center, context)
def sampling_prob(word):
    z = word_counter[word] / total_words
    p_keep = ((z/0.000001)**0.5 + 1) * (0.000001/z)
    return p_keep

In [5]:
# noise distribution
noisy_words = [iw for iw in int_to_word]
noisy_dist = np.array([(word_counter[int_to_word[iw]]/total_words)**noise_dist_alpha for iw in noisy_words])
noisy_dist = noisy_dist / noisy_dist.sum()

# noisy word generator
def get_noise_word(batch_size, neg_factor):
    noise_list = np.random.choice(noisy_words, batch_size*neg_factor, p=noisy_dist)
    noise_list = noise_list.reshape((batch_size, neg_factor))
    return torch.from_numpy(noise_list)

In [6]:
def get_target_context(sentence: list(str())):
    for i, word in enumerate(sentence):
        for j, context_word in enumerate(sentence[i-window_size:i+window_size+1]):
            if j != i and random.random() < sampling_prob(int_to_word[context_word]):
                    yield (torch.tensor(word, dtype=torch.long), 
                           torch.tensor(context_word, dtype=torch.long)
                          )

## Train word-embedding

### Model

In [7]:
class Word2Vec(Module):
    def __init__(self):
        super(Word2Vec, self).__init__()
        self.center_embed = nn.Embedding(vocab_size, embedding_size)        
        self.context_embed = nn.Embedding(vocab_size, embedding_size)
        
        init_range = (2 / (vocab_size + embedding_size)) ** 0.5
        self.center_embed.weight.data.uniform_(-init_range, init_range)
        self.context_embed.weight.data.uniform_(-init_range, init_range)
        
        self.log_sigmoid = nn.LogSigmoid()

    def forward(self, center_ids, context_ids, negative_samples):
        # center_ids, context_ids: [batch_size]
        # negatve_samples: [batch_size, neg_sample_factor]
        
        # center_embed, context_embed: [batch_size, embedding_size]
        center_embed = self.center_embed(center_ids)
        context_embed = self.context_embed(context_ids)
        
        # pos_dot: [batch_size]
        pos_dot = (center_embed * context_embed).sum(axis=1)
        
        # pos_loss: [batch_size]
        pos_loss = self.log_sigmoid(pos_dot)
        
        # negative_embed: [batch_size, neg_sample_factor, embedding_size]
        negative_embed = self.context_embed(negative_samples)
        
        # negs_dot: [batch_size, neg_sample_factor]
        negs_dot = torch.bmm(negative_embed, center_embed.unsqueeze(2)).squeeze(2) * (-1)
        
        # neg_dot: [batch_size]
        neg_dot = negs_dot.sum(axis=1)
        
        # neg_loss: [batch_size]
        neg_loss = self.log_sigmoid(neg_dot)
        
        loss = -(pos_loss + neg_loss).sum()
        return loss, -pos_loss.sum(), -neg_loss.sum()
    
    def to_embed(self, center_id):
        return self.center_embed(center_id)
    
word2vec = Word2Vec()
torch.save(word2vec.state_dict(), model_save_path)

display(word2vec.parameters)

<bound method Module.parameters of Word2Vec(
  (center_embed): Embedding(55189, 300)
  (context_embed): Embedding(55189, 300)
  (log_sigmoid): LogSigmoid()
)>

### Optimizer and Learning-rate scheduler

In [8]:
optimizer = optim.Adam(word2vec.parameters(), lr=learning_rate)
scheduler = LambdaLR(optimizer, lr_lambda=lr_decay)

### Dataset

In [9]:
class W2VDataset(Dataset):
    def __init__(self, sentences):
        self.data = []
        for sentence in sentences:
            for data_point in get_target_context(sentence):
                self.data.append(data_point)
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        return self.data[index]

### Learning parameters

In [10]:
# load initial weights
word2vec.load_state_dict(torch.load(model_save_path, map_location=torch.device(device)))
word2vec = word2vec.to(device)

early_stop = 5
history_losses = []
for epoch in range(1, epochs+1):
    train_dataset = W2VDataset(train_sentences)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    
    losses, pos_losses, neg_losses = 0., 0., 0.
    cnt = 0
    
    word2vec.train()
    for center_words, context_words in tqdm(train_loader):
        negative_samples = get_noise_word(len(center_words), neg_sample_factor)
        optimizer.zero_grad()
        loss, pos_loss, neg_loss = word2vec(center_words.to(device), context_words.to(device), negative_samples.to(device))
        loss.backward()
        optimizer.step()
        losses += loss
        cnt += len(center_words)
        pos_losses += pos_loss
        neg_losses += neg_loss

    scheduler.step()
    
    epoch_loss = losses / cnt
    print(f'Epoch {epoch:2}: training loss: {epoch_loss:.4f} (pos: {pos_losses/cnt:.4f}, neg: {neg_losses/(cnt*neg_sample_factor):.4f}) over {cnt} training points.')
    
    if epoch % 10 == 0:
        # save embedding
        embedding_weights = word2vec.center_embed.state_dict()
        torch.save(embedding_weights, f'save/embedding_checkpoints/{epoch}_epoch_{embedding_size}_dim_{window_size}_wsize_{neg_sample_factor}_negfac.pt')
    
    history_losses.append(epoch_loss)
    if len(history_losses) > early_stop and min(history_losses[-early_stop:]) >= min(history_losses[:-early_stop]):
        print(f'Early stopping: training loss does not decrease after {early_stop} epochs')
        break

print("Training finished")

100%|██████████| 1655/1655 [02:32<00:00, 10.86it/s]


Epoch  1: training loss: 1.4153 (pos: 1.0731, neg: 0.0342) over 423646 training points.


100%|██████████| 1659/1659 [02:35<00:00, 10.66it/s]


Epoch  2: training loss: 1.3822 (pos: 1.0008, neg: 0.0381) over 424456 training points.


100%|██████████| 1655/1655 [02:29<00:00, 11.09it/s]


Epoch  3: training loss: 1.1948 (pos: 0.8611, neg: 0.0334) over 423492 training points.


100%|██████████| 1657/1657 [02:30<00:00, 10.99it/s]


Epoch  4: training loss: 1.0362 (pos: 0.7579, neg: 0.0278) over 424101 training points.


100%|██████████| 1656/1656 [02:29<00:00, 11.08it/s]


Epoch  5: training loss: 0.8926 (pos: 0.6614, neg: 0.0231) over 423722 training points.


100%|██████████| 1655/1655 [02:29<00:00, 11.10it/s]


Epoch  6: training loss: 0.7600 (pos: 0.5698, neg: 0.0190) over 423638 training points.


100%|██████████| 1655/1655 [02:29<00:00, 11.10it/s]


Epoch  7: training loss: 0.6720 (pos: 0.5051, neg: 0.0167) over 423650 training points.


100%|██████████| 1657/1657 [02:29<00:00, 11.08it/s]


Epoch  8: training loss: 0.5805 (pos: 0.4381, neg: 0.0142) over 424006 training points.


100%|██████████| 1659/1659 [02:29<00:00, 11.10it/s]


Epoch  9: training loss: 0.5125 (pos: 0.3889, neg: 0.0124) over 424493 training points.


100%|██████████| 1657/1657 [02:31<00:00, 10.95it/s]


Epoch 10: training loss: 0.4412 (pos: 0.3370, neg: 0.0104) over 424143 training points.


100%|██████████| 1658/1658 [02:33<00:00, 10.79it/s]


Epoch 11: training loss: 0.3961 (pos: 0.3005, neg: 0.0096) over 424298 training points.


100%|██████████| 1658/1658 [02:34<00:00, 10.75it/s]


Epoch 12: training loss: 0.3495 (pos: 0.2659, neg: 0.0084) over 424312 training points.


100%|██████████| 1660/1660 [02:29<00:00, 11.13it/s]


Epoch 13: training loss: 0.3109 (pos: 0.2376, neg: 0.0073) over 424761 training points.


100%|██████████| 1657/1657 [02:28<00:00, 11.13it/s]


Epoch 14: training loss: 0.2770 (pos: 0.2096, neg: 0.0067) over 424184 training points.


100%|██████████| 1654/1654 [02:28<00:00, 11.11it/s]


Epoch 15: training loss: 0.2478 (pos: 0.1904, neg: 0.0057) over 423365 training points.


100%|██████████| 1657/1657 [02:28<00:00, 11.13it/s]


Epoch 16: training loss: 0.2288 (pos: 0.1734, neg: 0.0055) over 423943 training points.


100%|██████████| 1657/1657 [02:28<00:00, 11.13it/s]


Epoch 17: training loss: 0.2106 (pos: 0.1575, neg: 0.0053) over 424037 training points.


100%|██████████| 1658/1658 [02:29<00:00, 11.13it/s]


Epoch 18: training loss: 0.1916 (pos: 0.1451, neg: 0.0047) over 424346 training points.


100%|██████████| 1659/1659 [02:29<00:00, 11.13it/s]


Epoch 19: training loss: 0.1740 (pos: 0.1307, neg: 0.0043) over 424684 training points.


100%|██████████| 1657/1657 [02:28<00:00, 11.12it/s]


Epoch 20: training loss: 0.1636 (pos: 0.1232, neg: 0.0040) over 423990 training points.


100%|██████████| 1656/1656 [02:28<00:00, 11.12it/s]


Epoch 21: training loss: 0.1526 (pos: 0.1144, neg: 0.0038) over 423867 training points.


100%|██████████| 1654/1654 [02:28<00:00, 11.12it/s]


Epoch 22: training loss: 0.1422 (pos: 0.1051, neg: 0.0037) over 423407 training points.


100%|██████████| 1658/1658 [02:29<00:00, 11.12it/s]


Epoch 23: training loss: 0.1346 (pos: 0.1002, neg: 0.0034) over 424275 training points.


100%|██████████| 1655/1655 [02:28<00:00, 11.12it/s]


Epoch 24: training loss: 0.1257 (pos: 0.0925, neg: 0.0033) over 423551 training points.


100%|██████████| 1658/1658 [02:29<00:00, 11.12it/s]


Epoch 25: training loss: 0.1250 (pos: 0.0915, neg: 0.0033) over 424368 training points.


100%|██████████| 1655/1655 [02:31<00:00, 10.91it/s]


Epoch 26: training loss: 0.1176 (pos: 0.0858, neg: 0.0032) over 423597 training points.


100%|██████████| 1657/1657 [02:29<00:00, 11.11it/s]


Epoch 27: training loss: 0.1129 (pos: 0.0823, neg: 0.0031) over 424031 training points.


100%|██████████| 1655/1655 [02:28<00:00, 11.12it/s]


Epoch 28: training loss: 0.1065 (pos: 0.0772, neg: 0.0029) over 423584 training points.


100%|██████████| 1657/1657 [02:29<00:00, 11.11it/s]


Epoch 29: training loss: 0.1058 (pos: 0.0766, neg: 0.0029) over 424118 training points.


100%|██████████| 1653/1653 [02:28<00:00, 11.11it/s]


Epoch 30: training loss: 0.1012 (pos: 0.0726, neg: 0.0029) over 422942 training points.


100%|██████████| 1657/1657 [02:29<00:00, 11.11it/s]


Epoch 31: training loss: 0.0986 (pos: 0.0705, neg: 0.0028) over 424124 training points.


100%|██████████| 1658/1658 [02:29<00:00, 11.11it/s]


Epoch 32: training loss: 0.0972 (pos: 0.0691, neg: 0.0028) over 424195 training points.


100%|██████████| 1655/1655 [02:29<00:00, 11.10it/s]


Epoch 33: training loss: 0.0931 (pos: 0.0663, neg: 0.0027) over 423653 training points.


100%|██████████| 1657/1657 [02:29<00:00, 11.11it/s]


Epoch 34: training loss: 0.0924 (pos: 0.0657, neg: 0.0027) over 423965 training points.


100%|██████████| 1653/1653 [02:28<00:00, 11.10it/s]


Epoch 35: training loss: 0.0905 (pos: 0.0636, neg: 0.0027) over 423072 training points.


100%|██████████| 1656/1656 [02:29<00:00, 11.10it/s]


Epoch 36: training loss: 0.0869 (pos: 0.0604, neg: 0.0026) over 423695 training points.


100%|██████████| 1656/1656 [02:29<00:00, 11.09it/s]


Epoch 37: training loss: 0.0841 (pos: 0.0581, neg: 0.0026) over 423726 training points.


100%|██████████| 1655/1655 [02:29<00:00, 11.10it/s]


Epoch 38: training loss: 0.0838 (pos: 0.0584, neg: 0.0025) over 423604 training points.


100%|██████████| 1656/1656 [02:29<00:00, 11.10it/s]


Epoch 39: training loss: 0.0817 (pos: 0.0563, neg: 0.0025) over 423916 training points.


100%|██████████| 1656/1656 [02:38<00:00, 10.43it/s]


Epoch 40: training loss: 0.0818 (pos: 0.0549, neg: 0.0027) over 423761 training points.


100%|██████████| 1658/1658 [02:46<00:00,  9.95it/s]


Epoch 41: training loss: 0.0774 (pos: 0.0531, neg: 0.0024) over 424335 training points.


100%|██████████| 1658/1658 [02:46<00:00,  9.95it/s]


Epoch 42: training loss: 0.0769 (pos: 0.0523, neg: 0.0025) over 424311 training points.


100%|██████████| 1659/1659 [02:44<00:00, 10.08it/s]


Epoch 43: training loss: 0.0763 (pos: 0.0507, neg: 0.0026) over 424469 training points.


100%|██████████| 1654/1654 [02:46<00:00,  9.96it/s]


Epoch 44: training loss: 0.0740 (pos: 0.0501, neg: 0.0024) over 423238 training points.


100%|██████████| 1658/1658 [02:48<00:00,  9.86it/s]


Epoch 45: training loss: 0.0739 (pos: 0.0498, neg: 0.0024) over 424334 training points.


100%|██████████| 1660/1660 [02:46<00:00,  9.96it/s]


Epoch 46: training loss: 0.0712 (pos: 0.0475, neg: 0.0024) over 424811 training points.


100%|██████████| 1655/1655 [02:44<00:00, 10.06it/s]


Epoch 47: training loss: 0.0716 (pos: 0.0464, neg: 0.0025) over 423462 training points.


100%|██████████| 1657/1657 [02:41<00:00, 10.25it/s]


Epoch 48: training loss: 0.0684 (pos: 0.0456, neg: 0.0023) over 424173 training points.


100%|██████████| 1657/1657 [02:37<00:00, 10.50it/s]


Epoch 49: training loss: 0.0682 (pos: 0.0450, neg: 0.0023) over 424049 training points.


100%|██████████| 1654/1654 [02:42<00:00, 10.18it/s]


Epoch 50: training loss: 0.0668 (pos: 0.0432, neg: 0.0024) over 423348 training points.


100%|██████████| 1654/1654 [02:41<00:00, 10.21it/s]


Epoch 51: training loss: 0.0661 (pos: 0.0429, neg: 0.0023) over 423348 training points.


100%|██████████| 1659/1659 [02:35<00:00, 10.64it/s]


Epoch 52: training loss: 0.0651 (pos: 0.0428, neg: 0.0022) over 424615 training points.


100%|██████████| 1656/1656 [02:26<00:00, 11.30it/s]


Epoch 53: training loss: 0.0647 (pos: 0.0428, neg: 0.0022) over 423733 training points.


100%|██████████| 1655/1655 [02:38<00:00, 10.47it/s]


Epoch 54: training loss: 0.0622 (pos: 0.0401, neg: 0.0022) over 423503 training points.


100%|██████████| 1657/1657 [02:31<00:00, 10.91it/s]


Epoch 55: training loss: 0.0621 (pos: 0.0405, neg: 0.0022) over 424188 training points.


100%|██████████| 1659/1659 [02:25<00:00, 11.36it/s]


Epoch 56: training loss: 0.0605 (pos: 0.0383, neg: 0.0022) over 424521 training points.


100%|██████████| 1658/1658 [02:26<00:00, 11.33it/s]


Epoch 57: training loss: 0.0608 (pos: 0.0382, neg: 0.0023) over 424370 training points.


100%|██████████| 1654/1654 [02:25<00:00, 11.36it/s]


Epoch 58: training loss: 0.0593 (pos: 0.0373, neg: 0.0022) over 423347 training points.


100%|██████████| 1658/1658 [02:30<00:00, 11.02it/s]


Epoch 59: training loss: 0.0583 (pos: 0.0365, neg: 0.0022) over 424217 training points.


100%|██████████| 1656/1656 [02:31<00:00, 10.93it/s]


Epoch 60: training loss: 0.0597 (pos: 0.0365, neg: 0.0023) over 423752 training points.


100%|██████████| 1657/1657 [02:31<00:00, 10.96it/s]


Epoch 61: training loss: 0.0577 (pos: 0.0363, neg: 0.0021) over 423957 training points.


100%|██████████| 1656/1656 [02:30<00:00, 10.99it/s]


Epoch 62: training loss: 0.0555 (pos: 0.0344, neg: 0.0021) over 423823 training points.


100%|██████████| 1658/1658 [02:30<00:00, 11.03it/s]


Epoch 63: training loss: 0.0565 (pos: 0.0344, neg: 0.0022) over 424219 training points.


100%|██████████| 1657/1657 [02:29<00:00, 11.11it/s]


Epoch 64: training loss: 0.0552 (pos: 0.0340, neg: 0.0021) over 423997 training points.


100%|██████████| 1657/1657 [02:27<00:00, 11.21it/s]


Epoch 65: training loss: 0.0548 (pos: 0.0334, neg: 0.0021) over 424068 training points.


100%|██████████| 1657/1657 [02:26<00:00, 11.35it/s]


Epoch 66: training loss: 0.0541 (pos: 0.0323, neg: 0.0022) over 424068 training points.


100%|██████████| 1657/1657 [02:25<00:00, 11.36it/s]


Epoch 67: training loss: 0.0555 (pos: 0.0324, neg: 0.0023) over 423976 training points.


100%|██████████| 1653/1653 [02:31<00:00, 10.92it/s]


Epoch 68: training loss: 0.0531 (pos: 0.0316, neg: 0.0021) over 423019 training points.


100%|██████████| 1661/1661 [02:27<00:00, 11.23it/s]


Epoch 69: training loss: 0.0511 (pos: 0.0311, neg: 0.0020) over 425115 training points.


100%|██████████| 1655/1655 [02:25<00:00, 11.36it/s]


Epoch 70: training loss: 0.0528 (pos: 0.0312, neg: 0.0022) over 423429 training points.


100%|██████████| 1654/1654 [02:25<00:00, 11.36it/s]


Epoch 71: training loss: 0.0503 (pos: 0.0302, neg: 0.0020) over 423411 training points.


100%|██████████| 1658/1658 [02:25<00:00, 11.36it/s]


Epoch 72: training loss: 0.0510 (pos: 0.0300, neg: 0.0021) over 424399 training points.


100%|██████████| 1655/1655 [02:25<00:00, 11.36it/s]


Epoch 73: training loss: 0.0500 (pos: 0.0292, neg: 0.0021) over 423532 training points.


100%|██████████| 1656/1656 [02:25<00:00, 11.36it/s]


Epoch 74: training loss: 0.0514 (pos: 0.0292, neg: 0.0022) over 423832 training points.


100%|██████████| 1660/1660 [02:26<00:00, 11.36it/s]


Epoch 75: training loss: 0.0497 (pos: 0.0295, neg: 0.0020) over 424906 training points.


100%|██████████| 1657/1657 [02:25<00:00, 11.36it/s]


Epoch 76: training loss: 0.0491 (pos: 0.0282, neg: 0.0021) over 424138 training points.


100%|██████████| 1657/1657 [02:25<00:00, 11.36it/s]


Epoch 77: training loss: 0.0474 (pos: 0.0279, neg: 0.0020) over 423947 training points.


100%|██████████| 1657/1657 [02:25<00:00, 11.36it/s]


Epoch 78: training loss: 0.0492 (pos: 0.0282, neg: 0.0021) over 424022 training points.


100%|██████████| 1658/1658 [02:25<00:00, 11.36it/s]


Epoch 79: training loss: 0.0494 (pos: 0.0276, neg: 0.0022) over 424316 training points.


100%|██████████| 1656/1656 [02:25<00:00, 11.36it/s]


Epoch 80: training loss: 0.0478 (pos: 0.0275, neg: 0.0020) over 423789 training points.


100%|██████████| 1656/1656 [02:29<00:00, 11.07it/s]


Epoch 81: training loss: 0.0483 (pos: 0.0269, neg: 0.0021) over 423737 training points.


100%|██████████| 1659/1659 [02:27<00:00, 11.28it/s]


Epoch 82: training loss: 0.0476 (pos: 0.0264, neg: 0.0021) over 424563 training points.
Early stopping: training loss does not decrease after 5 epochs
Training finished


In [11]:
# save embedding weights
embedding_weights = word2vec.center_embed.state_dict()
torch.save(embedding_weights, f'save/big_embedding_weights_{window_size}_wsize_{neg_sample_factor}_negfac.pt')