# Make embeddings for Hindilanguage
This notebook handles the embedding process.

### Input:
    - Pre-processed training dataframe.

### Output:
    - The trained weights of the embedding layer.

## Import libraries

In [1]:
# Imports
import re
import string
import json
from datetime import datetime
from collections import defaultdict, Counter

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm

import torch
import torch.nn as nn
from torch.nn import Module
import torch.optim as optim
from torch.optim.lr_scheduler import LambdaLR
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.nn.init import xavier_uniform_
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

device = 'cuda'

import random

torch.manual_seed(123)
torch.cuda.manual_seed(234)
np.random.seed(345)
random.seed(456)
torch.manual_seed(567)
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True

## Load data

In [2]:
# train data
train_df = pd.read_csv('save/hindi_train_preprocessed.csv')
train_sentences = [[int(s) for s in text.split()] for text in train_df['sentence']]
train_df['hate'] = (train_df['task_1'] == 'HOF').astype(int)
train_labels = train_df['hate'].to_numpy()

# test data
test_df = pd.read_csv('save/hindi_test_preprocessed.csv')
test_sentences = [[int(s) for s in text.split()] for text in test_df['sentence']]
test_df['hate'] = (test_df['task_1'] == 'HOF').astype(int)
test_labels = train_df['hate'].to_numpy()

# word <-> convertion
with open('save/hindi_word_to_int_dict.json', 'r') as f:
    word_to_int = json.load(f)
with open('save/hindi_int_to_word_dict.json', 'r') as f:
    int_to_word = json.load(f)
    int_to_word = {int(k) : v for k, v in int_to_word.items()}

# word-counter
with open('save/hindi_word_counter.json', 'r') as f:
    word_counter = json.load(f)
    
vocab_size = len(word_to_int)
total_words = sum(word_counter.values())

### Constants and Hyper-parameters

In [3]:
model_save_path = 'save/hindi_word2vec_neg.pt'

window_size = 5
embedding_size = 300
neg_sample_factor = 10
noise_dist_alpha = 3/4
learning_rate = 0.01
lr_decay = lambda epoch: max(0.03, 0.9**epoch)
batch_size = 256
epochs = 100

## skip-gram

In [4]:
# sampling probability of pair (center, context)
def sampling_prob(word):
    z = word_counter[word] / total_words
    p_keep = ((z/0.000001)**0.5 + 1) * (0.000001/z)
    return p_keep

In [5]:
# noise distribution
noisy_words = [iw for iw in int_to_word]
noisy_dist = np.array([(word_counter[int_to_word[iw]]/total_words)**noise_dist_alpha for iw in noisy_words])
noisy_dist = noisy_dist / noisy_dist.sum()

# noisy word generator
def get_noise_word(batch_size, neg_factor):
    noise_list = np.random.choice(noisy_words, batch_size*neg_factor, p=noisy_dist)
    noise_list = noise_list.reshape((batch_size, neg_factor))
    return torch.from_numpy(noise_list)

In [6]:
def get_target_context(sentence: list(str())):
    for i, word in enumerate(sentence):
        for j, context_word in enumerate(sentence[i-window_size:i+window_size+1]):
            if j != i and random.random() < sampling_prob(int_to_word[context_word]):
                    yield (torch.tensor(word, dtype=torch.long), 
                           torch.tensor(context_word, dtype=torch.long)
                          )

## Train word-embedding

### Model

In [7]:
class Word2Vec(Module):
    def __init__(self):
        super(Word2Vec, self).__init__()
        self.center_embed = nn.Embedding(vocab_size, embedding_size)
        self.context_embed = nn.Embedding(vocab_size, embedding_size)  
        
        # xavier initialization
        init_range = (2 / (vocab_size + embedding_size)) ** 0.5
        self.center_embed.weight.data.uniform_(-init_range, init_range)
        self.context_embed.weight.data.uniform_(-init_range, init_range)
        
        self.log_sigmoid = nn.LogSigmoid()

    def forward(self, center_ids, context_ids, negative_samples):
        # center_ids, context_ids: [batch_size]
        # negatve_samples: [batch_size, neg_sample_factor]
        
        # center_embed, context_embed: [batch_size, embedding_size]
        center_embed = self.center_embed(center_ids)
        context_embed = self.context_embed(context_ids)
        
        # pos_dot: [batch_size]
        pos_dot = (center_embed * context_embed).sum(axis=1)
        
        # pos_loss: [batch_size]
        pos_loss = self.log_sigmoid(pos_dot)
        
        # negative_embed: [batch_size, neg_sample_factor, embedding_size]
        negative_embed = self.context_embed(negative_samples)
        
        # negs_dot: [batch_size, neg_sample_factor]
        negs_dot = torch.bmm(negative_embed, center_embed.unsqueeze(2)).squeeze(2) * (-1)
        
        # neg_dot: [batch_size]
        neg_dot = negs_dot.sum(axis=1)
        
        # neg_loss: [batch_size]
        neg_loss = self.log_sigmoid(neg_dot)
        
        loss = -(pos_loss + neg_loss).sum()
        return loss, -pos_loss.sum(), -neg_loss.sum()
    
    def to_embed(self, center_id):
        return self.center_embed(center_id)
    
word2vec = Word2Vec()
torch.save(word2vec.state_dict(), model_save_path)

display(word2vec.parameters)

<bound method Module.parameters of Word2Vec(
  (center_embed): Embedding(19379, 300)
  (context_embed): Embedding(19379, 300)
  (log_sigmoid): LogSigmoid()
)>

### Optimizer and Learning-rate scheduler

In [8]:
optimizer = optim.Adam(word2vec.parameters(), lr=learning_rate)
scheduler = LambdaLR(optimizer, lr_lambda=lr_decay)

### Dataset

In [9]:
class W2VDataset(Dataset):
    def __init__(self, sentences):
        self.data = []
        for sentence in sentences:
            for data_point in get_target_context(sentence):
                self.data.append(data_point)
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        return self.data[index]

### Learning parameters

In [10]:
# load initial weights
word2vec.load_state_dict(torch.load(model_save_path, map_location=torch.device(device)))
word2vec = word2vec.to(device)

early_stop = 5
history_losses = []
for epoch in range(1, epochs+1):
    train_dataset = W2VDataset(train_sentences)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    
    losses, pos_losses, neg_losses = 0., 0., 0.
    cnt = 0
    
    word2vec.train()
    for center_words, context_words in tqdm(train_loader):
        negative_samples = get_noise_word(len(center_words), neg_sample_factor)
        optimizer.zero_grad()
        loss, pos_loss, neg_loss = word2vec(center_words.to(device), context_words.to(device), negative_samples.to(device))
        loss.backward()
        optimizer.step()
        losses += loss
        cnt += len(center_words)
        pos_losses += pos_loss
        neg_losses += neg_loss

    scheduler.step()
    
    epoch_loss = losses / cnt
    print(f'Epoch {epoch:2}: training loss: {epoch_loss:.4f} (pos: {pos_losses/cnt:.4f}, neg: {neg_losses/(cnt*neg_sample_factor):.4f}) over {cnt} training points.')
    
    if epoch % 10 == 0:
        # save embedding
        embedding_weights = word2vec.center_embed.state_dict()
        torch.save(embedding_weights, f'save/hindi_embeddings/{epoch}_epoch_{embedding_size}_dim_{window_size}_wsize_{neg_sample_factor}_negfac.pt')
    
    history_losses.append(epoch_loss)
    if len(history_losses) > early_stop and min(history_losses[-early_stop:]) >= min(history_losses[:-early_stop]):
        print(f'Early stopping: training loss does not decrease after {early_stop} epochs')
        break

print("Training finished")

100%|██████████| 300/300 [00:10<00:00, 28.91it/s]


Epoch  1: training loss: 1.0749 (pos: 0.8331, neg: 0.0242) over 76758 training points.


100%|██████████| 300/300 [00:10<00:00, 28.81it/s]


Epoch  2: training loss: 0.9061 (pos: 0.7368, neg: 0.0169) over 76647 training points.


100%|██████████| 299/299 [00:10<00:00, 28.14it/s]


Epoch  3: training loss: 0.7679 (pos: 0.6047, neg: 0.0163) over 76467 training points.


100%|██████████| 298/298 [00:11<00:00, 26.96it/s]


Epoch  4: training loss: 0.6528 (pos: 0.5145, neg: 0.0138) over 76178 training points.


100%|██████████| 298/298 [00:12<00:00, 23.89it/s]


Epoch  5: training loss: 0.5556 (pos: 0.4356, neg: 0.0120) over 76075 training points.


100%|██████████| 299/299 [00:11<00:00, 25.23it/s]


Epoch  6: training loss: 0.4835 (pos: 0.3786, neg: 0.0105) over 76480 training points.


100%|██████████| 299/299 [00:09<00:00, 30.24it/s]


Epoch  7: training loss: 0.4102 (pos: 0.3301, neg: 0.0080) over 76492 training points.


100%|██████████| 299/299 [00:10<00:00, 28.64it/s]


Epoch  8: training loss: 0.3606 (pos: 0.2875, neg: 0.0073) over 76495 training points.


100%|██████████| 300/300 [00:10<00:00, 28.97it/s]


Epoch  9: training loss: 0.3150 (pos: 0.2536, neg: 0.0061) over 76664 training points.


100%|██████████| 298/298 [00:10<00:00, 28.92it/s]


Epoch 10: training loss: 0.2806 (pos: 0.2274, neg: 0.0053) over 76287 training points.


100%|██████████| 299/299 [00:10<00:00, 28.92it/s]


Epoch 11: training loss: 0.2480 (pos: 0.2017, neg: 0.0046) over 76393 training points.


100%|██████████| 299/299 [00:10<00:00, 28.85it/s]


Epoch 12: training loss: 0.2246 (pos: 0.1826, neg: 0.0042) over 76482 training points.


100%|██████████| 300/300 [00:10<00:00, 28.92it/s]


Epoch 13: training loss: 0.2088 (pos: 0.1697, neg: 0.0039) over 76550 training points.


100%|██████████| 299/299 [00:10<00:00, 28.96it/s]


Epoch 14: training loss: 0.1915 (pos: 0.1545, neg: 0.0037) over 76533 training points.


100%|██████████| 298/298 [00:10<00:00, 28.89it/s]


Epoch 15: training loss: 0.1692 (pos: 0.1386, neg: 0.0031) over 76268 training points.


100%|██████████| 299/299 [00:10<00:00, 28.88it/s]


Epoch 16: training loss: 0.1659 (pos: 0.1344, neg: 0.0031) over 76450 training points.


100%|██████████| 299/299 [00:10<00:00, 28.91it/s]


Epoch 17: training loss: 0.1523 (pos: 0.1245, neg: 0.0028) over 76294 training points.


100%|██████████| 298/298 [00:10<00:00, 28.67it/s]


Epoch 18: training loss: 0.1442 (pos: 0.1167, neg: 0.0028) over 76154 training points.


100%|██████████| 298/298 [00:10<00:00, 28.91it/s]


Epoch 19: training loss: 0.1390 (pos: 0.1123, neg: 0.0027) over 76072 training points.


100%|██████████| 298/298 [00:10<00:00, 28.91it/s]


Epoch 20: training loss: 0.1278 (pos: 0.1041, neg: 0.0024) over 76086 training points.


100%|██████████| 299/299 [00:10<00:00, 28.86it/s]


Epoch 21: training loss: 0.1225 (pos: 0.0986, neg: 0.0024) over 76418 training points.


100%|██████████| 299/299 [00:10<00:00, 28.90it/s]


Epoch 22: training loss: 0.1170 (pos: 0.0941, neg: 0.0023) over 76401 training points.


100%|██████████| 298/298 [00:10<00:00, 28.91it/s]


Epoch 23: training loss: 0.1133 (pos: 0.0905, neg: 0.0023) over 76066 training points.


100%|██████████| 300/300 [00:10<00:00, 28.76it/s]


Epoch 24: training loss: 0.1087 (pos: 0.0877, neg: 0.0021) over 76659 training points.


100%|██████████| 300/300 [00:10<00:00, 28.70it/s]


Epoch 25: training loss: 0.1057 (pos: 0.0845, neg: 0.0021) over 76788 training points.


100%|██████████| 299/299 [00:10<00:00, 28.90it/s]


Epoch 26: training loss: 0.0997 (pos: 0.0798, neg: 0.0020) over 76364 training points.


100%|██████████| 297/297 [00:10<00:00, 28.70it/s]


Epoch 27: training loss: 0.1028 (pos: 0.0807, neg: 0.0022) over 75989 training points.


100%|██████████| 298/298 [00:10<00:00, 29.00it/s]


Epoch 28: training loss: 0.0985 (pos: 0.0781, neg: 0.0020) over 76230 training points.


100%|██████████| 298/298 [00:10<00:00, 28.91it/s]


Epoch 29: training loss: 0.0945 (pos: 0.0755, neg: 0.0019) over 76148 training points.


100%|██████████| 299/299 [00:10<00:00, 28.99it/s]


Epoch 30: training loss: 0.0896 (pos: 0.0733, neg: 0.0016) over 76355 training points.


100%|██████████| 300/300 [00:10<00:00, 28.93it/s]


Epoch 31: training loss: 0.0894 (pos: 0.0715, neg: 0.0018) over 76745 training points.


100%|██████████| 299/299 [00:10<00:00, 28.94it/s]


Epoch 32: training loss: 0.0915 (pos: 0.0734, neg: 0.0018) over 76315 training points.


100%|██████████| 300/300 [00:10<00:00, 28.95it/s]


Epoch 33: training loss: 0.0865 (pos: 0.0701, neg: 0.0016) over 76603 training points.


100%|██████████| 300/300 [00:10<00:00, 28.93it/s]


Epoch 34: training loss: 0.0826 (pos: 0.0650, neg: 0.0018) over 76570 training points.


100%|██████████| 299/299 [00:10<00:00, 28.89it/s]


Epoch 35: training loss: 0.0852 (pos: 0.0667, neg: 0.0019) over 76378 training points.


100%|██████████| 299/299 [00:10<00:00, 28.94it/s]


Epoch 36: training loss: 0.0841 (pos: 0.0660, neg: 0.0018) over 76503 training points.


100%|██████████| 300/300 [00:10<00:00, 28.92it/s]


Epoch 37: training loss: 0.0838 (pos: 0.0657, neg: 0.0018) over 76599 training points.


100%|██████████| 298/298 [00:10<00:00, 28.93it/s]


Epoch 38: training loss: 0.0809 (pos: 0.0632, neg: 0.0018) over 76228 training points.


100%|██████████| 298/298 [00:10<00:00, 28.61it/s]


Epoch 39: training loss: 0.0832 (pos: 0.0648, neg: 0.0018) over 76228 training points.


100%|██████████| 298/298 [00:10<00:00, 28.91it/s]


Epoch 40: training loss: 0.0824 (pos: 0.0641, neg: 0.0018) over 76093 training points.


100%|██████████| 299/299 [00:10<00:00, 28.99it/s]


Epoch 41: training loss: 0.0807 (pos: 0.0635, neg: 0.0017) over 76325 training points.


100%|██████████| 297/297 [00:10<00:00, 28.95it/s]


Epoch 42: training loss: 0.0809 (pos: 0.0631, neg: 0.0018) over 75987 training points.


100%|██████████| 297/297 [00:10<00:00, 28.95it/s]


Epoch 43: training loss: 0.0798 (pos: 0.0606, neg: 0.0019) over 75993 training points.


100%|██████████| 299/299 [00:10<00:00, 28.89it/s]


Epoch 44: training loss: 0.0785 (pos: 0.0615, neg: 0.0017) over 76385 training points.


100%|██████████| 300/300 [00:10<00:00, 28.96it/s]


Epoch 45: training loss: 0.0763 (pos: 0.0597, neg: 0.0017) over 76798 training points.


100%|██████████| 298/298 [00:09<00:00, 30.05it/s]


Epoch 46: training loss: 0.0756 (pos: 0.0602, neg: 0.0015) over 76132 training points.


100%|██████████| 299/299 [00:09<00:00, 30.01it/s]


Epoch 47: training loss: 0.0750 (pos: 0.0590, neg: 0.0016) over 76477 training points.


100%|██████████| 300/300 [00:09<00:00, 30.04it/s]


Epoch 48: training loss: 0.0724 (pos: 0.0564, neg: 0.0016) over 76762 training points.


100%|██████████| 297/297 [00:10<00:00, 28.83it/s]


Epoch 49: training loss: 0.0710 (pos: 0.0555, neg: 0.0015) over 75785 training points.


100%|██████████| 298/298 [00:11<00:00, 25.87it/s]


Epoch 50: training loss: 0.0735 (pos: 0.0575, neg: 0.0016) over 76047 training points.


100%|██████████| 299/299 [00:10<00:00, 29.42it/s]


Epoch 51: training loss: 0.0725 (pos: 0.0557, neg: 0.0017) over 76463 training points.


100%|██████████| 298/298 [00:11<00:00, 26.69it/s]


Epoch 52: training loss: 0.0697 (pos: 0.0541, neg: 0.0016) over 76199 training points.


100%|██████████| 299/299 [00:10<00:00, 28.88it/s]


Epoch 53: training loss: 0.0716 (pos: 0.0552, neg: 0.0016) over 76501 training points.


100%|██████████| 297/297 [00:10<00:00, 28.62it/s]


Epoch 54: training loss: 0.0701 (pos: 0.0540, neg: 0.0016) over 75875 training points.


100%|██████████| 299/299 [00:10<00:00, 28.29it/s]


Epoch 55: training loss: 0.0700 (pos: 0.0532, neg: 0.0017) over 76290 training points.


100%|██████████| 297/297 [00:11<00:00, 25.21it/s]


Epoch 56: training loss: 0.0698 (pos: 0.0531, neg: 0.0017) over 75916 training points.


100%|██████████| 298/298 [00:12<00:00, 24.48it/s]


Epoch 57: training loss: 0.0679 (pos: 0.0518, neg: 0.0016) over 76144 training points.


100%|██████████| 298/298 [00:11<00:00, 25.55it/s]


Epoch 58: training loss: 0.0658 (pos: 0.0503, neg: 0.0016) over 76276 training points.


100%|██████████| 299/299 [00:11<00:00, 27.08it/s]


Epoch 59: training loss: 0.0653 (pos: 0.0491, neg: 0.0016) over 76420 training points.


100%|██████████| 298/298 [00:11<00:00, 26.41it/s]


Epoch 60: training loss: 0.0655 (pos: 0.0510, neg: 0.0014) over 76100 training points.


100%|██████████| 299/299 [00:11<00:00, 25.10it/s]


Epoch 61: training loss: 0.0660 (pos: 0.0504, neg: 0.0016) over 76394 training points.


100%|██████████| 298/298 [00:12<00:00, 23.63it/s]


Epoch 62: training loss: 0.0649 (pos: 0.0488, neg: 0.0016) over 76263 training points.


100%|██████████| 300/300 [00:13<00:00, 23.06it/s]


Epoch 63: training loss: 0.0642 (pos: 0.0498, neg: 0.0014) over 76795 training points.


100%|██████████| 298/298 [00:12<00:00, 24.71it/s]


Epoch 64: training loss: 0.0620 (pos: 0.0484, neg: 0.0014) over 76252 training points.


100%|██████████| 298/298 [00:11<00:00, 26.84it/s]


Epoch 65: training loss: 0.0639 (pos: 0.0481, neg: 0.0016) over 76272 training points.


100%|██████████| 297/297 [00:10<00:00, 28.21it/s]


Epoch 66: training loss: 0.0626 (pos: 0.0467, neg: 0.0016) over 75972 training points.


100%|██████████| 299/299 [00:11<00:00, 27.03it/s]


Epoch 67: training loss: 0.0623 (pos: 0.0475, neg: 0.0015) over 76373 training points.


100%|██████████| 300/300 [00:12<00:00, 23.56it/s]


Epoch 68: training loss: 0.0626 (pos: 0.0466, neg: 0.0016) over 76593 training points.


100%|██████████| 299/299 [00:12<00:00, 24.35it/s]

Epoch 69: training loss: 0.0631 (pos: 0.0466, neg: 0.0017) over 76386 training points.
Early stopping: training loss does not decrease after 5 epochs
Training finished





In [11]:
# save embedding weights
embedding_weights = word2vec.center_embed.state_dict()
torch.save(embedding_weights, f'save/hindi_embedding_weights_{window_size}_wsize_{neg_sample_factor}_negfac.pt')