# Task 2: Create Bengali word embeddings

## Import libraries

In [1]:
# Imports
import re
import string
import json
from datetime import datetime
from collections import defaultdict, Counter

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm

import torch
import torch.nn as nn
from torch.nn import Module
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

from sklearn.model_selection import train_test_split

from nltk.corpus import stopwords

device = 'cuda'

import random

random.seed(26)
np.random.seed(62)
torch.manual_seed(2021)

<torch._C.Generator at 0x7fd21167ef70>

## Load data

In [2]:
ben_train_df = pd.read_csv('../hindi_bengali/save/bengali_hatespeech_sample_train_preprocessed.csv')
ben_test_df = pd.read_csv('../hindi_bengali/save/bengali_hatespeech_sample_test_preprocessed.csv')

display(ben_train_df.head())

Unnamed: 0,sentence,hate,category
0,নটির পুতেরে জুতা বাইরে,1,religion
1,কুত্তাকে গণধোলাই দেওয়া জানোয়ারটা যেই হাত দিয...,1,"Meme, TikTok and others"
2,তোর বউয়ের ভোদ চোদ জাইয়া,1,religion
3,কুত্তার বাচ্চা কোথায় পাবো,1,religion
4,সালিকে জতাপিটা,1,crime


In [3]:
# train data:
# remove empty texts
ben_train_df = ben_train_df[ben_train_df.sentence.str.len() > 0]
# extract sentences and labels
train_sentences = [text.split() for text in ben_train_df['sentence']]
train_labels = ben_train_df['hate'].to_numpy()

# test data:
# remove empty texts
ben_test_df = ben_test_df[ben_test_df.sentence.str.len() > 0]
# extract sentences and labels
test_sentences = [text.split() for text in ben_test_df['sentence']]
test_labels = ben_test_df['hate'].to_numpy()

### Print out data/statistics

In [4]:
print('Train data:')
print(train_sentences[:3])
print(train_labels)
print()
print('Test data:')
print(test_sentences[:3])
print(test_labels)

Train data:
[['নটির', 'পুতেরে', 'জুতা', 'বাইরে'], ['কুত্তাকে', 'গণধোলাই', 'দেওয়া', 'জানোয়ারটা', 'যেই', 'হাত', 'দিয়ে', 'মারছে', 'হাতটা', 'ভেঙ্গে', 'দিন'], ['তোর', 'বউয়ের', 'ভোদ', 'চোদ', 'জাইয়া']]
[1 1 1 ... 1 1 1]

Test data:
[['শালা', 'তাহেরি', 'ওরে', 'বাশ', 'দেয়া', 'হোউক'], ['খানকির', 'বাচ্চা', 'তোরে', 'এনাকোন্ডা', 'মারা', 'খা'], ['ওরে', 'পুলিশের', 'হাতে', 'দেয়ার', 'সবাই', 'মিলে', 'পিটিয়ে', 'আধমরা', 'করলো']]
[1 1 0 ... 0 0 0]


## Prepare vocab set

In [5]:
# load mapping {word -> id} and {id -> word}
with open('../hindi_bengali/save/bengali_word_to_int_dict.json') as f:
    word_to_int = json.load(f)
with open('../hindi_bengali/save/bengali_int_to_word_dict.json') as f:
    int_to_word = json.load(f)
    int_to_word = {int(k) : v for k, v in int_to_word.items()}
with open('../hindi_bengali/save/bengali_word_counter.json') as f:
    word_counter = json.load(f)

# get vocab_size
vocab_size = len(word_to_int)
print(f'vocab_size: {vocab_size}')

# get total occurences
total_words = sum(word_counter.values())
print(f'total word occurences: {total_words}')

vocab_size: 15231
total word occurences: 47897


## Define sub-sampling

In [6]:
def sampling_prob(word):
    z = word_counter[word] / total_words
    p_keep = ((z/0.000001)**0.5 + 1) * (0.000001/z)
    return p_keep

## skip-gram

In [7]:
def get_target_context(sentence: list(str())):
    for i, word in enumerate(sentence):
        for j, context_word in enumerate(sentence[i-window_size:i+window_size+1]):
            if j != i and random.random() < sampling_prob(context_word):
                yield (torch.tensor(word_to_int[word], dtype=torch.long).unsqueeze(0), 
                       torch.tensor(word_to_int[context_word], dtype=torch.long).unsqueeze(0))

# Train word-embeddings

## hyper-parameters

In [8]:
window_size = 10
embedding_size = 300
learning_rate = 0.01
batch_size = 256
epochs = 100

## Model

In [9]:
class Word2Vec(Module):
    def __init__(self):
        super(Word2Vec, self).__init__()
        self.embed = nn.Embedding(vocab_size, embedding_size)
        self.fc = nn.Linear(embedding_size, vocab_size)

    def forward(self, word_id):
        out = self.embed(word_id)
        out = self.fc(out)
        return out.squeeze(1)
    
    def to_embed(self, word_id):
        return self.embed(word_id)
    
word2vec = Word2Vec()
save_path = './save/bengali_word2vec.pt'
torch.save(word2vec.state_dict(), save_path)

display(word2vec.parameters)

<bound method Module.parameters of Word2Vec(
  (embed): Embedding(15231, 300)
  (fc): Linear(in_features=300, out_features=15231, bias=True)
)>

## Loss function and Optimizer

In [10]:
optimizer = optim.Adam(word2vec.parameters(), lr=learning_rate)
criterion = nn.CrossEntropyLoss()

## Dataset

In [11]:
class W2VDataset(Dataset):
    def __init__(self, sentences):
        self.data = []
        for sentence in sentences:
            for data_point in get_target_context(sentence):
                self.data.append(data_point)
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        return self.data[index]

## Learning parameters

In [12]:
# load initial weights
word2vec.load_state_dict(torch.load(save_path))
word2vec = word2vec.to(device)

# training
early_stop = 5
list_loss = []
for epoch in range(1, epochs+1):
    train_dataset = W2VDataset(train_sentences)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    
    losses = 0.
    cnt = 0
    word2vec.train()
    for words, context_words in tqdm(train_loader):
        optimizer.zero_grad()
        pred = word2vec(words.to(device))
        loss = criterion(pred, context_words.squeeze(1).to(device))
        loss.backward()
        optimizer.step()
        losses += loss.detach().item() * len(words)
        cnt += len(words)

    epoch_loss = losses / cnt
    print(f'Epoch {epoch:2}: training loss: {epoch_loss:.4f} over {cnt} training points.')
    
    if epoch % 10 == 0:
        # save check-point embedding
        embedding_weights = word2vec.embed.state_dict()
        torch.save(embedding_weights, f'./save/embedding_checkpoints/embedding_weights_{epoch}_epoch_{embedding_size}_dim_{window_size}_wsize.pt')
    
    list_loss.append(epoch_loss)
    if len(list_loss) > early_stop and min(list_loss[-early_stop:]) > min(list_loss[:-early_stop]):
        print('Training loss is not reducing anymore, terminate.')
        break

print("Training finished")

100%|██████████| 189/189 [00:07<00:00, 26.58it/s]


Epoch  1: training loss: 11.4566 over 48272 training points.


100%|██████████| 189/189 [00:07<00:00, 25.64it/s]


Epoch  2: training loss: 11.8863 over 48203 training points.


100%|██████████| 189/189 [00:07<00:00, 26.62it/s]


Epoch  3: training loss: 11.1064 over 48340 training points.


100%|██████████| 189/189 [00:07<00:00, 26.60it/s]


Epoch  4: training loss: 10.3077 over 48176 training points.


100%|██████████| 190/190 [00:07<00:00, 25.87it/s]


Epoch  5: training loss: 9.6554 over 48460 training points.


100%|██████████| 189/189 [00:07<00:00, 24.94it/s]


Epoch  6: training loss: 9.1091 over 48295 training points.


100%|██████████| 188/188 [00:07<00:00, 26.55it/s]


Epoch  7: training loss: 8.6079 over 48092 training points.


100%|██████████| 190/190 [00:07<00:00, 26.22it/s]


Epoch  8: training loss: 8.2016 over 48636 training points.


100%|██████████| 188/188 [00:07<00:00, 26.26it/s]


Epoch  9: training loss: 7.8915 over 48115 training points.


100%|██████████| 189/189 [00:07<00:00, 26.67it/s]


Epoch 10: training loss: 7.6719 over 48176 training points.


100%|██████████| 191/191 [00:07<00:00, 26.57it/s]


Epoch 11: training loss: 7.4268 over 48866 training points.


100%|██████████| 189/189 [00:07<00:00, 26.48it/s]


Epoch 12: training loss: 7.1998 over 48287 training points.


100%|██████████| 189/189 [00:07<00:00, 26.47it/s]


Epoch 13: training loss: 7.0284 over 48365 training points.


100%|██████████| 189/189 [00:07<00:00, 26.43it/s]


Epoch 14: training loss: 6.9158 over 48207 training points.


100%|██████████| 189/189 [00:07<00:00, 25.10it/s]


Epoch 15: training loss: 6.8156 over 48246 training points.


100%|██████████| 189/189 [00:08<00:00, 23.58it/s]


Epoch 16: training loss: 6.7128 over 48161 training points.


100%|██████████| 189/189 [00:06<00:00, 27.00it/s]


Epoch 17: training loss: 6.6588 over 48306 training points.


100%|██████████| 190/190 [00:07<00:00, 25.83it/s]


Epoch 18: training loss: 6.5802 over 48451 training points.


100%|██████████| 189/189 [00:08<00:00, 23.03it/s]


Epoch 19: training loss: 6.5313 over 48298 training points.


100%|██████████| 190/190 [00:07<00:00, 24.69it/s]


Epoch 20: training loss: 6.4891 over 48569 training points.


100%|██████████| 190/190 [00:07<00:00, 26.57it/s]


Epoch 21: training loss: 6.4452 over 48422 training points.


100%|██████████| 190/190 [00:07<00:00, 25.87it/s]


Epoch 22: training loss: 6.4043 over 48435 training points.


100%|██████████| 188/188 [00:08<00:00, 22.73it/s]


Epoch 23: training loss: 6.4088 over 48078 training points.


100%|██████████| 189/189 [00:08<00:00, 23.25it/s]


Epoch 24: training loss: 6.3528 over 48280 training points.


100%|██████████| 189/189 [00:07<00:00, 25.39it/s]


Epoch 25: training loss: 6.3388 over 48235 training points.


100%|██████████| 189/189 [00:08<00:00, 23.38it/s]


Epoch 26: training loss: 6.2872 over 48223 training points.


100%|██████████| 189/189 [00:08<00:00, 21.72it/s]


Epoch 27: training loss: 6.3087 over 48209 training points.


100%|██████████| 190/190 [00:07<00:00, 26.12it/s]


Epoch 28: training loss: 6.3142 over 48449 training points.


100%|██████████| 190/190 [00:07<00:00, 26.03it/s]


Epoch 29: training loss: 6.2885 over 48474 training points.


100%|██████████| 188/188 [00:06<00:00, 27.06it/s]


Epoch 30: training loss: 6.2698 over 47912 training points.


100%|██████████| 188/188 [00:07<00:00, 26.60it/s]


Epoch 31: training loss: 6.2961 over 48110 training points.


100%|██████████| 190/190 [00:06<00:00, 27.15it/s]


Epoch 32: training loss: 6.2532 over 48404 training points.


100%|██████████| 189/189 [00:07<00:00, 26.08it/s]


Epoch 33: training loss: 6.2551 over 48375 training points.


100%|██████████| 189/189 [00:07<00:00, 26.99it/s]


Epoch 34: training loss: 6.2243 over 48165 training points.


100%|██████████| 189/189 [00:07<00:00, 26.69it/s]


Epoch 35: training loss: 6.2329 over 48196 training points.


100%|██████████| 189/189 [00:07<00:00, 24.05it/s]


Epoch 36: training loss: 6.2220 over 48256 training points.


100%|██████████| 189/189 [00:07<00:00, 23.95it/s]


Epoch 37: training loss: 6.1858 over 48341 training points.


100%|██████████| 188/188 [00:08<00:00, 22.89it/s]


Epoch 38: training loss: 6.1941 over 48085 training points.


100%|██████████| 189/189 [00:07<00:00, 25.19it/s]


Epoch 39: training loss: 6.1836 over 48300 training points.


100%|██████████| 189/189 [00:07<00:00, 26.43it/s]


Epoch 40: training loss: 6.1701 over 48322 training points.


100%|██████████| 190/190 [00:07<00:00, 24.93it/s]


Epoch 41: training loss: 6.1664 over 48393 training points.


100%|██████████| 190/190 [00:07<00:00, 24.02it/s]


Epoch 42: training loss: 6.1791 over 48499 training points.


100%|██████████| 189/189 [00:08<00:00, 22.88it/s]


Epoch 43: training loss: 6.1519 over 48159 training points.


100%|██████████| 189/189 [00:07<00:00, 23.90it/s]


Epoch 44: training loss: 6.1852 over 48179 training points.


100%|██████████| 190/190 [00:07<00:00, 26.23it/s]


Epoch 45: training loss: 6.1731 over 48602 training points.


100%|██████████| 189/189 [00:07<00:00, 26.86it/s]


Epoch 46: training loss: 6.1379 over 48158 training points.


100%|██████████| 188/188 [00:07<00:00, 24.87it/s]


Epoch 47: training loss: 6.1223 over 47879 training points.


100%|██████████| 190/190 [00:09<00:00, 20.50it/s]


Epoch 48: training loss: 6.1480 over 48639 training points.


100%|██████████| 189/189 [00:09<00:00, 20.49it/s]


Epoch 49: training loss: 6.1131 over 48338 training points.


100%|██████████| 188/188 [00:09<00:00, 20.16it/s]


Epoch 50: training loss: 6.1025 over 48024 training points.


100%|██████████| 189/189 [00:09<00:00, 20.27it/s]


Epoch 51: training loss: 6.0861 over 48158 training points.


100%|██████████| 189/189 [00:09<00:00, 20.92it/s]


Epoch 52: training loss: 6.0991 over 48249 training points.


100%|██████████| 191/191 [00:07<00:00, 25.66it/s]


Epoch 53: training loss: 6.0605 over 48650 training points.


100%|██████████| 190/190 [00:08<00:00, 23.35it/s]


Epoch 54: training loss: 6.0879 over 48523 training points.


100%|██████████| 190/190 [00:07<00:00, 25.06it/s]


Epoch 55: training loss: 6.0725 over 48535 training points.


100%|██████████| 189/189 [00:07<00:00, 25.53it/s]


Epoch 56: training loss: 6.0364 over 48341 training points.


100%|██████████| 190/190 [00:07<00:00, 26.05it/s]


Epoch 57: training loss: 6.0447 over 48439 training points.


100%|██████████| 190/190 [00:07<00:00, 25.09it/s]


Epoch 58: training loss: 6.0343 over 48508 training points.


100%|██████████| 189/189 [00:07<00:00, 25.87it/s]


Epoch 59: training loss: 6.0369 over 48357 training points.


100%|██████████| 190/190 [00:07<00:00, 26.12it/s]


Epoch 60: training loss: 6.0029 over 48583 training points.


100%|██████████| 190/190 [00:07<00:00, 24.54it/s]


Epoch 61: training loss: 6.0093 over 48448 training points.


100%|██████████| 189/189 [00:08<00:00, 22.42it/s]


Epoch 62: training loss: 6.0064 over 48297 training points.


100%|██████████| 189/189 [00:07<00:00, 24.84it/s]


Epoch 63: training loss: 5.9840 over 48236 training points.


100%|██████████| 189/189 [00:07<00:00, 26.81it/s]


Epoch 64: training loss: 5.9687 over 48131 training points.


100%|██████████| 189/189 [00:09<00:00, 20.31it/s]


Epoch 65: training loss: 5.9838 over 48220 training points.


100%|██████████| 188/188 [00:09<00:00, 20.20it/s]


Epoch 66: training loss: 5.9916 over 47966 training points.


100%|██████████| 188/188 [00:09<00:00, 20.25it/s]


Epoch 67: training loss: 5.9807 over 48051 training points.


100%|██████████| 190/190 [00:08<00:00, 21.83it/s]


Epoch 68: training loss: 5.9505 over 48455 training points.


100%|██████████| 189/189 [00:08<00:00, 23.00it/s]


Epoch 69: training loss: 5.9525 over 48232 training points.


100%|██████████| 189/189 [00:08<00:00, 22.96it/s]


Epoch 70: training loss: 5.9638 over 48326 training points.


100%|██████████| 189/189 [00:06<00:00, 27.50it/s]


Epoch 71: training loss: 5.9363 over 48260 training points.


100%|██████████| 189/189 [00:07<00:00, 26.63it/s]


Epoch 72: training loss: 5.9267 over 48155 training points.


100%|██████████| 189/189 [00:08<00:00, 22.80it/s]


Epoch 73: training loss: 5.9347 over 48136 training points.


100%|██████████| 189/189 [00:07<00:00, 25.22it/s]


Epoch 74: training loss: 5.9229 over 48135 training points.


100%|██████████| 189/189 [00:07<00:00, 24.13it/s]


Epoch 75: training loss: 5.9251 over 48187 training points.


100%|██████████| 189/189 [00:06<00:00, 27.10it/s]


Epoch 76: training loss: 5.9082 over 48192 training points.


100%|██████████| 189/189 [00:06<00:00, 27.32it/s]


Epoch 77: training loss: 5.9103 over 48327 training points.


100%|██████████| 189/189 [00:06<00:00, 27.40it/s]


Epoch 78: training loss: 5.9092 over 48249 training points.


100%|██████████| 189/189 [00:06<00:00, 27.35it/s]


Epoch 79: training loss: 5.8933 over 48180 training points.


100%|██████████| 188/188 [00:06<00:00, 27.20it/s]


Epoch 80: training loss: 5.8983 over 48085 training points.


100%|██████████| 189/189 [00:06<00:00, 27.28it/s]


Epoch 81: training loss: 5.8465 over 48318 training points.


100%|██████████| 188/188 [00:06<00:00, 27.37it/s]


Epoch 82: training loss: 5.8601 over 48033 training points.


100%|██████████| 188/188 [00:06<00:00, 27.28it/s]


Epoch 83: training loss: 5.8441 over 47963 training points.


100%|██████████| 189/189 [00:06<00:00, 27.22it/s]


Epoch 84: training loss: 5.8565 over 48355 training points.


100%|██████████| 189/189 [00:06<00:00, 27.31it/s]


Epoch 85: training loss: 5.8392 over 48184 training points.


100%|██████████| 188/188 [00:06<00:00, 27.31it/s]


Epoch 86: training loss: 5.8473 over 48115 training points.


100%|██████████| 189/189 [00:06<00:00, 27.18it/s]


Epoch 87: training loss: 5.8343 over 48190 training points.


100%|██████████| 188/188 [00:07<00:00, 24.64it/s]


Epoch 88: training loss: 5.8437 over 48019 training points.


100%|██████████| 188/188 [00:06<00:00, 27.30it/s]


Epoch 89: training loss: 5.8369 over 47967 training points.


100%|██████████| 191/191 [00:06<00:00, 27.39it/s]


Epoch 90: training loss: 5.8188 over 48699 training points.


100%|██████████| 189/189 [00:07<00:00, 25.62it/s]


Epoch 91: training loss: 5.8273 over 48146 training points.


100%|██████████| 189/189 [00:07<00:00, 24.34it/s]


Epoch 92: training loss: 5.8221 over 48283 training points.


100%|██████████| 190/190 [00:07<00:00, 26.06it/s]


Epoch 93: training loss: 5.8367 over 48536 training points.


100%|██████████| 189/189 [00:07<00:00, 26.75it/s]


Epoch 94: training loss: 5.7975 over 48278 training points.


100%|██████████| 189/189 [00:07<00:00, 25.78it/s]


Epoch 95: training loss: 5.8186 over 48217 training points.


100%|██████████| 189/189 [00:07<00:00, 25.55it/s]


Epoch 96: training loss: 5.7939 over 48220 training points.


100%|██████████| 189/189 [00:07<00:00, 26.32it/s]


Epoch 97: training loss: 5.7859 over 48172 training points.


100%|██████████| 188/188 [00:07<00:00, 26.24it/s]


Epoch 98: training loss: 5.7816 over 47894 training points.


100%|██████████| 188/188 [00:07<00:00, 26.36it/s]


Epoch 99: training loss: 5.8034 over 48014 training points.


100%|██████████| 189/189 [00:07<00:00, 26.00it/s]

Epoch 100: training loss: 5.8147 over 48297 training points.
Training finished





In [13]:
# save the word-embedding layer weights
embedding_weights = word2vec.embed.state_dict()
torch.save(embedding_weights, f'save/embedding_weights.pt')