# Task 2: Create Bengali word embeddings

## Import libraries

In [1]:
# Imports
import re
import string
import json
from datetime import datetime
from collections import defaultdict, Counter

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm

import torch
import torch.nn as nn
from torch.nn import Module
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

from sklearn.model_selection import train_test_split

from nltk.corpus import stopwords

device = 'cuda'

import random

random.seed(26)
np.random.seed(62)
torch.manual_seed(2021)

<torch._C.Generator at 0x7f5b8c5f2f50>

## Load data

In [2]:
ben_train_df = pd.read_csv('../../data/bengali_hatespeech_sample_train_preprocessed.csv')
ben_test_df = pd.read_csv('../../data/bengali_hatespeech_sample_test_preprocessed.csv')

display(ben_train_df.head())

Unnamed: 0,sentence,hate,category
0,ভায়েরা আপনাদের ধন্যোবাদ এগিয়ে জাও পাসে আছি ভাই,0,religion
1,নাউজুবিল্লাহ নাউজুবিল্লাহ,0,"Meme, TikTok and others"
2,দুইজন অপরাধ সরকারি চাকরি হিসেবে দুইজনকে বাংলাদ...,0,crime
3,উড়িয়ে ই মারলো পেরেরা,0,sports
4,পুরুষ এক জাত অনেকসময় বোঝেনা সময় বুঝতে চায়না...,0,entertainment


In [3]:
# train data:
# remove empty texts
ben_train_df = ben_train_df[ben_train_df.sentence.str.len() > 0]
# extract sentences and labels
train_sentences = [text.split() for text in ben_train_df['sentence']]
train_labels = ben_train_df['hate'].to_numpy()

# test data:
# remove empty texts
ben_test_df = ben_test_df[ben_test_df.sentence.str.len() > 0]
# extract sentences and labels
test_sentences = [text.split() for text in ben_test_df['sentence']]
test_labels = ben_test_df['hate'].to_numpy()

### Print out data/statistics

In [4]:
print('Train data:')
print(train_sentences[:3])
print(train_labels)
print()
print('Test data:')
print(test_sentences[:3])
print(test_labels)

Train data:
[['ভায়েরা', 'আপনাদের', 'ধন্যোবাদ', 'এগিয়ে', 'জাও', 'পাসে', 'আছি', 'ভাই'], ['নাউজুবিল্লাহ', 'নাউজুবিল্লাহ'], ['দুইজন', 'অপরাধ', 'সরকারি', 'চাকরি', 'হিসেবে', 'দুইজনকে', 'বাংলাদেশ', 'বের', 'দেওয়া', 'আপনারা', 'কমেন্ট', 'জানাবেন']]
[0 0 0 ... 0 0 0]

Test data:
[['লেখাটি', 'ফুটবল', 'বুঝেই', 'লেখা।'], ['ভাই', 'কথা', 'শুনে', 'কান্না', 'আসলো।'], ['খানকি', 'নাইকা']]
[0 0 1 ... 1 1 1]


## Prepare vocab set

In [5]:
# vocab_size and word->id and id->word
flattened_words = [word for sentence in train_sentences for word in sentence]
V = list(set(flattened_words))
vocab_size = len(V)
print(f'vocab_size: {vocab_size}')

word_to_int = {}
int_to_word = {}
for i, word in enumerate(V):
    word_to_int[word] = i
    int_to_word[i] = word

vocab_size: 16005


## Define sub-sampling

In [6]:
word_counter = Counter(flattened_words)
def sampling_prob(word):
    z = word_counter[word] / len(flattened_words)
    p_keep = ((z/0.000001)**0.5 + 1) * (0.000001/z)
    return p_keep

## skip-gram

In [7]:
def get_target_context(sentence: list(str())):
    for i, word in enumerate(sentence):
        for j, context_word in enumerate(sentence[i-window_size:i+window_size+1]):
            if j != i and random.random() < sampling_prob(context_word):
                yield (torch.tensor(word_to_int[word], dtype=torch.long).unsqueeze(0), 
                       torch.tensor(word_to_int[context_word], dtype=torch.long).unsqueeze(0))

# Train word-embeddings

## hyper-parameters

In [8]:
window_size = 10
embedding_size = 300
learning_rate = 0.01
batch_size = 256
epochs = 100

## Model

In [9]:
class Word2Vec(Module):
    def __init__(self):
        super(Word2Vec, self).__init__()
        self.embed = nn.Embedding(vocab_size, embedding_size)
        self.fc = nn.Linear(embedding_size, vocab_size)

    def forward(self, word_id):
        out = self.embed(word_id)
        out = self.fc(out)
        return out.squeeze(1)
    
    def to_embed(self, word_id):
        return self.embed(word_id)
    
word2vec = Word2Vec()
save_path = './save/bengali_word2vec.pt'
torch.save(word2vec.state_dict(), save_path)

display(word2vec.parameters)

<bound method Module.parameters of Word2Vec(
  (embed): Embedding(16005, 300)
  (fc): Linear(in_features=300, out_features=16005, bias=True)
)>

## Loss function and Optimizer

In [10]:
optimizer = optim.Adam(word2vec.parameters(), lr=learning_rate)
criterion = nn.CrossEntropyLoss()

## Dataset

In [11]:
class W2VDataset(Dataset):
    def __init__(self, sentences):
        self.data = []
        for sentence in sentences:
            for data_point in get_target_context(sentence):
                self.data.append(data_point)
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        return self.data[index]

## Learning parameters

In [12]:
# load initial weights
word2vec.load_state_dict(torch.load(save_path))
word2vec = word2vec.to(device)

# training
early_stop = 5
list_loss = []
for epoch in range(1, epochs+1):
    train_dataset = W2VDataset(train_sentences)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    
    losses = 0.
    cnt = 0
    word2vec.train()
    for words, context_words in tqdm(train_loader):
        optimizer.zero_grad()
        pred = word2vec(words.to(device))
        loss = criterion(pred, context_words.squeeze(1).to(device))
        loss.backward()
        optimizer.step()
        losses += loss.detach().item() * len(words)
        cnt += len(words)

    epoch_loss = losses / cnt
    print(f'Epoch {epoch:2}: training loss: {epoch_loss:.4f} over {cnt} training points.')
    
    if epoch % 10 == 0:
        # save check-point embedding
        embedding_weights = word2vec.embed.state_dict()
        torch.save(embedding_weights, f'./save/embedding_weights_{epoch}_epoch_{embedding_size}_dim_{window_size}_wsize.pt')
    
    list_loss.append(epoch_loss)
    if len(list_loss) > early_stop and min(list_loss[-early_stop:]) > min(list_loss[:-early_stop]):
        print('Training loss is not reducing anymore, terminate.')
        break

print("Training finished")

100%|██████████| 220/220 [00:08<00:00, 24.86it/s]


Epoch  1: training loss: 11.7544 over 56252 training points.


100%|██████████| 219/219 [00:08<00:00, 25.38it/s]


Epoch  2: training loss: 11.9434 over 55850 training points.


100%|██████████| 218/218 [00:08<00:00, 25.36it/s]


Epoch  3: training loss: 11.0959 over 55753 training points.


100%|██████████| 221/221 [00:08<00:00, 25.43it/s]


Epoch  4: training loss: 10.2133 over 56336 training points.


100%|██████████| 219/219 [00:08<00:00, 25.34it/s]


Epoch  5: training loss: 9.5631 over 55953 training points.


100%|██████████| 221/221 [00:08<00:00, 25.35it/s]


Epoch  6: training loss: 9.0013 over 56430 training points.


100%|██████████| 221/221 [00:08<00:00, 25.24it/s]


Epoch  7: training loss: 8.4746 over 56521 training points.


100%|██████████| 220/220 [00:08<00:00, 25.27it/s]


Epoch  8: training loss: 8.1018 over 56228 training points.


100%|██████████| 220/220 [00:08<00:00, 25.37it/s]


Epoch  9: training loss: 7.7314 over 56131 training points.


100%|██████████| 221/221 [00:08<00:00, 26.21it/s]


Epoch 10: training loss: 7.5002 over 56541 training points.


100%|██████████| 218/218 [00:08<00:00, 25.29it/s]


Epoch 11: training loss: 7.3127 over 55667 training points.


100%|██████████| 220/220 [00:08<00:00, 25.30it/s]


Epoch 12: training loss: 7.1680 over 56083 training points.


100%|██████████| 220/220 [00:08<00:00, 25.42it/s]


Epoch 13: training loss: 7.0023 over 56299 training points.


100%|██████████| 219/219 [00:08<00:00, 25.95it/s]


Epoch 14: training loss: 6.8729 over 55934 training points.


100%|██████████| 220/220 [00:08<00:00, 25.66it/s]


Epoch 15: training loss: 6.7824 over 56065 training points.


100%|██████████| 220/220 [00:09<00:00, 24.00it/s]


Epoch 16: training loss: 6.6935 over 56188 training points.


100%|██████████| 220/220 [00:08<00:00, 26.17it/s]


Epoch 17: training loss: 6.6349 over 56219 training points.


100%|██████████| 221/221 [00:08<00:00, 26.00it/s]


Epoch 18: training loss: 6.5973 over 56333 training points.


100%|██████████| 221/221 [00:08<00:00, 25.93it/s]


Epoch 19: training loss: 6.5703 over 56329 training points.


100%|██████████| 220/220 [00:08<00:00, 25.89it/s]


Epoch 20: training loss: 6.5368 over 56102 training points.


100%|██████████| 221/221 [00:08<00:00, 25.21it/s]


Epoch 21: training loss: 6.5176 over 56348 training points.


100%|██████████| 220/220 [00:08<00:00, 25.03it/s]


Epoch 22: training loss: 6.4816 over 56151 training points.


100%|██████████| 220/220 [00:08<00:00, 24.58it/s]


Epoch 23: training loss: 6.4714 over 56105 training points.


100%|██████████| 219/219 [00:08<00:00, 24.94it/s]


Epoch 24: training loss: 6.4556 over 55940 training points.


100%|██████████| 219/219 [00:08<00:00, 25.75it/s]


Epoch 25: training loss: 6.4255 over 55834 training points.


100%|██████████| 222/222 [00:08<00:00, 25.94it/s]


Epoch 26: training loss: 6.4240 over 56647 training points.


100%|██████████| 219/219 [00:08<00:00, 25.96it/s]


Epoch 27: training loss: 6.4141 over 55965 training points.


100%|██████████| 220/220 [00:08<00:00, 25.87it/s]


Epoch 28: training loss: 6.3992 over 56219 training points.


100%|██████████| 221/221 [00:08<00:00, 25.97it/s]


Epoch 29: training loss: 6.4169 over 56329 training points.


100%|██████████| 219/219 [00:08<00:00, 25.55it/s]


Epoch 30: training loss: 6.3981 over 55969 training points.


100%|██████████| 219/219 [00:09<00:00, 24.01it/s]


Epoch 31: training loss: 6.3737 over 56051 training points.


100%|██████████| 220/220 [00:09<00:00, 24.29it/s]


Epoch 32: training loss: 6.3541 over 56206 training points.


100%|██████████| 220/220 [00:08<00:00, 26.19it/s]


Epoch 33: training loss: 6.3556 over 56241 training points.


100%|██████████| 220/220 [00:08<00:00, 25.04it/s]


Epoch 34: training loss: 6.3414 over 56318 training points.


100%|██████████| 221/221 [00:09<00:00, 24.52it/s]


Epoch 35: training loss: 6.3527 over 56366 training points.


100%|██████████| 221/221 [00:08<00:00, 25.16it/s]


Epoch 36: training loss: 6.3390 over 56506 training points.


100%|██████████| 219/219 [00:08<00:00, 25.05it/s]


Epoch 37: training loss: 6.3073 over 55998 training points.


100%|██████████| 220/220 [00:08<00:00, 25.30it/s]


Epoch 38: training loss: 6.2961 over 56117 training points.


100%|██████████| 220/220 [00:08<00:00, 25.55it/s]


Epoch 39: training loss: 6.2966 over 56317 training points.


100%|██████████| 220/220 [00:08<00:00, 25.18it/s]


Epoch 40: training loss: 6.2815 over 56210 training points.


100%|██████████| 220/220 [00:08<00:00, 25.01it/s]


Epoch 41: training loss: 6.2657 over 56134 training points.


100%|██████████| 219/219 [00:08<00:00, 24.63it/s]


Epoch 42: training loss: 6.2573 over 56061 training points.


100%|██████████| 221/221 [00:08<00:00, 25.27it/s]


Epoch 43: training loss: 6.2626 over 56455 training points.


100%|██████████| 220/220 [00:08<00:00, 24.54it/s]


Epoch 44: training loss: 6.2442 over 56206 training points.


100%|██████████| 220/220 [00:09<00:00, 23.63it/s]


Epoch 45: training loss: 6.2522 over 56106 training points.


100%|██████████| 219/219 [00:08<00:00, 24.88it/s]


Epoch 46: training loss: 6.2390 over 56014 training points.


100%|██████████| 221/221 [00:08<00:00, 25.58it/s]


Epoch 47: training loss: 6.2229 over 56403 training points.


100%|██████████| 221/221 [00:08<00:00, 25.79it/s]


Epoch 48: training loss: 6.2237 over 56416 training points.


100%|██████████| 221/221 [00:08<00:00, 25.70it/s]


Epoch 49: training loss: 6.2083 over 56330 training points.


100%|██████████| 221/221 [00:08<00:00, 25.78it/s]


Epoch 50: training loss: 6.1987 over 56405 training points.


100%|██████████| 220/220 [00:08<00:00, 25.79it/s]


Epoch 51: training loss: 6.1865 over 56289 training points.


100%|██████████| 221/221 [00:08<00:00, 25.80it/s]


Epoch 52: training loss: 6.1849 over 56430 training points.


100%|██████████| 221/221 [00:08<00:00, 25.86it/s]


Epoch 53: training loss: 6.1705 over 56473 training points.


100%|██████████| 219/219 [00:08<00:00, 25.24it/s]


Epoch 54: training loss: 6.1473 over 56037 training points.


100%|██████████| 221/221 [00:08<00:00, 24.61it/s]


Epoch 55: training loss: 6.1483 over 56386 training points.


100%|██████████| 219/219 [00:09<00:00, 24.19it/s]


Epoch 56: training loss: 6.1378 over 56021 training points.


100%|██████████| 219/219 [00:08<00:00, 24.82it/s]


Epoch 57: training loss: 6.1169 over 55847 training points.


100%|██████████| 221/221 [00:08<00:00, 24.66it/s]


Epoch 58: training loss: 6.1078 over 56454 training points.


100%|██████████| 219/219 [00:08<00:00, 24.91it/s]


Epoch 59: training loss: 6.1230 over 55961 training points.


100%|██████████| 220/220 [00:08<00:00, 24.79it/s]


Epoch 60: training loss: 6.1102 over 56249 training points.


100%|██████████| 220/220 [00:08<00:00, 25.55it/s]


Epoch 61: training loss: 6.1024 over 56232 training points.


100%|██████████| 219/219 [00:08<00:00, 24.49it/s]


Epoch 62: training loss: 6.0826 over 55964 training points.


100%|██████████| 220/220 [00:08<00:00, 24.65it/s]


Epoch 63: training loss: 6.0712 over 56303 training points.


100%|██████████| 220/220 [00:09<00:00, 23.88it/s]


Epoch 64: training loss: 6.0621 over 56201 training points.


100%|██████████| 220/220 [00:09<00:00, 23.49it/s]


Epoch 65: training loss: 6.0620 over 56301 training points.


100%|██████████| 219/219 [00:08<00:00, 25.73it/s]


Epoch 66: training loss: 6.0529 over 55978 training points.


100%|██████████| 219/219 [00:08<00:00, 24.99it/s]


Epoch 67: training loss: 6.0484 over 55926 training points.


100%|██████████| 220/220 [00:08<00:00, 25.54it/s]


Epoch 68: training loss: 6.0378 over 56149 training points.


100%|██████████| 220/220 [00:08<00:00, 26.14it/s]


Epoch 69: training loss: 6.0278 over 56292 training points.


100%|██████████| 220/220 [00:08<00:00, 26.12it/s]


Epoch 70: training loss: 6.0249 over 56213 training points.


100%|██████████| 219/219 [00:08<00:00, 26.15it/s]


Epoch 71: training loss: 6.0055 over 55881 training points.


100%|██████████| 220/220 [00:08<00:00, 26.13it/s]


Epoch 72: training loss: 6.0323 over 56130 training points.


100%|██████████| 221/221 [00:08<00:00, 26.16it/s]


Epoch 73: training loss: 6.0235 over 56391 training points.


100%|██████████| 219/219 [00:08<00:00, 26.07it/s]


Epoch 74: training loss: 6.0112 over 55898 training points.


100%|██████████| 219/219 [00:08<00:00, 26.08it/s]


Epoch 75: training loss: 6.0018 over 56046 training points.


100%|██████████| 221/221 [00:08<00:00, 25.49it/s]


Epoch 76: training loss: 5.9782 over 56412 training points.


100%|██████████| 220/220 [00:08<00:00, 25.32it/s]


Epoch 77: training loss: 5.9903 over 56195 training points.


100%|██████████| 219/219 [00:08<00:00, 26.05it/s]


Epoch 78: training loss: 5.9813 over 55997 training points.


100%|██████████| 219/219 [00:08<00:00, 26.08it/s]


Epoch 79: training loss: 5.9635 over 55854 training points.


100%|██████████| 219/219 [00:08<00:00, 26.09it/s]


Epoch 80: training loss: 5.9598 over 56007 training points.


100%|██████████| 220/220 [00:08<00:00, 25.56it/s]


Epoch 81: training loss: 5.9592 over 56220 training points.


100%|██████████| 219/219 [00:08<00:00, 26.01it/s]


Epoch 82: training loss: 5.9610 over 56015 training points.


100%|██████████| 221/221 [00:08<00:00, 26.10it/s]


Epoch 83: training loss: 5.9527 over 56512 training points.


100%|██████████| 220/220 [00:08<00:00, 24.98it/s]


Epoch 84: training loss: 5.9382 over 56277 training points.


100%|██████████| 218/218 [00:08<00:00, 25.17it/s]


Epoch 85: training loss: 5.9470 over 55771 training points.


100%|██████████| 219/219 [00:08<00:00, 25.38it/s]


Epoch 86: training loss: 5.9273 over 55976 training points.


100%|██████████| 220/220 [00:08<00:00, 26.15it/s]


Epoch 87: training loss: 5.9258 over 56130 training points.


100%|██████████| 220/220 [00:08<00:00, 26.06it/s]


Epoch 88: training loss: 5.9266 over 56268 training points.


100%|██████████| 219/219 [00:08<00:00, 26.11it/s]


Epoch 89: training loss: 5.9356 over 56037 training points.


100%|██████████| 219/219 [00:08<00:00, 26.10it/s]


Epoch 90: training loss: 5.9173 over 55940 training points.


100%|██████████| 221/221 [00:08<00:00, 26.10it/s]


Epoch 91: training loss: 5.9153 over 56395 training points.


100%|██████████| 218/218 [00:08<00:00, 26.12it/s]


Epoch 92: training loss: 5.9235 over 55767 training points.


100%|██████████| 221/221 [00:08<00:00, 26.06it/s]


Epoch 93: training loss: 5.9109 over 56482 training points.


100%|██████████| 220/220 [00:08<00:00, 24.82it/s]


Epoch 94: training loss: 5.8865 over 56189 training points.


100%|██████████| 220/220 [00:08<00:00, 26.14it/s]


Epoch 95: training loss: 5.8995 over 56128 training points.


100%|██████████| 220/220 [00:08<00:00, 26.04it/s]


Epoch 96: training loss: 5.8949 over 56106 training points.


100%|██████████| 220/220 [00:08<00:00, 26.08it/s]


Epoch 97: training loss: 5.8887 over 56220 training points.


100%|██████████| 219/219 [00:08<00:00, 25.45it/s]


Epoch 98: training loss: 5.8834 over 56032 training points.


100%|██████████| 219/219 [00:08<00:00, 25.27it/s]


Epoch 99: training loss: 5.8877 over 55812 training points.


100%|██████████| 220/220 [00:09<00:00, 22.31it/s]


Epoch 100: training loss: 5.8871 over 56124 training points.
Training finished


In [13]:
# save the word-embedding layer weights
embedding_weights = word2vec.embed.state_dict()
torch.save(embedding_weights, f'save/embedding_weights.pt')

# save dicts for transformation word <-> int
with open(f'save/word_to_int_dict.json', 'w') as f:
    json.dump(word_to_int, f)
with open(f'save/int_to_word_dict.json', 'w') as f:
    json.dump(int_to_word, f)