# Task 2: Create Bengali word embeddings

## Import libraries

In [1]:
# Imports
import re
import string
import json
from datetime import datetime
from collections import defaultdict, Counter

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm

import torch
import torch.nn as nn
from torch.nn import Module
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

from sklearn.model_selection import train_test_split

from nltk.corpus import stopwords

device = 'cuda'

import random

torch.manual_seed(123)
torch.cuda.manual_seed(234)
np.random.seed(345)
random.seed(456)
torch.manual_seed(567)
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True

## Load data

In [2]:
ben_train_df = pd.read_csv('../hindi_bengali/save/bengali_hatespeech_sample_train_preprocessed.csv')
ben_test_df = pd.read_csv('../hindi_bengali/save/bengali_hatespeech_sample_test_preprocessed.csv')

display(ben_train_df.head())

Unnamed: 0,sentence,hate,category
0,সমকামী হুজুর,0,religion
1,ছাএলীগ সালা দের নিসিদ্দ হক,1,politics
2,কাওয়া গদি ছারলে বুজবে জুতা কেমনে খায়,0,politics
3,কাউয়া কাদের বড় মাগীখোর ভিডিও পিক দেখলে বুঝা লু...,1,politics
4,অপু ভালো কথা ছোট করবেনা,0,"Meme, TikTok and others"


In [3]:
# train data:
# remove empty texts
ben_train_df = ben_train_df[ben_train_df.sentence.str.len() > 0]
# extract sentences and labels
train_sentences = [text.split() for text in ben_train_df['sentence']]
train_labels = ben_train_df['hate'].to_numpy()

# test data:
# remove empty texts
ben_test_df = ben_test_df[ben_test_df.sentence.str.len() > 0]
# extract sentences and labels
test_sentences = [text.split() for text in ben_test_df['sentence']]
test_labels = ben_test_df['hate'].to_numpy()

### Print out data/statistics

In [4]:
print('Train data:')
print(train_sentences[:3])
print(train_labels)
print()
print('Test data:')
print(test_sentences[:3])
print(test_labels)

Train data:
[['সমকামী', 'হুজুর'], ['ছাএলীগ', 'সালা', 'দের', 'নিসিদ্দ', 'হক'], ['কাওয়া', 'গদি', 'ছারলে', 'বুজবে', 'জুতা', 'কেমনে', 'খায়']]
[0 1 0 ... 1 0 0]

Test data:
[['মহিলাকে', 'রিমানডে'], ['তুর', 'রিপাতকে', 'মন', 'চাইছিল', 'ছেড়ে', 'গেলি', 'সাথে', 'মিত্যে', 'অভিনয়', 'করলি', 'দুনিয়া', 'উঠালি', 'তুর', 'নরকেও', 'ঠাঁই', 'হবেনা', 'তুই', 'আকাশের', 'মিতুর', 'মত', 'করলি', 'তুর', 'মত', 'বিশ্বাস', 'ঘাতকনীর', 'ফাসি', 'হউক', 'রিফাত', 'তোকে', 'ফেলে', 'যেত', 'হয়তবা', 'বেঁচে', 'যেত', 'সরল', 'ভালবাসাকে', 'হত্যা', 'করলি', 'তুই', 'নারী', 'জাতের', 'কলংক', 'তুকে', 'দেখলে', 'বুঝা', 'আসলে', 'তোর', 'পরিক্ষলপনা', 'তোর', 'চোখ', 'মুখ', 'সাক্ষী', 'তুইয়ি', 'জরিত', 'নারী', 'জাতের', 'কলংক'], ['হুমায়ুন', 'আজাদ', 'এতো', 'বড়', 'ক্রাক', 'মাতাল']]
[0 0 0 ... 0 0 1]


## Prepare vocab set

In [5]:
# load mapping {word -> id} and {id -> word}
with open('../hindi_bengali/save/bengali_word_to_int_dict.json') as f:
    word_to_int = json.load(f)
with open('../hindi_bengali/save/bengali_int_to_word_dict.json') as f:
    int_to_word = json.load(f)
    int_to_word = {int(k) : v for k, v in int_to_word.items()}
with open('../hindi_bengali/save/bengali_word_counter.json') as f:
    word_counter = json.load(f)

# get vocab_size
vocab_size = len(word_to_int)
print(f'vocab_size: {vocab_size}')

# get total occurences
total_words = sum(word_counter.values())
print(f'total word occurences: {total_words}')

vocab_size: 15231
total word occurences: 47897


## Define sub-sampling

In [6]:
def sampling_prob(word):
    z = word_counter[word] / total_words
    p_keep = ((z/0.000001)**0.5 + 1) * (0.000001/z)
    return p_keep

## skip-gram

In [7]:
def get_target_context(sentence: list(str())):
    for i, word in enumerate(sentence):
        for j, context_word in enumerate(sentence[i-window_size:i+window_size+1]):
            if j != i and random.random() < sampling_prob(context_word):
                yield (torch.tensor(word_to_int[word], dtype=torch.long).unsqueeze(0), 
                       torch.tensor(word_to_int[context_word], dtype=torch.long).unsqueeze(0))

# Train word-embeddings

## hyper-parameters

In [8]:
window_size = 10
embedding_size = 300
learning_rate = 0.01
batch_size = 256
epochs = 100

## Model

In [9]:
class Word2Vec(Module):
    def __init__(self):
        super(Word2Vec, self).__init__()
        self.embed = nn.Embedding(vocab_size, embedding_size)
        self.fc = nn.Linear(embedding_size, vocab_size)

    def forward(self, word_id):
        out = self.embed(word_id)
        out = self.fc(out)
        return out.squeeze(1)
    
    def to_embed(self, word_id):
        return self.embed(word_id)
    
word2vec = Word2Vec()
save_path = './save/bengali_word2vec.pt'
torch.save(word2vec.state_dict(), save_path)

display(word2vec.parameters)

<bound method Module.parameters of Word2Vec(
  (embed): Embedding(15231, 300)
  (fc): Linear(in_features=300, out_features=15231, bias=True)
)>

## Loss function and Optimizer

In [10]:
optimizer = optim.Adam(word2vec.parameters(), lr=learning_rate)
criterion = nn.CrossEntropyLoss()

## Dataset

In [11]:
class W2VDataset(Dataset):
    def __init__(self, sentences):
        self.data = []
        for sentence in sentences:
            for data_point in get_target_context(sentence):
                self.data.append(data_point)
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        return self.data[index]

## Learning parameters

In [12]:
# load initial weights
word2vec.load_state_dict(torch.load(save_path))
word2vec = word2vec.to(device)

# training
early_stop = 5
list_loss = []
for epoch in range(1, epochs+1):
    train_dataset = W2VDataset(train_sentences)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    
    losses = 0.
    cnt = 0
    word2vec.train()
    for words, context_words in tqdm(train_loader):
        optimizer.zero_grad()
        pred = word2vec(words.to(device))
        loss = criterion(pred, context_words.squeeze(1).to(device))
        loss.backward()
        optimizer.step()
        losses += loss.detach().item() * len(words)
        cnt += len(words)

    epoch_loss = losses / cnt
    print(f'Epoch {epoch:2}: training loss: {epoch_loss:.4f} over {cnt} training points.')
    
    if epoch % 10 == 0:
        # save check-point embedding
        embedding_weights = word2vec.embed.state_dict()
        torch.save(embedding_weights, f'./save/embedding_checkpoints/embedding_weights_{epoch}_epoch_{embedding_size}_dim_{window_size}_wsize.pt')
    
    list_loss.append(epoch_loss)
    if len(list_loss) > early_stop and min(list_loss[-early_stop:]) > min(list_loss[:-early_stop]):
        print('Training loss is not reducing anymore, terminate.')
        break

print("Training finished")

100%|██████████| 189/189 [00:07<00:00, 25.10it/s]


Epoch  1: training loss: 11.4425 over 48288 training points.


100%|██████████| 189/189 [00:07<00:00, 24.88it/s]


Epoch  2: training loss: 11.8694 over 48217 training points.


100%|██████████| 189/189 [00:07<00:00, 26.30it/s]


Epoch  3: training loss: 11.0750 over 48236 training points.


100%|██████████| 188/188 [00:07<00:00, 26.55it/s]


Epoch  4: training loss: 10.2768 over 48011 training points.


100%|██████████| 190/190 [00:07<00:00, 24.92it/s]


Epoch  5: training loss: 9.6702 over 48511 training points.


100%|██████████| 190/190 [00:07<00:00, 26.13it/s]


Epoch  6: training loss: 9.1224 over 48489 training points.


100%|██████████| 187/187 [00:07<00:00, 25.72it/s]


Epoch  7: training loss: 8.6211 over 47752 training points.


100%|██████████| 188/188 [00:07<00:00, 24.99it/s]


Epoch  8: training loss: 8.2317 over 48060 training points.


100%|██████████| 190/190 [00:07<00:00, 24.50it/s]


Epoch  9: training loss: 7.8803 over 48414 training points.


100%|██████████| 188/188 [00:07<00:00, 24.14it/s]


Epoch 10: training loss: 7.6423 over 47920 training points.


100%|██████████| 189/189 [00:08<00:00, 22.25it/s]


Epoch 11: training loss: 7.4274 over 48332 training points.


100%|██████████| 189/189 [00:07<00:00, 24.49it/s]


Epoch 12: training loss: 7.2204 over 48356 training points.


100%|██████████| 189/189 [00:07<00:00, 24.96it/s]


Epoch 13: training loss: 7.0650 over 48333 training points.


100%|██████████| 190/190 [00:08<00:00, 23.71it/s]


Epoch 14: training loss: 6.9168 over 48577 training points.


100%|██████████| 188/188 [00:07<00:00, 25.52it/s]


Epoch 15: training loss: 6.8048 over 48061 training points.


100%|██████████| 188/188 [00:07<00:00, 25.17it/s]


Epoch 16: training loss: 6.6956 over 48070 training points.


100%|██████████| 189/189 [00:08<00:00, 22.59it/s]


Epoch 17: training loss: 6.6607 over 48332 training points.


100%|██████████| 189/189 [00:08<00:00, 21.11it/s]


Epoch 18: training loss: 6.5704 over 48176 training points.


100%|██████████| 190/190 [00:07<00:00, 26.97it/s]


Epoch 19: training loss: 6.5090 over 48385 training points.


100%|██████████| 189/189 [00:06<00:00, 27.13it/s]


Epoch 20: training loss: 6.4737 over 48175 training points.


100%|██████████| 189/189 [00:07<00:00, 26.56it/s]


Epoch 21: training loss: 6.4427 over 48317 training points.


100%|██████████| 189/189 [00:06<00:00, 27.03it/s]


Epoch 22: training loss: 6.4022 over 48320 training points.


100%|██████████| 188/188 [00:06<00:00, 27.22it/s]


Epoch 23: training loss: 6.3542 over 48054 training points.


100%|██████████| 189/189 [00:06<00:00, 27.12it/s]


Epoch 24: training loss: 6.3504 over 48180 training points.


100%|██████████| 189/189 [00:06<00:00, 27.19it/s]


Epoch 25: training loss: 6.3608 over 48272 training points.


100%|██████████| 187/187 [00:07<00:00, 25.15it/s]


Epoch 26: training loss: 6.3152 over 47655 training points.


100%|██████████| 189/189 [00:07<00:00, 25.77it/s]


Epoch 27: training loss: 6.2793 over 48315 training points.


100%|██████████| 189/189 [00:07<00:00, 26.86it/s]


Epoch 28: training loss: 6.2858 over 48274 training points.


100%|██████████| 189/189 [00:07<00:00, 25.89it/s]


Epoch 29: training loss: 6.2624 over 48204 training points.


100%|██████████| 189/189 [00:08<00:00, 22.48it/s]


Epoch 30: training loss: 6.2874 over 48321 training points.


100%|██████████| 189/189 [00:07<00:00, 23.64it/s]


Epoch 31: training loss: 6.2703 over 48382 training points.


100%|██████████| 189/189 [00:07<00:00, 23.75it/s]


Epoch 32: training loss: 6.2254 over 48147 training points.


100%|██████████| 189/189 [00:08<00:00, 21.52it/s]


Epoch 33: training loss: 6.2328 over 48257 training points.


100%|██████████| 189/189 [00:07<00:00, 24.13it/s]


Epoch 34: training loss: 6.2382 over 48350 training points.


100%|██████████| 190/190 [00:07<00:00, 26.32it/s]


Epoch 35: training loss: 6.2311 over 48540 training points.


100%|██████████| 189/189 [00:06<00:00, 27.31it/s]


Epoch 36: training loss: 6.1963 over 48196 training points.


100%|██████████| 191/191 [00:06<00:00, 27.38it/s]


Epoch 37: training loss: 6.2214 over 48680 training points.


100%|██████████| 189/189 [00:06<00:00, 27.14it/s]


Epoch 38: training loss: 6.2161 over 48190 training points.


100%|██████████| 190/190 [00:06<00:00, 27.38it/s]


Epoch 39: training loss: 6.1992 over 48429 training points.


100%|██████████| 189/189 [00:06<00:00, 27.31it/s]


Epoch 40: training loss: 6.1859 over 48285 training points.


100%|██████████| 189/189 [00:07<00:00, 26.71it/s]


Epoch 41: training loss: 6.1813 over 48263 training points.


100%|██████████| 189/189 [00:07<00:00, 26.79it/s]


Epoch 42: training loss: 6.1691 over 48224 training points.


100%|██████████| 189/189 [00:07<00:00, 26.34it/s]


Epoch 43: training loss: 6.1550 over 48382 training points.


100%|██████████| 189/189 [00:06<00:00, 27.25it/s]


Epoch 44: training loss: 6.1481 over 48252 training points.


100%|██████████| 189/189 [00:06<00:00, 27.13it/s]


Epoch 45: training loss: 6.1235 over 48171 training points.


100%|██████████| 191/191 [00:07<00:00, 27.21it/s]


Epoch 46: training loss: 6.1589 over 48718 training points.


100%|██████████| 189/189 [00:06<00:00, 27.32it/s]


Epoch 47: training loss: 6.1263 over 48224 training points.


100%|██████████| 189/189 [00:07<00:00, 26.58it/s]


Epoch 48: training loss: 6.1427 over 48307 training points.


100%|██████████| 188/188 [00:06<00:00, 27.00it/s]


Epoch 49: training loss: 6.1222 over 48122 training points.


100%|██████████| 188/188 [00:06<00:00, 27.30it/s]


Epoch 50: training loss: 6.0867 over 48003 training points.


100%|██████████| 190/190 [00:07<00:00, 26.96it/s]


Epoch 51: training loss: 6.0838 over 48455 training points.


100%|██████████| 189/189 [00:06<00:00, 27.23it/s]


Epoch 52: training loss: 6.0894 over 48344 training points.


100%|██████████| 190/190 [00:06<00:00, 27.18it/s]


Epoch 53: training loss: 6.0743 over 48411 training points.


100%|██████████| 189/189 [00:07<00:00, 26.70it/s]


Epoch 54: training loss: 6.0762 over 48161 training points.


100%|██████████| 190/190 [00:06<00:00, 27.27it/s]


Epoch 55: training loss: 6.1006 over 48433 training points.


100%|██████████| 189/189 [00:07<00:00, 26.99it/s]


Epoch 56: training loss: 6.0812 over 48184 training points.


100%|██████████| 188/188 [00:07<00:00, 26.83it/s]


Epoch 57: training loss: 6.0648 over 48007 training points.


100%|██████████| 188/188 [00:06<00:00, 26.94it/s]


Epoch 58: training loss: 6.0328 over 48009 training points.


100%|██████████| 188/188 [00:06<00:00, 27.16it/s]


Epoch 59: training loss: 6.0410 over 48067 training points.


100%|██████████| 189/189 [00:06<00:00, 27.21it/s]


Epoch 60: training loss: 6.0114 over 48133 training points.


100%|██████████| 189/189 [00:07<00:00, 24.55it/s]


Epoch 61: training loss: 6.0194 over 48237 training points.


100%|██████████| 189/189 [00:07<00:00, 26.44it/s]


Epoch 62: training loss: 6.0054 over 48216 training points.


100%|██████████| 188/188 [00:07<00:00, 26.18it/s]


Epoch 63: training loss: 5.9992 over 48006 training points.


100%|██████████| 189/189 [00:06<00:00, 27.02it/s]


Epoch 64: training loss: 5.9937 over 48203 training points.


100%|██████████| 188/188 [00:07<00:00, 26.08it/s]


Epoch 65: training loss: 5.9732 over 48117 training points.


100%|██████████| 188/188 [00:07<00:00, 26.58it/s]


Epoch 66: training loss: 5.9665 over 47930 training points.


100%|██████████| 189/189 [00:07<00:00, 26.94it/s]


Epoch 67: training loss: 5.9680 over 48212 training points.


100%|██████████| 188/188 [00:07<00:00, 26.78it/s]


Epoch 68: training loss: 5.9758 over 47874 training points.


100%|██████████| 189/189 [00:07<00:00, 26.24it/s]


Epoch 69: training loss: 5.9258 over 48357 training points.


100%|██████████| 189/189 [00:07<00:00, 26.50it/s]


Epoch 70: training loss: 5.9284 over 48230 training points.


100%|██████████| 191/191 [00:07<00:00, 26.71it/s]


Epoch 71: training loss: 5.9395 over 48658 training points.


100%|██████████| 189/189 [00:07<00:00, 26.44it/s]


Epoch 72: training loss: 5.9324 over 48227 training points.


100%|██████████| 188/188 [00:07<00:00, 26.76it/s]


Epoch 73: training loss: 5.9371 over 48021 training points.


100%|██████████| 188/188 [00:07<00:00, 26.57it/s]


Epoch 74: training loss: 5.9116 over 48069 training points.


100%|██████████| 189/189 [00:07<00:00, 26.79it/s]


Epoch 75: training loss: 5.8915 over 48198 training points.


100%|██████████| 190/190 [00:07<00:00, 26.91it/s]


Epoch 76: training loss: 5.9151 over 48475 training points.


100%|██████████| 189/189 [00:06<00:00, 27.27it/s]


Epoch 77: training loss: 5.9046 over 48154 training points.


100%|██████████| 190/190 [00:06<00:00, 27.16it/s]


Epoch 78: training loss: 5.8847 over 48467 training points.


100%|██████████| 189/189 [00:07<00:00, 24.57it/s]


Epoch 79: training loss: 5.8829 over 48206 training points.


100%|██████████| 190/190 [00:07<00:00, 26.27it/s]


Epoch 80: training loss: 5.8779 over 48399 training points.


100%|██████████| 189/189 [00:07<00:00, 26.27it/s]


Epoch 81: training loss: 5.8581 over 48347 training points.


100%|██████████| 189/189 [00:07<00:00, 26.33it/s]


Epoch 82: training loss: 5.8490 over 48233 training points.


100%|██████████| 190/190 [00:07<00:00, 26.25it/s]


Epoch 83: training loss: 5.8502 over 48404 training points.


100%|██████████| 187/187 [00:07<00:00, 26.26it/s]


Epoch 84: training loss: 5.8517 over 47753 training points.


100%|██████████| 190/190 [00:07<00:00, 26.21it/s]


Epoch 85: training loss: 5.8605 over 48509 training points.


100%|██████████| 188/188 [00:07<00:00, 26.28it/s]


Epoch 86: training loss: 5.8323 over 48122 training points.


100%|██████████| 189/189 [00:07<00:00, 26.05it/s]


Epoch 87: training loss: 5.8291 over 48232 training points.


100%|██████████| 190/190 [00:07<00:00, 26.14it/s]


Epoch 88: training loss: 5.8107 over 48475 training points.


100%|██████████| 189/189 [00:07<00:00, 26.20it/s]


Epoch 89: training loss: 5.8305 over 48240 training points.


100%|██████████| 189/189 [00:07<00:00, 26.09it/s]


Epoch 90: training loss: 5.8120 over 48214 training points.


100%|██████████| 188/188 [00:07<00:00, 26.15it/s]


Epoch 91: training loss: 5.8013 over 48120 training points.


100%|██████████| 190/190 [00:07<00:00, 23.83it/s]


Epoch 92: training loss: 5.8022 over 48425 training points.


100%|██████████| 189/189 [00:08<00:00, 22.60it/s]


Epoch 93: training loss: 5.8256 over 48314 training points.


100%|██████████| 189/189 [00:07<00:00, 25.16it/s]


Epoch 94: training loss: 5.7991 over 48242 training points.


100%|██████████| 189/189 [00:07<00:00, 24.36it/s]


Epoch 95: training loss: 5.7982 over 48135 training points.


100%|██████████| 190/190 [00:07<00:00, 23.77it/s]


Epoch 96: training loss: 5.8091 over 48421 training points.


100%|██████████| 188/188 [00:07<00:00, 24.95it/s]


Epoch 97: training loss: 5.7993 over 48024 training points.


100%|██████████| 188/188 [00:07<00:00, 24.51it/s]


Epoch 98: training loss: 5.7839 over 47957 training points.


100%|██████████| 189/189 [00:07<00:00, 25.72it/s]


Epoch 99: training loss: 5.7883 over 48148 training points.


100%|██████████| 190/190 [00:07<00:00, 24.99it/s]

Epoch 100: training loss: 5.8138 over 48617 training points.
Training finished





In [13]:
# save the word-embedding layer weights
embedding_weights = word2vec.embed.state_dict()
torch.save(embedding_weights, f'save/embedding_weights.pt')