## Import libraries

In [1]:
# Imports
import re
import string
import json
from datetime import datetime
from collections import defaultdict, Counter

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm

import torch
import torch.nn as nn
from torch.nn import Module
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

from nltk.corpus import stopwords

device = 'cpu'

import random

torch.manual_seed(123)
torch.cuda.manual_seed(234)
np.random.seed(345)
random.seed(456)
torch.manual_seed(567)
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True

## Load data

In [2]:
ben_train_df = pd.read_csv('save/bengali_hatespeech_sample_train_preprocessed.csv')
ben_test_df = pd.read_csv('save/bengali_hatespeech_sample_test_preprocessed.csv')

display(ben_train_df.head())

Unnamed: 0,sentence,hate,category
0,সমকামী হুজুর,0,religion
1,ছাএলীগ সালা দের নিসিদ্দ হক,1,politics
2,কাওয়া গদি ছারলে বুজবে জুতা কেমনে খায়,0,politics
3,কাউয়া কাদের বড় মাগীখোর ভিডিও পিক দেখলে বুঝা লু...,1,politics
4,অপু ভালো কথা ছোট করবেনা,0,"Meme, TikTok and others"


In [3]:
# remove empty texts
ben_train_df = ben_train_df[ben_train_df.sentence.str.len() > 0]
# extract sentences and labels
train_sentences = [text.split() for text in ben_train_df['sentence']]
train_labels = ben_train_df['hate'].to_numpy()

# remove empty texts
ben_test_df = ben_test_df[ben_test_df.sentence.str.len() > 0]
# extract sentences and labels
test_sentences = [text.split() for text in ben_test_df['sentence']]
test_labels = ben_test_df['hate'].to_numpy()

print('Train data:')
print(train_sentences[:3])
print(train_labels)
print()
print('Test data:')
print(test_sentences[:3])
print(test_labels)

Train data:
[['সমকামী', 'হুজুর'], ['ছাএলীগ', 'সালা', 'দের', 'নিসিদ্দ', 'হক'], ['কাওয়া', 'গদি', 'ছারলে', 'বুজবে', 'জুতা', 'কেমনে', 'খায়']]
[0 1 0 ... 1 0 0]

Test data:
[['মহিলাকে', 'রিমানডে'], ['তুর', 'রিপাতকে', 'মন', 'চাইছিল', 'ছেড়ে', 'গেলি', 'সাথে', 'মিত্যে', 'অভিনয়', 'করলি', 'দুনিয়া', 'উঠালি', 'তুর', 'নরকেও', 'ঠাঁই', 'হবেনা', 'তুই', 'আকাশের', 'মিতুর', 'মত', 'করলি', 'তুর', 'মত', 'বিশ্বাস', 'ঘাতকনীর', 'ফাসি', 'হউক', 'রিফাত', 'তোকে', 'ফেলে', 'যেত', 'হয়তবা', 'বেঁচে', 'যেত', 'সরল', 'ভালবাসাকে', 'হত্যা', 'করলি', 'তুই', 'নারী', 'জাতের', 'কলংক', 'তুকে', 'দেখলে', 'বুঝা', 'আসলে', 'তোর', 'পরিক্ষলপনা', 'তোর', 'চোখ', 'মুখ', 'সাক্ষী', 'তুইয়ি', 'জরিত', 'নারী', 'জাতের', 'কলংক'], ['হুমায়ুন', 'আজাদ', 'এতো', 'বড়', 'ক্রাক', 'মাতাল']]
[0 0 0 ... 0 0 1]


## Prepare vocab set

In [4]:
# load mapping {word -> id} and {id -> word}
with open('save/bengali_word_to_int_dict.json') as f:
    word_to_int = json.load(f)
with open('save/bengali_int_to_word_dict.json') as f:
    int_to_word = json.load(f)
    int_to_word = {int(k) : v for k, v in int_to_word.items()}

# get vocab_size
vocab_size = len(word_to_int)
print(f'vocab_size: {vocab_size}')

vocab_size: 15231


In [5]:
train_sentences = [[word_to_int[word] for word in sentence] for sentence in train_sentences]
test_sentences = [[word_to_int[word] for word in sentence if word in word_to_int] for sentence in test_sentences]

## Hyper-parameters

In [6]:
embedding_path = 'save/embedding_weights.pt'
embedding_size = 300
att_dim = 150
learning_rate = 1e-4
batch_size = 32
epochs = 30

## Build datasets

In [7]:
class HOFDataset(Dataset):
    def __init__(self, sentences, labels):
        self.data = []
        for sentence, label in zip(sentences, labels):
            if len(sentence):
                self.data.append(
                    (torch.tensor(sentence, dtype=torch.long), 
                     torch.tensor(label, dtype=torch.float))
                )
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        return self.data[index]
    
def preprocess_batch(batch):
    texts, labels = list(zip(*batch))
    seq_lens = torch.tensor([len(text) for text in texts], dtype=torch.long)
    texts = pad_sequence(texts, padding_value=0)
    labels = torch.tensor(labels).unsqueeze(1)

    seq_lens, sorted_idx = seq_lens.sort(descending=True)
    texts = texts[:,sorted_idx]
    labels = labels[sorted_idx]
    return texts, seq_lens, labels

train_dataset = HOFDataset(train_sentences, train_labels)
train_loader = DataLoader(train_dataset, batch_size=batch_size, 
                          shuffle=True, collate_fn=preprocess_batch)

test_dataset = HOFDataset(test_sentences, test_labels)
test_loader = DataLoader(test_dataset, batch_size=batch_size, 
                         shuffle=False, collate_fn=preprocess_batch)

## Network architecture

In [8]:
def mask_seq(seq_lens):
    mask = torch.zeros((len(seq_lens), max(seq_lens))).bool()
    for i, seq_len in enumerate(seq_lens):
        mask[i, seq_len:] = True
    return mask

In [9]:
# get hindi_vocab_size
with open('../../Task_1/save/word_to_int_dict.json') as f:
    hindi_word_to_int = json.load(f)
hindi_vocab_size = len(hindi_word_to_int)

# define classifier
class Classifier(Module):
    def __init__(self):
        super(Classifier, self).__init__()

        self.embed = nn.Embedding(hindi_vocab_size, embedding_size)
        
        self.attention = nn.MultiheadAttention(embed_dim=embedding_size,
                                               num_heads=10,
                                               dropout=0.5,)
        self.attention.requires_grad = False # fix all layers except embedding.

        self.fc = nn.Linear(embedding_size, 1)
        self.fc.requires_grad = False # fix all layers except embedding.

    def forward(self, inp, seq_lens):
        out = self.embed(inp)
        pad_mask = mask_seq(seq_lens)
        att_out, _ = self.attention(out, out, out, key_padding_mask=pad_mask)
        out = F.layer_norm(out + att_out, (out.size(2), ))
        out = self.fc(out).squeeze(2)
        pred = torch.zeros((out.size(1), 1))
        for i, seq_len in enumerate(seq_lens):
            pred[i, 0] = out[:seq_len, i].mean()
        return pred

# load pre-trained hindi classifier
hindi_clf = Classifier().to(device)
hindi_model_weight_path = '../hindi_hindi/save/hindi_clf.pt'
hindi_clf.load_state_dict(torch.load(hindi_model_weight_path, map_location=torch.device(device)))
print('Hindi classifier:')
print(hindi_clf.eval())

# replace the embedding layer to make it a bengali classifier
bengali_embed = nn.Embedding(vocab_size, embedding_size)
bengali_clf = hindi_clf
bengali_clf._modules['embed'] = bengali_embed
print('Bengali classifier:')
print(bengali_clf.eval())

Hindi classifier:
Classifier(
  (embed): Embedding(20402, 300)
  (attention): MultiheadAttention(
    (out_proj): _LinearWithBias(in_features=300, out_features=300, bias=True)
  )
  (fc): Linear(in_features=300, out_features=1, bias=True)
)
Bengali classifier:
Classifier(
  (embed): Embedding(15231, 300)
  (attention): MultiheadAttention(
    (out_proj): _LinearWithBias(in_features=300, out_features=300, bias=True)
  )
  (fc): Linear(in_features=300, out_features=1, bias=True)
)


## Loss function and optimizer

In [10]:
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(bengali_clf.parameters(), lr=learning_rate)

## Measure performance on test data

In [11]:
def predict_test():
    losses = 0
    acc_cnt = 0
    cnt = 0
    preds = []
    true_labels = []
    for texts, seq_lens, labels in test_loader:
        pred = bengali_clf(texts.to(device), seq_lens).detach().to('cpu')
        loss = criterion(pred, labels)
        losses += loss.detach().item() * len(texts)
        acc_cnt += sum((pred > 0) == (labels > 0)).item()
        preds.extend(pred.view(-1))
        true_labels.extend(labels.view(-1))
        cnt += texts.size(1)
    
    preds = np.array(preds) > 0
    macro_f1 = f1_score(true_labels, preds, average='macro')
    weighted_f1 = f1_score(true_labels, preds, average='weighted')
    return losses / cnt, acc_cnt / cnt, macro_f1, weighted_f1

## Training

In [12]:
list_test_acc = []
early_stop = 5

for epoch in range(1, epochs + 1):
    losses = 0.
    acc_cnt = 0
    cnt = 0
    bengali_clf.train()
    for texts, seq_lens, labels in tqdm(train_loader):
        optimizer.zero_grad()
        pred = bengali_clf(texts.to(device), seq_lens)
        loss = criterion(pred, labels)
        loss.backward()
        optimizer.step()
        losses += loss.detach().item() * len(texts)
        acc_cnt += sum((pred.to('cpu') > 0) == (labels > 0)).item()
        cnt += texts.size(1)

    epoch_loss = losses / cnt
    epoch_acc = acc_cnt / cnt
    test_loss, test_acc, test_macro_f1, test_weighted_f1 = predict_test()
    print(f'Epoch {epoch:2}: Train loss: {epoch_loss:.4f}, acc: {epoch_acc:.4f}. '
        f'Test loss: {test_loss:.4f}, acc: {test_acc:.4f}, '
        f'macro_f1: {test_macro_f1:.4f}, weighted_f1: {test_weighted_f1:.4f}',
        flush=True)

    list_test_acc.append(test_acc)
    if len(list_test_acc) > early_stop and max(list_test_acc[-early_stop:]) <= max(list_test_acc[:-early_stop]):
        print(f'Early stopping: test accuracy does not increase after {early_stop} epochs')
        break

100%|██████████| 146/146 [00:11<00:00, 12.56it/s]


Epoch  1: Train loss: 1.1500, acc: 0.5812. Test loss: 0.8553, acc: 0.6209, macro_f1: 0.6193, weighted_f1: 0.6175


100%|██████████| 146/146 [00:11<00:00, 12.25it/s]


Epoch  2: Train loss: 1.0646, acc: 0.6808. Test loss: 0.8179, acc: 0.6364, macro_f1: 0.6358, weighted_f1: 0.6348


100%|██████████| 146/146 [00:13<00:00, 11.22it/s]


Epoch  3: Train loss: 0.9888, acc: 0.7177. Test loss: 0.8067, acc: 0.6651, macro_f1: 0.6651, weighted_f1: 0.6649


100%|██████████| 146/146 [00:12<00:00, 11.43it/s]


Epoch  4: Train loss: 0.8955, acc: 0.7516. Test loss: 0.8345, acc: 0.6791, macro_f1: 0.6791, weighted_f1: 0.6792


100%|██████████| 146/146 [00:12<00:00, 11.31it/s]


Epoch  5: Train loss: 0.8612, acc: 0.7713. Test loss: 0.8405, acc: 0.6767, macro_f1: 0.6767, weighted_f1: 0.6770


100%|██████████| 146/146 [00:14<00:00, 10.00it/s]


Epoch  6: Train loss: 0.7838, acc: 0.7962. Test loss: 0.8344, acc: 0.6899, macro_f1: 0.6898, weighted_f1: 0.6902


100%|██████████| 146/146 [00:13<00:00, 10.70it/s]


Epoch  7: Train loss: 0.7298, acc: 0.8153. Test loss: 0.8495, acc: 0.6946, macro_f1: 0.6945, weighted_f1: 0.6949


100%|██████████| 146/146 [00:11<00:00, 12.51it/s]


Epoch  8: Train loss: 0.6725, acc: 0.8327. Test loss: 0.8845, acc: 0.6992, macro_f1: 0.6987, weighted_f1: 0.6996


100%|██████████| 146/146 [00:11<00:00, 12.44it/s]


Epoch  9: Train loss: 0.6380, acc: 0.8556. Test loss: 0.8895, acc: 0.7054, macro_f1: 0.7053, weighted_f1: 0.7057


100%|██████████| 146/146 [00:11<00:00, 12.61it/s]


Epoch 10: Train loss: 0.5720, acc: 0.8698. Test loss: 0.9649, acc: 0.7085, macro_f1: 0.7083, weighted_f1: 0.7089


100%|██████████| 146/146 [00:11<00:00, 12.63it/s]


Epoch 11: Train loss: 0.5317, acc: 0.8880. Test loss: 1.0052, acc: 0.7023, macro_f1: 0.7021, weighted_f1: 0.7027


100%|██████████| 146/146 [00:12<00:00, 12.15it/s]


Epoch 12: Train loss: 0.4825, acc: 0.8985. Test loss: 0.9923, acc: 0.7054, macro_f1: 0.7054, weighted_f1: 0.7056


100%|██████████| 146/146 [00:12<00:00, 11.85it/s]


Epoch 13: Train loss: 0.4160, acc: 0.9168. Test loss: 1.0584, acc: 0.7101, macro_f1: 0.7099, weighted_f1: 0.7104


100%|██████████| 146/146 [00:11<00:00, 12.35it/s]


Epoch 14: Train loss: 0.3330, acc: 0.9277. Test loss: 1.0871, acc: 0.7078, macro_f1: 0.7077, weighted_f1: 0.7080


100%|██████████| 146/146 [00:13<00:00, 11.04it/s]


Epoch 15: Train loss: 0.3236, acc: 0.9434. Test loss: 1.0962, acc: 0.7171, macro_f1: 0.7166, weighted_f1: 0.7174


100%|██████████| 146/146 [00:12<00:00, 11.36it/s]


Epoch 16: Train loss: 0.2632, acc: 0.9502. Test loss: 1.2154, acc: 0.7171, macro_f1: 0.7168, weighted_f1: 0.7174


100%|██████████| 146/146 [00:12<00:00, 11.49it/s]


Epoch 17: Train loss: 0.2257, acc: 0.9610. Test loss: 1.2940, acc: 0.7163, macro_f1: 0.7161, weighted_f1: 0.7166


100%|██████████| 146/146 [00:13<00:00, 11.21it/s]


Epoch 18: Train loss: 0.2057, acc: 0.9695. Test loss: 1.3737, acc: 0.7302, macro_f1: 0.7300, weighted_f1: 0.7306


100%|██████████| 146/146 [00:12<00:00, 11.44it/s]


Epoch 19: Train loss: 0.2035, acc: 0.9715. Test loss: 1.3253, acc: 0.7233, macro_f1: 0.7231, weighted_f1: 0.7235


100%|██████████| 146/146 [00:12<00:00, 11.81it/s]


Epoch 20: Train loss: 0.1493, acc: 0.9760. Test loss: 1.4247, acc: 0.7271, macro_f1: 0.7269, weighted_f1: 0.7275


100%|██████████| 146/146 [00:12<00:00, 11.57it/s]


Epoch 21: Train loss: 0.1468, acc: 0.9779. Test loss: 1.4180, acc: 0.7341, macro_f1: 0.7338, weighted_f1: 0.7344


100%|██████████| 146/146 [00:13<00:00, 10.99it/s]


Epoch 22: Train loss: 0.1282, acc: 0.9850. Test loss: 1.5971, acc: 0.7171, macro_f1: 0.7169, weighted_f1: 0.7174


100%|██████████| 146/146 [00:12<00:00, 11.48it/s]


Epoch 23: Train loss: 0.1165, acc: 0.9837. Test loss: 1.6503, acc: 0.7209, macro_f1: 0.7207, weighted_f1: 0.7213


100%|██████████| 146/146 [00:12<00:00, 11.39it/s]


Epoch 24: Train loss: 0.1024, acc: 0.9867. Test loss: 1.6572, acc: 0.7248, macro_f1: 0.7246, weighted_f1: 0.7251


100%|██████████| 146/146 [00:12<00:00, 11.56it/s]


Epoch 25: Train loss: 0.0920, acc: 0.9884. Test loss: 1.6331, acc: 0.7202, macro_f1: 0.7199, weighted_f1: 0.7205


100%|██████████| 146/146 [00:12<00:00, 11.67it/s]


Epoch 26: Train loss: 0.0791, acc: 0.9895. Test loss: 1.6353, acc: 0.7194, macro_f1: 0.7192, weighted_f1: 0.7197
Early stopping: test accuracy does not increase after 5 epochs


In [13]:
# save the word-embedding layer weights
embedding_weights = bengali_clf.embed.state_dict()
torch.save(embedding_weights, f'save/bengali_embedding_weights_.pt')