## Import libraries

In [1]:
# Imports
import re
import string
import json
from datetime import datetime
from collections import defaultdict, Counter

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm

import torch
import torch.nn as nn
from torch.nn import Module
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

from nltk.corpus import stopwords

device = 'cpu'

import random

random.seed(26)
np.random.seed(62)
torch.manual_seed(2021)

<torch._C.Generator at 0x7f7b691cff90>

## Load data

In [2]:
ben_train_df = pd.read_csv('../../data/bengali_hatespeech_sample_train_preprocessed.csv')
ben_test_df = pd.read_csv('../../data/bengali_hatespeech_sample_test_preprocessed.csv')

display(ben_train_df.head())

Unnamed: 0,sentence,hate,category
0,ভায়েরা আপনাদের ধন্যোবাদ এগিয়ে জাও পাসে আছি ভাই,0,religion
1,নাউজুবিল্লাহ নাউজুবিল্লাহ,0,"Meme, TikTok and others"
2,দুইজন অপরাধ সরকারি চাকরি হিসেবে দুইজনকে বাংলাদ...,0,crime
3,উড়িয়ে ই মারলো পেরেরা,0,sports
4,পুরুষ এক জাত অনেকসময় বোঝেনা সময় বুঝতে চায়না...,0,entertainment


In [3]:
# remove empty texts
ben_train_df = ben_train_df[ben_train_df.sentence.str.len() > 0]
# extract sentences and labels
train_sentences = [text.split() for text in ben_train_df['sentence']]
train_labels = ben_train_df['hate'].to_numpy()

# remove empty texts
ben_test_df = ben_test_df[ben_test_df.sentence.str.len() > 0]
# extract sentences and labels
test_sentences = [text.split() for text in ben_test_df['sentence']]
test_labels = ben_test_df['hate'].to_numpy()

print('Train data:')
print(train_sentences[:3])
print(train_labels)
print()
print('Test data:')
print(test_sentences[:3])
print(test_labels)

Train data:
[['ভায়েরা', 'আপনাদের', 'ধন্যোবাদ', 'এগিয়ে', 'জাও', 'পাসে', 'আছি', 'ভাই'], ['নাউজুবিল্লাহ', 'নাউজুবিল্লাহ'], ['দুইজন', 'অপরাধ', 'সরকারি', 'চাকরি', 'হিসেবে', 'দুইজনকে', 'বাংলাদেশ', 'বের', 'দেওয়া', 'আপনারা', 'কমেন্ট', 'জানাবেন']]
[0 0 0 ... 0 0 0]

Test data:
[['লেখাটি', 'ফুটবল', 'বুঝেই', 'লেখা।'], ['ভাই', 'কথা', 'শুনে', 'কান্না', 'আসলো।'], ['খানকি', 'নাইকা']]
[0 0 1 ... 1 1 1]


## Prepare vocab set

In [4]:
# load mapping {word -> id} and {id -> word}
with open('save/word_to_int_dict.json') as f:
    word_to_int = json.load(f)
with open('save/int_to_word_dict.json') as f:
    int_to_word = json.load(f)

# get vocab_size
vocab_size = len(word_to_int)
print(f'vocab_size: {vocab_size}')

vocab_size: 16005


In [5]:
train_sentences = [[word_to_int[word] for word in sentence] for sentence in train_sentences]
test_sentences = [[word_to_int[word] for word in sentence if word in word_to_int] for sentence in test_sentences]

## Hyper-parameters

In [6]:
embedding_path = 'save/embedding_weights.pt'
embedding_size = 300
att_dim = 150
learning_rate = 1e-4
batch_size = 32
epochs = 30

## Build datasets

In [7]:
class HOFDataset(Dataset):
    def __init__(self, sentences, labels):
        self.data = []
        for sentence, label in zip(sentences, labels):
            if len(sentence):
                self.data.append(
                    (torch.tensor(sentence, dtype=torch.long), 
                     torch.tensor(label, dtype=torch.float))
                )
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        return self.data[index]
    
def preprocess_batch(batch):
    texts, labels = list(zip(*batch))
    seq_lens = torch.tensor([len(text) for text in texts], dtype=torch.long)
    texts = pad_sequence(texts, padding_value=0)
    labels = torch.tensor(labels).unsqueeze(1)

    seq_lens, sorted_idx = seq_lens.sort(descending=True)
    texts = texts[:,sorted_idx]
    labels = labels[sorted_idx]
    return texts, seq_lens, labels

train_dataset = HOFDataset(train_sentences, train_labels)
train_loader = DataLoader(train_dataset, batch_size=batch_size, 
                          shuffle=True, collate_fn=preprocess_batch)

test_dataset = HOFDataset(test_sentences, test_labels)
test_loader = DataLoader(test_dataset, batch_size=batch_size, 
                         shuffle=False, collate_fn=preprocess_batch)

## Network architecture

In [8]:
def mask_seq(seq_lens):
    mask = torch.zeros((len(seq_lens), max(seq_lens))).bool()
    for i, seq_len in enumerate(seq_lens):
        mask[i, seq_len:] = True
    return mask

In [9]:
# get hindi_vocab_size
with open('../../Task_1/save/word_to_int_dict.json') as f:
    hindi_word_to_int = json.load(f)
hindi_vocab_size = len(hindi_word_to_int)

# define classifier
class Classifier(Module):
    def __init__(self):
        super(Classifier, self).__init__()

        self.embed = nn.Embedding(hindi_vocab_size, embedding_size)
        
        self.attention = nn.MultiheadAttention(embed_dim=embedding_size,
                                               num_heads=10,
                                               dropout=0.5,)
        self.attention.requires_grad = False # fix all layers except embedding.

        self.fc = nn.Linear(embedding_size, 1)
        self.fc.requires_grad = False # fix all layers except embedding.

    def forward(self, inp, seq_lens):
        out = self.embed(inp)
        pad_mask = mask_seq(seq_lens)
        att_out, _ = self.attention(out, out, out, key_padding_mask=pad_mask)
        out = F.layer_norm(out + att_out, (out.size(2), ))
        out = self.fc(out).squeeze(2)
        pred = torch.zeros((out.size(1), 1))
        for i, seq_len in enumerate(seq_lens):
            pred[i, 0] = out[:seq_len, i].mean()
        return pred

# load pre-trained hindi classifier
hindi_clf = Classifier().to(device)
hindi_model_weight_path = '../hindi_hindi/save/hindi_clf.pt'
hindi_clf.load_state_dict(torch.load(hindi_model_weight_path, map_location=torch.device(device)))
print('Hindi classifier:')
print(hindi_clf.eval())

# replace the embedding layer to make it a bengali classifier
bengali_embed = nn.Embedding(vocab_size, embedding_size)
bengali_clf = hindi_clf
bengali_clf._modules['embed'] = bengali_embed
print('Bengali classifier:')
print(bengali_clf.eval())

Hindi classifier
Classifier(
  (embed): Embedding(20402, 300)
  (attention): MultiheadAttention(
    (out_proj): _LinearWithBias(in_features=300, out_features=300, bias=True)
  )
  (fc): Linear(in_features=300, out_features=1, bias=True)
)
Bengali classifier
Classifier(
  (embed): Embedding(16005, 300)
  (attention): MultiheadAttention(
    (out_proj): _LinearWithBias(in_features=300, out_features=300, bias=True)
  )
  (fc): Linear(in_features=300, out_features=1, bias=True)
)


## Loss function and optimizer

In [10]:
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(bengali_clf.parameters(), lr=learning_rate)

## Measure performance on test data

In [11]:
def predict_test():
    losses = 0
    acc_cnt = 0
    cnt = 0
    preds = []
    true_labels = []
    for texts, seq_lens, labels in test_loader:
        pred = bengali_clf(texts.to(device), seq_lens).detach().to('cpu')
        loss = criterion(pred, labels)
        losses += loss.detach().item() * len(texts)
        acc_cnt += sum((pred > 0) == (labels > 0)).item()
        preds.extend(pred.view(-1))
        true_labels.extend(labels.view(-1))
        cnt += texts.size(1)
    
    preds = np.array(preds) > 0
    macro_f1 = f1_score(true_labels, preds, average='macro')
    weighted_f1 = f1_score(true_labels, preds, average='weighted')
    return losses / cnt, acc_cnt / cnt, macro_f1, weighted_f1

## Training

In [12]:
list_test_acc = []
early_stop = 5

for epoch in range(1, epochs + 1):
    losses = 0.
    acc_cnt = 0
    cnt = 0
    bengali_clf.train()
    for texts, seq_lens, labels in tqdm(train_loader):
        optimizer.zero_grad()
        pred = bengali_clf(texts.to(device), seq_lens)
        loss = criterion(pred, labels)
        loss.backward()
        optimizer.step()
        losses += loss.detach().item() * len(texts)
        acc_cnt += sum((pred.to('cpu') > 0) == (labels > 0)).item()
        cnt += texts.size(1)

    epoch_loss = losses / cnt
    epoch_acc = acc_cnt / cnt
    test_loss, test_acc, test_macro_f1, test_weighted_f1 = predict_test()
    print(f'Epoch {epoch:2}: Train loss: {epoch_loss:.4f}, acc: {epoch_acc:.4f}. '
        f'Test loss: {test_loss:.4f}, acc: {test_acc:.4f}, '
        f'macro_f1: {test_macro_f1:.4f}, weighted_f1: {test_weighted_f1:.4f}',
        flush=True)

    list_test_acc.append(test_acc)
    if len(list_test_acc) > early_stop and max(list_test_acc[-early_stop:]) <= max(list_test_acc[:-early_stop]):
        print(f'Early stopping: test accuracy does not increase after {early_stop} epochs')
        break

100%|██████████| 146/146 [00:13<00:00, 10.61it/s]


Epoch  1: Train loss: 1.5486, acc: 0.5793. Test loss: 0.8961, acc: 0.6206, macro_f1: 0.6201, weighted_f1: 0.6191


100%|██████████| 146/146 [00:13<00:00, 10.54it/s]


Epoch  2: Train loss: 1.3249, acc: 0.6644. Test loss: 0.8336, acc: 0.6553, macro_f1: 0.6552, weighted_f1: 0.6548


100%|██████████| 146/146 [00:14<00:00,  9.82it/s]


Epoch  3: Train loss: 1.1857, acc: 0.7063. Test loss: 0.8147, acc: 0.6723, macro_f1: 0.6723, weighted_f1: 0.6723


100%|██████████| 146/146 [00:15<00:00,  9.55it/s]


Epoch  4: Train loss: 1.1552, acc: 0.7325. Test loss: 0.8060, acc: 0.6971, macro_f1: 0.6970, weighted_f1: 0.6973


100%|██████████| 146/146 [00:14<00:00,  9.78it/s]


Epoch  5: Train loss: 1.0559, acc: 0.7552. Test loss: 0.7990, acc: 0.7002, macro_f1: 0.6999, weighted_f1: 0.7005


100%|██████████| 146/146 [00:15<00:00,  9.64it/s]


Epoch  6: Train loss: 1.0046, acc: 0.7760. Test loss: 0.8048, acc: 0.7009, macro_f1: 0.7009, weighted_f1: 0.7012


100%|██████████| 146/146 [00:15<00:00,  9.64it/s]


Epoch  7: Train loss: 0.9330, acc: 0.7968. Test loss: 0.8201, acc: 0.7117, macro_f1: 0.7116, weighted_f1: 0.7121


100%|██████████| 146/146 [00:15<00:00,  9.49it/s]


Epoch  8: Train loss: 0.8813, acc: 0.8176. Test loss: 0.8292, acc: 0.7110, macro_f1: 0.7109, weighted_f1: 0.7113


100%|██████████| 146/146 [00:15<00:00,  9.61it/s]


Epoch  9: Train loss: 0.8410, acc: 0.8350. Test loss: 0.8628, acc: 0.7195, macro_f1: 0.7192, weighted_f1: 0.7198


100%|██████████| 146/146 [00:14<00:00,  9.86it/s]


Epoch 10: Train loss: 0.7128, acc: 0.8507. Test loss: 0.8848, acc: 0.7141, macro_f1: 0.7139, weighted_f1: 0.7144


100%|██████████| 146/146 [00:14<00:00,  9.75it/s]


Epoch 11: Train loss: 0.6705, acc: 0.8685. Test loss: 0.8744, acc: 0.7372, macro_f1: 0.7368, weighted_f1: 0.7376


100%|██████████| 146/146 [00:15<00:00,  9.47it/s]


Epoch 12: Train loss: 0.6088, acc: 0.8902. Test loss: 0.9307, acc: 0.7349, macro_f1: 0.7346, weighted_f1: 0.7353


100%|██████████| 146/146 [00:14<00:00,  9.78it/s]


Epoch 13: Train loss: 0.5165, acc: 0.9024. Test loss: 0.9813, acc: 0.7249, macro_f1: 0.7248, weighted_f1: 0.7252


100%|██████████| 146/146 [00:15<00:00,  9.64it/s]


Epoch 14: Train loss: 0.4561, acc: 0.9191. Test loss: 1.0004, acc: 0.7311, macro_f1: 0.7309, weighted_f1: 0.7314


100%|██████████| 146/146 [00:14<00:00,  9.89it/s]


Epoch 15: Train loss: 0.4054, acc: 0.9333. Test loss: 1.0575, acc: 0.7388, macro_f1: 0.7387, weighted_f1: 0.7390


100%|██████████| 146/146 [00:14<00:00, 10.17it/s]


Epoch 16: Train loss: 0.3348, acc: 0.9446. Test loss: 1.1214, acc: 0.7342, macro_f1: 0.7338, weighted_f1: 0.7345


100%|██████████| 146/146 [00:15<00:00,  9.65it/s]


Epoch 17: Train loss: 0.2978, acc: 0.9547. Test loss: 1.2018, acc: 0.7280, macro_f1: 0.7278, weighted_f1: 0.7283


100%|██████████| 146/146 [00:14<00:00,  9.83it/s]


Epoch 18: Train loss: 0.2586, acc: 0.9605. Test loss: 1.2930, acc: 0.7326, macro_f1: 0.7325, weighted_f1: 0.7328


100%|██████████| 146/146 [00:14<00:00,  9.87it/s]


Epoch 19: Train loss: 0.2239, acc: 0.9678. Test loss: 1.3448, acc: 0.7233, macro_f1: 0.7233, weighted_f1: 0.7235


100%|██████████| 146/146 [00:14<00:00,  9.76it/s]


Epoch 20: Train loss: 0.2226, acc: 0.9725. Test loss: 1.3909, acc: 0.7334, macro_f1: 0.7332, weighted_f1: 0.7337
Early stopping: test accuracy does not increase after 5 epochs
