## Import libraries

In [1]:
# Imports
import re
import string
import json
from datetime import datetime
from collections import defaultdict, Counter

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm

import torch
import torch.nn as nn
from torch.nn import Module
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

from nltk.corpus import stopwords

device = 'cpu'

import random

random.seed(26)
np.random.seed(62)
torch.manual_seed(2021)

<torch._C.Generator at 0x7f0a67cdcf70>

## Load data

In [2]:
ben_train_df = pd.read_csv('../../data/bengali_hatespeech_sample_train_preprocessed.csv')
ben_test_df = pd.read_csv('../../data/bengali_hatespeech_sample_test_preprocessed.csv')

display(ben_train_df.head())

Unnamed: 0,sentence,hate,category
0,নটির পুতেরে জুতা বাইরে,1,religion
1,কুত্তাকে গণধোলাই দেওয়া জানোয়ারটা যেই হাত দিয...,1,"Meme, TikTok and others"
2,তোর বউয়ের ভোদ চোদ জাইয়া,1,religion
3,কুত্তার বাচ্চা কোথায় পাবো,1,religion
4,সালিকে জতাপিটা,1,crime


In [3]:
# remove empty texts
ben_train_df = ben_train_df[ben_train_df.sentence.str.len() > 0]
# extract sentences and labels
train_sentences = [text.split() for text in ben_train_df['sentence']]
train_labels = ben_train_df['hate'].to_numpy()

# remove empty texts
ben_test_df = ben_test_df[ben_test_df.sentence.str.len() > 0]
# extract sentences and labels
test_sentences = [text.split() for text in ben_test_df['sentence']]
test_labels = ben_test_df['hate'].to_numpy()

print('Train data:')
print(train_sentences[:3])
print(train_labels)
print()
print('Test data:')
print(test_sentences[:3])
print(test_labels)

Train data:
[['নটির', 'পুতেরে', 'জুতা', 'বাইরে'], ['কুত্তাকে', 'গণধোলাই', 'দেওয়া', 'জানোয়ারটা', 'যেই', 'হাত', 'দিয়ে', 'মারছে', 'হাতটা', 'ভেঙ্গে', 'দিন'], ['তোর', 'বউয়ের', 'ভোদ', 'চোদ', 'জাইয়া']]
[1 1 1 ... 1 1 1]

Test data:
[['শালা', 'তাহেরি', 'ওরে', 'বাশ', 'দেয়া', 'হোউক'], ['খানকির', 'বাচ্চা', 'তোরে', 'এনাকোন্ডা', 'মারা', 'খা'], ['ওরে', 'পুলিশের', 'হাতে', 'দেয়ার', 'সবাই', 'মিলে', 'পিটিয়ে', 'আধমরা', 'করলো']]
[1 1 0 ... 0 0 0]


## Prepare vocab set

In [4]:
# load mapping {word -> id} and {id -> word}
with open('save/word_to_int_dict.json') as f:
    word_to_int = json.load(f)
with open('save/int_to_word_dict.json') as f:
    int_to_word = json.load(f)

# get vocab_size
vocab_size = len(word_to_int)
print(f'vocab_size: {vocab_size}')

vocab_size: 15231


In [5]:
train_sentences = [[word_to_int[word] for word in sentence] for sentence in train_sentences]
test_sentences = [[word_to_int[word] for word in sentence if word in word_to_int] for sentence in test_sentences]

## Hyper-parameters

In [6]:
embedding_path = 'save/embedding_weights.pt'
embedding_size = 300
att_dim = 150
learning_rate = 1e-4
batch_size = 32
epochs = 30

## Build datasets

In [7]:
class HOFDataset(Dataset):
    def __init__(self, sentences, labels):
        self.data = []
        for sentence, label in zip(sentences, labels):
            if len(sentence):
                self.data.append(
                    (torch.tensor(sentence, dtype=torch.long), 
                     torch.tensor(label, dtype=torch.float))
                )
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        return self.data[index]
    
def preprocess_batch(batch):
    texts, labels = list(zip(*batch))
    seq_lens = torch.tensor([len(text) for text in texts], dtype=torch.long)
    texts = pad_sequence(texts, padding_value=0)
    labels = torch.tensor(labels).unsqueeze(1)

    seq_lens, sorted_idx = seq_lens.sort(descending=True)
    texts = texts[:,sorted_idx]
    labels = labels[sorted_idx]
    return texts, seq_lens, labels

train_dataset = HOFDataset(train_sentences, train_labels)
train_loader = DataLoader(train_dataset, batch_size=batch_size, 
                          shuffle=True, collate_fn=preprocess_batch)

test_dataset = HOFDataset(test_sentences, test_labels)
test_loader = DataLoader(test_dataset, batch_size=batch_size, 
                         shuffle=False, collate_fn=preprocess_batch)

## Network architecture

In [8]:
def mask_seq(seq_lens):
    mask = torch.zeros((len(seq_lens), max(seq_lens))).bool()
    for i, seq_len in enumerate(seq_lens):
        mask[i, seq_len:] = True
    return mask

In [9]:
# get hindi_vocab_size
with open('../../Task_1/save/word_to_int_dict.json') as f:
    hindi_word_to_int = json.load(f)
hindi_vocab_size = len(hindi_word_to_int)

# define classifier
class Classifier(Module):
    def __init__(self):
        super(Classifier, self).__init__()

        self.embed = nn.Embedding(hindi_vocab_size, embedding_size)
        
        self.attention = nn.MultiheadAttention(embed_dim=embedding_size,
                                               num_heads=10,
                                               dropout=0.5,)
        self.attention.requires_grad = False # fix all layers except embedding.

        self.fc = nn.Linear(embedding_size, 1)
        self.fc.requires_grad = False # fix all layers except embedding.

    def forward(self, inp, seq_lens):
        out = self.embed(inp)
        pad_mask = mask_seq(seq_lens)
        att_out, _ = self.attention(out, out, out, key_padding_mask=pad_mask)
        out = F.layer_norm(out + att_out, (out.size(2), ))
        out = self.fc(out).squeeze(2)
        pred = torch.zeros((out.size(1), 1))
        for i, seq_len in enumerate(seq_lens):
            pred[i, 0] = out[:seq_len, i].mean()
        return pred

# load pre-trained hindi classifier
hindi_clf = Classifier().to(device)
hindi_model_weight_path = '../hindi_hindi/save/hindi_clf.pt'
hindi_clf.load_state_dict(torch.load(hindi_model_weight_path, map_location=torch.device(device)))
print('Hindi classifier:')
print(hindi_clf.eval())

# replace the embedding layer to make it a bengali classifier
bengali_embed = nn.Embedding(vocab_size, embedding_size)
bengali_clf = hindi_clf
bengali_clf._modules['embed'] = bengali_embed
print('Bengali classifier:')
print(bengali_clf.eval())

Hindi classifier:
Classifier(
  (embed): Embedding(20402, 300)
  (attention): MultiheadAttention(
    (out_proj): _LinearWithBias(in_features=300, out_features=300, bias=True)
  )
  (fc): Linear(in_features=300, out_features=1, bias=True)
)
Bengali classifier:
Classifier(
  (embed): Embedding(15231, 300)
  (attention): MultiheadAttention(
    (out_proj): _LinearWithBias(in_features=300, out_features=300, bias=True)
  )
  (fc): Linear(in_features=300, out_features=1, bias=True)
)


## Loss function and optimizer

In [10]:
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(bengali_clf.parameters(), lr=learning_rate)

## Measure performance on test data

In [11]:
def predict_test():
    losses = 0
    acc_cnt = 0
    cnt = 0
    preds = []
    true_labels = []
    for texts, seq_lens, labels in test_loader:
        pred = bengali_clf(texts.to(device), seq_lens).detach().to('cpu')
        loss = criterion(pred, labels)
        losses += loss.detach().item() * len(texts)
        acc_cnt += sum((pred > 0) == (labels > 0)).item()
        preds.extend(pred.view(-1))
        true_labels.extend(labels.view(-1))
        cnt += texts.size(1)
    
    preds = np.array(preds) > 0
    macro_f1 = f1_score(true_labels, preds, average='macro')
    weighted_f1 = f1_score(true_labels, preds, average='weighted')
    return losses / cnt, acc_cnt / cnt, macro_f1, weighted_f1

## Training

In [12]:
list_test_acc = []
early_stop = 5

for epoch in range(1, epochs + 1):
    losses = 0.
    acc_cnt = 0
    cnt = 0
    bengali_clf.train()
    for texts, seq_lens, labels in tqdm(train_loader):
        optimizer.zero_grad()
        pred = bengali_clf(texts.to(device), seq_lens)
        loss = criterion(pred, labels)
        loss.backward()
        optimizer.step()
        losses += loss.detach().item() * len(texts)
        acc_cnt += sum((pred.to('cpu') > 0) == (labels > 0)).item()
        cnt += texts.size(1)

    epoch_loss = losses / cnt
    epoch_acc = acc_cnt / cnt
    test_loss, test_acc, test_macro_f1, test_weighted_f1 = predict_test()
    print(f'Epoch {epoch:2}: Train loss: {epoch_loss:.4f}, acc: {epoch_acc:.4f}. '
        f'Test loss: {test_loss:.4f}, acc: {test_acc:.4f}, '
        f'macro_f1: {test_macro_f1:.4f}, weighted_f1: {test_weighted_f1:.4f}',
        flush=True)

    list_test_acc.append(test_acc)
    if len(list_test_acc) > early_stop and max(list_test_acc[-early_stop:]) <= max(list_test_acc[:-early_stop]):
        print(f'Early stopping: test accuracy does not increase after {early_stop} epochs')
        break

100%|██████████| 146/146 [00:11<00:00, 12.87it/s]


Epoch  1: Train loss: 1.3250, acc: 0.5602. Test loss: 1.0254, acc: 0.5760, macro_f1: 0.5755, weighted_f1: 0.5746


100%|██████████| 146/146 [00:10<00:00, 13.80it/s]


Epoch  2: Train loss: 1.0921, acc: 0.6524. Test loss: 0.9476, acc: 0.6318, macro_f1: 0.6318, weighted_f1: 0.6318


100%|██████████| 146/146 [00:11<00:00, 13.19it/s]


Epoch  3: Train loss: 0.9834, acc: 0.7005. Test loss: 0.9267, acc: 0.6566, macro_f1: 0.6565, weighted_f1: 0.6569


100%|██████████| 146/146 [00:11<00:00, 12.26it/s]


Epoch  4: Train loss: 0.9590, acc: 0.7322. Test loss: 0.9247, acc: 0.6729, macro_f1: 0.6727, weighted_f1: 0.6732


100%|██████████| 146/146 [00:11<00:00, 12.26it/s]


Epoch  5: Train loss: 0.9052, acc: 0.7509. Test loss: 0.9250, acc: 0.6729, macro_f1: 0.6726, weighted_f1: 0.6733


100%|██████████| 146/146 [00:12<00:00, 11.60it/s]


Epoch  6: Train loss: 0.8550, acc: 0.7762. Test loss: 0.9225, acc: 0.6891, macro_f1: 0.6888, weighted_f1: 0.6895


100%|██████████| 146/146 [00:11<00:00, 12.36it/s]


Epoch  7: Train loss: 0.7664, acc: 0.8024. Test loss: 0.9401, acc: 0.6891, macro_f1: 0.6889, weighted_f1: 0.6895


100%|██████████| 146/146 [00:11<00:00, 12.42it/s]


Epoch  8: Train loss: 0.7237, acc: 0.8168. Test loss: 0.9640, acc: 0.6891, macro_f1: 0.6885, weighted_f1: 0.6895


100%|██████████| 146/146 [00:11<00:00, 12.26it/s]


Epoch  9: Train loss: 0.6654, acc: 0.8357. Test loss: 0.9822, acc: 0.7008, macro_f1: 0.7002, weighted_f1: 0.7011


100%|██████████| 146/146 [00:12<00:00, 12.07it/s]


Epoch 10: Train loss: 0.5989, acc: 0.8496. Test loss: 1.0055, acc: 0.7023, macro_f1: 0.7017, weighted_f1: 0.7027


100%|██████████| 146/146 [00:12<00:00, 11.51it/s]


Epoch 11: Train loss: 0.5363, acc: 0.8777. Test loss: 1.0618, acc: 0.7078, macro_f1: 0.7071, weighted_f1: 0.7081


100%|██████████| 146/146 [00:12<00:00, 11.40it/s]


Epoch 12: Train loss: 0.5181, acc: 0.8869. Test loss: 1.1205, acc: 0.7132, macro_f1: 0.7127, weighted_f1: 0.7135


100%|██████████| 146/146 [00:12<00:00, 11.77it/s]


Epoch 13: Train loss: 0.4574, acc: 0.9060. Test loss: 1.1254, acc: 0.7078, macro_f1: 0.7076, weighted_f1: 0.7081


100%|██████████| 146/146 [00:12<00:00, 11.50it/s]


Epoch 14: Train loss: 0.3954, acc: 0.9174. Test loss: 1.1780, acc: 0.7140, macro_f1: 0.7136, weighted_f1: 0.7143


100%|██████████| 146/146 [00:12<00:00, 11.98it/s]


Epoch 15: Train loss: 0.3419, acc: 0.9354. Test loss: 1.2493, acc: 0.7186, macro_f1: 0.7183, weighted_f1: 0.7190


100%|██████████| 146/146 [00:12<00:00, 12.09it/s]


Epoch 16: Train loss: 0.2905, acc: 0.9436. Test loss: 1.3527, acc: 0.7078, macro_f1: 0.7074, weighted_f1: 0.7081


100%|██████████| 146/146 [00:11<00:00, 12.17it/s]


Epoch 17: Train loss: 0.2729, acc: 0.9517. Test loss: 1.3546, acc: 0.7225, macro_f1: 0.7221, weighted_f1: 0.7228


100%|██████████| 146/146 [00:12<00:00, 12.00it/s]


Epoch 18: Train loss: 0.2319, acc: 0.9635. Test loss: 1.5104, acc: 0.7132, macro_f1: 0.7128, weighted_f1: 0.7135


100%|██████████| 146/146 [00:11<00:00, 12.59it/s]


Epoch 19: Train loss: 0.1912, acc: 0.9702. Test loss: 1.5965, acc: 0.7155, macro_f1: 0.7151, weighted_f1: 0.7159


100%|██████████| 146/146 [00:11<00:00, 12.35it/s]


Epoch 20: Train loss: 0.1876, acc: 0.9706. Test loss: 1.5824, acc: 0.7202, macro_f1: 0.7200, weighted_f1: 0.7205


100%|██████████| 146/146 [00:11<00:00, 12.22it/s]


Epoch 21: Train loss: 0.1406, acc: 0.9803. Test loss: 1.6849, acc: 0.7186, macro_f1: 0.7185, weighted_f1: 0.7189


100%|██████████| 146/146 [00:12<00:00, 12.13it/s]


Epoch 22: Train loss: 0.1290, acc: 0.9794. Test loss: 1.7217, acc: 0.7233, macro_f1: 0.7229, weighted_f1: 0.7236


100%|██████████| 146/146 [00:12<00:00, 12.06it/s]


Epoch 23: Train loss: 0.1022, acc: 0.9863. Test loss: 1.6927, acc: 0.7140, macro_f1: 0.7135, weighted_f1: 0.7143


100%|██████████| 146/146 [00:11<00:00, 12.17it/s]


Epoch 24: Train loss: 0.0894, acc: 0.9876. Test loss: 1.8645, acc: 0.7194, macro_f1: 0.7192, weighted_f1: 0.7197


100%|██████████| 146/146 [00:11<00:00, 12.34it/s]


Epoch 25: Train loss: 0.0866, acc: 0.9893. Test loss: 2.0079, acc: 0.7194, macro_f1: 0.7190, weighted_f1: 0.7197


100%|██████████| 146/146 [00:11<00:00, 12.28it/s]


Epoch 26: Train loss: 0.0827, acc: 0.9884. Test loss: 1.9627, acc: 0.7186, macro_f1: 0.7183, weighted_f1: 0.7190


100%|██████████| 146/146 [00:12<00:00, 12.15it/s]


Epoch 27: Train loss: 0.0571, acc: 0.9942. Test loss: 1.9611, acc: 0.7217, macro_f1: 0.7214, weighted_f1: 0.7221
Early stopping: test accuracy does not increase after 5 epochs


In [13]:
# save the word-embedding layer weights
embedding_weights = bengali_clf.embed.state_dict()
torch.save(embedding_weights, f'save/bengali_embedding_weights_.pt')