# Task 2c: Train a Bengali hatespeech classifier from scratch
(Preprocessing and training word-embedding are done in '../hindi_bengali/')

## Import libraries

In [1]:
# Imports
import re
import string
import json
from datetime import datetime
from collections import defaultdict, Counter

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm

import torch
import torch.nn as nn
from torch.nn import Module
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

from nltk.corpus import stopwords

device = 'cuda'

import random

random.seed(26)
np.random.seed(62)
torch.manual_seed(2021)

<torch._C.Generator at 0x7f7ce85bcf90>

## Load data

In [2]:
ben_train_df = pd.read_csv('../hindi_bengali/save/bengali_hatespeech_sample_train_preprocessed.csv')
ben_test_df = pd.read_csv('../hindi_bengali/save/bengali_hatespeech_sample_test_preprocessed.csv')

display(ben_train_df.head())

Unnamed: 0,sentence,hate,category
0,নটির পুতেরে জুতা বাইরে,1,religion
1,কুত্তাকে গণধোলাই দেওয়া জানোয়ারটা যেই হাত দিয...,1,"Meme, TikTok and others"
2,তোর বউয়ের ভোদ চোদ জাইয়া,1,religion
3,কুত্তার বাচ্চা কোথায় পাবো,1,religion
4,সালিকে জতাপিটা,1,crime


In [3]:
# remove empty texts
ben_train_df = ben_train_df[ben_train_df.sentence.str.len() > 0]
# extract sentences and labels
train_sentences = [text.split() for text in ben_train_df['sentence']]
train_labels = ben_train_df['hate'].to_numpy()

# remove empty texts
ben_test_df = ben_test_df[ben_test_df.sentence.str.len() > 0]
# extract sentences and labels
test_sentences = [text.split() for text in ben_test_df['sentence']]
test_labels = ben_test_df['hate'].to_numpy()

print('Train data:')
print(train_sentences[:3])
print(train_labels)
print()
print('Test data:')
print(test_sentences[:3])
print(test_labels)

Train data:
[['নটির', 'পুতেরে', 'জুতা', 'বাইরে'], ['কুত্তাকে', 'গণধোলাই', 'দেওয়া', 'জানোয়ারটা', 'যেই', 'হাত', 'দিয়ে', 'মারছে', 'হাতটা', 'ভেঙ্গে', 'দিন'], ['তোর', 'বউয়ের', 'ভোদ', 'চোদ', 'জাইয়া']]
[1 1 1 ... 1 1 1]

Test data:
[['শালা', 'তাহেরি', 'ওরে', 'বাশ', 'দেয়া', 'হোউক'], ['খানকির', 'বাচ্চা', 'তোরে', 'এনাকোন্ডা', 'মারা', 'খা'], ['ওরে', 'পুলিশের', 'হাতে', 'দেয়ার', 'সবাই', 'মিলে', 'পিটিয়ে', 'আধমরা', 'করলো']]
[1 1 0 ... 0 0 0]


## Prepare vocab set

In [4]:
# load mapping {word -> id} and {id -> word}
with open('../hindi_bengali/save/bengali_word_to_int_dict.json') as f:
    word_to_int = json.load(f)
with open('../hindi_bengali/save/bengali_int_to_word_dict.json') as f:
    int_to_word = json.load(f)
    int_to_word = {int(k) : v for k, v in int_to_word.items()}

# get vocab_size
vocab_size = len(word_to_int)
print(f'vocab_size: {vocab_size}')

vocab_size: 15231


In [5]:
train_sentences = [[word_to_int[word] for word in sentence] for sentence in train_sentences]
test_sentences = [[word_to_int[word] for word in sentence if word in word_to_int] for sentence in test_sentences]

## Hyper-parameters

In [6]:
embedding_path = 'save/embedding_weights.pt'
embedding_size = 300
att_dim = 150
learning_rate = 1e-4
batch_size = 32
epochs = 30

## Build datasets

In [7]:
class HOFDataset(Dataset):
    def __init__(self, sentences, labels):
        self.data = []
        for sentence, label in zip(sentences, labels):
            if len(sentence):
                self.data.append(
                    (torch.tensor(sentence, dtype=torch.long), 
                     torch.tensor(label, dtype=torch.float))
                )
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        return self.data[index]
    
def preprocess_batch(batch):
    texts, labels = list(zip(*batch))
    seq_lens = torch.tensor([len(text) for text in texts], dtype=torch.long)
    texts = pad_sequence(texts, padding_value=0)
    labels = torch.tensor(labels).unsqueeze(1)

    seq_lens, sorted_idx = seq_lens.sort(descending=True)
    texts = texts[:,sorted_idx]
    labels = labels[sorted_idx]
    return texts, seq_lens, labels

train_dataset = HOFDataset(train_sentences, train_labels)
train_loader = DataLoader(train_dataset, batch_size=batch_size, 
                          shuffle=True, collate_fn=preprocess_batch)

test_dataset = HOFDataset(test_sentences, test_labels)
test_loader = DataLoader(test_dataset, batch_size=batch_size, 
                         shuffle=False, collate_fn=preprocess_batch)

## Network architecture

In [8]:
def mask_seq(seq_lens):
    mask = torch.zeros((len(seq_lens), max(seq_lens))).bool()
    for i, seq_len in enumerate(seq_lens):
        mask[i, seq_len:] = True
    return mask.to(device)

In [9]:
# define classifier
class Classifier(Module):
    def __init__(self):
        super(Classifier, self).__init__()

        self.embed = nn.Embedding(vocab_size, embedding_size)
        self.embed.load_state_dict(torch.load(embedding_path, map_location=torch.device(device)))
        self.embed.requires_grad = False
        
        self.attention = nn.MultiheadAttention(embed_dim=embedding_size,
                                               num_heads=10,
                                               dropout=0.5,)

        self.fc = nn.Linear(embedding_size, 1)

    def forward(self, inp, seq_lens):
        out = self.embed(inp)
        pad_mask = mask_seq(seq_lens)
        att_out, _ = self.attention(out, out, out, key_padding_mask=pad_mask)
        out = F.layer_norm(out + att_out, (out.size(2), ))
        out = self.fc(out).squeeze(2)
        pred = torch.zeros((out.size(1), 1))
        for i, seq_len in enumerate(seq_lens):
            pred[i, 0] = out[:seq_len, i].mean()
        return pred

# replace the embedding layer to make it a bengali classifier
bengali_clf = Classifier().to(device)
print('Bengali classifier:')
print(bengali_clf.eval())

Bengali classifier:
Classifier(
  (embed): Embedding(15231, 300)
  (attention): MultiheadAttention(
    (out_proj): _LinearWithBias(in_features=300, out_features=300, bias=True)
  )
  (fc): Linear(in_features=300, out_features=1, bias=True)
)


## Loss function and optimizer

In [10]:
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(bengali_clf.parameters(), lr=learning_rate)

## Measure performance on test data

In [11]:
def predict_test():
    losses = 0
    acc_cnt = 0
    cnt = 0
    preds = []
    true_labels = []
    for texts, seq_lens, labels in test_loader:
        pred = bengali_clf(texts.to(device), seq_lens).detach().to('cpu')
        loss = criterion(pred, labels)
        losses += loss.detach().item() * len(texts)
        acc_cnt += sum((pred > 0) == (labels > 0)).item()
        preds.extend(pred.view(-1))
        true_labels.extend(labels.view(-1))
        cnt += texts.size(1)
    
    preds = np.array(preds) > 0
    macro_f1 = f1_score(true_labels, preds, average='macro')
    weighted_f1 = f1_score(true_labels, preds, average='weighted')
    return losses / cnt, acc_cnt / cnt, macro_f1, weighted_f1

## Training

In [12]:
list_test_acc = []
early_stop = 5

for epoch in range(1, epochs + 1):
    losses = 0.
    acc_cnt = 0
    cnt = 0
    bengali_clf.train()
    for texts, seq_lens, labels in tqdm(train_loader):
        optimizer.zero_grad()
        pred = bengali_clf(texts.to(device), seq_lens)
        loss = criterion(pred, labels)
        loss.backward()
        optimizer.step()
        losses += loss.detach().item() * len(texts)
        acc_cnt += sum((pred.to('cpu') > 0) == (labels > 0)).item()
        cnt += texts.size(1)

    epoch_loss = losses / cnt
    epoch_acc = acc_cnt / cnt
    test_loss, test_acc, test_macro_f1, test_weighted_f1 = predict_test()
    print(f'Epoch {epoch:2}: Train loss: {epoch_loss:.4f}, acc: {epoch_acc:.4f}. '
        f'Test loss: {test_loss:.4f}, acc: {test_acc:.4f}, '
        f'macro_f1: {test_macro_f1:.4f}, weighted_f1: {test_weighted_f1:.4f}',
        flush=True)

    list_test_acc.append(test_acc)
    if len(list_test_acc) > early_stop and max(list_test_acc[-early_stop:]) <= max(list_test_acc[:-early_stop]):
        print(f'Early stopping: test accuracy does not increase after {early_stop} epochs')
        break

100%|██████████| 146/146 [00:05<00:00, 28.63it/s]


Epoch  1: Train loss: 1.0091, acc: 0.7091. Test loss: 0.7543, acc: 0.7643, macro_f1: 0.7643, weighted_f1: 0.7646


100%|██████████| 146/146 [00:05<00:00, 28.40it/s]


Epoch  2: Train loss: 0.8223, acc: 0.7809. Test loss: 0.7199, acc: 0.7798, macro_f1: 0.7790, weighted_f1: 0.7800


100%|██████████| 146/146 [00:05<00:00, 28.06it/s]


Epoch  3: Train loss: 0.7656, acc: 0.8024. Test loss: 0.7092, acc: 0.7806, macro_f1: 0.7796, weighted_f1: 0.7807


100%|██████████| 146/146 [00:05<00:00, 28.60it/s]


Epoch  4: Train loss: 0.7049, acc: 0.8254. Test loss: 0.6998, acc: 0.7853, macro_f1: 0.7840, weighted_f1: 0.7852


100%|██████████| 146/146 [00:05<00:00, 27.79it/s]


Epoch  5: Train loss: 0.6315, acc: 0.8442. Test loss: 0.7142, acc: 0.7868, macro_f1: 0.7863, weighted_f1: 0.7871


100%|██████████| 146/146 [00:05<00:00, 28.49it/s]


Epoch  6: Train loss: 0.5679, acc: 0.8663. Test loss: 0.7172, acc: 0.7891, macro_f1: 0.7881, weighted_f1: 0.7891


100%|██████████| 146/146 [00:05<00:00, 28.41it/s]


Epoch  7: Train loss: 0.5112, acc: 0.8841. Test loss: 0.7592, acc: 0.7876, macro_f1: 0.7868, weighted_f1: 0.7877


100%|██████████| 146/146 [00:05<00:00, 28.77it/s]


Epoch  8: Train loss: 0.4115, acc: 0.9052. Test loss: 0.7916, acc: 0.7915, macro_f1: 0.7908, weighted_f1: 0.7916


100%|██████████| 146/146 [00:05<00:00, 28.69it/s]


Epoch  9: Train loss: 0.3439, acc: 0.9228. Test loss: 0.8417, acc: 0.7868, macro_f1: 0.7862, weighted_f1: 0.7870


100%|██████████| 146/146 [00:05<00:00, 28.54it/s]


Epoch 10: Train loss: 0.2959, acc: 0.9412. Test loss: 0.8903, acc: 0.7806, macro_f1: 0.7796, weighted_f1: 0.7807


100%|██████████| 146/146 [00:05<00:00, 28.67it/s]


Epoch 11: Train loss: 0.2370, acc: 0.9522. Test loss: 0.9610, acc: 0.7829, macro_f1: 0.7825, weighted_f1: 0.7832


100%|██████████| 146/146 [00:05<00:00, 28.40it/s]


Epoch 12: Train loss: 0.2036, acc: 0.9618. Test loss: 1.0404, acc: 0.7798, macro_f1: 0.7793, weighted_f1: 0.7801


100%|██████████| 146/146 [00:05<00:00, 28.63it/s]


Epoch 13: Train loss: 0.1558, acc: 0.9725. Test loss: 1.1223, acc: 0.7682, macro_f1: 0.7678, weighted_f1: 0.7685
Early stopping: test accuracy does not increase after 5 epochs
