# Used the trained embeddings to classify hate-speeches
This notebook creates a neural classifier.

### Input:
    - Word-embeddings.
    - Training data.

### Output:
    - A binary classifier.
    - Evaluation on test data.

## Import libraries

In [1]:
# Imports
import re
import string
import json
from datetime import datetime
from collections import defaultdict, Counter

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm

import torch
import torch.nn as nn
from torch.nn import Module
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

from nltk.corpus import stopwords

device = 'cpu'

import random

random.seed(26)
np.random.seed(62)
# torch.manual_seed(2021)

## Load data

In [2]:
# train data
train_df = pd.read_csv('save/bengali_train_preprocessed.csv')
train_sentences = [[int(s) for s in text.split()] for text in train_df['sentence']]
train_labels = train_df['hate'].to_numpy()

# test data
test_df = pd.read_csv('save/bengali_test_preprocessed.csv')
test_sentences = [[int(s) for s in text.split()] for text in test_df['sentence']]
test_labels = test_df['hate'].to_numpy()

# word <-> convertion
with open('save/word_to_int_dict.json', 'r') as f:
    word_to_int = json.load(f)
with open('save/int_to_word_dict.json', 'r') as f:
    int_to_word = json.load(f)
    int_to_word = {int(k) : v for k, v in int_to_word.items()}

# word-counter
with open('save/word_counter.json', 'r') as f:
    word_counter = json.load(f)
    
vocab_size = len(word_to_int)
print('vocab size:', vocab_size)

vocab size: 15983


In [3]:
print('sample data:')
print('train:')
print(train_sentences[:2])
print(train_labels[:2])
print('test:')
print(test_sentences[:2])
print(test_labels[:2])

sample data:
train:
[[12431, 11321, 507, 13590, 993, 8990, 7341, 7078], [9604, 9604]]
[0 0]
test:
[[5664, 10661, 3793, 6014], [7078, 7570, 3439, 15021]]
[0 0]


## Hyper-parameters

In [4]:
embedding_path = 'save/embedding_weights_30_epoch_300_dim_5_wsize_5_negfac.pt'
embedding_size = 300
att_dim = 150
# lstm_dim = 100
# lstm_bidirectional = True
learning_rate = 1e-4
batch_size = 32
epochs = 50

## Build datasets

In [5]:
class HOFDataset(Dataset):
    def __init__(self, sentences, labels):
        self.data = []
        for sentence, label in zip(sentences, labels):
            if len(sentence):
                self.data.append(
                    (torch.tensor(sentence, dtype=torch.long), 
                     torch.tensor(label, dtype=torch.float))
                )
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        return self.data[index]
    
def preprocess_batch(batch):
    texts, labels = list(zip(*batch))
    seq_lens = torch.tensor([len(text) for text in texts], dtype=torch.long)
    texts = pad_sequence(texts, padding_value=0)
    labels = torch.tensor(labels).unsqueeze(1)

    seq_lens, sorted_idx = seq_lens.sort(descending=True)
    texts = texts[:,sorted_idx]
    labels = labels[sorted_idx]
    return texts, seq_lens, labels

train_dataset = HOFDataset(train_sentences, train_labels)
train_loader = DataLoader(train_dataset, batch_size=batch_size, 
                          shuffle=True, collate_fn=preprocess_batch)

test_dataset = HOFDataset(test_sentences, test_labels)
test_loader = DataLoader(test_dataset, batch_size=batch_size, 
                         shuffle=False, collate_fn=preprocess_batch)

## Network architecture

In [6]:
def mask_seq(seq_lens):
    mask = torch.zeros((len(seq_lens), max(seq_lens))).bool()
    for i, seq_len in enumerate(seq_lens):
        mask[i, seq_len:] = True
    return mask

class Classifier(Module):
    def __init__(self):
        super(Classifier, self).__init__()
#         self.lstm_dim = lstm_dim
#         self.bidirectional = bidirectional

        self.embed = nn.Embedding(vocab_size, embedding_size)
        self.embed.load_state_dict(torch.load(embedding_path, map_location=torch.device(device)))
        self.embed.requires_grad = False
        # self.lstm = nn.LSTM(input_size=embedding_size, 
        #                     hidden_size=lstm_dim, 
        #                     num_layers=2, 
        #                     batch_first=False, 
        #                     dropout=0.5, 
        #                     bidirectional=bidirectional)
        
        # self.fc = nn.Linear(lstm_dim*2 if bidirectional else lstm_dim, 1)
        
        self.attention = nn.MultiheadAttention(#embed_dim=lstm_dim*2 if bidirectional else lstm_dim,
                                               embed_dim=embedding_size,
                                               num_heads=10,
                                               dropout=0.7,) # need to add mask for padding positions
        
        self.attention_2 = nn.MultiheadAttention(
                                               embed_dim=embedding_size,
                                               num_heads=5,
                                               dropout=0.7,) # need to add mask for padding positions

        self.fc = nn.Linear(embedding_size, 1)
#         init_range = (2 / (embedding_size + 1)) ** 0.5 # Xavier
#         self.fc.weight.data.uniform_(-init_range, init_range)

    def forward(self, inp, seq_lens):
        out = self.embed(inp)
        # out = pack_padded_sequence(out, seq_lens, batch_first=False, 
                                #    enforce_sorted = True)
        # out, _ = self.lstm(out)
        # out, _ = pad_packed_sequence(out, batch_first=False)

        pad_mask = mask_seq(seq_lens)
        att_out, _ = self.attention(out, out, out)#, key_padding_mask=pad_mask)
        out = F.layer_norm(out + att_out, (out.size(2), ))
        
        att_out, _ = self.attention_2(out, out, out)
        out = F.layer_norm(out + att_out, (out.size(2), ))

        out = self.fc(out)
        out = out.squeeze(2)
        pred = torch.zeros((out.size(1), 1))
        for i, seq_len in enumerate(seq_lens):
            pred[i, 0] = out[:seq_len, i].mean()
        return pred

clf = Classifier().to(device)

In [7]:
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(clf.parameters(), lr=learning_rate)

In [8]:
def predict_test():
    losses = 0
    acc_cnt = 0
    cnt = 0
    preds = []
    true_labels = []
    clf.eval()
    for texts, seq_lens, labels in test_loader:
        pred = clf(texts.to(device), seq_lens).detach().to('cpu')
        loss = criterion(pred, labels)
        losses += loss.detach().item() * len(texts)
        acc_cnt += sum((pred > 0) == (labels > 0)).item()
        preds.extend(pred.view(-1))
        true_labels.extend(labels.view(-1))
        cnt += texts.size(1)
    
    preds = np.array(preds) > 0
    macro_f1 = f1_score(true_labels, preds, average='macro')
    weighted_f1 = f1_score(true_labels, preds, average='weighted')
    return losses / cnt, acc_cnt / cnt, macro_f1, weighted_f1

## Training

In [9]:
list_test_acc = []
early_stop = 5

for epoch in range(1, epochs + 1):
    losses = 0.
    acc_cnt = 0
    cnt = 0
    clf.train()
    for texts, seq_lens, labels in tqdm(train_loader):
        optimizer.zero_grad()
        pred = clf(texts.to(device), seq_lens)
        loss = criterion(pred, labels)
        loss.backward()
        optimizer.step()
        losses += loss.detach().item() * len(texts)
        acc_cnt += sum((pred.to('cpu') > 0) == (labels > 0)).item()
        cnt += texts.size(1)

    epoch_loss = losses / cnt
    epoch_acc = acc_cnt / cnt
    test_loss, test_acc, test_macro_f1, test_weighted_f1 = predict_test()
    print(f'Epoch {epoch:2}: Train loss: {epoch_loss:.4f}, acc: {epoch_acc:.4f}. '
        f'Test loss: {test_loss:.4f}, acc: {test_acc:.4f}, '
        f'macro_f1: {test_macro_f1:.4f}, weighted_f1: {test_weighted_f1:.4f}',
        flush=True)

    list_test_acc.append(test_acc)
    if len(list_test_acc) > early_stop and max(list_test_acc[-early_stop:]) <= max(list_test_acc[:-early_stop]):
        print(f'Early stopping: test accuracy does not increase after {early_stop} epochs')
        break
    

100%|██████████| 146/146 [00:19<00:00,  7.54it/s]


Epoch  1: Train loss: 1.2451, acc: 0.6722. Test loss: 0.6135, acc: 0.7859, macro_f1: 0.7859, weighted_f1: 0.7858


100%|██████████| 146/146 [00:22<00:00,  6.49it/s]


Epoch  2: Train loss: 0.9886, acc: 0.7958. Test loss: 0.5739, acc: 0.8138, macro_f1: 0.8105, weighted_f1: 0.8123


100%|██████████| 146/146 [00:22<00:00,  6.56it/s]


Epoch  3: Train loss: 0.8964, acc: 0.8140. Test loss: 0.5510, acc: 0.8060, macro_f1: 0.8057, weighted_f1: 0.8063


100%|██████████| 146/146 [00:22<00:00,  6.61it/s]


Epoch  4: Train loss: 0.8244, acc: 0.8309. Test loss: 0.5395, acc: 0.8176, macro_f1: 0.8165, weighted_f1: 0.8175


100%|██████████| 146/146 [00:22<00:00,  6.57it/s]


Epoch  5: Train loss: 0.8239, acc: 0.8440. Test loss: 0.5469, acc: 0.8184, macro_f1: 0.8174, weighted_f1: 0.8184


100%|██████████| 146/146 [00:22<00:00,  6.50it/s]


Epoch  6: Train loss: 0.7122, acc: 0.8599. Test loss: 0.5782, acc: 0.8091, macro_f1: 0.8089, weighted_f1: 0.8094


100%|██████████| 146/146 [00:23<00:00,  6.32it/s]


Epoch  7: Train loss: 0.6603, acc: 0.8734. Test loss: 0.5908, acc: 0.8168, macro_f1: 0.8162, weighted_f1: 0.8170


100%|██████████| 146/146 [00:22<00:00,  6.56it/s]


Epoch  8: Train loss: 0.5913, acc: 0.8835. Test loss: 0.6413, acc: 0.7960, macro_f1: 0.7959, weighted_f1: 0.7962


100%|██████████| 146/146 [00:22<00:00,  6.51it/s]


Epoch  9: Train loss: 0.5820, acc: 0.8953. Test loss: 0.6437, acc: 0.8161, macro_f1: 0.8143, weighted_f1: 0.8156


100%|██████████| 146/146 [00:22<00:00,  6.58it/s]


Epoch 10: Train loss: 0.4980, acc: 0.9101. Test loss: 0.7084, acc: 0.8130, macro_f1: 0.8118, weighted_f1: 0.8129
Early stopping: test accuracy does not increase after 5 epochs
