# Task 2

In [1]:
# Imports
import re
import string
from datetime import datetime
from collections import defaultdict, Counter

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm

import torch
import torch.nn as nn
from torch.nn import Module
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

from nltk.corpus import stopwords

device = 'cuda'

import random

random.seed(26)
np.random.seed(62)
torch.manual_seed(2021)
torch.cuda.manual_seed(123)

LANGUAGE = 'hi'

In [2]:
# from google.colab import files
# uploaded = files.upload()
# uploaded = files.upload()
# uploaded = files.upload()

In [3]:
embedding_path = '../Task_1/save/embedding_weights_hi_30_epoch_300_dim_5_wsize.pt'
embedding_size = 300
att_dim = 150
# lstm_dim = 100
# lstm_bidirectional = True
learning_rate = 1e-4
batch_size = 32
epochs = 20

## Load and preprocess data

In [4]:
train_data = pd.read_csv('../data/hindi_dataset.tsv', sep='\t')
print('train:')
display(train_data.head())

train_sentences = train_data['text'].to_numpy()
train_labels = train_data['task_1'].to_numpy()
train_labels[train_labels=='NOT'] = 0
train_labels[train_labels=='HOF'] = 1
train_labels = train_labels.astype(int)

test_data = pd.read_csv('../data/hasoc2019_hi_test_gold_2919.tsv', sep='\t')
print('test:')
display(test_data.head())

test_sentences = test_data['text'].to_numpy()
test_labels = test_data['task_1'].to_numpy()
test_labels[test_labels=='NOT'] = 0
test_labels[test_labels=='HOF'] = 1
test_labels = test_labels.astype(int)

train:


Unnamed: 0,text_id,text,task_1,task_2,task_3
0,hasoc_hi_5556,"बांग्लादेश की शानदार वापसी, भारत को 314 रन पर ...",NOT,NONE,NONE
1,hasoc_hi_5648,सब रंडी नाच देखने मे व्यस्त जैसे ही कोई #शांती...,HOF,PRFN,UNT
2,hasoc_hi_164,तुम जैसे हरामियों के लिए बस जूतों की कमी है शु...,HOF,PRFN,TIN
3,hasoc_hi_3530,"बीजेपी MLA आकाश विजयवर्गीय जेल से रिहा, जमानत ...",NOT,NONE,NONE
4,hasoc_hi_5206,चमकी बुखार: विधानसभा परिसर में आरजेडी का प्रदर...,NOT,NONE,NONE


test:


Unnamed: 0,text_id,text,task_1,task_2,task_3
0,hasoc_hi_5061,"वक्त, इन्सान और इंग्लैंड का मौसम आपको कभी भी ध...",NOT,NONE,NONE
1,hasoc_hi_2090,#कांग्रेस के इस #कमीने की #करतूत को देखिए देश ...,HOF,OFFN,TIN
2,hasoc_hi_2960,पाकिस्तान को फेकना था फेका गया। जो हार कर भी द...,HOF,OFFN,TIN
3,hasoc_hi_864,जो शब्द तूम आज किसी और औरत के लिए यूज कर रहे व...,NOT,NONE,NONE
4,hasoc_hi_54,नेता जी हम समाजवादी सिपाही हमेशा आपके साथ है आ...,NOT,NONE,NONE


In [5]:
def preprocess_texts(sentences):
    # remove user taggings
    user_tag_pattern = re.compile(r'\@\w*')
    sentences = [re.sub(user_tag_pattern, ' ', sentence) for sentence in sentences]
    # lower case
    sentences = [sentence.lower() for sentence in sentences]
    # remove punctuations
    http_re = re.compile('http://[^ ]*')
    https_re = re.compile('https://[^ ]*')
    punctuation = string.punctuation[:2] + string.punctuation[3:]
    translator = str.maketrans(punctuation, ' '*len(punctuation))
    def clean(s):
        s = re.sub(http_re, ' ', s)
        s = re.sub(https_re, ' ', s)
        s = s.translate(translator)
        return s

    sentences = [clean(sentence) for sentence in sentences]
    # remove number ?
    
    # remove stopwords
    if LANGUAGE == 'hi':
        stopwords = ['अंदर', 'अत', 'अदि', 'अप', 'अपना', 'अपनि', 'अपनी', 'अपने', 'अभि', 'अभी', 'आदि', 
                     'आप', 'इंहिं', 'इंहें', 'इंहों', 'इतयादि', 'इत्यादि', 'इन', 'इनका', 'इन्हीं', 'इन्हें', 'इन्हों', 
                     'इस', 'इसका', 'इसकि', 'इसकी', 'इसके', 'इसमें', 'इसि', 'इसी', 'इसे', 'उंहिं', 'उंहें', 
                     'उंहों', 'उन', 'उनका', 'उनकि', 'उनकी', 'उनके', 'उनको', 'उन्हीं', 'उन्हें', 'उन्हों', 'उस', 
                     'उसके', 'उसि', 'उसी', 'उसे', 'एक', 'एवं', 'एस', 'एसे', 'ऐसे', 'ओर', 'और', 'कइ', 
                     'कई', 'कर', 'करता', 'करते', 'करना', 'करने', 'करें', 'कहते', 'कहा', 'का', 'काफि', 
                     'काफ़ी', 'कि', 'किंहें', 'किंहों', 'कितना', 'किन्हें', 'किन्हों', 'किया', 'किर', 'किस', 
                     'किसि', 'किसी', 'किसे', 'की', 'कुछ', 'कुल', 'के', 'को', 'कोइ', 'कोई', 'कोन', 
                     'कोनसा', 'कौन', 'कौनसा', 'गया', 'घर', 'जब', 'जहाँ', 'जहां', 'जा', 'जिंहें', 'जिंहों', 
                     'जितना', 'जिधर', 'जिन', 'जिन्हें', 'जिन्हों', 'जिस', 'जिसे', 'जीधर', 'जेसा', 'जेसे', 
                     'जैसा', 'जैसे', 'जो', 'तक', 'तब', 'तरह', 'तिंहें', 'तिंहों', 'तिन', 'तिन्हें', 'तिन्हों', 
                     'तिस', 'तिसे', 'तो', 'था', 'थि', 'थी', 'थे', 'दबारा', 'दवारा', 'दिया', 'दुसरा', 'दुसरे', 
                     'दूसरे', 'दो', 'द्वारा', 'न', 'नहिं', 'नहीं', 'ना', 'निचे', 'निहायत', 'नीचे', 'ने', 'पर', 
                     'पहले', 'पुरा', 'पूरा', 'पे', 'फिर', 'बनि', 'बनी', 'बहि', 'बही', 'बहुत', 'बाद', 'बाला', 
                     'बिलकुल', 'भि', 'भितर', 'भी', 'भीतर', 'मगर', 'मानो', 'मे', 'में', 'यदि', 'यह', 'यहाँ', 
                     'यहां', 'यहि', 'यही', 'या', 'यिह', 'ये', 'रखें', 'रवासा', 'रहा', 'रहे', 'ऱ्वासा', 'लिए', 
                     'लिये', 'लेकिन', 'व', 'वगेरह', 'वरग', 'वर्ग', 'वह', 'वहाँ', 'वहां', 'वहिं', 'वहीं', 'वाले', 
                     'वुह', 'वे', 'वग़ैरह', 'संग', 'सकता', 'सकते', 'सबसे', 'सभि', 'सभी', 'साथ', 'साबुत', 
                     'साभ', 'सारा', 'से', 'सो', 'हि', 'ही', 'हुअ', 'हुआ', 'हुइ', 'हुई', 'हुए', 'हे', 'हें', 
                     'है', 'हैं', 'हो', 'होता', 'होति', 'होती', 'होते', 'होना', 'होने']
    elif LANGUAGE == 'en':
        stopwords = stopwords.words('english')

    sentences = [[word for word in sentence.split() if word not in stopwords] for sentence in sentences]
    
    return sentences

train_sentences = preprocess_texts(train_sentences)
test_sentences = preprocess_texts(test_sentences)

In [6]:
# vocab_size and word->id and id->word
flattened_words = [word for sentence in train_sentences for word in sentence]
V = list(set(flattened_words))
vocab_size = len(V)
print(f'vocab_size: {vocab_size}')

word_to_int = {}
int_to_word = {}
for i, word in enumerate(V):
    word_to_int[word] = i
    int_to_word[i] = word

vocab_size: 19580


In [7]:
train_sentences = [[word_to_int[word] for word in sentence] for sentence in train_sentences]
sq_len = np.array([len(s) for s in train_sentences])
for id in np.where(sq_len == 0)[0][::-1]:
    print('delete training text id', id)
    del train_sentences[id], 
    np.delete(train_labels, id)
del sq_len

test_sentences = [[word_to_int[word] for word in sentence if word in word_to_int] for sentence in test_sentences]
print('Number of empty test sentences: ', sum([len(s) == 0 for s in test_sentences]))

delete training text id 1375
delete training text id 428
Number of empty test sentences:  0


## Build datasets

In [8]:
class HOFDataset(Dataset):
    def __init__(self, sentences, labels):
        self.data = []
        for sentence, label in zip(sentences, labels):
            self.data.append(
                (torch.tensor(sentence, dtype=torch.long), 
                 torch.tensor(label, dtype=torch.float))
            )
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        return self.data[index]
    
def preprocess_batch(batch):
    texts, labels = list(zip(*batch))
    seq_lens = torch.tensor([len(text) for text in texts], dtype=torch.long)
    texts = pad_sequence(texts, padding_value=0)
    labels = torch.tensor(labels).unsqueeze(1)

    seq_lens, sorted_idx = seq_lens.sort(descending=True)
    texts = texts[:,sorted_idx]
    labels = labels[sorted_idx]
    return texts, seq_lens, labels

train_dataset = HOFDataset(train_sentences, train_labels)
train_loader = DataLoader(train_dataset, batch_size=batch_size, 
                          shuffle=True, collate_fn=preprocess_batch)

test_dataset = HOFDataset(test_sentences, test_labels)
test_loader = DataLoader(test_dataset, batch_size=batch_size, 
                         shuffle=False, collate_fn=preprocess_batch)

## Network architecture

In [9]:
def mask_seq(seq_lens):
    mask = torch.zeros((len(seq_lens), max(seq_lens))).bool()
    for i, seq_len in enumerate(seq_lens):
        mask[i, seq_len:] = True
    return mask

class Classifier(Module):
    def __init__(self):
        super(Classifier, self).__init__()
#         self.lstm_dim = lstm_dim
#         self.bidirectional = bidirectional

        self.embed = nn.Embedding(vocab_size, embedding_size)
        self.embed.load_state_dict(torch.load(embedding_path))
        self.embed.requires_grad = False
        # self.lstm = nn.LSTM(input_size=embedding_size, 
        #                     hidden_size=lstm_dim, 
        #                     num_layers=2, 
        #                     batch_first=False, 
        #                     dropout=0.5, 
        #                     bidirectional=bidirectional)
        
        # self.fc = nn.Linear(lstm_dim*2 if bidirectional else lstm_dim, 1)
        
        self.attention = nn.MultiheadAttention(#embed_dim=lstm_dim*2 if bidirectional else lstm_dim,
                                               embed_dim=embedding_size,
                                               num_heads=10,
                                               dropout=0.5,) # need to add mask for padding positions

        self.fc = nn.Linear(embedding_size, 1)

    def forward(self, inp, seq_lens):
        out = self.embed(inp)
        # out = pack_padded_sequence(out, seq_lens, batch_first=False, 
                                #    enforce_sorted = True)
        # out, _ = self.lstm(out)
        # out, _ = pad_packed_sequence(out, batch_first=False)

        pad_mask = mask_seq(seq_lens)
        att_out, _ = self.attention(out, out, out)#, key_padding_mask=pad_mask)
        out = F.layer_norm(out + att_out, (out.size(2), ))

        out = self.fc(out)
        out = out.squeeze(2)
        pred = torch.zeros((out.size(1), 1))
        for i, seq_len in enumerate(seq_lens):
            pred[i, 0] = out[:seq_len, i].mean()
        return pred

clf = Classifier().to(device)

In [10]:
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(clf.parameters(), lr=learning_rate)

In [11]:
def predict_test():
    losses = 0
    acc_cnt = 0
    cnt = 0
    preds = []
    true_labels = []
    clf.eval()
    for texts, seq_lens, labels in test_loader:
        pred = clf(texts.to(device), seq_lens).detach().to('cpu')
        loss = criterion(pred, labels)
        losses += loss.detach().item() * len(texts)
        acc_cnt += sum((pred > 0) == (labels > 0)).item()
        preds.extend(pred.view(-1))
        true_labels.extend(labels.view(-1))
        cnt += texts.size(1)
    
    preds = np.array(preds) > 0
    macro_f1 = f1_score(true_labels, preds, average='macro')
    weighted_f1 = f1_score(true_labels, preds, average='weighted')
    return losses / cnt, acc_cnt / cnt, macro_f1, weighted_f1

In [12]:
list_test_acc = []
early_stop = 5

for epoch in range(1, epochs + 1):
    losses = 0.
    acc_cnt = 0
    cnt = 0
    clf.train()
    for texts, seq_lens, labels in tqdm(train_loader):
        optimizer.zero_grad()
        pred = clf(texts.to(device), seq_lens)
        loss = criterion(pred, labels)
        loss.backward()
        optimizer.step()
        losses += loss.detach().item() * len(texts)
        acc_cnt += sum((pred.to('cpu') > 0) == (labels > 0)).item()
        cnt += texts.size(1)

    epoch_loss = losses / cnt
    epoch_acc = acc_cnt / cnt
    test_loss, test_acc, test_macro_f1, test_weighted_f1 = predict_test()
    print(f'Epoch {epoch:2}: Train loss: {epoch_loss:.4f}, acc: {epoch_acc:.4f}. '
        f'Test loss: {test_loss:.4f}, acc: {test_acc:.4f}, '
        f'macro_f1: {test_macro_f1:.4f}, weighted_f1: {test_weighted_f1:.4f}',
        flush=True)

    list_test_acc.append(test_acc)
    if len(list_test_acc) > early_stop and max(list_test_acc[-early_stop:]) <= max(list_test_acc[:-early_stop]):
        print(f'Early stopping: test accuracy does not increase after {early_stop} epochs')
        break

100%|██████████| 146/146 [00:05<00:00, 26.87it/s]


Epoch  1: Train loss: 0.8894, acc: 0.5239. Test loss: 0.8302, acc: 0.5250, macro_f1: 0.5067, weighted_f1: 0.4989


100%|██████████| 146/146 [00:06<00:00, 23.47it/s]


Epoch  2: Train loss: 0.8766, acc: 0.5666. Test loss: 0.8228, acc: 0.5319, macro_f1: 0.4957, weighted_f1: 0.4846


100%|██████████| 146/146 [00:05<00:00, 25.40it/s]


Epoch  3: Train loss: 0.8620, acc: 0.5947. Test loss: 0.8181, acc: 0.5296, macro_f1: 0.4781, weighted_f1: 0.4647


100%|██████████| 146/146 [00:05<00:00, 25.54it/s]


Epoch  4: Train loss: 0.8314, acc: 0.6416. Test loss: 0.7809, acc: 0.6411, macro_f1: 0.6410, weighted_f1: 0.6406


100%|██████████| 146/146 [00:05<00:00, 25.38it/s]


Epoch  5: Train loss: 0.7938, acc: 0.6794. Test loss: 0.7639, acc: 0.6442, macro_f1: 0.6441, weighted_f1: 0.6439


100%|██████████| 146/146 [00:05<00:00, 25.62it/s]


Epoch  6: Train loss: 0.7497, acc: 0.7156. Test loss: 0.7638, acc: 0.6396, macro_f1: 0.6393, weighted_f1: 0.6384


100%|██████████| 146/146 [00:05<00:00, 25.22it/s]


Epoch  7: Train loss: 0.6973, acc: 0.7341. Test loss: 0.7848, acc: 0.6305, macro_f1: 0.6301, weighted_f1: 0.6290


100%|██████████| 146/146 [00:06<00:00, 23.26it/s]


Epoch  8: Train loss: 0.6233, acc: 0.7707. Test loss: 0.8321, acc: 0.6282, macro_f1: 0.6281, weighted_f1: 0.6276


100%|██████████| 146/146 [00:05<00:00, 25.74it/s]


Epoch  9: Train loss: 0.5415, acc: 0.8119. Test loss: 0.9339, acc: 0.5888, macro_f1: 0.5860, weighted_f1: 0.5832


100%|██████████| 146/146 [00:05<00:00, 25.42it/s]


Epoch 10: Train loss: 0.4676, acc: 0.8430. Test loss: 1.0243, acc: 0.5759, macro_f1: 0.5745, weighted_f1: 0.5725
Early stopping: test accuracy does not increase after 5 epochs


In [13]:
len(test_labels), sum(test_labels == 0) / len(test_labels)

(1318, 0.5409711684370258)

In [14]:
# The following setting got test acc: 0.6586
# embedding_path = '../Task_1/save/embedding_weights_hi_30_epoch_300_dim_5_wsize.pt'
# embedding_size = 300
# att_dim = 150
# # lstm_dim = 100
# # lstm_bidirectional = True
# learning_rate = 1e-4
# batch_size = 16
# epochs = 20
# self.attention = nn.MultiheadAttention(#embed_dim=lstm_dim*2 if bidirectional else lstm_dim,
#                                        embed_dim=embedding_size,
#                                        num_heads=10,
#                                        dropout=0.5,) # need to add mask for padding positions