# Task 2

In [28]:
# Imports
import re
import string
from datetime import datetime
from collections import defaultdict, Counter

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm

import torch
import torch.nn as nn
from torch.nn import Module
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

from sklearn.model_selection import train_test_split

from nltk.corpus import stopwords

device = 'cuda'

import random

random.seed(26)
np.random.seed(62)
torch.manual_seed(2021)

LANGUAGE = 'hi'

In [29]:
embedding_path = 'save/embedding_weights_hi_30_epoch_100_dim_10_wsize.pt'
embedding_size = 100
lstm_dim = 50
batch_size = 1
epochs = 3

## Load and preprocess data

In [30]:
train_data = pd.read_csv('data/hindi_hatespeech.tsv', sep='\t')
print('train:')
display(train_data.head())

train_sentences = train_data['text'].to_numpy()
train_labels = train_data['task_1'].to_numpy()
train_labels[train_labels=='NOT'] = 0
train_labels[train_labels=='HOF'] = 1
train_labels = train_labels.astype(int)

test_data = pd.read_csv('data/hasoc2019_hi_test_gold_2919.tsv', sep='\t')
print('test:')
display(test_data.head())

test_sentences = test_data['text'].to_numpy()
test_labels = test_data['task_1'].to_numpy()
test_labels[test_labels=='NOT'] = 0
test_labels[test_labels=='HOF'] = 1
test_labels = test_labels.astype(int)

train:


Unnamed: 0,text_id,text,task_1,task_2,task_3
0,hasoc_hi_5556,"बांग्लादेश की शानदार वापसी, भारत को 314 रन पर ...",NOT,NONE,NONE
1,hasoc_hi_5648,सब रंडी नाच देखने मे व्यस्त जैसे ही कोई #शांती...,HOF,PRFN,UNT
2,hasoc_hi_164,तुम जैसे हरामियों के लिए बस जूतों की कमी है शु...,HOF,PRFN,TIN
3,hasoc_hi_3530,"बीजेपी MLA आकाश विजयवर्गीय जेल से रिहा, जमानत ...",NOT,NONE,NONE
4,hasoc_hi_5206,चमकी बुखार: विधानसभा परिसर में आरजेडी का प्रदर...,NOT,NONE,NONE


test:


Unnamed: 0,text_id,text,task_1,task_2,task_3
0,hasoc_hi_5061,"वक्त, इन्सान और इंग्लैंड का मौसम आपको कभी भी ध...",NOT,NONE,NONE
1,hasoc_hi_2090,#कांग्रेस के इस #कमीने की #करतूत को देखिए देश ...,HOF,OFFN,TIN
2,hasoc_hi_2960,पाकिस्तान को फेकना था फेका गया। जो हार कर भी द...,HOF,OFFN,TIN
3,hasoc_hi_864,जो शब्द तूम आज किसी और औरत के लिए यूज कर रहे व...,NOT,NONE,NONE
4,hasoc_hi_54,नेता जी हम समाजवादी सिपाही हमेशा आपके साथ है आ...,NOT,NONE,NONE


In [31]:
def preprocess_texts(sentences):
    # remove user taggings
    user_tag_pattern = re.compile(r'\@\w*')
    sentences = [re.sub(user_tag_pattern, ' ', sentence) for sentence in sentences]
    # lower case
    sentences = [sentence.lower() for sentence in sentences]
    # remove punctuations
    http_re = re.compile('http://[^ ]*')
    https_re = re.compile('https://[^ ]*')
    punctuation = string.punctuation[:2] + string.punctuation[3:]
    translator = str.maketrans(punctuation, ' '*len(punctuation))
    def clean(s):
        s = re.sub(http_re, ' ', s)
        s = re.sub(https_re, ' ', s)
        s = s.translate(translator)
        return s

    sentences = [clean(sentence) for sentence in sentences]
    # remove number ?
    
    # remove stopwords
    if LANGUAGE == 'hi':
        stopwords = ['अंदर', 'अत', 'अदि', 'अप', 'अपना', 'अपनि', 'अपनी', 'अपने', 'अभि', 'अभी', 'आदि', 
                     'आप', 'इंहिं', 'इंहें', 'इंहों', 'इतयादि', 'इत्यादि', 'इन', 'इनका', 'इन्हीं', 'इन्हें', 'इन्हों', 
                     'इस', 'इसका', 'इसकि', 'इसकी', 'इसके', 'इसमें', 'इसि', 'इसी', 'इसे', 'उंहिं', 'उंहें', 
                     'उंहों', 'उन', 'उनका', 'उनकि', 'उनकी', 'उनके', 'उनको', 'उन्हीं', 'उन्हें', 'उन्हों', 'उस', 
                     'उसके', 'उसि', 'उसी', 'उसे', 'एक', 'एवं', 'एस', 'एसे', 'ऐसे', 'ओर', 'और', 'कइ', 
                     'कई', 'कर', 'करता', 'करते', 'करना', 'करने', 'करें', 'कहते', 'कहा', 'का', 'काफि', 
                     'काफ़ी', 'कि', 'किंहें', 'किंहों', 'कितना', 'किन्हें', 'किन्हों', 'किया', 'किर', 'किस', 
                     'किसि', 'किसी', 'किसे', 'की', 'कुछ', 'कुल', 'के', 'को', 'कोइ', 'कोई', 'कोन', 
                     'कोनसा', 'कौन', 'कौनसा', 'गया', 'घर', 'जब', 'जहाँ', 'जहां', 'जा', 'जिंहें', 'जिंहों', 
                     'जितना', 'जिधर', 'जिन', 'जिन्हें', 'जिन्हों', 'जिस', 'जिसे', 'जीधर', 'जेसा', 'जेसे', 
                     'जैसा', 'जैसे', 'जो', 'तक', 'तब', 'तरह', 'तिंहें', 'तिंहों', 'तिन', 'तिन्हें', 'तिन्हों', 
                     'तिस', 'तिसे', 'तो', 'था', 'थि', 'थी', 'थे', 'दबारा', 'दवारा', 'दिया', 'दुसरा', 'दुसरे', 
                     'दूसरे', 'दो', 'द्वारा', 'न', 'नहिं', 'नहीं', 'ना', 'निचे', 'निहायत', 'नीचे', 'ने', 'पर', 
                     'पहले', 'पुरा', 'पूरा', 'पे', 'फिर', 'बनि', 'बनी', 'बहि', 'बही', 'बहुत', 'बाद', 'बाला', 
                     'बिलकुल', 'भि', 'भितर', 'भी', 'भीतर', 'मगर', 'मानो', 'मे', 'में', 'यदि', 'यह', 'यहाँ', 
                     'यहां', 'यहि', 'यही', 'या', 'यिह', 'ये', 'रखें', 'रवासा', 'रहा', 'रहे', 'ऱ्वासा', 'लिए', 
                     'लिये', 'लेकिन', 'व', 'वगेरह', 'वरग', 'वर्ग', 'वह', 'वहाँ', 'वहां', 'वहिं', 'वहीं', 'वाले', 
                     'वुह', 'वे', 'वग़ैरह', 'संग', 'सकता', 'सकते', 'सबसे', 'सभि', 'सभी', 'साथ', 'साबुत', 
                     'साभ', 'सारा', 'से', 'सो', 'हि', 'ही', 'हुअ', 'हुआ', 'हुइ', 'हुई', 'हुए', 'हे', 'हें', 
                     'है', 'हैं', 'हो', 'होता', 'होति', 'होती', 'होते', 'होना', 'होने']
    elif LANGUAGE == 'en':
        stopwords = stopwords.words('english')

    sentences = [[word for word in sentence.split() if word not in stopwords] for sentence in sentences]
    
    return sentences

train_sentences = preprocess_texts(train_sentences)
test_sentences = preprocess_texts(test_sentences)

In [32]:
# vocab_size and word->id and id->word
flattened_words = [word for sentence in train_sentences for word in sentence]
V = list(set(flattened_words))
vocab_size = len(V)
print(f'vocab_size: {vocab_size}')

word_to_int = {}
int_to_word = {}
for i, word in enumerate(V):
    word_to_int[word] = i
    int_to_word[i] = word

vocab_size: 19580


In [33]:
train_sentences = [[word_to_int[word] for word in sentence] for sentence in train_sentences]

test_sentences = [[word_to_int[word] for word in sentence if word in word_to_int] for sentence in test_sentences]
print('Number of empty test sentences: ', sum([len(s) == 0 for s in test_sentences]))

Number of empty test sentences:  0


## Build datasets

In [7]:
class HOFDataset(Dataset):
    def __init__(self, sentences, labels):
        self.data = []
        for sentence, label in zip(sentences, labels):
            self.data.append(
                (torch.tensor(sentence, dtype=torch.long), 
                 torch.tensor(label, dtype=torch.long))
            )
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        return self.data[index]
    

train_dataset = HOFDataset(train_sentences, train_labels)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

test_dataset = HOFDataset(test_sentences, test_labels)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

## Network architecture

In [8]:
class Classifier(Module):
    def __init__(self, lstm_dim):
        super(Classifier, self).__init__()
        self.embed = nn.Embedding(vocab_size, embedding_size)
        self.embed.load_state_dict(torch.load(embedding_path))
        self.embed.requires_grad = False
        
        self.lstm = nn.LSTM(embedding_size, lstm_dim)
        self.fc = nn.Linear(lstm_dim, 1)
    
    def forward(self, inp):
        out = self.embed(inp)
        out, _ = self.lstm(out)
        out = self.fc(out)
        return out

clf = Classifier(lstm_dim=lstm_dim).to(device)

  return torch._C._cuda_getDeviceCount() > 0


RuntimeError: Attempting to deserialize object on a CUDA device but torch.cuda.is_available() is False. If you are running on a CPU-only machine, please use torch.load with map_location=torch.device('cpu') to map your storages to the CPU.

In [None]:
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(clf.parameters())

In [None]:
for epoch in range(1, epochs + 1):
    
    losses = 0.
    cnt = 0
    clf.train()
    for texts, labels in tqdm(train_loader):
        optimizer.zero_grad()
        pred = clf(texts.to(device))
        loss = criterion(pred, labels.to(device))
        loss.backward()
        optimizer.step()
        losses += loss.detach().item() * len(texts)
        cnt += len(texts)

    epoch_loss = losses / cnt
    print(f'Epoch {epoch:2}: training loss: {epoch_loss:.4f} over {cnt} training points.')   
    

In [12]:
len(train_sentences)

4665

In [11]:
tr_len = np.array([len(s) for s in train_sentences])
sum(tr_len == 0)

2

In [13]:
train_sentences = train_sentences[tr_len != 0]
len(train_sentences)

TypeError: only integer scalar arrays can be converted to a scalar index

In [14]:
type(train_sentences)

list

In [27]:
np.where(tr_len == 0)[0]

array([ 428, 1375])

In [25]:
train_sentences[list(np.where(tr_len != 0)[0])]

TypeError: list indices must be integers or slices, not list

In [26]:
z = torch.tensor(train_sentences)

ValueError: expected sequence of length 9 at dim 1 (got 11)

In [35]:
ben_data = pd.read_csv('data/bengali_hatespeech.csv')

In [36]:
ben_data

Unnamed: 0,sentence,hate,category
0,যত্তসব পাপন শালার ফাজলামী!!!!!,1,sports
1,পাপন শালা রে রিমান্ডে নেওয়া দরকার,1,sports
2,জিল্লুর রহমান স্যারের ছেলে এতো বড় জারজ হবে এটা...,1,sports
3,শালা লুচ্চা দেখতে পাঠার মত দেখা যায়,1,sports
4,তুই তো শালা গাজা খাইছচ।তুর মার হেডায় খেলবে সাকিব,1,sports
...,...,...,...
29995,আমার মনে হচ্ছে মেনে নেয়া উচিত,0,"Meme, TikTok and others"
29996,আমি ধন্যবাদ জানাই আইনপসাসনকে,0,"Meme, TikTok and others"
29997,কাসমির কাসমিরই নিজশ্যই সাদিন হওয়ার দরকার,0,"Meme, TikTok and others"
29998,কলমি পিলিজ আপু মনি অনেক কিওট লাগছে,0,"Meme, TikTok and others"


In [38]:
ben_sentences = ben_data['sentence'].to_numpy()
ben_sentences

array(['যত্তসব পাপন শালার ফাজলামী!!!!!',
       'পাপন শালা রে রিমান্ডে নেওয়া দরকার',
       'জিল্লুর রহমান স্যারের ছেলে এতো বড় জারজ হবে এটা একটা দেশের মানুষ কোনো দিন ও ভাবতে পারেনি।ধন্যবাদ তাহসিন ভাই।',
       ..., 'কাসমির কাসমিরই নিজশ্যই সাদিন হওয়ার দরকার',
       'কলমি পিলিজ আপু মনি অনেক কিওট লাগছে', 'আমি পাকিস্তান এর সাথে জড়িত'],
      dtype=object)

In [39]:
ben = preprocess_texts(ben_sentences)

In [41]:
ben_vocab = list(set(word for sentence in ben for word in sentence))

In [43]:
len(ben_vocab)

57179

In [51]:
ben_data.describe()

Unnamed: 0,hate
count,30000.0
mean,0.333333
std,0.471412
min,0.0
25%,0.0
50%,0.0
75%,1.0
max,1.0
