# Task 3: preprocessing

In [1]:
# Imports
import re
import string
import json
from datetime import datetime
from collections import defaultdict, Counter

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm

import torch
import torch.nn as nn
from torch.nn import Module
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

from nltk.corpus import stopwords

device = 'cpu'

import random

random.seed(26)
np.random.seed(62)
torch.manual_seed(2021)
torch.cuda.manual_seed(123)

# Load Bengali datasets

In [2]:
# save sample Bengali datasets
bengali_train_df = pd.read_csv('../../Task_2/hindi_bengali/save/bengali_hatespeech_sample_train.csv')
bengali_test_df = pd.read_csv('../../Task_2/hindi_bengali/save/bengali_hatespeech_sample_test.csv')
bengali_other_df = pd.read_csv('../../Task_2/hindi_bengali/save/bengali_hatespeech_other.csv')

# Preprocessing

In [3]:
train_sentences = bengali_train_df['sentence']
test_sentences = bengali_test_df['sentence']
other_sentences = bengali_other_df['sentence']

In [4]:
# remove user taggings
user_tag_pattern = re.compile(r'\@\w*')
def remove_tagging(sentence):
    return re.sub(user_tag_pattern, ' ', sentence)

# remove punctuations and urls
http_re = re.compile('http://[^ ]*')
https_re = re.compile('https://[^ ]*')
punctuation = string.punctuation[:2] + string.punctuation[3:]
translator = str.maketrans(punctuation, ' '*len(punctuation))
def remove_punc_and_urls(s):
    s = re.sub(http_re, ' ', s)
    s = re.sub(https_re, ' ', s)
    s = s.translate(translator)
    return s

# substitute numbers
#   when there is a number in the string:
#   if that number is 0 or 1 or 2, then there is no change.
#   else, that number is substituted by a word describing how many digits it has.
def substitute_number(x):
    x = x.group(0)
    if x in {'0', '1', '2'}:
        return x
    return '{}_digits_number'.format(len(x))

# stopwords BENGALI (source: https://www.ranks.nl/stopwords/bengali)
stopwords = ['‡¶Ö‡¶¨‡¶∂‡ßç‡¶Ø', '‡¶Ö‡¶®‡ßá‡¶ï', '‡¶Ö‡¶®‡ßá‡¶ï‡ßá', '‡¶Ö‡¶®‡ßá‡¶ï‡ßá‡¶á', '‡¶Ö‡¶®‡ßç‡¶§‡¶§', '‡¶Ö‡¶•‡¶¨‡¶æ', '‡¶Ö‡¶•‡¶ö', '‡¶Ö‡¶∞‡ßç‡¶•‡¶æ‡¶§', '‡¶Ö‡¶®‡ßç‡¶Ø', '‡¶Ü‡¶ú', '‡¶Ü‡¶õ‡ßá', '‡¶Ü‡¶™‡¶®‡¶æ‡¶∞', 
             '‡¶Ü‡¶™‡¶®‡¶ø', '‡¶Ü‡¶¨‡¶æ‡¶∞', '‡¶Ü‡¶Æ‡¶∞‡¶æ', '‡¶Ü‡¶Æ‡¶æ‡¶ï‡ßá', '‡¶Ü‡¶Æ‡¶æ‡¶¶‡ßá‡¶∞', '‡¶Ü‡¶Æ‡¶æ‡¶∞', '‡¶Ü‡¶Æ‡¶ø', '‡¶Ü‡¶∞‡¶ì', '‡¶Ü‡¶∞', '‡¶Ü‡¶ó‡ßá', '‡¶Ü‡¶ó‡ßá‡¶á', '‡¶Ü‡¶á', 
             '‡¶Ö‡¶§‡¶è‡¶¨', '‡¶Ü‡¶ó‡¶æ‡¶Æ‡ßÄ', '‡¶Ö‡¶¨‡¶ß‡¶ø', '‡¶Ö‡¶®‡ßÅ‡¶Ø‡¶æ‡ßü‡ßÄ', '‡¶Ü‡¶¶‡ßç‡¶Ø‡¶≠‡¶æ‡¶ó‡ßá', '‡¶è‡¶á', '‡¶è‡¶ï‡¶á', '‡¶è‡¶ï‡ßá', '‡¶è‡¶ï‡¶ü‡¶ø', '‡¶è‡¶ñ‡¶®', '‡¶è‡¶ñ‡¶®‡¶ì', '‡¶è‡¶ñ‡¶æ‡¶®‡ßá', 
             '‡¶è‡¶ñ‡¶æ‡¶®‡ßá‡¶á', '‡¶è‡¶ü‡¶ø', '‡¶è‡¶ü‡¶æ', '‡¶è‡¶ü‡¶æ‡¶á', '‡¶è‡¶§‡¶ü‡¶æ‡¶á', '‡¶è‡¶¨‡¶Ç', '‡¶è‡¶ï‡¶¨‡¶æ‡¶∞', '‡¶è‡¶¨‡¶æ‡¶∞', '‡¶è‡¶¶‡ßá‡¶∞', '‡¶è‡¶Å‡¶¶‡ßá‡¶∞', '‡¶è‡¶Æ‡¶®', '‡¶è‡¶Æ‡¶®‡¶ï‡ßÄ', '‡¶è‡¶≤', 
             '‡¶è‡¶∞', '‡¶è‡¶∞‡¶æ', '‡¶è‡¶Å‡¶∞‡¶æ', '‡¶è‡¶∏', '‡¶è‡¶§', '‡¶è‡¶§‡ßá', '‡¶è‡¶∏‡ßá', '‡¶è‡¶ï‡ßá', '‡¶è', '‡¶ê', ' ‡¶á', '‡¶á‡¶π‡¶æ', '‡¶á‡¶§‡ßç‡¶Ø‡¶æ‡¶¶‡¶ø', '‡¶â‡¶®‡¶ø', '‡¶â‡¶™‡¶∞', 
             '‡¶â‡¶™‡¶∞‡ßá', '‡¶â‡¶ö‡¶ø‡¶§', '‡¶ì', '‡¶ì‡¶á', '‡¶ì‡¶∞', '‡¶ì‡¶∞‡¶æ', '‡¶ì‡¶Å‡¶∞', '‡¶ì‡¶Å‡¶∞‡¶æ', '‡¶ì‡¶ï‡ßá', '‡¶ì‡¶¶‡ßá‡¶∞', '‡¶ì‡¶Å‡¶¶‡ßá‡¶∞', '‡¶ì‡¶ñ‡¶æ‡¶®‡ßá', '‡¶ï‡¶§', '‡¶ï‡¶¨‡ßá', 
             '‡¶ï‡¶∞‡¶§‡ßá', '‡¶ï‡ßü‡ßá‡¶ï', '‡¶ï‡ßü‡ßá‡¶ï‡¶ü‡¶ø', '‡¶ï‡¶∞‡¶¨‡ßá', '‡¶ï‡¶∞‡¶≤‡ßá‡¶®', '‡¶ï‡¶∞‡¶æ‡¶∞', '‡¶ï‡¶æ‡¶∞‡¶ì', '‡¶ï‡¶∞‡¶æ', '‡¶ï‡¶∞‡¶ø', '‡¶ï‡¶∞‡¶ø‡ßü‡ßá', '‡¶ï‡¶∞‡¶æ‡¶∞', '‡¶ï‡¶∞‡¶æ‡¶á', 
             '‡¶ï‡¶∞‡¶≤‡ßá', '‡¶ï‡¶∞‡¶≤‡ßá‡¶®', '‡¶ï‡¶∞‡¶ø‡¶§‡ßá', '‡¶ï‡¶∞‡¶ø‡ßü‡¶æ', '‡¶ï‡¶∞‡ßá‡¶õ‡¶ø‡¶≤‡ßá‡¶®', '‡¶ï‡¶∞‡¶õ‡ßá', '‡¶ï‡¶∞‡¶õ‡ßá‡¶®', '‡¶ï‡¶∞‡ßá‡¶õ‡ßá‡¶®', '‡¶ï‡¶∞‡ßá‡¶õ‡ßá', '‡¶ï‡¶∞‡ßá‡¶®', '‡¶ï‡¶∞‡¶¨‡ßá‡¶®', 
             '‡¶ï‡¶∞‡¶æ‡ßü', '‡¶ï‡¶∞‡ßá', '‡¶ï‡¶∞‡ßá‡¶á', '‡¶ï‡¶æ‡¶õ', '‡¶ï‡¶æ‡¶õ‡ßá', '‡¶ï‡¶æ‡¶ú‡ßá', '‡¶ï‡¶æ‡¶∞‡¶£', '‡¶ï‡¶ø‡¶õ‡ßÅ', '‡¶ï‡¶ø‡¶õ‡ßÅ‡¶á', '‡¶ï‡¶ø‡¶®‡ßç‡¶§‡ßÅ', '‡¶ï‡¶ø‡¶Ç‡¶¨‡¶æ', '‡¶ï‡¶ø', '‡¶ï‡ßÄ', '‡¶ï‡ßá‡¶â', 
             '‡¶ï‡ßá‡¶â‡¶á', '‡¶ï‡¶æ‡¶â‡¶ï‡ßá', '‡¶ï‡ßá‡¶®', '‡¶ï‡ßá', '‡¶ï‡ßã‡¶®‡¶ì', '‡¶ï‡ßã‡¶®‡ßã', '‡¶ï‡ßã‡¶®', '‡¶ï‡¶ñ‡¶®‡¶ì', '‡¶ï‡ßç‡¶∑‡ßá‡¶§‡ßç‡¶∞‡ßá', '‡¶ñ‡ßÅ‡¶¨	‡¶ó‡ßÅ‡¶≤‡¶ø', '‡¶ó‡¶ø‡ßü‡ßá', '‡¶ó‡¶ø‡ßü‡ßá‡¶õ‡ßá', 
             '‡¶ó‡ßá‡¶õ‡ßá', '‡¶ó‡ßá‡¶≤', '‡¶ó‡ßá‡¶≤‡ßá', '‡¶ó‡ßã‡¶ü‡¶æ', '‡¶ö‡¶≤‡ßá', '‡¶õ‡¶æ‡ßú‡¶æ', '‡¶õ‡¶æ‡ßú‡¶æ‡¶ì', '‡¶õ‡¶ø‡¶≤‡ßá‡¶®', '‡¶õ‡¶ø‡¶≤', '‡¶ú‡¶®‡ßç‡¶Ø', '‡¶ú‡¶æ‡¶®‡¶æ', '‡¶†‡¶ø‡¶ï', '‡¶§‡¶ø‡¶®‡¶ø', 
             '‡¶§‡¶ø‡¶®‡¶ê', '‡¶§‡¶ø‡¶®‡¶ø‡¶ì', '‡¶§‡¶ñ‡¶®', '‡¶§‡¶¨‡ßá', '‡¶§‡¶¨‡ßÅ', '‡¶§‡¶æ‡¶Å‡¶¶‡ßá‡¶∞', '‡¶§‡¶æ‡¶Å‡¶æ‡¶π‡¶æ‡¶∞‡¶æ', '‡¶§‡¶æ‡¶Å‡¶∞‡¶æ', '‡¶§‡¶æ‡¶Å‡¶∞', '‡¶§‡¶æ‡¶Å‡¶ï‡ßá', '‡¶§‡¶æ‡¶á', '‡¶§‡ßá‡¶Æ‡¶®', '‡¶§‡¶æ‡¶ï‡ßá', 
             '‡¶§‡¶æ‡¶π‡¶æ', '‡¶§‡¶æ‡¶π‡¶æ‡¶§‡ßá', '‡¶§‡¶æ‡¶π‡¶æ‡¶∞', '‡¶§‡¶æ‡¶¶‡ßá‡¶∞', '‡¶§‡¶æ‡¶∞‡¶™‡¶∞', '‡¶§‡¶æ‡¶∞‡¶æ', '‡¶§‡¶æ‡¶∞‡ßà', '‡¶§‡¶æ‡¶∞', '‡¶§‡¶æ‡¶π‡¶≤‡ßá', '‡¶§‡¶ø‡¶®‡¶ø', '‡¶§‡¶æ', '‡¶§‡¶æ‡¶ì', '‡¶§‡¶æ‡¶§‡ßá', 
             '‡¶§‡ßã', '‡¶§‡¶§', '‡¶§‡ßÅ‡¶Æ‡¶ø', '‡¶§‡ßã‡¶Æ‡¶æ‡¶∞', '‡¶§‡¶•‡¶æ', '‡¶•‡¶æ‡¶ï‡ßá', '‡¶•‡¶æ‡¶ï‡¶æ', '‡¶•‡¶æ‡¶ï‡¶æ‡ßü', '‡¶•‡ßá‡¶ï‡ßá', '‡¶•‡ßá‡¶ï‡ßá‡¶ì', '‡¶•‡¶æ‡¶ï‡¶¨‡ßá', '‡¶•‡¶æ‡¶ï‡ßá‡¶®', '‡¶•‡¶æ‡¶ï‡¶¨‡ßá‡¶®', 
             '‡¶•‡ßá‡¶ï‡ßá‡¶á', '‡¶¶‡¶ø‡¶ï‡ßá', '‡¶¶‡¶ø‡¶§‡ßá', '‡¶¶‡¶ø‡ßü‡ßá', '‡¶¶‡¶ø‡ßü‡ßá‡¶õ‡ßá', '‡¶¶‡¶ø‡ßü‡ßá‡¶õ‡ßá‡¶®', '‡¶¶‡¶ø‡¶≤‡ßá‡¶®', '‡¶¶‡ßÅ', '‡¶¶‡ßÅ‡¶ü‡¶ø', '‡¶¶‡ßÅ‡¶ü‡ßã', '‡¶¶‡ßá‡ßü', '‡¶¶‡ßá‡¶ì‡ßü‡¶æ', '‡¶¶‡ßá‡¶ì‡ßü‡¶æ‡¶∞', 
             '‡¶¶‡ßá‡¶ñ‡¶æ', '‡¶¶‡ßá‡¶ñ‡ßá', '‡¶¶‡ßá‡¶ñ‡¶§‡ßá', '‡¶¶‡ßç‡¶¨‡¶æ‡¶∞‡¶æ', '‡¶ß‡¶∞‡ßá', '‡¶ß‡¶∞‡¶æ', '‡¶®‡ßü', '‡¶®‡¶æ‡¶®‡¶æ', '‡¶®‡¶æ', '‡¶®‡¶æ‡¶ï‡¶ø', '‡¶®‡¶æ‡¶ó‡¶æ‡¶¶', '‡¶®‡¶ø‡¶§‡ßá', '‡¶®‡¶ø‡¶ú‡ßá', '‡¶®‡¶ø‡¶ú‡ßá‡¶á', 
             '‡¶®‡¶ø‡¶ú‡ßá‡¶∞', '‡¶®‡¶ø‡¶ú‡ßá‡¶¶‡ßá‡¶∞', '‡¶®‡¶ø‡ßü‡ßá', '‡¶®‡ßá‡¶ì‡ßü‡¶æ', '‡¶®‡ßá‡¶ì‡ßü‡¶æ‡¶∞', '‡¶®‡ßá‡¶á', '‡¶®‡¶æ‡¶á', '‡¶™‡¶ï‡ßç‡¶∑‡ßá', '‡¶™‡¶∞‡ßç‡¶Ø‡¶®‡ßç‡¶§', '‡¶™‡¶æ‡¶ì‡ßü‡¶æ', '‡¶™‡¶æ‡¶∞‡ßá‡¶®', '‡¶™‡¶æ‡¶∞‡¶ø', '‡¶™‡¶æ‡¶∞‡ßá', 
             '‡¶™‡¶∞‡ßá', '‡¶™‡¶∞‡ßá‡¶á', '‡¶™‡¶∞‡ßá‡¶ì', '‡¶™‡¶∞', '‡¶™‡ßá‡ßü‡ßá', '‡¶™‡ßç‡¶∞‡¶§‡¶ø', '‡¶™‡ßç‡¶∞‡¶≠‡ßÉ‡¶§‡¶ø', '‡¶™‡ßç‡¶∞‡¶æ‡ßü', '‡¶´‡ßá‡¶∞', '‡¶´‡¶≤‡ßá', '‡¶´‡¶ø‡¶∞‡ßá', '‡¶¨‡ßç‡¶Ø‡¶¨‡¶π‡¶æ‡¶∞', '‡¶¨‡¶≤‡¶§‡ßá', 
             '‡¶¨‡¶≤‡¶≤‡ßá‡¶®', '‡¶¨‡¶≤‡ßá‡¶õ‡ßá‡¶®', '‡¶¨‡¶≤‡¶≤', '‡¶¨‡¶≤‡¶æ', '‡¶¨‡¶≤‡ßá‡¶®', '‡¶¨‡¶≤‡ßá', '‡¶¨‡¶π‡ßÅ', '‡¶¨‡¶∏‡ßá', '‡¶¨‡¶æ‡¶∞', '‡¶¨‡¶æ', '‡¶¨‡¶ø‡¶®‡¶æ', '‡¶¨‡¶∞‡¶Ç', '‡¶¨‡¶¶‡¶≤‡ßá', '‡¶¨‡¶æ‡¶¶‡ßá', 
             '‡¶¨‡¶æ‡¶∞', '‡¶¨‡¶ø‡¶∂‡ßá‡¶∑', '‡¶¨‡¶ø‡¶≠‡¶ø‡¶®‡ßç‡¶®	‡¶¨‡¶ø‡¶∑‡ßü‡¶ü‡¶ø', '‡¶¨‡ßç‡¶Ø‡¶¨‡¶π‡¶æ‡¶∞', '‡¶¨‡ßç‡¶Ø‡¶æ‡¶™‡¶æ‡¶∞‡ßá', '‡¶≠‡¶æ‡¶¨‡ßá', '‡¶≠‡¶æ‡¶¨‡ßá‡¶á', '‡¶Æ‡¶ß‡ßç‡¶Ø‡ßá', '‡¶Æ‡¶ß‡ßç‡¶Ø‡ßá‡¶á', '‡¶Æ‡¶ß‡ßç‡¶Ø‡ßá‡¶ì', '‡¶Æ‡¶ß‡ßç‡¶Ø‡¶≠‡¶æ‡¶ó‡ßá', 
             '‡¶Æ‡¶æ‡¶ß‡ßç‡¶Ø‡¶Æ‡ßá', '‡¶Æ‡¶æ‡¶§‡ßç‡¶∞', '‡¶Æ‡¶§‡ßã', '‡¶Æ‡¶§‡ßã‡¶á', '‡¶Æ‡ßã‡¶ü‡ßá‡¶á', '‡¶Ø‡¶ñ‡¶®', '‡¶Ø‡¶¶‡¶ø', '‡¶Ø‡¶¶‡¶ø‡¶ì', '‡¶Ø‡¶æ‡¶¨‡ßá', '‡¶Ø‡¶æ‡ßü', '‡¶Ø‡¶æ‡¶ï‡ßá', '‡¶Ø‡¶æ‡¶ì‡ßü‡¶æ', '‡¶Ø‡¶æ‡¶ì‡ßü‡¶æ‡¶∞', 
             '‡¶Ø‡¶§', '‡¶Ø‡¶§‡¶ü‡¶æ', '‡¶Ø‡¶æ', '‡¶Ø‡¶æ‡¶∞', '‡¶Ø‡¶æ‡¶∞‡¶æ', '‡¶Ø‡¶æ‡¶Å‡¶∞', '‡¶Ø‡¶æ‡¶Å‡¶∞‡¶æ', '‡¶Ø‡¶æ‡¶¶‡ßá‡¶∞', '‡¶Ø‡¶æ‡¶®', '‡¶Ø‡¶æ‡¶ö‡ßç‡¶õ‡ßá', '‡¶Ø‡ßá‡¶§‡ßá', '‡¶Ø‡¶æ‡¶§‡ßá', '‡¶Ø‡ßá‡¶®', '‡¶Ø‡ßá‡¶Æ‡¶®', 
             '‡¶Ø‡ßá‡¶ñ‡¶æ‡¶®‡ßá', '‡¶Ø‡¶ø‡¶®‡¶ø', '‡¶Ø‡ßá', '‡¶∞‡ßá‡¶ñ‡ßá', '‡¶∞‡¶æ‡¶ñ‡¶æ', '‡¶∞‡ßü‡ßá‡¶õ‡ßá', '‡¶∞‡¶ï‡¶Æ', '‡¶∂‡ßÅ‡¶ß‡ßÅ', '‡¶∏‡¶ô‡ßç‡¶ó‡ßá', '‡¶∏‡¶ô‡ßç‡¶ó‡ßá‡¶ì', '‡¶∏‡¶Æ‡¶∏‡ßç‡¶§', '‡¶∏‡¶¨', '‡¶∏‡¶¨‡¶æ‡¶∞', '‡¶∏‡¶π', 
             '‡¶∏‡ßÅ‡¶§‡¶∞‡¶æ‡¶Ç', '‡¶∏‡¶π‡¶ø‡¶§', '‡¶∏‡ßá‡¶á', '‡¶∏‡ßá‡¶ü‡¶æ', '‡¶∏‡ßá‡¶ü‡¶ø', '‡¶∏‡ßá‡¶ü‡¶æ‡¶á', '‡¶∏‡ßá‡¶ü‡¶æ‡¶ì', '‡¶∏‡¶Æ‡ßç‡¶™‡ßç‡¶∞‡¶§‡¶ø', '‡¶∏‡ßá‡¶ñ‡¶æ‡¶®', '‡¶∏‡ßá‡¶ñ‡¶æ‡¶®‡ßá', '‡¶∏‡ßá', '‡¶∏‡ßç‡¶™‡¶∑‡ßç‡¶ü', '‡¶∏‡ßç‡¶¨‡ßü‡¶Ç', 
             '‡¶π‡¶á‡¶§‡ßá', '‡¶π‡¶á‡¶¨‡ßá', '‡¶π‡ßà‡¶≤‡ßá', '‡¶π‡¶á‡ßü‡¶æ', '‡¶π‡¶ö‡ßç‡¶õ‡ßá', '‡¶π‡¶§', '‡¶π‡¶§‡ßá', '‡¶π‡¶§‡ßá‡¶á', '‡¶π‡¶¨‡ßá', '‡¶π‡¶¨‡ßá‡¶®', '‡¶π‡ßü‡ßá‡¶õ‡¶ø‡¶≤', '‡¶π‡ßü‡ßá‡¶õ‡ßá', '‡¶π‡ßü‡ßá‡¶õ‡ßá‡¶®', '‡¶π‡ßü‡ßá', 
             '‡¶π‡ßü‡¶®‡¶ø', '‡¶π‡ßü', '‡¶π‡ßü‡ßá‡¶á', '‡¶π‡ßü‡¶§‡ßã', '‡¶π‡¶≤', '‡¶π‡¶≤‡ßá', '‡¶π‡¶≤‡ßá‡¶á', '‡¶π‡¶≤‡ßá‡¶ì', '‡¶π‡¶≤‡ßã', '‡¶π‡¶ø‡¶∏‡¶æ‡¶¨‡ßá', '‡¶π‡¶ì‡ßü‡¶æ', '‡¶π‡¶ì‡ßü‡¶æ‡¶∞', '‡¶π‡¶ì‡ßü‡¶æ‡ßü', '‡¶π‡¶®', 
             '‡¶π‡ßã‡¶ï', '‡¶ú‡¶®', '‡¶ú‡¶®‡¶ï‡ßá', '‡¶ú‡¶®‡ßá‡¶∞', '‡¶ú‡¶æ‡¶®‡¶§‡ßá', '‡¶ú‡¶æ‡¶®‡¶æ‡ßü', '‡¶ú‡¶æ‡¶®‡¶ø‡ßü‡ßá', '‡¶ú‡¶æ‡¶®‡¶æ‡¶®‡ßã', '‡¶ú‡¶æ‡¶®‡¶ø‡ßü‡ßá‡¶õ‡ßá', '‡¶ú‡¶®‡ßç‡¶Ø', '‡¶ú‡¶®‡ßç‡¶Ø‡¶ì‡¶ú‡ßá', '‡¶ú‡ßá', 
             '‡¶¨‡ßá‡¶∂', '‡¶¶‡ßá‡¶®', '‡¶§‡ßÅ‡¶≤‡ßá', '‡¶õ‡¶ø‡¶≤‡ßá‡¶®', '‡¶ö‡¶æ‡¶®', '‡¶ö‡¶æ‡ßü', '‡¶ö‡ßá‡ßü‡ßá', '‡¶Æ‡ßã‡¶ü', '‡¶Ø‡¶•‡ßá‡¶∑‡ßç‡¶ü', '‡¶ü‡¶ø']

In [5]:
def clean_texts(sentences):
    # tags
    sentences = [remove_tagging(sentence) for sentence in sentences]
    
    # lower case
    sentences = [sentence.lower() for sentence in sentences]
    
    # remove punctuations and urls 
    sentences = [remove_punc_and_urls(sentence) for sentence in sentences]
    
    # substitute numbers
    sentences = [re.sub('\\b[0-9]+\\b', substitute_number, sentence) for sentence in sentences]
    
    # remove stopwords
    sentences = [[word for word in sentence.split() if word not in stopwords] for sentence in sentences]
    
    return sentences

In [6]:
# perform cleaning

train_sentences = clean_texts(train_sentences)
train_texts = [' '.join(l) for l in train_sentences]
bengali_train_df['sentence'] = train_texts

test_sentences = clean_texts(test_sentences)
test_texts = [' '.join(l) for l in test_sentences]
bengali_test_df['sentence'] = test_texts

other_sentences = clean_texts(other_sentences)
other_texts = [' '.join(l) for l in other_sentences]
bengali_other_df['sentence'] = other_texts

In [7]:
print('train:')
display(bengali_train_df.head())
print('test:')
display(bengali_test_df.head())
print('other:')
display(bengali_other_df.head())

train:


Unnamed: 0,sentence,hate,category
0,‡¶∏‡¶Æ‡¶ï‡¶æ‡¶Æ‡ßÄ ‡¶π‡ßÅ‡¶ú‡ßÅ‡¶∞,0,religion
1,‡¶õ‡¶æ‡¶è‡¶≤‡ßÄ‡¶ó ‡¶∏‡¶æ‡¶≤‡¶æ ‡¶¶‡ßá‡¶∞ ‡¶®‡¶ø‡¶∏‡¶ø‡¶¶‡ßç‡¶¶ ‡¶π‡¶ï,1,politics
2,‡¶ï‡¶æ‡¶ì‡ßü‡¶æ ‡¶ó‡¶¶‡¶ø ‡¶õ‡¶æ‡¶∞‡¶≤‡ßá ‡¶¨‡ßÅ‡¶ú‡¶¨‡ßá ‡¶ú‡ßÅ‡¶§‡¶æ ‡¶ï‡ßá‡¶Æ‡¶®‡ßá ‡¶ñ‡¶æ‡ßü,0,politics
3,‡¶ï‡¶æ‡¶â‡ßü‡¶æ ‡¶ï‡¶æ‡¶¶‡ßá‡¶∞ ‡¶¨‡ßú ‡¶Æ‡¶æ‡¶ó‡ßÄ‡¶ñ‡ßã‡¶∞ ‡¶≠‡¶ø‡¶°‡¶ø‡¶ì ‡¶™‡¶ø‡¶ï ‡¶¶‡ßá‡¶ñ‡¶≤‡ßá ‡¶¨‡ßÅ‡¶ù‡¶æ ‡¶≤‡ßÅ...,1,politics
4,‡¶Ö‡¶™‡ßÅ ‡¶≠‡¶æ‡¶≤‡ßã ‡¶ï‡¶•‡¶æ ‡¶õ‡ßã‡¶ü ‡¶ï‡¶∞‡¶¨‡ßá‡¶®‡¶æ,0,"Meme, TikTok and others"


test:


Unnamed: 0,sentence,hate,category
0,‡¶Æ‡¶π‡¶ø‡¶≤‡¶æ‡¶ï‡ßá ‡¶∞‡¶ø‡¶Æ‡¶æ‡¶®‡¶°‡ßá,0,crime
1,‡¶§‡ßÅ‡¶∞ ‡¶∞‡¶ø‡¶™‡¶æ‡¶§‡¶ï‡ßá ‡¶Æ‡¶® ‡¶ö‡¶æ‡¶á‡¶õ‡¶ø‡¶≤ ‡¶õ‡ßá‡¶°‡¶º‡ßá ‡¶ó‡ßá‡¶≤‡¶ø ‡¶∏‡¶æ‡¶•‡ßá ‡¶Æ‡¶ø‡¶§‡ßç‡¶Ø‡ßá ‡¶Ö...,0,crime
2,‡¶π‡ßÅ‡¶Æ‡¶æ‡ßü‡ßÅ‡¶® ‡¶Ü‡¶ú‡¶æ‡¶¶ ‡¶è‡¶§‡ßã ‡¶¨‡ßú ‡¶ï‡ßç‡¶∞‡¶æ‡¶ï ‡¶Æ‡¶æ‡¶§‡¶æ‡¶≤,0,religion
3,‡¶¨‡¶æ‡¶Ç‡¶æ‡¶≤ ‡¶¶‡ßá‡¶∂‡ßá‡¶∞ ‡¶∏‡¶¨‡¶ö‡ßá‡ßü‡ßá ‡¶∏‡¶æ‡¶π‡¶∏‡¶ø ‡¶á‡¶â‡¶ü‡ßÅ‡¶¨‡¶æ‡¶∞üëçüëçüëç,0,politics
4,‡¶ï‡¶æ‡¶¶‡ßá‡¶∞ ‡¶ß‡¶®‡ßá‡¶∞ ‡¶Æ‡¶æ‡¶•‡¶æ ‡¶ö‡ßã‡¶∞ ‡¶ï‡¶æ‡¶¶‡ßá‡¶∞ ‡¶è‡¶ï‡¶ü‡¶æ ‡¶´‡¶æ‡¶ü‡¶æ ‡¶ï‡ßá‡¶∑‡ßç‡¶ü ‡¶ï‡ßã‡¶á ...,1,politics


other:


Unnamed: 0,sentence,hate,category
0,‡¶õ‡¶¨‡¶ø‡¶ü‡¶æ ‡¶ö‡¶ñ‡ßá ‡¶™‡¶æ‡¶®‡¶ø ‡¶¨‡¶æ‡¶≤ ‡¶è‡¶ï‡¶ü‡¶æ ‡¶õ‡¶¨‡¶ø,0,entertainment
1,‡¶™‡¶æ‡¶™‡¶® ‡¶ñ‡¶æ‡¶®‡¶ï‡¶ø‡¶∞ ‡¶™‡ßã‡¶≤‡¶æ ‡¶Æ‡ßá‡¶á‡¶® ‡¶∂‡ßü‡¶§‡¶æ‡¶® ‡¶ì‡¶∞‡ßá ‡¶≤‡¶æ‡¶•‡¶•‡ßÄ ‡¶Æ‡¶æ‡¶á‡¶∞‡¶æ ‡¶¨‡¶ø...,1,sports
2,‡¶™‡¶∞‡¶ø‡¶¨‡ßá‡¶∂‡¶ü‡¶æ ‡¶∏‡ßÅ‡¶®‡ßç‡¶¶‡¶∞ ‡¶°‡¶ø‡¶∏‡¶ø ‡¶ï‡ßá‡¶π ‡¶ó‡¶æ‡¶≤‡¶ø ‡¶°‡¶ø‡¶∏‡¶ø ‡¶∏‡¶∞‡¶æ‡¶∏‡¶∞‡¶ø ‡¶¢‡ßá‡¶≤‡ßá...,0,crime
3,‡¶á‡ßü‡¶æ ‡¶Ü‡¶≤‡ßç‡¶≤‡¶æ‡¶π ‡¶á‡¶´‡¶§‡¶æ‡¶∞‡ßá‡¶∞ ‡¶∏‡¶Æ‡ßü ‡¶¶‡ßã‡ßü‡¶æ ‡¶ï‡¶¨‡ßÅ‡¶≤‡ßá‡¶∞ ‡¶∏‡¶Æ‡ßü ‡¶∏‡¶Æ‡ßü ‡¶¶‡ßã‡ßü...,0,religion
4,‡¶Æ‡¶æ‡¶¶‡¶æ‡¶∞‡¶ö‡ßã‡¶¶ ‡¶á‡¶π‡ßÅ‡¶ß‡¶ø‡¶∞ ‡¶¶‡¶æ‡¶≤‡¶æ‡¶≤ ‡¶¨‡¶æ‡¶¨‡¶∞‡¶ø ‡¶Æ‡¶∏‡¶ú‡¶ø‡¶¶ ‡¶≠‡¶æ‡¶Ç‡¶≤‡ßá ‡¶Æ‡¶®‡ßç‡¶ß‡¶ø‡¶∞...,1,"Meme, TikTok and others"


## Vocab and Word <-> int transformation

In [8]:
embed_train_df = pd.concat([bengali_train_df, bengali_other_df])

In [9]:
train_sentences = [sentence.split() for sentence in bengali_train_df['sentence']]
test_sentences = [sentence.split() for sentence in bengali_test_df['sentence']]
embed_train_sentences = [sentence.split() for sentence in embed_train_df['sentence']]

flattened_words = [word for sentence in embed_train_sentences for word in sentence]
V = sorted(list(set(flattened_words)))
vocab_size = len(V)
print(f'vocab_size: {vocab_size}')

word_to_int = {}
int_to_word = {}
for i, word in enumerate(V):
    word_to_int[word] = i
    int_to_word[i] = word

# save dicts for transformation word <-> int
with open('save/word_to_int_dict.json', 'w') as f:
    json.dump(word_to_int, f)
with open('save/int_to_word_dict.json', 'w') as f:
    json.dump(int_to_word, f)    

# save word-counter for sampling
word_counter = Counter(flattened_words)
with open('save/word_counter.json', 'w') as f:
    json.dump(word_counter, f)

vocab_size: 55189


## Save

In [10]:
bengali_train_df.to_csv('save/bengali_hatespeech_sample_train_preprocessed.csv', index=False)
bengali_test_df.to_csv('save/bengali_hatespeech_sample_test_preprocessed.csv', index=False)
bengali_other_df.to_csv('save/bengali_hatespeech_other_preprocessed.csv', index=False)
embed_train_df.to_csv('save/bengali_hatespeech_embed_train_preprocessed.csv', index=False)