In [1]:
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
from torch.optim import AdamW
from torch.utils.data import Dataset, DataLoader

import datasets
from transformers import DistilBertTokenizer, DistilBertModel

from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import f1_score, classification_report

from tqdm import tqdm

pd.set_option("display.max_colwidth", None)

In [2]:
dataset = datasets.load_dataset("ucberkeley-dlab/measuring-hate-speech", "binary")
hate_speech_ucb = dataset["train"].to_pandas()

Using custom data configuration ucberkeley-dlab--measuring-hate-speech-f91f636a830ad73c
Reusing dataset parquet (C:\Users\UTKARSH\.cache\huggingface\datasets\parquet\ucberkeley-dlab--measuring-hate-speech-f91f636a830ad73c\0.0.0\0b6d5799bb726b24ad7fc7be720c170d8e497f575d02d47537de9a5bac074901)


  0%|          | 0/1 [00:00<?, ?it/s]

In [3]:
device = torch.device("cuda:0") if torch.cuda.is_available() else "cpu"
device

device(type='cuda', index=0)

In [4]:
MODEL_NAME = "distilbert-base-uncased"  # "distilbert-base-uncased-finetuned-sst-2-english"
BATCH_SIZE = 16
MAX_LEN = 128
EPOCHS = 10
LEARNING_RATE = 1e-05
TOKENIZER = DistilBertTokenizer.from_pretrained(MODEL_NAME, truncation=True, do_lower_case=True)

In [5]:
hate_speech_ucb.iloc[:, :22].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 135556 entries, 0 to 135555
Data columns (total 22 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   comment_id          135556 non-null  int32  
 1   annotator_id        135556 non-null  int32  
 2   platform            135556 non-null  int8   
 3   sentiment           135556 non-null  float64
 4   respect             135556 non-null  float64
 5   insult              135556 non-null  float64
 6   humiliate           135556 non-null  float64
 7   status              135556 non-null  float64
 8   dehumanize          135556 non-null  float64
 9   violence            135556 non-null  float64
 10  genocide            135556 non-null  float64
 11  attack_defend       135556 non-null  float64
 12  hatespeech          135556 non-null  float64
 13  hate_speech_score   135556 non-null  float64
 14  text                135556 non-null  object 
 15  infitms             135556 non-nul

In [6]:
hate_speech_ucb = hate_speech_ucb[["text", "hatespeech"]]
hate_speech_ucb["hatespeech"] = pd.to_numeric(hate_speech_ucb["hatespeech"], downcast="integer")

In [7]:
hate_speech_ucb["hatespeech"].value_counts()

0    80624
2    46021
1     8911
Name: hatespeech, dtype: int64

In [8]:
# # For testing purpose
# hate_speech_ucb = hate_speech_ucb[:5000]

In [9]:
hate_speech_ucb

Unnamed: 0,text,hatespeech
0,"Yes indeed. She sort of reminds me of the elder lady that played the part in the movie ""Titanic"" who was telling her story!!! And I wouldn't have wanted to cover who I really am!! I would be proud!!!! WE should be proud of our race no matter what it is!!",0
1,The trans women reading this tweet right now is beautiful,0
2,"Question: These 4 broads who criticize America, what country did they flee to get here? And now they want to make OUR America like THEIR former HELL HOLE. I don't think so!!!!!!!!!! Let them explain their GRATITUDE for letting them in OUR country.",2
3,It is about time for all illegals to go back to their country of origin and keep our freeway open and prevent heavy traffic.,0
4,For starters bend over the one in pink and kick that ass and pussy to get a taste until she's begging for a dick inside her.,2
...,...,...
135551,عاجل سماحة #السيد_عبدالملك_بدرالدين_الحوثي نصره الله على السعوديةأن تتعلم من عمليةالتاسع برمضان وتوقف الحرب #اليمن_مقبرة_الغزاة #Breaking_News |Sayyed Abdulmalik:The Saudi regime should learn from the 9th of Ramadan operation and stop its aggression and be for peace #Almasirah URL URL,0
135552,"Millions of #Yemen-is participated in mass rallies on 13squares on various governorates, to mark the International Day of Al-Quds #يوم_القدس_العالمي #لا_لصفقة_القرن #لا_لصفقة_ترامب #QudsDay #FreePalestine #IntenationalQudsDay #Palestine #news #media #press URL",0
135553,@AbeShinzo @realDonaldTrump @shinzoabe 独裁者は行きますこれは、滞在しているイランの人々です Dictator goes This is the people of Iran who are staying دیکتاتور میرود.این ما مردم ایران هستیم که میمانیم #NoDeal4TerroristRegime @realDonaldTrump,0
135554,"Millions of #Yemen-is participated in mass rallies on 13squares on various governorates, to mark the International Day of Al-Quds #يوم_القدس_العالمي #لا_لصفقة_القرن #لا_لصفقة_ترامب #QudsDay #FreePalestine #IntenationalQudsDay #Palestine #news #media #press URL",0


In [10]:
class HateDataset(Dataset):

    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.text = dataframe.text
        self.targets = OneHotEncoder(sparse=False).fit_transform(np.array(self.data["hatespeech"]).reshape(-1, 1))
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = str(self.text[index])
        text = " ".join(text.split())

        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True
        )

        ids = inputs["input_ids"]
        mask = inputs["attention_mask"]
        token_type_ids = inputs["token_type_ids"]

        return {
            "ids": torch.tensor(ids, dtype=torch.long),
            "mask": torch.tensor(mask, dtype=torch.long),
            "token_type_ids": torch.tensor(token_type_ids, dtype=torch.long),
            "targets": torch.tensor(self.targets[index], dtype=torch.float)
        }

In [11]:
# Creating the dataset and dataloader for the neural network

train_size = 0.8
val_size = 0.1

train_data = hate_speech_ucb.sample(frac=train_size, random_state=210)
test_data = hate_speech_ucb.drop(train_data.index).reset_index(drop=True)
train_data = train_data.reset_index(drop=True)
val_data = test_data.sample(frac=val_size / (1 - train_size), random_state=220).reset_index()
test_data = test_data.drop(val_data.index).reset_index(drop=True)

print(f"FULL Dataset: {hate_speech_ucb.shape}")
print(f"TRAIN Dataset: {train_data.shape}")
print(f"VAL Dataset: {val_data.shape}")
print(f"TEST Dataset: {test_data.shape}")

training_set = HateDataset(train_data, TOKENIZER, MAX_LEN)
validation_set = HateDataset(val_data, TOKENIZER, MAX_LEN)
testing_set = HateDataset(test_data, TOKENIZER, MAX_LEN)

FULL Dataset: (135556, 2)
TRAIN Dataset: (108445, 2)
VAL Dataset: (13556, 3)
TEST Dataset: (13555, 2)


In [12]:
train_params = {
    "batch_size": BATCH_SIZE,
    "shuffle": True,
    "num_workers": 0
}

val_params = {
    "batch_size": 1,
    "shuffle": False,
    "num_workers": 0
}

test_params = {
    "batch_size": 1,
    "shuffle": False,
    "num_workers": 0
}

training_loader = DataLoader(training_set, **train_params)
validation_loader = DataLoader(validation_set, **val_params)
testing_loader = DataLoader(testing_set, **test_params)

In [13]:
class DistilBERTMultiClass(nn.Module):
    def __init__(self):
        super(DistilBERTMultiClass, self).__init__()
        self.l1 = DistilBertModel.from_pretrained(MODEL_NAME)
        self.pre_classifier = nn.Linear(768, 768)
        self.dropout = nn.Dropout(0.1)
        self.classifier = nn.Linear(768, 3)

    def forward(self, input_ids, attention_mask, token_type_ids):
        output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask)
        hidden_state = output_1[0]
        pooler = hidden_state[:, 0]
        pooler = self.pre_classifier(pooler)
        pooler = nn.Tanh()(pooler)
        pooler = self.dropout(pooler)
        output = self.classifier(pooler)
        return output

In [14]:
model = DistilBERTMultiClass()
model.to(device)

DistilBERTMultiClass(
  (l1): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0): TransformerBlock(
          (dropout): Dropout(p=0.1, inplace=False)
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p

In [15]:
def loss_fn(outputs, targets):
    return torch.nn.BCEWithLogitsLoss()(outputs, targets)

In [16]:
optimizer = AdamW(params=model.parameters(), lr=LEARNING_RATE)

In [17]:
def train(epoch):
    model.train()
    for _, data in tqdm(enumerate(training_loader, 0)):
        ids = data["ids"].to(device, dtype=torch.long)
        mask = data["mask"].to(device, dtype=torch.long)
        token_type_ids = data["token_type_ids"].to(device, dtype=torch.long)
        targets = data["targets"].to(device, dtype=torch.float)
        outputs = model(ids, mask, token_type_ids)

        optimizer.zero_grad()
        loss = loss_fn(outputs, targets)
        if _ % 1000 == 0:
            print(f"Epoch: {epoch}, Loss: {loss.item()}")
        loss.backward()
        optimizer.step()

In [18]:
for epoch in range(EPOCHS):
    train(epoch)

1it [00:02,  2.56s/it]

Epoch: 0, Loss: 0.7027923464775085


1001it [04:59,  3.10it/s]

Epoch: 0, Loss: 0.3348482549190521


2001it [09:56,  3.10it/s]

Epoch: 0, Loss: 0.3232208788394928


3001it [14:55,  3.08it/s]

Epoch: 0, Loss: 0.4014842212200165


4001it [19:54,  3.09it/s]

Epoch: 0, Loss: 0.3571707606315613


5001it [24:53,  3.11it/s]

Epoch: 0, Loss: 0.30053970217704773


6001it [29:52,  3.08it/s]

Epoch: 0, Loss: 0.41926440596580505


6778it [33:44,  3.35it/s]
1it [00:00,  3.02it/s]

Epoch: 1, Loss: 0.19265049695968628


1001it [04:59,  3.08it/s]

Epoch: 1, Loss: 0.33590540289878845


2001it [09:58,  3.10it/s]

Epoch: 1, Loss: 0.32974714040756226


3001it [14:56,  3.09it/s]

Epoch: 1, Loss: 0.36316728591918945


4001it [19:55,  3.09it/s]

Epoch: 1, Loss: 0.3326778709888458


5001it [24:54,  3.08it/s]

Epoch: 1, Loss: 0.4588663578033447


6001it [29:53,  3.07it/s]

Epoch: 1, Loss: 0.25858694314956665


6778it [33:45,  3.35it/s]
1it [00:00,  3.03it/s]

Epoch: 2, Loss: 0.33017897605895996


1001it [04:59,  3.09it/s]

Epoch: 2, Loss: 0.27940675616264343


2001it [09:58,  3.09it/s]

Epoch: 2, Loss: 0.3070855140686035


3001it [14:57,  3.10it/s]

Epoch: 2, Loss: 0.2559090256690979


4001it [19:56,  3.10it/s]

Epoch: 2, Loss: 0.32584428787231445


5001it [24:55,  3.08it/s]

Epoch: 2, Loss: 0.2823101878166199


6001it [29:53,  3.09it/s]

Epoch: 2, Loss: 0.25728434324264526


6778it [33:46,  3.35it/s]
1it [00:00,  3.07it/s]

Epoch: 3, Loss: 0.31495413184165955


1001it [04:59,  3.09it/s]

Epoch: 3, Loss: 0.39375242590904236


2001it [09:58,  3.09it/s]

Epoch: 3, Loss: 0.19469043612480164


3001it [14:57,  3.10it/s]

Epoch: 3, Loss: 0.22779342532157898


4001it [19:56,  3.09it/s]

Epoch: 3, Loss: 0.2391335368156433


5001it [24:55,  3.10it/s]

Epoch: 3, Loss: 0.32870274782180786


6001it [29:54,  3.10it/s]

Epoch: 3, Loss: 0.21039769053459167


6778it [33:46,  3.34it/s]
1it [00:00,  2.99it/s]

Epoch: 4, Loss: 0.43223699927330017


1001it [04:59,  3.08it/s]

Epoch: 4, Loss: 0.3165038824081421


2001it [09:58,  3.10it/s]

Epoch: 4, Loss: 0.424699068069458


3001it [14:57,  3.10it/s]

Epoch: 4, Loss: 0.181780144572258


4001it [19:57,  3.08it/s]

Epoch: 4, Loss: 0.40844714641571045


5001it [24:56,  3.09it/s]

Epoch: 4, Loss: 0.2293248176574707


6001it [29:56,  3.09it/s]

Epoch: 4, Loss: 0.10798344016075134


6778it [33:48,  3.34it/s]
1it [00:00,  3.03it/s]

Epoch: 5, Loss: 0.33021724224090576


1001it [04:59,  3.10it/s]

Epoch: 5, Loss: 0.21122634410858154


2001it [09:58,  3.09it/s]

Epoch: 5, Loss: 0.29082030057907104


3001it [14:57,  3.09it/s]

Epoch: 5, Loss: 0.32094019651412964


4001it [19:57,  3.08it/s]

Epoch: 5, Loss: 0.27299371361732483


5001it [24:56,  3.09it/s]

Epoch: 5, Loss: 0.29187828302383423


6001it [29:55,  3.11it/s]

Epoch: 5, Loss: 0.41944068670272827


6778it [33:48,  3.34it/s]
1it [00:00,  3.03it/s]

Epoch: 6, Loss: 0.2552644610404968


1001it [04:59,  3.07it/s]

Epoch: 6, Loss: 0.2976244390010834


2001it [09:58,  3.09it/s]

Epoch: 6, Loss: 0.14374865591526031


3001it [14:58,  3.09it/s]

Epoch: 6, Loss: 0.19810275733470917


4001it [19:57,  3.09it/s]

Epoch: 6, Loss: 0.20370244979858398


5001it [24:56,  3.09it/s]

Epoch: 6, Loss: 0.3419370651245117


6001it [29:56,  3.10it/s]

Epoch: 6, Loss: 0.19980435073375702


6778it [33:48,  3.34it/s]
1it [00:00,  3.01it/s]

Epoch: 7, Loss: 0.23885749280452728


1001it [04:59,  3.07it/s]

Epoch: 7, Loss: 0.3619285821914673


2001it [09:58,  3.10it/s]

Epoch: 7, Loss: 0.18911629915237427


3001it [14:57,  3.08it/s]

Epoch: 7, Loss: 0.1431323140859604


4001it [19:57,  3.08it/s]

Epoch: 7, Loss: 0.20128116011619568


5001it [24:56,  3.11it/s]

Epoch: 7, Loss: 0.293302059173584


6001it [29:55,  3.09it/s]

Epoch: 7, Loss: 0.2174893617630005


6778it [33:47,  3.34it/s]
1it [00:00,  3.02it/s]

Epoch: 8, Loss: 0.557444155216217


1001it [04:59,  3.09it/s]

Epoch: 8, Loss: 0.2444854974746704


2001it [09:58,  3.10it/s]

Epoch: 8, Loss: 0.19205689430236816


3001it [14:58,  3.10it/s]

Epoch: 8, Loss: 0.1489662379026413


4001it [19:57,  3.09it/s]

Epoch: 8, Loss: 0.28874197602272034


5001it [24:57,  3.09it/s]

Epoch: 8, Loss: 0.3749621510505676


6001it [29:56,  3.09it/s]

Epoch: 8, Loss: 0.49509352445602417


6778it [33:49,  3.34it/s]
1it [00:00,  3.04it/s]

Epoch: 9, Loss: 0.20295989513397217


1001it [04:59,  3.09it/s]

Epoch: 9, Loss: 0.2575748562812805


2001it [09:59,  3.10it/s]

Epoch: 9, Loss: 0.12963566184043884


3001it [14:58,  3.09it/s]

Epoch: 9, Loss: 0.2902938723564148


4001it [19:58,  3.09it/s]

Epoch: 9, Loss: 0.2214321792125702


5001it [24:57,  3.08it/s]

Epoch: 9, Loss: 0.20245066285133362


6001it [29:57,  3.09it/s]

Epoch: 9, Loss: 0.21714180707931519


6778it [33:50,  3.34it/s]


In [19]:
def validation(model, loader):
    model.eval()
    fin_targets = []
    fin_outputs = []
    with torch.no_grad():
        for _, data in tqdm(enumerate(loader, 0)):
            ids = data["ids"].to(device, dtype=torch.long)
            mask = data["mask"].to(device, dtype=torch.long)
            token_type_ids = data["token_type_ids"].to(device, dtype=torch.long)
            targets = data["targets"].to(device, dtype=torch.float)
            outputs = model(ids, mask, token_type_ids)
            fin_targets.extend(targets.cpu().detach().numpy().tolist())
            fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
    return fin_outputs, fin_targets

In [20]:
outputs, targets = validation(model, validation_loader)

final_outputs = np.argmax(outputs, axis=1)
targets = np.argmax(targets, axis=1)

13556it [02:39, 85.14it/s]


In [21]:
print(f"Got {sum(final_outputs == targets)} / {len(final_outputs)} correct")

Got 10421 / 13556 correct


In [22]:
micro_f1 = f1_score(targets, final_outputs, average="micro")
macro_f1 = f1_score(targets, final_outputs, average="macro")
weighted_f1 = f1_score(targets, final_outputs, average="weighted")

print(f"Micro F1 score:\t\t{round(micro_f1, 3)}")
print(f"Macro F1 score:\t\t{round(macro_f1, 3)}")
print(f"Weighted F1 score:\t{round(weighted_f1, 3)}")

Micro F1 score:		0.769
Macro F1 score:		0.548
Weighted F1 score:	0.753


In [23]:
print(classification_report(targets, final_outputs))

              precision    recall  f1-score   support

           0       0.84      0.82      0.83      8030
           1       0.15      0.04      0.06       868
           2       0.69      0.81      0.75      4658

    accuracy                           0.77     13556
   macro avg       0.56      0.56      0.55     13556
weighted avg       0.75      0.77      0.75     13556



In [24]:
output_model_file = "../models/pytorch_distilbert.bin"
output_vocab_file = "../models/vocab_distilbert.bin"

torch.save(model, output_model_file)
TOKENIZER.save_vocabulary(output_vocab_file)

print("Model Saved")

Model Saved
