In [1]:
import sys

import numpy as np
import pandas as pd

import torch
import torch.nn as nn
from torch.optim import AdamW
from torch.utils.data import Dataset, DataLoader

from transformers import DistilBertTokenizer, DistilBertModel

from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import f1_score, classification_report

from tqdm import tqdm

sys.path.insert(0, '..')
from src.data_collection import get_data

pd.set_option("display.max_colwidth", None)

In [2]:
hate_speech_ucb = get_data()

Fetching data...


Using custom data configuration ucberkeley-dlab--measuring-hate-speech-f91f636a830ad73c
Reusing dataset parquet (C:\Users\UTKARSH\.cache\huggingface\datasets\parquet\ucberkeley-dlab--measuring-hate-speech-f91f636a830ad73c\0.0.0\0b6d5799bb726b24ad7fc7be720c170d8e497f575d02d47537de9a5bac074901)


  0%|          | 0/1 [00:00<?, ?it/s]

Processing...
Done!


In [3]:
device = torch.device("cuda:0") if torch.cuda.is_available() else "cpu"
device

device(type='cuda', index=0)

In [4]:
MODEL_NAME = "distilbert-base-uncased"  # "distilbert-base-uncased-finetuned-sst-2-english"
BATCH_SIZE = 16
MAX_LEN = 128
EPOCHS = 10
LEARNING_RATE = 1e-05
TOKENIZER = DistilBertTokenizer.from_pretrained(MODEL_NAME, truncation=True, do_lower_case=True)

In [5]:
hate_speech_ucb.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39565 entries, 0 to 39564
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   text        39565 non-null  object
 1   hatespeech  39565 non-null  int8  
dtypes: int8(1), object(1)
memory usage: 347.9+ KB


In [6]:
hate_speech_ucb["hatespeech"].value_counts()

0    26608
1    12957
Name: hatespeech, dtype: int64

In [7]:
# # For testing purpose
# hate_speech_ucb = hate_speech_ucb[:5000]

In [8]:
hate_speech_ucb

Unnamed: 0,text,hatespeech
0,! thank u! im transmasc and generally present masc i Maybe look a bit gay so i was wondering about how that would go. im sorry u were heckled but its good to hear things are better there now. tysm my favorite lesbian :),0
1,!Go fuck yourself faggot!,1
2,!flair [I love women and minorities],0
3,!flair [death to all niggers and gays],1
4,""" 'convoluted' genealogy of Jesus""; was that comment really necessary? I am sure that you have regular Christian viewers--like myself--who might not have appreciated that very much.",0
...,...,...
39560,🤧 The limousine liberal Jews and WASPs who want to assuage their white guilt.. SJW maladjusted gays and nigs so they can pester and bother me; bullshit wack monotheist Judeo-Christianity values that's I'm suck when I leave Chinatown,1
39561,"🤬CONGRATULATIONS #MARYLAND #BALTIMORE LIBERALS & OBAMA... YOU REACHED A NEW LOW If I didn't know better, I would have thought this was Europe. Once again, Democrat Mayor Police Video Shows Whites Being Targeted During Memorial Day Chaos in Baltimore URL",0
39562,🥳 another bitch will when another bitch can't,1
39563,"🥴🥴 next time stay your ass in the car before our ""beaner"" asses beat yah URL",1


In [9]:
class HateDataset(Dataset):

    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.text = dataframe.text
        self.targets = OneHotEncoder(sparse=False).fit_transform(np.array(self.data["hatespeech"]).reshape(-1, 1))
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = str(self.text[index])
        text = " ".join(text.split())

        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True
        )

        ids = inputs["input_ids"]
        mask = inputs["attention_mask"]
        token_type_ids = inputs["token_type_ids"]

        return {
            "ids": torch.tensor(ids, dtype=torch.long),
            "mask": torch.tensor(mask, dtype=torch.long),
            "token_type_ids": torch.tensor(token_type_ids, dtype=torch.long),
            "targets": torch.tensor(self.targets[index], dtype=torch.float)
        }

In [10]:
# Creating the dataset and dataloader for the neural network

train_size = 0.8
val_size = 0.1

train_data = hate_speech_ucb.sample(frac=train_size, random_state=210)
test_data = hate_speech_ucb.drop(train_data.index).reset_index(drop=True)
train_data = train_data.reset_index(drop=True)
val_data = test_data.sample(frac=val_size / (1 - train_size), random_state=220).reset_index()
test_data = test_data.drop(val_data.index).reset_index(drop=True)

print(f"FULL Dataset: {hate_speech_ucb.shape}")
print(f"TRAIN Dataset: {train_data.shape}")
print(f"VAL Dataset: {val_data.shape}")
print(f"TEST Dataset: {test_data.shape}")

training_set = HateDataset(train_data, TOKENIZER, MAX_LEN)
validation_set = HateDataset(val_data, TOKENIZER, MAX_LEN)
testing_set = HateDataset(test_data, TOKENIZER, MAX_LEN)

FULL Dataset: (39565, 2)
TRAIN Dataset: (31652, 2)
VAL Dataset: (3957, 3)
TEST Dataset: (3956, 2)


In [11]:
train_params = {
    "batch_size": BATCH_SIZE,
    "shuffle": True,
    "num_workers": 0
}

val_params = {
    "batch_size": 1,
    "shuffle": False,
    "num_workers": 0
}

test_params = {
    "batch_size": 1,
    "shuffle": False,
    "num_workers": 0
}

training_loader = DataLoader(training_set, **train_params)
validation_loader = DataLoader(validation_set, **val_params)
testing_loader = DataLoader(testing_set, **test_params)

In [12]:
class DistilBERTMultiClass(nn.Module):
    def __init__(self, n_classes):
        super(DistilBERTMultiClass, self).__init__()
        self.l1 = DistilBertModel.from_pretrained(MODEL_NAME)
        self.pre_classifier = nn.Linear(768, 768)
        self.dropout = nn.Dropout(0.1)
        self.classifier = nn.Linear(768, n_classes)

    def forward(self, input_ids, attention_mask, token_type_ids):
        output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask)
        hidden_state = output_1[0]
        pooler = hidden_state[:, 0]
        pooler = self.pre_classifier(pooler)
        pooler = nn.Tanh()(pooler)
        pooler = self.dropout(pooler)
        output = self.classifier(pooler)
        return output

In [13]:
N_CLASSES = hate_speech_ucb["hatespeech"].nunique()

In [14]:
model = DistilBERTMultiClass(n_classes=N_CLASSES)
model.to(device)

DistilBERTMultiClass(
  (l1): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0): TransformerBlock(
          (dropout): Dropout(p=0.1, inplace=False)
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p

In [15]:
def loss_fn(outputs, targets):
    return torch.nn.BCEWithLogitsLoss()(outputs, targets)

In [16]:
optimizer = AdamW(params=model.parameters(), lr=LEARNING_RATE)

In [17]:
def train(epoch):
    model.train()
    for _, data in tqdm(enumerate(training_loader, 0)):
        ids = data["ids"].to(device, dtype=torch.long)
        mask = data["mask"].to(device, dtype=torch.long)
        token_type_ids = data["token_type_ids"].to(device, dtype=torch.long)
        targets = data["targets"].to(device, dtype=torch.float)
        outputs = model(ids, mask, token_type_ids)

        optimizer.zero_grad()
        loss = loss_fn(outputs, targets)
        if _ % 1000 == 0:
            print(f"Epoch: {epoch}, Loss: {loss.item()}")
        loss.backward()
        optimizer.step()

In [18]:
for epoch in range(EPOCHS):
    train(epoch)

1it [00:02,  2.45s/it]

Epoch: 0, Loss: 0.693435549736023


1001it [05:00,  3.08it/s]

Epoch: 0, Loss: 0.48804569244384766


1979it [09:54,  3.33it/s]
1it [00:00,  5.38it/s]

Epoch: 1, Loss: 0.3577663004398346


1001it [05:15,  2.73it/s]

Epoch: 1, Loss: 0.16716288030147552


1979it [10:51,  3.04it/s]
1it [00:00,  4.81it/s]

Epoch: 2, Loss: 0.38692086935043335


1001it [05:53,  2.62it/s]

Epoch: 2, Loss: 0.3680565357208252


1979it [10:59,  3.00it/s]
1it [00:00,  5.30it/s]

Epoch: 3, Loss: 0.3465307950973511


1001it [05:00,  3.08it/s]

Epoch: 3, Loss: 0.11053898930549622


1979it [09:54,  3.33it/s]
1it [00:00,  5.35it/s]

Epoch: 4, Loss: 0.2167648822069168


1001it [04:59,  3.08it/s]

Epoch: 4, Loss: 0.15157343447208405


1979it [09:52,  3.34it/s]
1it [00:00,  5.38it/s]

Epoch: 5, Loss: 0.13486319780349731


1001it [04:59,  3.07it/s]

Epoch: 5, Loss: 0.023350607603788376


1979it [09:52,  3.34it/s]
1it [00:00,  5.26it/s]

Epoch: 6, Loss: 0.006674409843981266


1001it [05:00,  3.09it/s]

Epoch: 6, Loss: 0.0315849743783474


1979it [09:53,  3.34it/s]
1it [00:00,  5.35it/s]

Epoch: 7, Loss: 0.022003773599863052


1001it [05:00,  3.06it/s]

Epoch: 7, Loss: 0.014893965795636177


1979it [09:54,  3.33it/s]
1it [00:00,  5.29it/s]

Epoch: 8, Loss: 0.017017576843500137


1001it [05:01,  3.06it/s]

Epoch: 8, Loss: 0.006195249035954475


1979it [09:55,  3.33it/s]
1it [00:00,  5.55it/s]

Epoch: 9, Loss: 0.018526898697018623


1001it [05:00,  3.08it/s]

Epoch: 9, Loss: 0.015277309343218803


1979it [09:53,  3.33it/s]


In [19]:
def validation(model, loader):
    model.eval()
    fin_targets = []
    fin_outputs = []
    with torch.no_grad():
        for _, data in tqdm(enumerate(loader, 0)):
            ids = data["ids"].to(device, dtype=torch.long)
            mask = data["mask"].to(device, dtype=torch.long)
            token_type_ids = data["token_type_ids"].to(device, dtype=torch.long)
            targets = data["targets"].to(device, dtype=torch.float)
            outputs = model(ids, mask, token_type_ids)
            fin_targets.extend(targets.cpu().detach().numpy().tolist())
            fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
    return fin_outputs, fin_targets

In [20]:
outputs, targets = validation(model, validation_loader)

final_outputs = np.argmax(outputs, axis=1)
targets = np.argmax(targets, axis=1)

3957it [00:48, 81.94it/s]


In [21]:
print(f"Got {sum(final_outputs == targets)} / {len(final_outputs)} correct")

Got 3033 / 3957 correct


In [22]:
micro_f1 = f1_score(targets, final_outputs, average="micro")
macro_f1 = f1_score(targets, final_outputs, average="macro")
weighted_f1 = f1_score(targets, final_outputs, average="weighted")

print(f"Micro F1 score:\t\t{round(micro_f1, 3)}")
print(f"Macro F1 score:\t\t{round(macro_f1, 3)}")
print(f"Weighted F1 score:\t{round(weighted_f1, 3)}")

Micro F1 score:		0.766
Macro F1 score:		0.728
Weighted F1 score:	0.763


In [23]:
print(classification_report(targets, final_outputs))

              precision    recall  f1-score   support

           0       0.81      0.85      0.83      2665
           1       0.66      0.60      0.62      1292

    accuracy                           0.77      3957
   macro avg       0.73      0.72      0.73      3957
weighted avg       0.76      0.77      0.76      3957



In [24]:
output_model_file = "../models/pytorch_distilbert.bin"
output_vocab_file = "../models/vocab_distilbert.bin"

torch.save(model, output_model_file)
TOKENIZER.save_vocabulary(output_vocab_file)

print("Model Saved")

Model Saved
