# HateBert + MLP

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install transformers  --quiet
!pip install emoji --quiet
!pip install torchmetrics --quiet

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m22.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m294.8/294.8 kB[0m [31m31.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m61.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m58.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m358.9/358.9 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m764.8/764.8 kB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import gc
#import os
import emoji as emoji
import re
import string
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from transformers import BertModel, BertTokenizer, AdamW, AutoModel, AutoModelForMaskedLM, AutoTokenizer, DistilBertTokenizer, DistilBertModel, AutoModelForSequenceClassification
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from torchmetrics import *

In [None]:
tokenizer = AutoTokenizer.from_pretrained("GroNLP/hateBERT")

model = AutoModelForMaskedLM.from_pretrained("GroNLP/hateBERT")

Downloading (…)okenizer_config.json:   0%|          | 0.00/151 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.24k [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at GroNLP/hateBERT were not used when initializing BertForMaskedLM: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


# MLP Model



In [None]:
class MLP(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super(MLP, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.1)
        self.fc2 = nn.Linear(hidden_size, num_classes)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.fc2(x)
        x = self.softmax(x)
        return x

In [None]:
class HATEBERT_Arch(nn.Module):

    def __init__(self, modelName, num_classes):
        super(HATEBERT_Arch, self).__init__()
        self.hate_bert = AutoModel.from_pretrained(
            modelName,
            num_labels = num_classes
        )

        self.mlp = MLP(input_size=768, hidden_size=128, num_classes=num_classes)

    def forward(self, sent_id, mask):
        _, x = self.hate_bert(sent_id, attention_mask=mask, return_dict=False)
        x = self.mlp(x)
        return x

# Dataset Pre Processing functions

In [None]:
def read_dataset(file, column_to_drop, textColumnName, labelColumnName):
    data = pd.read_csv(file)
    data = data.drop(column_to_drop, axis=1)
    return data[textColumnName].tolist(), data[labelColumnName]

In [None]:
def pre_process_dataset(values):
    # print("Eseguo", len(values), values[0])
    new_values = list()
    # Emoticons
    emoticons = [':-)', ':)', '(:', '(-:', ':))', '((:', ':-D', ':D', 'X-D', 'XD', 'xD', 'xD', '<3', '</3', ':\*',
                 ';-)',
                 ';)', ';-D', ';D', '(;', '(-;', ':-(', ':(', '(:', '(-:', ':,(', ':\'(', ':"(', ':((', ':D', '=D',
                 '=)',
                 '(=', '=(', ')=', '=-O', 'O-=', ':o', 'o:', 'O:', 'O:', ':-o', 'o-:', ':P', ':p', ':S', ':s', ':@',
                 ':>',
                 ':<', '^_^', '^.^', '>.>', 'T_T', 'T-T', '-.-', '*.*', '~.~', ':*', ':-*', 'xP', 'XP', 'XP', 'Xp',
                 ':-|',
                 ':->', ':-<', '$_$', '8-)', ':-P', ':-p', '=P', '=p', ':*)', '*-*', 'B-)', 'O.o', 'X-(', ')-X']

    for value in values:
        text = value
        users = re.findall("[@]\w+", text)
        for user in users:
            text = text.replace(user, "<user>")
        urls = re.findall(r'https:\/\/[^\s]*', text)
        if len(urls) != 0:
            for url in urls:
                text = text.replace(url, "<url >")
        for emo in text:
            if emo in emoji.EMOJI_DATA.keys():
                text = text.replace(emo, "<emoticon>")
        for emo in emoticons:
            text = text.replace(emo, "<emoticon>")
        numbers = re.findall('[0-9]+', text)
        for number in numbers:
            text = text.replace(number, "<number>")
        hashtags = re.findall('#[^\s]*', text)
        for hashtag in hashtags:
            text = text.replace(hashtag, "<hashtag>")

        text = text.replace(".", " ").lower()
        text = re.sub(r"([?.!,¿])", r" ", text)
        keywords = ["<user>", "<emoticon>", "<number>", "<hashtag>"]

        text = "".join(l for l in text if l not in string.punctuation or (l in string.punctuation and l == "<") or (l in string.punctuation and l == ">"))
        text = re.sub(r'[" "]+', " ", text).strip()
        new_values.append(text)
    return new_values

In [None]:
def data_process(data, labels):
    input_ids = []
    attention_masks = []
    for sentence in data:
        bert_inp = tokenizer.__call__(sentence, max_length=32,
                                           padding='max_length', pad_to_max_length=True,
                                           truncation=True, return_token_type_ids=False)

        input_ids.append(bert_inp['input_ids'])
        attention_masks.append(bert_inp['attention_mask'])
    #del bert_tokenizer
    #gc.collect()
    #torch.cuda.empty_cache()
    input_ids = np.asarray(input_ids)
    attention_masks = np.array(attention_masks)
    labels = np.array(labels)
    return input_ids, attention_masks, labels

In [None]:
def load_and_process(dataset, column_to_drop, textColumnName, labelColumnName):
    data, labels = read_dataset(dataset, column_to_drop, textColumnName, labelColumnName)
    num_of_labels = len(labels.unique())
    input_ids, attention_masks, labels = data_process(pre_process_dataset(data), labels)

    return input_ids, attention_masks, labels,  data, labels

## Function for Train, Evaluation and Test process

In [None]:
def progress_bar(i, total, accuracy, loss, batch_size):
    step = i+1
    percent = "{0:.2f}".format(100 * (step / float(total)))
    lossp = "{0:.4f}".format(loss)
    accuracy = "{0:.4f}".format(accuracy)
    filledLength = int(100 * step // total)
    bar = '█' * filledLength + '>'  *(filledLength < 100) + '.' * (99 - filledLength)
    print(f'\rBatch {step}/{total} |{bar}| {percent}% complete, loss={lossp}, accuracy={accuracy}', end='')

In [None]:
def predict(model, sent_id, mask, labels, cross_entropy):
    preds = model(sent_id, mask)
    #print(preds, labels)
    # print(len(preds))
    # print(len(labels))
    loss = cross_entropy(preds, labels)
    return preds, loss

In [None]:
def init_batch(batch):
    # push the batch to gpu
    batch = [r.to(device) for r in batch]
    sent_id, mask, labels = batch
    #del batch
    #gc.collect()
    #torch.cuda.empty_cache()
    return sent_id, mask, labels

In [None]:
def output_model(total_loss, train_dataloader, batch_size, total_preds):
    avg_loss = total_loss / (len(train_dataloader)*batch_size)
    total_preds = np.stack(total_preds[0].detach().cpu().numpy(), axis=0)
    return avg_loss, total_preds

In [None]:
def train(model, train_dataloader, cross_entropy):
    model.train()
    total_loss, total_accuracy = 0, 0
    # iterate over batches
    total = len(train_dataloader)
    for i, batch in enumerate(train_dataloader):
        progress_bar(i, total, total_accuracy / (i + 1), total_loss / (i + 1), batch_size)
        sent_id, mask, labels = init_batch(batch)
        # clear previously calculated gradients
        model.zero_grad()
        # compute the loss between actual and predicted values
        preds, loss = predict(model, sent_id, mask, labels, cross_entropy)
        # backward pass to calculate the gradients
        loss_with_penalty = loss  #+ l2_penalty

        loss_with_penalty.backward()
        # clip the the gradients to 1.0. It helps in preventing the exploding gradient problem
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        # update parameters
        optimizer.step()
        # append the model predictions

        total_loss += loss_with_penalty.mean().item()
        total_accuracy += acc(preds.argmax(axis=1), labels).mean().item()

    return total_loss / total, total_accuracy / total

In [None]:
# function for evaluating the model
def evaluate(model, val_dataloader, cross_entropy):
    print("\n\nEvaluating...")
    # deactivate dropout layers
    model.eval()
    total_loss, total_accuracy, total_preds = 0, 0, []
    # iterate over batches
    total = len(val_dataloader)
    for i, batch in enumerate(val_dataloader):
        progress_bar(i, total, total_accuracy / (i + 1), total_loss / (i + 1), batch_size)
        sent_id, mask, labels = init_batch(batch)
        # deactivate autograd
        with torch.no_grad():
            preds, loss = predict(model, sent_id, mask, labels, cross_entropy)
            total_loss += loss.mean().item()
            total_accuracy += acc(preds.argmax(axis=1), labels).mean().item()
    return total_loss / total, total_accuracy / total

In [None]:
def to_tensor(dict_text, label):
    seq = torch.tensor(np.stack(dict_text['input_ids'].values))
    mask = torch.tensor(np.stack(dict_text['attention_masks'].values))
    y = torch.tensor(label)
    return seq, mask, y

## Data division functions for Train, Evaluation and Test set

In [None]:
gc.collect()
torch.cuda.empty_cache()

In [None]:
def tokenitazionAndDataloader(text, labels):
    train_seq, train_mask, train_y = to_tensor(text, labels)

    train_data = TensorDataset(train_seq, train_mask, train_y)

    train_sampler = RandomSampler(train_data)

    train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

    return train_dataloader

In [None]:
def train_val_test_to_dataloader(df, labels):
    train_text, temp_text, train_labels, temp_labels = train_test_split(df, labels,
                                random_state=1234, test_size=0.2, stratify=labels)

    val_text, test_text, val_labels, test_labels = train_test_split(temp_text, temp_labels,
                            random_state=1234, test_size=0.5, stratify=temp_labels)

    del temp_text
    gc.collect()

    train_dataloader = tokenitazionAndDataloader(train_text, train_labels)
    val_dataloader = tokenitazionAndDataloader(val_text, val_labels)
    test_dataloader = tokenitazionAndDataloader(test_text, test_labels)

    return train_dataloader, val_dataloader, test_dataloader

# SemEval-2019

## Train

In [None]:
dataset_link_ema = "/content/drive/MyDrive/Data science/NLP/dataset/hateval2019_en_train2.csv"
device = 'cuda' if torch.cuda.is_available() else 'cpu'

acc = Accuracy('multiclass', num_classes=2).to(device)

input_ids, attention_masks, labels, data, labels = load_and_process(dataset_link_ema, [], "tweet", "class")

df = pd.DataFrame(list(zip(input_ids, attention_masks)), columns=['input_ids', 'attention_masks'])

batch_size = 64

train_dataloader, val_dataloader, test_dataloader = train_val_test_to_dataloader(df, labels)

torch.cuda.empty_cache()

In [None]:
gc.collect()
torch.cuda.empty_cache()

model = HATEBERT_Arch("GroNLP/hateBERT", 2)

model = model.to(device)

In [None]:
optimizer = AdamW(model.parameters(), lr=2e-5)
best_valid_loss = float('inf')

cross_entropy = nn.NLLLoss()

epochs = 3
current = 1

while current <= epochs:
    print(f'\nEpoch {current} / {epochs}:')

    train_loss, _ = train(model, train_dataloader, cross_entropy)
    valid_loss, _ = evaluate(model, val_dataloader, cross_entropy)

    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss

    print(f'\n\nTraining Loss: {train_loss:.3f}')
    print(f'Validation Loss: {valid_loss:.3f}')

    current = current + 1




Epoch 1 / 3:
Batch 163/163 |████████████████████████████████████████████████████████████████████████████████████████████████████| 100.00% complete, loss=0.5855, accuracy=0.6785

Evaluating...
Batch 21/21 |████████████████████████████████████████████████████████████████████████████████████████████████████| 100.00% complete, loss=0.4789, accuracy=0.7269

Training Loss: 0.588
Validation Loss: 0.512

Epoch 2 / 3:
Batch 163/163 |████████████████████████████████████████████████████████████████████████████████████████████████████| 100.00% complete, loss=0.4571, accuracy=0.7809

Evaluating...
Batch 21/21 |████████████████████████████████████████████████████████████████████████████████████████████████████| 100.00% complete, loss=0.4606, accuracy=0.7381

Training Loss: 0.459
Validation Loss: 0.484

Epoch 3 / 3:
Batch 163/163 |████████████████████████████████████████████████████████████████████████████████████████████████████| 100.00% complete, loss=0.3581, accuracy=0.8476

Evaluating...
Batch 2

## Test

In [None]:
j = 0
preds_ = []
total_loss, total_accuracy, total_preds = 0, 0, []
test_y = []
total = len(test_dataloader)
for i, batch in enumerate(test_dataloader):
    test_y.extend(list(batch[2].numpy()))
    progress_bar(i, total, total_accuracy / (i + 1), total_loss / (i + 1), batch_size)
    sent_id, mask, labels = init_batch(batch)
    with torch.no_grad():
        preds, loss = predict(model, sent_id, mask, labels, cross_entropy)
        total_loss += loss.mean().item()
        total_accuracy += acc(preds.argmax(axis=1), labels).mean().item()
        preds_.extend(list(preds.argmax(axis=1).detach().cpu().numpy()))


print("\nPerformance:")
print('Classification Report')
print(classification_report(test_y, preds_))

print("Accuracy: " + str(accuracy_score(test_y, preds_)))

Batch 21/21 |████████████████████████████████████████████████████████████████████████████████████████████████████| 100.00% complete, loss=0.5035, accuracy=0.7135
Performance:
Classification Report
              precision    recall  f1-score   support

           0       0.81      0.75      0.78       753
           1       0.69      0.75      0.72       547

    accuracy                           0.75      1300
   macro avg       0.75      0.75      0.75      1300
weighted avg       0.76      0.75      0.75      1300

Accuracy: 0.7507692307692307


# WikiToxic

## Train

In [None]:
dataset_link_alex = "/content/drive/MyDrive/Data science/NLP/Alex/wiki_toxic.csv"

column_to_drop = ['Unnamed: 0', 'id']

input_ids, attention_masks, labels, data, labels = load_and_process(dataset_link_alex, column_to_drop, 'tweet', 'class')

df = pd.DataFrame(list(zip(input_ids, attention_masks)), columns=['input_ids', 'attention_masks'])

batch_size = 64

train_dataloader_alex, val_dataloader_alex, test_dataloader_alex = train_val_test_to_dataloader(df, labels)

In [None]:
gc.collect()
torch.cuda.empty_cache()

acc = Accuracy('multiclass', num_classes=2).to(device)
model_alex = HATEBERT_Arch("GroNLP/hateBERT", 2)

model_alex = model_alex.to(device)

In [None]:
optimizer = AdamW(model_alex.parameters(), lr=2e-5)
best_valid_loss = float('inf')

cross_entropy = nn.NLLLoss()

epochs = 2
current = 1

while current <= epochs:

    print(f'\nEpoch {current} / {epochs}:')

    # train model
    train_loss, _ = train(model_alex, train_dataloader_alex, cross_entropy)

    # evaluate model
    valid_loss, _ = evaluate(model_alex, val_dataloader_alex, cross_entropy)

    # save the best model
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss

    print(f'\n\nTraining Loss: {train_loss:.3f}')
    print(f'Validation Loss: {valid_loss:.3f}')

    current = current + 1




Epoch 1 / 2:
Batch 562/562 |████████████████████████████████████████████████████████████████████████████████████████████████████| 100.00% complete, loss=0.2979, accuracy=0.8788

Evaluating...
Batch 71/71 |████████████████████████████████████████████████████████████████████████████████████████████████████| 100.00% complete, loss=0.2141, accuracy=0.8985

Training Loss: 0.298
Validation Loss: 0.219

Epoch 2 / 2:
Batch 562/562 |████████████████████████████████████████████████████████████████████████████████████████████████████| 100.00% complete, loss=0.1863, accuracy=0.9293

Evaluating...
Batch 71/71 |████████████████████████████████████████████████████████████████████████████████████████████████████| 100.00% complete, loss=0.2201, accuracy=0.9025

Training Loss: 0.187
Validation Loss: 0.222


## Test

In [None]:
j = 0
preds_alex = []
total_loss_alex, total_accuracy_alex, total_preds_alex = 0, 0, []
test_y_alex = []
total = len(test_dataloader_alex)
for i, batch in enumerate(test_dataloader_alex):
    test_y_alex.extend(list(batch[2].numpy()))
    progress_bar(i, total, total_accuracy_alex / (i + 1), total_loss_alex / (i + 1), batch_size)
    sent_id, mask, labels = init_batch(batch)
    # deactivate autograd
    with torch.no_grad():
        preds, loss = predict(model_alex, sent_id, mask, labels, cross_entropy)
        total_loss_alex += loss.mean().item()
        total_accuracy_alex += acc(preds.argmax(axis=1), labels).mean().item()
        preds_alex.extend(list(preds.argmax(axis=1).detach().cpu().numpy()))


print("\nPerformance:")
print('Classification Report')
print(classification_report(test_y_alex, preds_alex))

print("Accuracy: " + str(accuracy_score(test_y_alex, preds_alex)))

Batch 71/71 |████████████████████████████████████████████████████████████████████████████████████████████████████| 100.00% complete, loss=0.2170, accuracy=0.9045
Performance:
Classification Report
              precision    recall  f1-score   support

           0       0.92      0.91      0.92      2247
           1       0.91      0.93      0.92      2247

    accuracy                           0.92      4494
   macro avg       0.92      0.92      0.92      4494
weighted avg       0.92      0.92      0.92      4494

Accuracy: 0.9170004450378282


# Automated hate Speech Detection Dataset

## Train

In [None]:
dataset_link = "/content/drive/MyDrive/Data science/NLP/dataset/hate_speech_offensiveWithHatespeech/labeled_data.csv"
column_to_drop = ['count', 'hate_speech', 'offensive_language', 'neither']

input_ids, attention_masks, labels, data, labels = load_and_process(dataset_link, column_to_drop, 'tweet', 'class')

df = pd.DataFrame(list(zip(input_ids, attention_masks)), columns=['input_ids', 'attention_masks'])

batch_size = 64

train_dataloader_mina, val_dataloader_mina, test_dataloader_mina = train_val_test_to_dataloader(df, labels)

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
acc = Accuracy('multiclass', num_classes=3).to(device)

gc.collect()
torch.cuda.empty_cache()

model_mina = HATEBERT_Arch("GroNLP/hateBERT", 3)

model_mina = model_mina.to(device)

In [None]:
optimizer = AdamW(model_mina.parameters(), lr=2e-5)
best_valid_loss = float('inf')
weights = torch.tensor([2., 1., 1.])
weight=weights.to(device)
cross_entropy = nn.CrossEntropyLoss(weight=weight)

epochs = 1
current = 1

while current <= epochs:

    print(f'\nEpoch {current} / {epochs}:')

    # train model
    train_loss, _ = train(model_mina, train_dataloader_mina, cross_entropy)

    # evaluate model
    valid_loss, _ = evaluate(model_mina, val_dataloader_mina, cross_entropy)

    # save the best model
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss

    print(f'\n\nTraining Loss: {train_loss:.3f}')
    print(f'Validation Loss: {valid_loss:.3f}')

    current = current + 1




Epoch 1 / 1:
Batch 310/310 |████████████████████████████████████████████████████████████████████████████████████████████████████| 100.00% complete, loss=0.4513, accuracy=0.8615

Evaluating...
Batch 39/39 |████████████████████████████████████████████████████████████████████████████████████████████████████| 100.00% complete, loss=0.3291, accuracy=0.8722

Training Loss: 0.452
Validation Loss: 0.341


## Test

In [None]:
j = 0
preds_mina = []
total_loss_mina, total_accuracy_mina, total_preds_mina = 0, 0, []
test_y_mina = []
total = len(test_dataloader_mina)
for i, batch in enumerate(test_dataloader_mina):
    test_y_mina.extend(list(batch[2].numpy()))
    progress_bar(i, total, total_accuracy_mina / (i + 1), total_loss_mina / (i + 1), batch_size)
    sent_id, mask, labels = init_batch(batch)
    # deactivate autograd
    with torch.no_grad():
        preds, loss = predict(model_mina, sent_id, mask, labels, cross_entropy)
        total_loss_mina += loss.mean().item()
        total_accuracy_mina += acc(preds.argmax(axis=1), labels).mean().item()
        preds_mina.extend(list(preds.argmax(axis=1).detach().cpu().numpy()))


print("\nPerformance:")
print('Classification Report')
print(classification_report(test_y_mina, preds_mina))

print("Accuracy: " + str(accuracy_score(test_y_mina, preds_mina)))

Batch 39/39 |████████████████████████████████████████████████████████████████████████████████████████████████████| 100.00% complete, loss=0.3405, accuracy=0.8698
Performance:
Classification Report
              precision    recall  f1-score   support

           0       0.40      0.51      0.45       143
           1       0.95      0.92      0.94      1919
           2       0.86      0.88      0.87       417

    accuracy                           0.89      2479
   macro avg       0.74      0.77      0.75      2479
weighted avg       0.90      0.89      0.90      2479

Accuracy: 0.8939088342073417
