# Imports

The entire code is written using **PyTorch**.<br>
We'll be using the **transformers** library by [huggingface](https://github.com/huggingface/transformers) as they provide wrappers for multiple Transformer models.

In [1]:
%load_ext autoreload
%autoreload 2

In [None]:
%%capture

!pip install transformers
!pip install pytorch-lightning --upgrade
!pip install sentencepiece
!pip install datasets --upgrade
!pip install torchmetrics
!pip install wandb --upgrade
!pip install lightning
!pip install optuna
!pip install huggingface-hub

In [None]:
iscolab = False

In [None]:
if iscolab:
    from google.colab import files
    files.upload()
    ! mkdir ~/.kaggle
    ! cp kaggle.json ~/.kaggle/
    ! chmod 600 ~/.kaggle/kaggle.json
    ! kaggle datasets download -d sifalklioui/hatespeechdza
    !mkdir data
    !unzip hatespeechdza.zip -d ./data

In [None]:
import pandas as pd
from datasets import Dataset as hgdataset
from datasets import load_dataset
import numpy as np
import pickle
import seaborn as sns
import re
import copy
from tqdm.notebook import tqdm
import gc
import random
import torch
import wandb
from sklearn import metrics
from huggingface_hub import PyTorchModelHubMixin
import torchmetrics
import logging
import torch.nn as nn
import torch.nn.functional as F
from torch import optim
from torch.utils.data import Dataset, DataLoader
from dataclasses import dataclass
from sklearn.metrics import f1_score
import pytorch_lightning as pl
from torch.optim import AdamW
from torchmetrics import Metric
from lightning.pytorch.loggers import WandbLogger



from transformers import (
    T5Tokenizer,
    T5Model,
    T5ForConditionalGeneration,
    get_linear_schedule_with_warmup
)
wandb_logger = WandbLogger(project="HTarabT5")

In [3]:
@dataclass
class Config:
    seed = 203
    data_folder = "../input/hatespeechdza"
    output_dir = './logs'
    model_name_or_path = 'UBC-NLP/AraT5v2-base-1024'
    src_max_length = 150
    tgt_max_length = 2
    add_special_tokens = True
    truncation = True
    return_tensors = 'pt'
    padding = "max_length"
    weight_decay=0.0
    adam_epsilon=1e-8
    warmup_steps=0
    train_batch_size=16
    eval_batch_size=16
    num_train_epochs=2
    gradient_accumulation_steps=16
    n_gpu=1
    fp_16= False, # if you want to enable 16-bit training then install apex and set this to true
    max_grad_norm= 1 # if you enable 16-bit training then set this to a sensible value, 0.5 is a good default
    learning_rate= float(3e-5)

config = Config()

pl.seed_everything(config.seed)

203

# 1.  Dataset & Dataloader

Now, we'll create a custom Dataset class inherited from the PyTorch Dataset class. We'll be using the **T5 tokenizer** that returns **input_ids** and **attention_mask**.<br><br>
The custom Dataset class will return a dict containing - <br>

- src_input_ids
- src_attention_mask
- tgt_input_ids'
-tgt_attention_mask

In [4]:
class HateDetect():
    def __init__(self,config,tokenizer, part):

        self.config = config
        self.part = part
        self.tokenizer = tokenizer


        data_paths = {
            'train': config.data_folder + "/dataset_prep_train.csv",
            'test': config.data_folder + "/dataset_prep_test.csv",
            'val': config.data_folder + "/dataset_prep_val.csv"
        }
        path = data_paths.get(self.part,None)
        if path is not None:
            df = pd.read_csv(path)
            df['label'].replace({0:"normal",1:"hate"}, inplace = True)
            self.data = hgdataset.from_pandas(df ,split=self.part)
        else:
            raise ValueError("Invalid value for self.part")


        self.dataset_scr,self.dataset_tgt = self.tokenize()

        # create funtion to tokenize data
    def __len__(self):
        return len(self.data)

    def __getitem__(self,idx):

        source_ids = self.dataset_scr["input_ids"][idx].squeeze()
        target_ids = self.dataset_tgt["input_ids"][idx].squeeze()

        src_mask    = self.dataset_scr["attention_mask"][idx].squeeze()
        target_mask = self.dataset_tgt["attention_mask"][idx].squeeze()

        return {"source_ids": source_ids,
                "source_mask": src_mask,
                "target_ids": target_ids,
                "target_mask": target_mask}


    def tokenize(self):

        tokenizer_params = {
            "src": {
                "max_length": self.config.src_max_length,
                "add_special_tokens": self.config.add_special_tokens,
                "truncation": self.config.truncation,
                "return_tensors": self.config.return_tensors,
                "padding": self.config.padding
            },
            "tgt": {
                "max_length": self.config.tgt_max_length,
                "add_special_tokens": self.config.add_special_tokens,
                "truncation": self.config.truncation,
                "return_tensors": self.config.return_tensors,
                "padding": self.config.padding
            }
        }
        dataset_scr = self.tokenizer(self.data['text'], **tokenizer_params["src"])
        dataset_tgt = self.tokenizer(self.data['label'], **tokenizer_params["tgt"])
        return dataset_scr,dataset_tgt

def get_dataset(config,tokenizer,part):
    return HateDetect(config,tokenizer,part)

In [19]:
checkpoint_callback = pl.callbacks.ModelCheckpoint(
    dirpath= config.output_dir, monitor="f1_valid_epoch", mode="max", save_top_k=1)
    
train_params = dict(
    devices=config.n_gpu,
    strategy="auto",
    accelerator="gpu",
    max_epochs=config.num_train_epochs+8,
    precision= "16-mixed" if config.fp_16 else 32,
    gradient_clip_val=config.max_grad_norm,
    callbacks=[checkpoint_callback]
)

# 2. Model

Coming to the most interesting part - the model architecture! We'll create a class named **Model**, inherited from **torch.nn.Module**.<br><br>

### Flow
- We initialize our pretrained T5 model with a Conditional Generation Head.
- Pass in the src & tgt, input_ids & attention_mask.
- The model returns the decoder generated output ids (predicted labels in textual format), which we need to decode further using the tokenizer.

In [6]:
class MyAccuracy(Metric):
    def __init__(self):
        super().__init__()
        higher_is_better = True
        self.add_state("correct", default=torch.tensor(0), dist_reduce_fx="sum")
        self.add_state("total", default=torch.tensor(0), dist_reduce_fx="sum")

    def update(self, preds, target):
        assert preds.shape == target.shape

        self.correct += torch.sum(preds == target)
        self.total += target.numel()

    def compute(self):
        return self.correct.float() / self.total

In [11]:
class FRP(Metric):
    def __init__(self):
        super().__init__()
        higher_is_better = True
        self.add_state("true_positives", default=torch.tensor(0), dist_reduce_fx="sum")
        self.add_state("false_positives", default=torch.tensor(0), dist_reduce_fx="sum")
        self.add_state("false_negatives", default=torch.tensor(0), dist_reduce_fx="sum")

    def update(self, preds, target):
        assert preds.shape == target.shape

        self.true_positives += torch.sum((preds == 52459) & (target == 52459))
        self.false_positives += torch.sum((preds == 52459) & (target == 16147))
        self.false_negatives += torch.sum((preds == 16147) & (target == 52459))

    def compute(self):
        precision = self.true_positives.float() / (self.true_positives + self.false_positives).float()
        recall = self.true_positives.float() / (self.true_positives + self.false_negatives).float()

        f1_score = 2 * (precision * recall) / (precision + recall)
        return f1_score, recall, precision


In [12]:
class T5FineTuner(pl.LightningModule,PyTorchModelHubMixin):
    def __init__(self, config):
        super().__init__()
        gc.collect()
        torch.cuda.empty_cache() 
        self.config = config
        self.model = T5ForConditionalGeneration.from_pretrained(config.model_name_or_path)
        self.tokenizer = T5Tokenizer.from_pretrained(config.model_name_or_path)
        self.save_hyperparameters()
        self.valid_acc = MyAccuracy()
        self.FRP = FRP()
        self.training_step_outputs = []
        self.validation_step_outputs = []

    def forward(
        self, input_ids, attention_mask=None, decoder_input_ids=None, decoder_attention_mask=None, labels=None
        ):
        return self.model(
            input_ids,
            attention_mask=attention_mask,
            decoder_input_ids=decoder_input_ids,
            decoder_attention_mask=decoder_attention_mask,
            labels=labels,
            )

    def _step(self, batch):
        lm_labels = batch["target_ids"]
        lm_labels[lm_labels[:, :] == self.tokenizer.pad_token_id] = -100
        outputs = self(
        input_ids=batch["source_ids"],
        attention_mask=batch["source_mask"],
        labels=lm_labels,
        decoder_attention_mask=batch['target_mask']
        )
        del lm_labels
        return outputs[0]

    def training_step(self, batch, batch_idx):
        loss = self._step(batch)
        #self.log("train/loss", loss)
        self.training_step_outputs.append(loss.item())
        return loss

    def on_train_epoch_end(self):
        
        self.log("training_epoch_average", np.mean(self.training_step_outputs), sync_dist=True, prog_bar=True, logger=True, on_epoch=True)
        self.training_step_outputs.clear()  # free memory
        
    def validation_step(self, batch, batch_idx):
        
        pred_ids = self.model.generate(input_ids=batch['source_ids'],
                                       attention_mask=batch['source_mask'],
                                       max_length=2)
        
        target_ids = batch['target_ids'][:,0].flatten()
        self.valid_acc.update(pred_ids[:,1].flatten(),target_ids)
        self.FRP.update(pred_ids[:,1].flatten(),target_ids)
        del pred_ids
        del target_ids
        
    def on_validation_epoch_end(self):
        f1, recall, precision = self.FRP.compute()
        self.log("acc_valid_epoch", self.valid_acc.compute(), sync_dist=True, prog_bar=True, logger=True, on_epoch=True)
        self.log("f1_valid_epoch", f1, sync_dist=True, prog_bar=True, logger=True, on_epoch=True)  
        self.log("recall_valid_epoch", recall, sync_dist=True, prog_bar=True, logger=True, on_epoch=True) 
        self.log("precision_valid_epoch", precision, sync_dist=True, prog_bar=True, logger=True, on_epoch=True) 
        self.valid_acc.reset()
        self.FRP.reset()
        del f1
        del recall
        del precision
    def configure_optimizers(self):
        "Prepare optimizer and schedule (linear warmup and decay)"

        no_decay = ["bias", "LayerNorm.weight"]

        optimizer_grouped_parameters = [
        {
            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
            "weight_decay": self.config.weight_decay,
        },
        {
            "params": [p for n, p in self.model.named_parameters() if any(nd in n for nd in no_decay)],
            "weight_decay": 0.0,
        },
        ]
        optimizer = AdamW(optimizer_grouped_parameters, lr=self.config.learning_rate, eps=self.config.adam_epsilon)
        scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=self.config.warmup_steps, num_training_steps=self.trainer.estimated_stepping_batches)

        return [optimizer],[scheduler]


    def train_dataloader(self):
        return DataLoader(get_dataset(config=self.config, tokenizer=self.tokenizer, part="train"), batch_size=self.config.train_batch_size, drop_last=True, shuffle=True,num_workers=0)
    def val_dataloader(self):
        return DataLoader(get_dataset(config=self.config,tokenizer=self.tokenizer, part="val"), batch_size=self.config.eval_batch_size, drop_last=True,num_workers=0)


In [13]:
model = T5FineTuner(config)

# 3. Training 

In [22]:
trainer = pl.Trainer(**train_params,logger=wandb_logger)

INFO: Using 16bit Automatic Mixed Precision (AMP)
INFO: Trainer already configured with model summary callbacks: [<class 'pytorch_lightning.callbacks.model_summary.ModelSummary'>]. Skipping setting a default `ModelSummary` callback.
INFO: GPU available: True (cuda), used: True
INFO: TPU available: False, using: 0 TPU cores
INFO: IPU available: False, using: 0 IPUs
INFO: HPU available: False, using: 0 HPUs


In [None]:
trainer.fit(model)

# 4. Evaluation

In [14]:
path ='/kaggle/working/logs/epoch=5-step=2814.ckpt'

In [30]:
tokenizer = T5Tokenizer.from_pretrained(config.model_name_or_path)

In [32]:
data_ = HateDetect(config,tokenizer=tokenizer, part="test")
loader = DataLoader(data_, batch_size=config.eval_batch_size,drop_last=True, num_workers=2)

In [33]:
outputs = []
targets = []
model.eval()
with torch.no_grad():
    for batch in tqdm(loader):
        outs = model_test.model.generate(input_ids=batch['source_ids'].cuda(),
                                  attention_mask=batch['source_mask'].cuda(),
                                  max_length=2)

        dec = [tokenizer.decode(ids[ids > 1 ]) for ids in outs]
        target = [tokenizer.decode((ids[ids > 1 ])) for ids in batch["target_ids"]]

        outputs.extend(dec)
        targets.extend(target)

  0%|          | 0/158 [00:00<?, ?it/s]

In [35]:
print(metrics.classification_report(targets, outputs))

              precision    recall  f1-score   support

        hate       0.76      0.88      0.82      1056
      normal       0.90      0.81      0.85      1472

    accuracy                           0.84      2528
   macro avg       0.83      0.84      0.83      2528
weighted avg       0.84      0.84      0.84      2528

