### [T5](https://github.com/google-research/text-to-text-transfer-transformer)
- **Text-To-Text Transfer Transformer**
- A unified framework that converts every language problem into a text-to-text format.
- Achieves state-of-the-art results on many benchmarks covering summarization, question answering, text classification, and more.

### Multi Class vs Multi Label Classification
- **Multi Class** - There are multiple categories but each instance is assigned only one, therefore such problems are known as multi-class classification problem.

# Imports

The entire code is written using **PyTorch**.<br>
We'll be using the **transformers** library by [huggingface](https://github.com/huggingface/transformers) as they provide wrappers for multiple Transformer models.

In [None]:
%%capture

!pip install transformers
!pip install pytorch-lightning --upgrade
!pip install sentencepiece
!pip install datasets --upgrade
!pip install torchmetrics
!pip install wandb
!pip install lightning

In [None]:
'''rom google.colab import files

files.upload()
! mkdir ~/.kaggle
! cp kaggle.json ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json
! kaggle datasets download -d sifalklioui/hatespeechdza
!mkdir data
!unzip hatespeechdza.zip -d ./data'''

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import pandas as pd
from datasets import Dataset as hgdataset
from datasets import load_dataset
import numpy as np
import pickle
import matplotlib.pyplot as plt
import seaborn as sns
import re
import copy
from tqdm.notebook import tqdm
import gc
import random
import torch
import wandb
import torchmetrics
import logging
import torch.nn as nn
import torch.nn.functional as F
from torch import optim
from torch.utils.data import Dataset, DataLoader
from dataclasses import dataclass
from sklearn.metrics import f1_score
import pytorch_lightning as pl
from torch.optim import AdamW
from sklearn import metrics
from lightning.pytorch.loggers import WandbLogger

from transformers import (
    T5Tokenizer,
    T5Model,
    T5ForConditionalGeneration,
    get_linear_schedule_with_warmup
)
wandb.login(key="902ffdfbd80732219ee9853892860a048fa9914f")
wandb_logger = WandbLogger(project="HTarabT5")

In [None]:
@dataclass
class Config:
    seed = 203
    data_folder = "../input/hatespeechdza"
    output_dir = './logs'
    model_name_or_path = 'UBC-NLP/AraT5v2-base-1024'
    src_max_length = 40
    tgt_max_length = 2
    add_special_tokens = True
    truncation = True
    return_tensors = 'pt'
    padding = "max_length"
    weight_decay=0.0
    adam_epsilon=1e-8
    warmup_steps=0
    train_batch_size=48
    eval_batch_size=48
    num_train_epochs=5
    gradient_accumulation_steps=16
    n_gpu=1
    fp_16= False, # if you want to enable 16-bit training then install apex and set this to true
    max_grad_norm=0.5 # if you enable 16-bit training then set this to a sensible value, 0.5 is a good default
    learning_rate= float(3e-4)

config = Config()

In [None]:
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)
set_seed(config.seed)

# Dataset & Dataloader

Now, we'll create a custom Dataset class inherited from the PyTorch Dataset class. We'll be using the **T5 tokenizer** that returns **input_ids** and **attention_mask**.<br><br>
The custom Dataset class will return a dict containing - <br>

- src_input_ids
- src_attention_mask
- tgt_input_ids'
-tgt_attention_mask

In [None]:
class HateDetect():
    def __init__(self,config,tokenizer, part):

        self.config = config
        self.part = part
        self.tokenizer = tokenizer


        data_paths = {
            'train': config.data_folder + "/dataset_prep_train.csv",
            'test': config.data_folder + "/dataset_prep_test.csv",
            'val': config.data_folder + "/dataset_prep_val.csv"
        }
        path = data_paths.get(self.part,None)
        if path is not None:
            df = pd.read_csv(path)
            df['label'].replace({0:"normal",1:"hate"}, inplace = True)
            self.data = hgdataset.from_pandas(df ,split=self.part)
        else:
            raise ValueError("Invalid value for self.part")


        self.dataset_scr,self.dataset_tgt = self.tokenize()

        # create funtion to tokenize data
    def __len__(self):
        return len(self.data)

    def __getitem__(self,idx):

        source_ids = self.dataset_scr["input_ids"][idx].squeeze()
        target_ids = self.dataset_tgt["input_ids"][idx].squeeze()

        src_mask    = self.dataset_scr["attention_mask"][idx].squeeze()
        target_mask = self.dataset_tgt["attention_mask"][idx].squeeze()

        return {"source_ids": source_ids,
                "source_mask": src_mask,
                "target_ids": target_ids,
                "target_mask": target_mask}


    def tokenize(self):

        tokenizer_params = {
            "src": {
                "max_length": self.config.src_max_length,
                "add_special_tokens": self.config.add_special_tokens,
                "truncation": self.config.truncation,
                "return_tensors": self.config.return_tensors,
                "padding": self.config.padding
            },
            "tgt": {
                "max_length": self.config.tgt_max_length,
                "add_special_tokens": self.config.add_special_tokens,
                "truncation": self.config.truncation,
                "return_tensors": self.config.return_tensors,
                "padding": self.config.padding
            }
        }
        dataset_scr = self.tokenizer(self.data['text'], **tokenizer_params["src"])
        dataset_tgt = self.tokenizer(self.data['label'], **tokenizer_params["tgt"])
        return dataset_scr,dataset_tgt

def get_dataset(config,tokenizer,part):
    return HateDetect(config,tokenizer,part)

In [None]:
get_dataset

In [None]:
logger = logging.getLogger(__name__)

class DeviceCallback(pl.Callback):
    def on_batch_start(self, trainer, pl_module):
        assert next(pl_module.parameters()).device.type == "cuda"

class LoggingCallback(pl.Callback):
    def on_validation_end(self, trainer, pl_module):
        logger.info("***** Validation results *****")
        if pl_module.is_logger():
            metrics = trainer.callback_metrics
            # Log results
            for key in sorted(metrics):
                if key not in ["log", "progress_bar"]:
                    logger.info("{} = {}\n".format(key, str(metrics[key])))

In [None]:
    def on_test_end(self, trainer, pl_module):
        logger.info("***** Validation results *****")
        if pl_module.is_logger():
            metrics = trainer.callback_metrics
            # Log results
            for key in sorted(metrics):
                if key not in ["log", "progress_bar"]:
                    logger.info("{} = {}\n".format(key, str(metrics[key])))

In [None]:
callbacks=[DeviceCallback()]

In [None]:
checkpoint_callback = pl.callbacks.ModelCheckpoint(
    dirpath= config.output_dir, monitor="validation_epoch_average", mode="max", save_top_k=1)
    
#dc = pl.callbacks.DeviceStatsMonitor(cpu_stats=True)

train_params = dict(
    devices=config.n_gpu,
    strategy="auto",
    accelerator="gpu",
    max_epochs=config.num_train_epochs-2,
    precision= "16-mixed",
    gradient_clip_val=config.max_grad_norm,
    
)

In [None]:
LoggingCallback(),checkpoint_callback,    callbacks=[checkpoint_callback]

# Model

Coming to the most interesting part - the model architecture! We'll create a class named **Model**, inherited from **torch.nn.Module**.<br><br>

### Flow
- We initialize our pretrained T5 model with a Conditional Generation Head.
- Pass in the src & tgt, input_ids & attention_mask.
- The model returns the decoder generated output ids (predicted labels in textual format), which we need to decode further using the tokenizer.

In [None]:
gc.collect()

In [None]:
rm -r /kaggle/working/lightning_logs

In [None]:
torch.cuda.empty_cache() 

In [None]:
%memit

In [None]:
%load_ext memory_profiler

%memit

In [None]:
from torchmetrics import Metric

class MyAccuracy(Metric):
    def __init__(self):
        super().__init__()
        higher_is_better = True
        self.add_state("correct", default=torch.tensor(0), dist_reduce_fx="sum")
        self.add_state("total", default=torch.tensor(0), dist_reduce_fx="sum")

    def update(self, preds, target):
        assert preds.shape == target.shape

        self.correct += torch.sum(preds == target)
        self.total += target.numel()

    def compute(self):
        return self.correct.float() / self.total

In [None]:
class T5FineTuner(pl.LightningModule):
    def __init__(self, config):
        super().__init__()
        gc.collect()
        torch.cuda.empty_cache() 
        self.config = config
        self.model = T5ForConditionalGeneration.from_pretrained(config.model_name_or_path)
        self.tokenizer = T5Tokenizer.from_pretrained(config.model_name_or_path)
        self.valid_acc = MyAccuracy()
        self.training_step_outputs = []
        self.validation_step_outputs = []

    def forward(
        self, input_ids, attention_mask=None, decoder_input_ids=None, decoder_attention_mask=None, labels=None
        ):
        return self.model(
            input_ids,
            attention_mask=attention_mask,
            decoder_input_ids=decoder_input_ids,
            decoder_attention_mask=decoder_attention_mask,
            labels=labels,
            )

    def _step(self, batch):
        lm_labels = batch["target_ids"]
        lm_labels[lm_labels[:, :] == self.tokenizer.pad_token_id] = -100
        outputs = self(
        input_ids=batch["source_ids"],
        attention_mask=batch["source_mask"],
        labels=lm_labels,
        decoder_attention_mask=batch['target_mask']
        )
        del lm_labels
        return outputs[0]

    def training_step(self, batch, batch_idx):
        loss = self._step(batch)
        #self.log("train/loss", loss)
        self.training_step_outputs.append(loss.item())
        return loss

    def on_train_epoch_end(self):
        
        self.log("training_epoch_average", np.mean(self.training_step_outputs), sync_dist=True, prog_bar=True, logger=True, on_epoch=True)
        self.training_step_outputs.clear()  # free memory
        
    def validation_step(self, batch, batch_idx):
        pred_ids = self.model.generate(input_ids=batch['source_ids'],
                                       attention_mask=batch['source_mask'],
                                       max_length=2)
        self.valid_acc.update(pred_ids[:,1].flatten(),batch['target_ids'][:,0].flatten())
        #print(batch['target_ids'][:,0][0])
        
        #print(torch.sum(batch['target_ids'][:,0].flatten() == pred_ids[:,1].flatten()))
        del pred_ids
        
    def on_validation_epoch_end(self):
        self.log("acc_valid_epoch", self.valid_acc.compute(), sync_dist=True, prog_bar=True, logger=True, on_epoch=True)
        # to complete 
        # toughts: doesn't matter what the source id is 
        # try to track how much epochs until the gens[:,0] are of these two classes
        # look into how to make custom metrics 

        
    def configure_optimizers(self):
        "Prepare optimizer and schedule (linear warmup and decay)"

        no_decay = ["bias", "LayerNorm.weight"]

        optimizer_grouped_parameters = [
        {
            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
            "weight_decay": self.config.weight_decay,
        },
        {
            "params": [p for n, p in self.model.named_parameters() if any(nd in n for nd in no_decay)],
            "weight_decay": 0.0,
        },
        ]
        optimizer = AdamW(optimizer_grouped_parameters, lr=self.config.learning_rate, eps=self.config.adam_epsilon)
        scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=self.config.warmup_steps, num_training_steps=self.trainer.estimated_stepping_batches)

        return [optimizer],[scheduler]


    def train_dataloader(self):
        return DataLoader(get_dataset(config=self.config, tokenizer=self.tokenizer, part="train"), batch_size=self.config.train_batch_size, drop_last=True, shuffle=True,num_workers=0)
    def val_dataloader(self):
        return DataLoader(get_dataset(config=self.config,tokenizer=self.tokenizer, part="val"), batch_size=self.config.eval_batch_size, drop_last=True,num_workers=0)


In [None]:
def test_dataloader(self):
    val_dataset = get_dataset(config=self.config,tokenizer=self.tokenizer, part="test")
    return DataLoader(val_dataset, batch_size=self.config.eval_batch_size,drop_last=True, num_workers=1)

In [None]:
def test_step(self, batch, batch_idx):
        loss = self._step(batch)
        self.test_step_outputs.append(loss)
        return loss

def on_test_epoch_end(self):
    epoch_average = torch.stack(self.test_step_outputs).mean()
    self.log("test_epoch_average", epoch_average,  sync_dist=True, prog_bar=True, logger=True, on_epoch=True)
    self.test_step_outputs.clear()  # free memory


In [None]:
        #self.outputsf1.append(outs.detach().tolist())
        #self.targetsf1.append(target.detach().tolist())

In [None]:
#target_binary = mlb.fit_transform(self.targetsf1)
        #output_binary = mlb.transform(self.outputsf1)
        
        
        #out_flat = [pair[0] for sublist in self.outputsf1 for pair in sublist]
        #target_flat = [pair[0] for sublist in self.targetsf1 for pair in sublist]
        #targets = [ids[0] for batch in self.targetsf1 for ids in batch]
        #outputs = [ids[0] for batch in self.outputsf1 for ids in batch]
        
        #f1 = f1_score(targets,outputs,average='macro')
        #self.log("validation_f1_step",float(0.5) ,  sync_dist=True, prog_bar=True, logger=True, on_epoch=True)

In [33]:
model = T5FineTuner(config)

In [35]:
trainer = pl.Trainer(**train_params)

INFO: Using 16bit Automatic Mixed Precision (AMP)
INFO: GPU available: True (cuda), used: True
INFO: TPU available: False, using: 0 TPU cores
INFO: IPU available: False, using: 0 IPUs
INFO: HPU available: False, using: 0 HPUs


In [36]:
trainer.fit(model)

INFO: Loading `train_dataloader` to estimate number of stepping batches.


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

INFO: `Trainer.fit` stopped: `max_epochs=3` reached.


In [37]:
tokenizer = T5Tokenizer.from_pretrained(config.model_name_or_path)

In [45]:
data_ = HateDetect(config,tokenizer=tokenizer, part="val")
loader = DataLoader(data_, batch_size=config.eval_batch_size,drop_last=True, num_workers=2)

In [None]:
del batch

In [40]:
import glob

# Define the path pattern
path_pattern = '/kaggle/working/lightning_logs/version_*/checkpoints/*'

# Use glob to list files matching the pattern
file_list = glob.glob(path_pattern)

# Print the list of matching files
for file_path in file_list:
    print(file_path)


/kaggle/working/lightning_logs/version_2/checkpoints/epoch=2-step=468.ckpt
/kaggle/working/lightning_logs/version_0/checkpoints/epoch=0-step=156.ckpt
/kaggle/working/lightning_logs/version_1/checkpoints/epoch=2-step=596.ckpt


In [46]:
path = "/kaggle/working/lightning_logs/version_1/checkpoints/epoch=2-step=596.ckpt"
model_test = T5FineTuner.load_from_checkpoint(path,config=config)

# disable randomness, dropout, etc...
model_test.eval()

outputs = []
targets = []
with torch.no_grad():
    for batch in tqdm(loader):
        outs = model_test.model.generate(input_ids=batch['source_ids'].cuda(),
                                  attention_mask=batch['source_mask'].cuda(),
                                  max_length=2)

        dec = [tokenizer.decode(ids[ids > 1 ]) for ids in outs]
        target = [tokenizer.decode((ids[ids > 1 ])) for ids in batch["target_ids"]]

        outputs.extend(dec)
        targets.extend(target)

  0%|          | 0/52 [00:00<?, ?it/s]

In [48]:
print(metrics.classification_report(targets, outputs))

              precision    recall  f1-score   support

        hate       0.87      0.70      0.78      1068
      normal       0.80      0.92      0.86      1428

    accuracy                           0.83      2496
   macro avg       0.84      0.81      0.82      2496
weighted avg       0.83      0.83      0.82      2496



In [None]:
import textwrap

In [None]:
for i in range(32):
    lines = textwrap.wrap("Review:\n%s\n" % texts[i], width=100)
    print("\n".join(lines))
    print("\nActual sentiment: %s" % targets[i])
    print("Predicted sentiment: %s" % dec[i])
    print("=====================================================================\n")