### [T5](https://github.com/google-research/text-to-text-transfer-transformer) 
- **Text-To-Text Transfer Transformer**
- A unified framework that converts every language problem into a text-to-text format.
- Achieves state-of-the-art results on many benchmarks covering summarization, question answering, text classification, and more.

### Multi Class vs Multi Label Classification
- **Multi Class** - There are multiple categories but each instance is assigned only one, therefore such problems are known as multi-class classification problem.

# Imports

The entire code is written using **PyTorch**.<br>
We'll be using the **transformers** library by [huggingface](https://github.com/huggingface/transformers) as they provide wrappers for multiple Transformer models.

In [1]:
# ! pip3 install transformers

In [2]:
#pip install pytorch-lightning --upgrade

In [3]:
import pandas as pd
import numpy as np
import pickle
import matplotlib.pyplot as plt
import seaborn as sns
import re
import copy
from tqdm.notebook import tqdm
import gc
import random 
import torch
import logging 
import torch.nn as nn
import torch.nn.functional as F
from torch import optim
from torch.utils.data import Dataset, DataLoader
from dataclasses import dataclass
import pytorch_lightning as pl
from torch.optim import AdamW

from sklearn.metrics import (
    accuracy_score, 
    f1_score, 
    classification_report
)

from transformers import (
    T5Tokenizer, 
    T5Model,
    T5ForConditionalGeneration,
    get_linear_schedule_with_warmup
)


caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io_plugins.so: undefined symbol: _ZN3tsl6StatusC1EN10tensorflow5error4CodeESt17basic_string_viewIcSt11char_traitsIcEENS_14SourceLocationE']
caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io.so: undefined symbol: _ZTVN10tensorflow13GcsFileSystemE']


In [4]:
!mkdir logs

@dataclass
class Config:
    seed = 203
    output_dir = './logs'
    model_name_or_path = 'UBC-NLP/AraT5v2-base-1024'    
    src_max_length = 160
    tgt_max_length = 2
    add_special_tokens = True
    truncation = True
    return_tensors = 'pt'
    padding = "max_length"
    weight_decay=0.0
    adam_epsilon=1e-8
    warmup_steps=0
    train_batch_size=8
    eval_batch_size=8
    num_train_epochs=2
    gradient_accumulation_steps=16
    n_gpu=2
    fp_16=True, # if you want to enable 16-bit training then install apex and set this to true
    max_grad_norm=1.0 # if you enable 16-bit training then set this to a sensible value, 0.5 is a good default
    learning_rate= float(3e-4)

config = Config()

In [5]:
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)
set_seed(config.seed)

# Dataset & Dataloader

Now, we'll create a custom Dataset class inherited from the PyTorch Dataset class. We'll be using the **T5 tokenizer** that returns **input_ids** and **attention_mask**.<br><br>
The custom Dataset class will return a dict containing - <br>

- src_input_ids
- src_attention_mask
- tgt_input_ids'
-tgt_attention_mask

In [6]:
import pandas as pd
from datasets import Dataset,load_dataset

class HateDetect():
    def __init__(self,config,tokenizer, part):
        
        self.config = config
        self.part = part
        self.tokenizer = tokenizer
        
        data_paths = {
            'train': "/kaggle/input/hatespeechdza/dataset_prep_train.csv",
            'test': "/kaggle/input/hatespeechdza/dataset_prep_test.csv",
            'val': "/kaggle/input/hatespeechdza/dataset_prep_val.csv"
        }
        path = data_paths.get(self.part,None)
        if path is not None:
            df = pd.read_csv(path)
            df['label'].replace({0:"normal",1:"hate"}, inplace = True)
            self.data = Dataset.from_pandas(df ,split=self.part)
        else:
            raise ValueError("Invalid value for self.part")


        self.dataset_scr,self.dataset_tgt = self.tokenize()
    
        # create funtion to tokenize data
    def __len__(self):
        
        return len(self.data)
        
    def __getitem__(self,idx):     
        
        source_ids = self.dataset_scr["input_ids"][idx].squeeze()
        target_ids = self.dataset_tgt["input_ids"][idx].squeeze()

        src_mask    = self.dataset_scr["attention_mask"][idx].squeeze()
        target_mask = self.dataset_tgt["attention_mask"][idx].squeeze()
        
        return {"source_ids": source_ids,
                "source_mask": src_mask,
                "target_ids": target_ids,
                "target_mask": target_mask}
    
        
    def tokenize(self):
        
        tokenizer_params = {
            "src": {
                "max_length": self.config.src_max_length,
                "add_special_tokens": self.config.add_special_tokens,
                "truncation": self.config.truncation,
                "return_tensors": self.config.return_tensors,
                "padding": self.config.padding
            },
            "tgt": {
                "max_length": self.config.tgt_max_length,
                "add_special_tokens": self.config.add_special_tokens,
                "truncation": self.config.truncation,
                "return_tensors": self.config.return_tensors,
                "padding": self.config.padding
            }
        }
        dataset_scr = self.tokenizer.batch_encode_plus(self.data['text'], **tokenizer_params["src"])
        dataset_tgt = self.tokenizer.batch_encode_plus(self.data['label'], **tokenizer_params["tgt"])
        return dataset_scr,dataset_tgt

def get_dataset(config,tokenizer,part):
    return HateDetect(config,tokenizer,part)

In [7]:
logger = logging.getLogger(__name__)

class DeviceCallback(pl.Callback):

    def on_batch_start(self, trainer, pl_module):
        assert next(pl_module.parameters()).device.type == "cuda"

class LoggingCallback(pl.Callback):
    def on_validation_end(self, trainer, pl_module):
        logger.info("***** Validation results *****")
        if pl_module.is_logger():
            metrics = trainer.callback_metrics
            # Log results
            for key in sorted(metrics):
                if key not in ["log", "progress_bar"]:
                    logger.info("{} = {}\n".format(key, str(metrics[key])))

    def on_test_end(self, trainer, pl_module):
        logger.info("***** Test results *****")
        if pl_module.is_logger():
            metrics = trainer.callback_metrics
            # Log and save results to file
            output_test_results_file = os.path.join(pl_module.config.output_dir, "test_results.txt")
            with open(output_test_results_file, "w") as writer:
                for key in sorted(metrics):
                    if key not in ["log", "progress_bar"]:
                        logger.info("{} = {}\n".format(key, str(metrics[key])))
                        writer.write("{} = {}\n".format(key, str(metrics[key])))

In [8]:
checkpoint_callback = pl.callbacks.ModelCheckpoint(
    dirpath = config.output_dir, monitor="val_loss", mode="min", save_top_k=1
)
train_params = dict(
    devices="auto", 
    strategy="auto",
    accumulate_grad_batches=config.gradient_accumulation_steps,
    accelerator="gpu",
    max_epochs=config.num_train_epochs,
    precision= '16-mixed' if config.fp_16 else '32bfloat',
    gradient_clip_val=config.max_grad_norm,
    callbacks=[LoggingCallback(),checkpoint_callback,DeviceCallback()],
)

# Model

Coming to the most interesting part - the model architecture! We'll create a class named **Model**, inherited from **torch.nn.Module**.<br><br>

### Flow
- We initialize our pretrained T5 model with a Conditional Generation Head.
- Pass in the src & tgt, input_ids & attention_mask.
- The model returns the decoder generated output ids (predicted labels in textual format), which we need to decode further using the tokenizer.

In [25]:
class T5FineTuner(pl.LightningModule):
    def __init__(self, config):
        super(T5FineTuner, self).__init__()
        self.config = config
        self.model = T5ForConditionalGeneration.from_pretrained(config.model_name_or_path)
        self.tokenizer = T5Tokenizer.from_pretrained(config.model_name_or_path)
        self.training_step_outputs = []
        self.validation_step_outputs = []
        self.test_step_outputs = []
        
    def is_logger(self):
        return self.trainer.global_rank  <= 0

    def forward(
        self, input_ids, attention_mask=None, decoder_input_ids=None, decoder_attention_mask=None, labels=None
        ):
        return self.model(
            input_ids,
            attention_mask=attention_mask,
            decoder_input_ids=decoder_input_ids,
            decoder_attention_mask=decoder_attention_mask,
            labels=labels,
            )

    def _step(self, batch):
        lm_labels = batch["target_ids"]

        lm_labels[lm_labels[:, :] == self.tokenizer.pad_token_id] = -100

        outputs = self(
        input_ids=batch["source_ids"],
        attention_mask=batch["source_mask"],
        labels=lm_labels,
        decoder_attention_mask=batch['target_mask']
        )

        loss = outputs[0]
        return loss

    def training_step(self, batch, batch_idx):
        loss = self._step(batch)
        self.training_step_outputs.append(loss)
        return loss

    def on_train_epoch_end(self):
        epoch_average = torch.stack(self.training_step_outputs).mean()
        self.log("training_epoch_average", epoch_average)
        self.training_step_outputs.clear()  # free memory

    
    def test_step(self, batch, batch_idx):
        loss = self._step(batch)
        self.test_step_outputs.append(loss)
        return loss
    
    def on_test_epoch_end(self):
        epoch_average = torch.stack(self.test_step_outputs).mean()
        self.log("test_epoch_average", epoch_average)
        self.test_step_outputs.clear()  # free memory
    
    def validation_step(self, batch, batch_idx):
        loss = self._step(batch)
        self.validation_step_outputs.append(loss)
        return loss

    def on_validation_epoch_end(self):
        epoch_average = torch.stack(self.validation_step_outputs).mean()
        self.log("validation_epoch_average", epoch_average)
        self.validation_step_outputs.clear()  # free memory

    def configure_optimizers(self):
        "Prepare optimizer and schedule (linear warmup and decay)"

        model = self.model
        no_decay = ["bias", "LayerNorm.weight"]
        optimizer_grouped_parameters = [
        {
            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
            "weight_decay": self.config.weight_decay,
        },
        {
            "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
            "weight_decay": 0.0,
        },
        ]
        optimizer = AdamW(optimizer_grouped_parameters, lr=self.config.learning_rate, eps=self.config.adam_epsilon)
        self.opt = optimizer
        return [optimizer]

    def optimizer_step(self,epoch,batch_idx,optimizer,optimizer_idx,optimizer_closure):

        optimizer.step(closure = optimizer_closure)
        optimizer.zero_grad()
        lr_scheduler.step()

    def get_tqdm_dict(self):
        tqdm_dict = {"loss": "{:.3f}".format(self.trainer.avg_loss), "lr": self.lr_scheduler.get_last_lr()[-1]}

        return tqdm_dict

    def train_dataloader(self):
        train_dataset = get_dataset(config=self.config, tokenizer=self.tokenizer, part="train")
        dataloader = DataLoader(train_dataset, batch_size=self.config.train_batch_size, drop_last=True, shuffle=True,num_workers=2,pin_memory=True)
        t_total = (
        (len(dataloader.dataset) // (self.config.train_batch_size * max(1, self.config.n_gpu)))
        // self.config.gradient_accumulation_steps
        * float(self.config.num_train_epochs)
        )
        scheduler = get_linear_schedule_with_warmup(
        self.opt, num_warmup_steps=self.config.warmup_steps, num_training_steps=t_total
        )
        self.lr_scheduler = scheduler
        return dataloader

    def val_dataloader(self):
        val_dataset = get_dataset(config=self.config,tokenizer=self.tokenizer, part="val")
        return DataLoader(val_dataset, batch_size=self.config.eval_batch_size, num_workers=2,pin_memory=True)
    def test_dataloader(self):
        val_dataset = get_dataset(config=self.config,tokenizer=self.tokenizer, part="test")
        return DataLoader(val_dataset, batch_size=self.config.eval_batch_size, num_workers=2,pin_memory=True)

In [26]:
model = T5FineTuner(config)

In [11]:
trainer = pl.Trainer(**train_params)

In [27]:
trainer.fit(model)