# T5-base

For this model I used [this notebook](https://www.kaggle.com/code/bunnyyy/t5-tuning-for-paraphrasing-questions/notebook) as reference.

In [1]:
import os
import random
import argparse
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import (
    AdamW,
    T5ForConditionalGeneration,
    T5Tokenizer,
    get_linear_schedule_with_warmup
)
import pytorch_lightning as pl

  from .autonotebook import tqdm as notebook_tqdm


## Setup

In [2]:
df = pd.read_csv("../data/raw/suitable.csv", index_col=0)
df.head()

Unnamed: 0,toxic,non-toxic
5,I'm not gonna have a child... ...with the same...,I'm not going to breed kids with a genetic dis...
6,"They're all laughing at us, so we'll kick your...",they're laughing at us. We'll show you.
13,"Come on, Cal, leave that shit alone.","come on, Cal, put it down."
22,"Real life starts the first time you fuck, kid.","boy, real life starts up first."
25,"Shit, this one I can't even pronounce.","gosh, I can't even pronounce this."


In [3]:
len(df)

349705

In [4]:
seed = 177013
train = df.sample(10000, random_state=seed)
val = df.drop(train.index).sample(1000, random_state=seed)
print(len(train), len(val))

10000 1000


In [5]:
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

set_seed(177013)

In [None]:
class T5FineTuner(pl.LightningModule):
    def __init__(self, hparams):
        super(T5FineTuner, self).__init__()
        self.hparams = hparams

        self.model = T5ForConditionalGeneration.from_pretrained(hparams.model_name_or_path)
        self.tokenizer = T5Tokenizer.from_pretrained(hparams.tokenizer_name_or_path)

    def is_logger(self):
        return False

    def forward(
            self, input_ids, attention_mask=None, decoder_input_ids=None, decoder_attention_mask=None, lm_labels=None
    ):
        return self.model(
            input_ids,
            attention_mask=attention_mask,
            decoder_input_ids=decoder_input_ids,
            decoder_attention_mask=decoder_attention_mask,
            lm_labels=lm_labels,
        )

    def _step(self, batch):
        lm_labels = batch["target_ids"]
        lm_labels[lm_labels[:, :] == self.tokenizer.pad_token_id] = -100

        outputs = self(
            input_ids=batch["source_ids"],
            attention_mask=batch["source_mask"],
            lm_labels=lm_labels,
            decoder_attention_mask=batch['target_mask']
        )

        loss = outputs[0]

        return loss

    def training_step(self, batch, batch_idx):
        loss = self._step(batch)

        tensorboard_logs = {"train_loss": loss}
        return {"loss": loss, "log": tensorboard_logs}

    def training_epoch_end(self, outputs):
        avg_train_loss = torch.stack([x["loss"] for x in outputs]).mean()
        tensorboard_logs = {"avg_train_loss": avg_train_loss}
        return {"avg_train_loss": avg_train_loss, "log": tensorboard_logs, 'progress_bar': tensorboard_logs}

    def validation_step(self, batch, batch_idx):
        loss = self._step(batch)
        return {"val_loss": loss}

    def validation_epoch_end(self, outputs):
        avg_loss = torch.stack([x["val_loss"] for x in outputs]).mean()
        tensorboard_logs = {"val_loss": avg_loss}
        return {"avg_val_loss": avg_loss, "log": tensorboard_logs, 'progress_bar': tensorboard_logs}

    def configure_optimizers(self):
        "Prepare optimizer and schedule (linear warmup and decay)"

        model = self.model
        no_decay = ["bias", "LayerNorm.weight"]
        optimizer_grouped_parameters = [
            {
                "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
                "weight_decay": self.hparams.weight_decay,
            },
            {
                "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
                "weight_decay": 0.0,
            },
        ]
        optimizer = AdamW(optimizer_grouped_parameters, lr=self.hparams.learning_rate, eps=self.hparams.adam_epsilon)
        self.opt = optimizer
        return [optimizer]

    def optimizer_step(self, epoch, batch_idx, optimizer, optimizer_idx, second_order_closure=None):
        if self.trainer.use_tpu:
            xm.optimizer_step(optimizer)
        else:
            optimizer.step()
        optimizer.zero_grad()
        self.lr_scheduler.step()

    def get_tqdm_dict(self):
        tqdm_dict = {"loss": "{:.3f}".format(self.trainer.avg_loss), "lr": self.lr_scheduler.get_last_lr()[-1]}

        return tqdm_dict

    def train_dataloader(self):
        train_dataset = get_dataset(tokenizer=self.tokenizer, type_path="train", args=self.hparams)
        dataloader = DataLoader(train_dataset, batch_size=self.hparams.train_batch_size, drop_last=True, shuffle=True,
                                num_workers=4)
        t_total = (
                (len(dataloader.dataset) // (self.hparams.train_batch_size * max(1, self.hparams.n_gpu)))
                // self.hparams.gradient_accumulation_steps
                * float(self.hparams.num_train_epochs)
        )
        scheduler = get_linear_schedule_with_warmup(
            self.opt, num_warmup_steps=self.hparams.warmup_steps, num_training_steps=t_total
        )
        self.lr_scheduler = scheduler
        return dataloader

    def val_dataloader(self):
        val_dataset = get_dataset(tokenizer=self.tokenizer, type_path="valid", args=self.hparams)
        return DataLoader(val_dataset, batch_size=self.hparams.eval_batch_size, num_workers=4)

In [12]:
class GrammerDataset(Dataset):
    def __init__(self, tokenizer, data_dir, type_path, max_len=256):
        self.path = os.path.join(data_dir, type_path + '.csv')

        self.question = "toxic"
        self.target_column = "non-toxic"
        self.data = pd.read_csv(self.path)
        self.max_len = max_len
        self.tokenizer = tokenizer
        self.inputs = []
        self.targets = []

        self._build()

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, index):
        source_ids = self.inputs[index]["input_ids"].squeeze()
        target_ids = self.targets[index]["input_ids"].squeeze()

        src_mask = self.inputs[index]["attention_mask"].squeeze()  # might need to squeeze
        target_mask = self.targets[index]["attention_mask"].squeeze()  # might need to squeeze

        return {"source_ids": source_ids, "source_mask": src_mask, "target_ids": target_ids, "target_mask": target_mask}

    def _build(self):
        for idx in range(len(self.data)):
            target,question= self.data.loc[idx, self.target_column], self.data.loc[idx, self.question]
            input_ = "detoxify: "+ question + ' </s>'
            target = target + " </s>"

            # tokenize inputs
            tokenized_inputs = self.tokenizer.batch_encode_plus(
                [input_], max_length=self.max_len, pad_to_max_length=True, return_tensors="pt"
            )
            # tokenize targets
            tokenized_targets = self.tokenizer.batch_encode_plus(
                [target], max_length=self.max_len, pad_to_max_length=True, return_tensors="pt"
            )

            self.inputs.append(tokenized_inputs)
            self.targets.append(tokenized_targets)

In [18]:
def get_dataset(tokenizer, type_path, args):
    return GrammerDataset(tokenizer=tokenizer, data_dir=args.data_dir, type_path=type_path,  max_len=args.max_seq_length)

In [7]:
train.to_csv('../data/raw/train.csv', index=False)
val.to_csv('../data/raw/valid.csv', index= False)

In [6]:
tokenizer = T5Tokenizer.from_pretrained('t5-base')

Downloading (…)ve/main/spiece.model: 100%|██████████| 792k/792k [00:00<00:00, 2.73MB/s]
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Downloading (…)/main/tokenizer.json: 100%|██████████| 1.39M/1.39M [00:00<00:00, 3.73MB/s]
Downloading (…)lve/main/config.json: 100%|██████████| 1.21k/1.21k [00:00<?, ?B/s]
For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.
You are using the default legacy behaviour of the <class 'transformers.mo

## Model training

In [11]:
args_dict = dict(
    data_dir='../data/raw', # path for data files
    output_dir='../models/t5', # path to save the checkpoints
    model_name_or_path='t5-base',
    tokenizer_name_or_path='t5-base',
    max_seq_length=64,
    learning_rate=3e-4,
    weight_decay=0.0,
    adam_epsilon=1e-8,
    warmup_steps=0,
    train_batch_size=32,
    eval_batch_size=32,
    num_train_epochs=2,
    gradient_accumulation_steps=32,
    n_gpu=1,
    early_stop_callback=False,
    fp_16=False, # if you want to enable 16-bit training then install apex and set this to true
    opt_level='O1', # you can find out more on optimisation levels here https://nvidia.github.io/apex/amp.html#opt-levels-and-properties
    max_grad_norm=1.0, # if you enable 16-bit training then set this to a sensible value, 0.5 is a good default
    save_strategy="no",
    seed=177013,
)

In [None]:
args = argparse.Namespace(**args_dict)

In [17]:
train_params = dict(
    accumulate_grad_batches=args.gradient_accumulation_steps,
    gpus=args.n_gpu,
    max_epochs=args.num_train_epochs,
    early_stop_callback=False,
    precision= 16 if args.fp_16 else 32,
    amp_level=args.opt_level,
    gradient_clip_val=args.max_grad_norm,
    checkpoint_callback=False,
    logger=False,
)

In [19]:
print ("Initialize model")
model = T5FineTuner(args)

trainer = pl.Trainer(**train_params)

Initialize model


GPU available: True, used: True
TPU available: False, using: 0 TPU cores
CUDA_VISIBLE_DEVICES: [0]


In [20]:
print (" Training model")
trainer.fit(model)

print ("training finished")

print ("Saving model")
model.model.save_pretrained("../models/t5")

print ("Saved model")

 Training model



  | Name  | Type                       | Params
-----------------------------------------------------
0 | model | T5ForConditionalGeneration | 222 M 


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validation sanity check', layout=Layout…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Training', layout=Layout(flex='2'), max…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…


training finished
Saving model
Saved model


## Check model

In [8]:
my_model = T5ForConditionalGeneration.from_pretrained("../models/t5")
my_model.eval()
my_model.config.use_cache = False

In [9]:
my_tokenizer = T5Tokenizer.from_pretrained('t5-base')

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [11]:
def generate(model, tokenizer, prompt):
    input_ = "detoxify: "+ prompt + ' </s>'
    input_ids = tokenizer.batch_encode_plus(
                [input_], max_length=64, truncation=True, return_tensors="pt"
            ).input_ids
    outputs = model.generate(input_ids=input_ids)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

generate(my_model, my_tokenizer, "last time I got drunk, I fucked the firemen.")

'last time I was drunk, I sucked firemen.'

In [12]:
test = df.drop(train.index).drop(val.index).sample(1000, random_state=seed)
print(len(test))

1000


In [13]:
test.head()

Unnamed: 0,toxic,non-toxic
399522,I'll show you who's got bigger butts on this s...,I'll show you who's got The bigger bottom roun...
32832,"""I sit on the floor and pick my nose and think...","""I sit down and I pick my nose, and on my nose..."
126205,did you know I really thought this grease monk...,"You know, I thought that greaser really did tu..."
151854,"Just go, tyler, get the hell away from me.","go, Tyler, get away from me."
218096,"'You're putrid, you know that?' Danny said.","""you're cute, you know?"" Said Danny."


In [14]:
test['generated'] = test['toxic'].map(lambda prompt: generate(my_model, my_tokenizer, prompt))
test.head()



Unnamed: 0,toxic,non-toxic,generated
399522,I'll show you who's got bigger butts on this s...,I'll show you who's got The bigger bottom roun...,I'll show you who's bigger on this station!
32832,"""I sit on the floor and pick my nose and think...","""I sit down and I pick my nose, and on my nose...","""I sit on the floor and pick my nose and think..."
126205,did you know I really thought this grease monk...,"You know, I thought that greaser really did tu...",did you know I thought this grease monkey was ...
151854,"Just go, tyler, get the hell away from me.","go, Tyler, get away from me.","just go, tyler, get away from me."
218096,"'You're putrid, you know that?' Danny said.","""you're cute, you know?"" Said Danny.","""You know what?"" Danny asked."


## Save results

In [16]:
test.to_csv("../data/interim/t5_pred.csv")