In [44]:
!pip install sentencepiece==0.1.97
!pip install transformers==4.27.2
!pip install pytorch_lightning==1.5.10



In [45]:
from sklearn.model_selection import train_test_split

from transformers import T5Tokenizer, T5ForConditionalGeneration

from transformers import AdamW
import pandas as pd
import torch
import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint

pl.seed_everything(100)
import warnings
warnings.filterwarnings("ignore")

In [46]:
data_raw = pd.read_csv("../data/raw/filtered.tsv", sep='\t')[:5000]
data_raw.head(3)

Unnamed: 0.1,Unnamed: 0,reference,translation,similarity,lenght_diff,ref_tox,trn_tox
0,0,"If Alkar is flooding her with psychic waste, t...","if Alkar floods her with her mental waste, it ...",0.785171,0.010309,0.014195,0.981983
1,1,Now you're getting nasty.,you're becoming disgusting.,0.749687,0.071429,0.065473,0.999039
2,2,"Well, we could spare your life, for one.","well, we can spare your life.",0.919051,0.268293,0.213313,0.985068


In [47]:
for i in range(len(data_raw)):
  if data_raw.iloc[i, 5] < data_raw.iloc[i, 6]:
    z = data_raw.iloc[i, 1]
    data_raw.iloc[i, 1] =  data_raw.iloc[i, 2]
    data_raw.iloc[i, 2] = z

In [48]:
data = data_raw.drop(columns=['Unnamed: 0', 'similarity', 'lenght_diff', 'ref_tox', 'trn_tox'])

In [49]:
data.head(4)

Unnamed: 0,reference,translation
0,"if Alkar floods her with her mental waste, it ...","If Alkar is flooding her with psychic waste, t..."
1,you're becoming disgusting.,Now you're getting nasty.
2,"well, we can spare your life.","Well, we could spare your life, for one."
3,"monkey, you have to wake up.","Ah! Monkey, you've got to snap out of it."


In [50]:
print("No of rows:" ,data.shape[0])

No of rows: 5000


In [51]:
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
INPUT_MAX_LEN = 128 #input length
OUTPUT_MAX_LEN = 128 # output length
TRAIN_BATCH_SIZE = 8 # batch size of training
VAL_BATCH_SIZE = 2 # batch size for validation
EPOCHS = 5 # number of epoch

In [52]:
MODEL_NAME = "t5-base"
tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME, model_max_length=512)


Example of how T5 Tokenizer actually work.

In [53]:
text = "How are you?"    # assume the text that is to be tokenized

input_tokenize = tokenizer(
             text,
            add_special_tokens=True,        #Add Special tokens like [CLS] and [SEP]
            max_length=128,
            padding = 'max_length',         #for padding to max_length for equal sequence length
            truncation = True,              #truncate the text if it is greater than max_length
            return_attention_mask=True,     #will return attention mask
            return_tensors="pt"             #return tensor formate
        )

In [54]:
print("input_ids: ", input_tokenize['input_ids'].flatten())
print("-----------------------------------------------------------------------------")
print("Attention Mask: ", input_tokenize['attention_mask'].flatten())

input_ids:  tensor([8774,    6,  149,   33,   25,  469,   58,    1,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0])
-----------------------------------------------------------------------------
Attention Mask:  tensor([1, 1, 1, 1, 1, 1, 1, 1, 0, 0

In [55]:
class T5Dataset:

  def __init__(self,toxic,non_toxic):

    self.toxic = toxic
    self.non_toxic = non_toxic
    self.tokenizer = tokenizer
    self.input_max_len = INPUT_MAX_LEN
    self.output_max_len = OUTPUT_MAX_LEN

  def __len__(self):                      # This method retrives the number of item from the dataset
    return len(self.toxic)

  def __getitem__(self,item):             # This method retrieves the item at the specified index item.

    toxic = str(self.toxic[item])
    toxic = ''.join(toxic.split())

    non_toxic = str(self.non_toxic[item])
    non_toxic = ''.join(non_toxic.split())

    input_tokenize = self.tokenizer(
            toxic,
            add_special_tokens=True,
            max_length=self.input_max_len,
            padding = 'max_length',
            truncation = True,
            return_attention_mask=True,
            return_tensors="pt"
        )
    output_tokenize = self.tokenizer(
            non_toxic,
            add_special_tokens=True,
            max_length=self.output_max_len,
            padding = 'max_length',
            truncation = True,
            return_attention_mask=True,
            return_tensors="pt"

        )


    input_ids = input_tokenize["input_ids"].flatten()
    attention_mask = input_tokenize["attention_mask"].flatten()
    labels = output_tokenize['input_ids'].flatten()

    out = {
            'toxic':toxic,
            'non-toxic':non_toxic,
            'input_ids': input_ids,
            'attention_mask':attention_mask,
            'target':labels
        }

    return out

# Data Loader

In [56]:
class T5DataLoad(pl.LightningDataModule):

    def __init__(self,df_train,df_test):
        super().__init__()
        self.df_train = df_train
        self.df_test = df_test
        self.tokenizer = tokenizer
        self.input_max_len = INPUT_MAX_LEN
        self.out_max_len = OUTPUT_MAX_LEN

    def setup(self, stage=None):

        self.train_data = T5Dataset(
            toxic = self.df_train.reference.values,
            non_toxic = self.df_train.translation.values
        )

        self.valid_data = T5Dataset(
            toxic = self.df_test.reference.values,
            non_toxic = self.df_test.translation.values
        )
    def train_dataloader(self):
        return torch.utils.data.DataLoader(
         self.train_data,
         batch_size= TRAIN_BATCH_SIZE,
         shuffle=True,
         num_workers=2
        )
    def val_dataloader(self):
        return torch.utils.data.DataLoader(
        self.valid_data,
        batch_size= VAL_BATCH_SIZE,
        num_workers = 2
        )

# Building T5 Model

In [57]:
class T5Model(pl.LightningModule):

    def __init__(self):
        super().__init__()
        self.model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME, return_dict = True)


    def forward(self, input_ids, attention_mask, labels=None):

        output = self.model(
        input_ids=input_ids,
        attention_mask=attention_mask,
        labels=labels
        )
        return output.loss, output.logits

    def training_step(self, batch, batch_idx):

        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        labels= batch["target"]
        loss, logits = self(input_ids , attention_mask, labels)


        self.log("train_loss", loss, prog_bar=True, logger=True)

        return {'loss': loss}

    def validation_step(self, batch, batch_idx):
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        labels= batch["target"]
        loss, logits = self(input_ids, attention_mask, labels)

        self.log("val_loss", loss, prog_bar=True, logger=True)

        return {'val_loss': loss}

    def configure_optimizers(self):
        return AdamW(self.parameters(), lr=0.0001)

# Final Training Step

In [None]:
def run():
    df_train, df_test = train_test_split(data,test_size = 0.2, random_state=100)
    dataload = T5DataLoad(df_train,df_test)
    dataload.setup()
    device = DEVICE
    model = T5Model()
    model.to(device)

    checkpoint = ModelCheckpoint(
        dirpath="t5_models",
        filename='best-model',
        save_top_k=2,
        verbose=True,
        monitor="val_loss",
        mode="min"
    )
    trainer = pl.Trainer(
        callbacks = checkpoint,
        max_epochs= 1,
        gpus=1,
        accelerator="gpu"
    )
    trainer.fit(model, dataload)
run()

Downloading model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Validation sanity check: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

In [None]:
train_model = T5Model.load_from_checkpoint('t5_models/best-model.ckpt')
train_model.freeze()

def detoxicate(text):

    inputs_encoding =  tokenizer(
        text,
        add_special_tokens=True,
        max_length= INPUT_MAX_LEN,
        padding = 'max_length',
        truncation='only_first',
        return_attention_mask=True,
        return_tensors="pt"
        )


    generate_ids = train_model.model.generate(
        input_ids = inputs_encoding["input_ids"],
        attention_mask = inputs_encoding["attention_mask"],
        max_length = INPUT_MAX_LEN,
        num_beams = 4,
        num_return_sequences = 1,
        no_repeat_ngram_size=2,
        early_stopping=True,
        )

    preds = [
        tokenizer.decode(gen_id,
        skip_special_tokens=True,
        clean_up_tokenization_spaces=True)
        for gen_id in generate_ids
    ]

    return "".join(preds)


# Model Evaluation

In [None]:
text = "Are you stupid or you blind?"
print("Original: ",text)
print("Detox: ",detoxicate(text))

In [None]:
text = data.iloc[1].reference
print("Original:", text)
print("Detox: ",detoxicate(text))