## References

Based on https://github.com/s-nlp/detox/blob/main/emnlp2021/style_transfer/mining_parallel_corpus/finetune_t5_on_mined.ipynb

In [1]:
import pandas as pd
import numpy as np
import torch

from tqdm import tqdm, trange
from transformers import (
    AdamW,
    T5ForConditionalGeneration,
    T5Tokenizer, T5TokenizerFast,
    get_linear_schedule_with_warmup
)

from torch.utils.data import Dataset, DataLoader

from sklearn.model_selection import train_test_split

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
device = torch.device("mps")
# device = torch.device("cuda")

# Dataset preprocessing

Firstly, we can notice that sometimes translated sentences in the dataset are more toxic than reference sentences.

If we don't want to confuse our model, we have to fix that

In [3]:
dataset_path = "../data/raw/filtered.tsv"

df = pd.read_csv(dataset_path, delimiter="\t")
df.head()

Unnamed: 0.1,Unnamed: 0,reference,translation,similarity,lenght_diff,ref_tox,trn_tox
0,0,"If Alkar is flooding her with psychic waste, t...","if Alkar floods her with her mental waste, it ...",0.785171,0.010309,0.014195,0.981983
1,1,Now you're getting nasty.,you're becoming disgusting.,0.749687,0.071429,0.065473,0.999039
2,2,"Well, we could spare your life, for one.","well, we can spare your life.",0.919051,0.268293,0.213313,0.985068
3,3,"Ah! Monkey, you've got to snap out of it.","monkey, you have to wake up.",0.664333,0.309524,0.053362,0.994215
4,4,I've got orders to put her down.,I have orders to kill her.,0.726639,0.181818,0.009402,0.999348


In [4]:
true_xs = []
true_ys = []

for i, row in df.iterrows():
    if row["trn_tox"] > row["ref_tox"]:
        true_xs.append(row["translation"])
        true_ys.append(row["reference"])
    else:
        true_xs.append(row["reference"])
        true_ys.append(row["translation"])

df = pd.DataFrame({"source": true_xs, "target": true_ys})

## Model

In [5]:
model_name = "ceshine/t5-paraphrase-paws-msrp-opinosis"
tokenizer = T5TokenizerFast.from_pretrained(model_name)

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [6]:
test_size = 100
df_train, df_test = train_test_split(df, test_size=test_size)

In [7]:
df_train.head()

Unnamed: 0,source,target
519216,Don't get the heart and the ass mixed up.,don't get your heart out!
321933,Bank robbers and gun nuts aren't typically sex...,bank robbers and armed shrinks aren't usually ...
345405,you smell like vindaloo.,Smells like vindaloo.
103260,Threats and intimidation won't save your ass t...,threats and intimidation will not save your ne...
184358,he's a fucking genius.,"Oh, he's a genius."


In [8]:
tokenized_train_xs = tokenizer(df_train.source.tolist(), truncation=True)
tokenized_train_ys = tokenizer(df_train.target.tolist(), truncation=True)

tokenized_test_xs = tokenizer(df_test.source.tolist(), truncation=True)
tokenized_test_ys = tokenizer(df_test.target.tolist(), truncation=True)

In [9]:
class MyDataset(torch.utils.data.Dataset):
    def __init__(self, inputs, targets):
        self.inputs = inputs
        self.targets = targets

    def __getitem__(self, idx):
        assert idx < len(self.inputs['input_ids'])
        
        item = { key: val[idx] for key, val in self.inputs.items() }

        item['decoder_attention_mask'] = self.targets['attention_mask'][idx]
        item['labels'] = self.targets['input_ids'][idx]

        return item
    
    @property
    def n(self):
        return len(self.inputs['input_ids'])

    def __len__(self):
        return self.n # * 2


In [10]:
train_dataset = MyDataset(tokenized_train_xs, tokenized_train_ys)
test_dataset = MyDataset(tokenized_test_xs, tokenized_test_ys)

In [11]:
train_dataloader = DataLoader(train_dataset, batch_size=4, drop_last=True, shuffle=True, num_workers=1)
test_dataloader = DataLoader(test_dataset, batch_size=4, drop_last=True, shuffle=True, num_workers=1)

In [12]:
checkpoint_name = 'SkolkovoInstitute/t5-paraphrase-paws-msrp-opinosis-paranmt'
model = T5ForConditionalGeneration.from_pretrained(checkpoint_name)
model.to(device)

T5ForConditionalGeneration(
  (shared): Embedding(32128, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=768, out_features=3072, bias=False)
              (wo): Linear(in_features=3072, out_features=768, bias=False)
              (dropout): Dro

In [13]:
from transformers import Trainer, TrainingArguments
from transformers.file_utils import cached_property
from typing import Tuple

class TrAr(TrainingArguments):
    @cached_property
    def _setup_devices(self):
        return device

In [14]:
from typing import List, Dict, Union

class DataCollatorWithPadding:
    def __init__(self, tokenizer):
        self.tokenizer = tokenizer

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        batch = self.tokenizer.pad(
            features,
            padding=True,
        )
        ybatch = self.tokenizer.pad(
            {'input_ids': batch['labels'], 'attention_mask': batch['decoder_attention_mask']},
            padding=True,
        ) 
        batch['labels'] = ybatch['input_ids']
        batch['decoder_attention_mask'] = ybatch['attention_mask']
        
        return {k: torch.tensor(v) for k, v in batch.items()}

In [15]:
save_name = '../models/trained/t5-detox-finetuned'

training_args = TrAr(
    output_dir=save_name,
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=4,   # batch size per device during training
    gradient_accumulation_steps=4,
    per_device_eval_batch_size=8,
    warmup_steps=300,                # number of warmup steps for learning rate scheduler
    weight_decay=0,                  # strength of weight decay
    learning_rate=3e-5,
    logging_dir='./logs',
    logging_steps=100,
    eval_steps=100,
    evaluation_strategy='steps',
    save_total_limit=1,
    save_steps=5000,
)


In [16]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [17]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
)

NameError: name 'accelerate_version' is not defined

In [None]:
import gc

gc.collect()
torch.cuda.empty_cache()

In [None]:
trainer.train()

In [None]:
trainer.evaluate()

In [None]:
model.eval()

In [None]:
inputs = tokenizer('It is a shit day.', return_tensors='pt')
inputs = {k: v.to(device) for k, v in inputs.items()}

for t in model.generate(**inputs, num_return_sequences=10, do_sample=False, num_beams=10):
    print(tokenizer.decode(t, skip_special_tokens=True))