# Exploring the t5 model for text detoxification



In [None]:
# installing huggingface libraries for dataset, models and metrics
!pip install datasets transformers[sentencepiece] sacrebleu

!pip install numpy==1.24.3

Collecting datasets
  Downloading datasets-2.14.6-py3-none-any.whl (493 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m493.7/493.7 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting transformers[sentencepiece]
  Downloading transformers-4.34.1-py3-none-any.whl (7.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m19.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting sacrebleu
  Downloading sacrebleu-2.3.1-py3-none-any.whl (118 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m118.9/118.9 kB[0m [31m17.2 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m17.3 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
!pip install transformers[torch]
!pip install accelerate -U

Collecting accelerate>=0.20.3 (from transformers[torch])
  Downloading accelerate-0.24.1-py3-none-any.whl (261 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m261.4/261.4 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: accelerate
Successfully installed accelerate-0.24.1


In [None]:
# Necessary inputs
import warnings

from datasets import load_dataset, load_metric
import transformers
import datasets
import random
import pandas as pd
import numpy as np
from IPython.display import display, HTML

warnings.filterwarnings('ignore')

## Selecting the model
For the example purpose we select as model checkpoint the smallest transformer in T5 family - `t5_small`. Other pre-trained models can be found [here](https://huggingface.co/docs/transformers/model_doc/t5#:~:text=T5%20comes%20in%20different%20sizes%3A).

In [None]:
# selecting model checkpoint
model_checkpoint = "t5-small"

## Loading the dataset

In [None]:
# setting random seed for transformers library
transformers.set_seed(42)

# Load the WMT16 dataset
#raw_datasets = load_dataset("wmt16", "de-en")

# Load the BLUE metric
metric = load_metric("sacrebleu")

Downloading builder script:   0%|          | 0.00/2.85k [00:00<?, ?B/s]

In [None]:
df = pd.read_csv("processed.tsv", sep ="\t")
df.head(5)

Unnamed: 0,reference,translation,similarity,lenght_diff,ref_tox,trn_tox
0,I've spent the last seven years with four smel...,I've spent the last seven years with four men.,0.95,0.12963,0.990042,4.1e-05
1,tell me something I don't fucking know.,Tell me something I don't know!,0.949999,0.2,0.971418,7.9e-05
2,you're such a fucking tough guy.,You're such a Tough guy.,0.949997,0.242424,0.977852,0.000238
3,someone tried to kill her with an injection of...,Someone tried to kill her by injecting the virus?,0.949996,0.137931,0.980873,0.10218
4,I could look at women's legs for hours.,I could look at a woman's legs for hours.,0.949996,0.047619,0.941838,0.006438


## Dataset
Downloaded from HuggingFace dataset is a `DatasetDict`. It contains keys `["train", "validation", "test"]` - which represents a dataset splits

In [None]:
#df.drop(columns =['lenght_diff', 'similarity', 'ref_tox', 'trn_tox'])

In [None]:
train, validate, test = np.split(df.sample(frac=1), [int(.6*len(df)), int(.8*len(df))])

In [None]:
train_dataset = datasets.Dataset.from_dict(train)
test_dataset = datasets.Dataset.from_dict(test)
validation_dataset = datasets.Dataset.from_dict(validate)

In [None]:
raw_datasets = datasets.DatasetDict({"train":train_dataset,"test":test_dataset, "validation": validation_dataset})

In [None]:
# samples from train dataset
raw_datasets["train"][:5]

{'reference': ["Every woman in California thinks... she's either too fat or too thin or too something.",
  "Listen, give me your dog, and I'll forget it. You're nuts.",
  'Without my rifle, I am useless.',
  'You guys are worse than NASA.',
  "I'm sorry I can't crap like Bono."],
 'translation': ["every woman in California thinks she's too fat or too skinny.",
  "listen, give me your dog, and I'll forget it.",
  '"I\'m useless without my rifle.',
  "you're worse than NASA.",
  "I'm sorry I don't like Bono."],
 'similarity': [0.91169195304,
  0.937651404656,
  0.949683975862,
  0.933995157371,
  0.906056473784],
 'lenght_diff': [0.2873563218390804,
  0.2203389830508474,
  0.03125,
  0.2,
  0.1470588235294117],
 'ref_tox': [0.9432735443115234,
  0.999486804008484,
  0.9850885272026062,
  0.8082457184791565,
  0.9785056114196776],
 'trn_tox': [0.4230548143386841,
  0.0024958848953247,
  0.2508155405521393,
  0.090722881257534,
  0.0002564493624959]}

In [None]:
fake_preds = ["hello there", "general kenobi", "Can I get an A"]
fake_labels = [["hello there"], ["general kenobi"], ['Can I get a C']]
metric.compute(predictions=fake_preds, references=fake_labels)

{'score': 45.59274666224604,
 'counts': [7, 4, 1, 0],
 'totals': [9, 6, 3, 2],
 'precisions': [77.77777777777777,
  66.66666666666667,
  33.333333333333336,
  25.0],
 'bp': 1.0,
 'sys_len': 9,
 'ref_len': 9}

## Preprocessing the data
As usual we will need to preprocess data and tokenize it before passing to model

In [None]:
from transformers import AutoTokenizer

# we will use autotokenizer for this purpose
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

Downloading (…)okenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

Downloading (…)ve/main/spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

In [None]:
tokenizer("Hello, this one sentence!")

{'input_ids': [8774, 6, 48, 80, 7142, 55, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1]}

In [None]:
tokenizer(["Hello, this one sentence!", "This is another sentence."])

{'input_ids': [[8774, 6, 48, 80, 7142, 55, 1], [100, 19, 430, 7142, 5, 1]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1]]}

In [None]:
# prefix for model input
prefix = "Detoxify this sentence:"

In [None]:
raw_datasets["train"][:5]

{'reference': ["Every woman in California thinks... she's either too fat or too thin or too something.",
  "Listen, give me your dog, and I'll forget it. You're nuts.",
  'Without my rifle, I am useless.',
  'You guys are worse than NASA.',
  "I'm sorry I can't crap like Bono."],
 'translation': ["every woman in California thinks she's too fat or too skinny.",
  "listen, give me your dog, and I'll forget it.",
  '"I\'m useless without my rifle.',
  "you're worse than NASA.",
  "I'm sorry I don't like Bono."],
 'similarity': [0.91169195304,
  0.937651404656,
  0.949683975862,
  0.933995157371,
  0.906056473784],
 'lenght_diff': [0.2873563218390804,
  0.2203389830508474,
  0.03125,
  0.2,
  0.1470588235294117],
 'ref_tox': [0.9432735443115234,
  0.999486804008484,
  0.9850885272026062,
  0.8082457184791565,
  0.9785056114196776],
 'trn_tox': [0.4230548143386841,
  0.0024958848953247,
  0.2508155405521393,
  0.090722881257534,
  0.0002564493624959]}

In [None]:
max_input_length = 128
max_target_length = 128
source_lang = "toxic"
target_lang = "detoxified"

def preprocess_function(examples):
    inputs = [prefix + ex for ex in examples["reference"]]
    targets = [ex for ex in examples["translation"]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)

    # Setup the tokenizer for targets
    labels = tokenizer(targets, max_length=max_target_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [None]:
# example of preprocessing
preprocess_function(raw_datasets['train'][:2])

{'input_ids': [[374, 235, 226, 4921, 48, 7142, 10, 18403, 2335, 16, 1826, 317, 7, 233, 255, 31, 7, 893, 396, 2886, 42, 396, 5551, 42, 396, 424, 5, 1], [374, 235, 226, 4921, 48, 7142, 10, 21310, 35, 6, 428, 140, 39, 1782, 6, 11, 27, 31, 195, 2612, 34, 5, 148, 31, 60, 11446, 5, 1]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], 'labels': [[334, 2335, 16, 1826, 317, 7, 255, 31, 7, 396, 2886, 42, 396, 28451, 5, 1], [3011, 6, 428, 140, 39, 1782, 6, 11, 27, 31, 195, 2612, 34, 5, 1]]}

In [None]:
# for the example purpose we will crop the dataset and select first 5000 for train
# and 500 for validation and test
cropped_datasets = raw_datasets
tokenized_datasets = cropped_datasets.map(preprocess_function, batched=True)
tokenized_datasets['train'][0]

Map:   0%|          | 0/30000 [00:00<?, ? examples/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

{'reference': "Every woman in California thinks... she's either too fat or too thin or too something.",
 'translation': "every woman in California thinks she's too fat or too skinny.",
 'similarity': 0.91169195304,
 'lenght_diff': 0.2873563218390804,
 'ref_tox': 0.9432735443115234,
 'trn_tox': 0.4230548143386841,
 'input_ids': [374,
  235,
  226,
  4921,
  48,
  7142,
  10,
  18403,
  2335,
  16,
  1826,
  317,
  7,
  233,
  255,
  31,
  7,
  893,
  396,
  2886,
  42,
  396,
  5551,
  42,
  396,
  424,
  5,
  1],
 'attention_mask': [1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1],
 'labels': [334,
  2335,
  16,
  1826,
  317,
  7,
  255,
  31,
  7,
  396,
  2886,
  42,
  396,
  28451,
  5,
  1]}

## Fine-tuning the model

In [None]:
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
# create a model for the pretrained model
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [None]:
# defining the parameters for training
batch_size = 32
model_name = model_checkpoint.split("/")[-1]
args = Seq2SeqTrainingArguments(
    f"{model_name}-finetuned-detoxify",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=10,
    predict_with_generate=True,
    fp16=True,
    report_to='tensorboard',
)

In [None]:
# instead of writing collate_fn function we will use DataCollatorForSeq2Seq
# simliarly it implements the batch creation for training
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [None]:
import numpy as np

# simple postprocessing for text
def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]

    return preds, labels

# compute metrics function to pass to trainer
def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

In [None]:
# instead of writing train loop we will use Seq2SeqTrainer
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Bleu,Gen Len
1,1.6134,1.242498,39.5687,11.8084
2,1.3231,1.173971,40.9354,11.7992
3,1.2639,1.144733,41.3117,11.762
4,1.2274,1.12586,41.6391,11.7724
5,1.2036,1.114355,41.8065,11.7655
6,1.1812,1.105925,41.9682,11.7693
7,1.1746,1.099475,42.0469,11.7621
8,1.1658,1.095707,42.1469,11.7706
9,1.1631,1.093117,42.1351,11.7621
10,1.1566,1.092396,42.1248,11.7594


TrainOutput(global_step=9380, training_loss=1.2360559003947895, metrics={'train_runtime': 2134.6969, 'train_samples_per_second': 140.535, 'train_steps_per_second': 4.394, 'total_flos': 4069914065240064.0, 'train_loss': 1.2360559003947895, 'epoch': 10.0})

In [None]:
# saving model
trainer.save_model('best')

In [None]:
# loading the model and run inference for it
model = AutoModelForSeq2SeqLM.from_pretrained('best')
model.eval()
model.config.use_cache = False

In [None]:
def translate(model, inference_request, tokenizer=tokenizer):
    input_ids = tokenizer(inference_request, return_tensors="pt").input_ids
    outputs = model.generate(input_ids=input_ids)
    print(tokenizer.decode(outputs[0], skip_special_tokens=True,temperature=0))

In [43]:
inference_request = prefix + 'You are a fucking piece of shit'
translate(model, inference_request,tokenizer)

you're a piece of shit.


In [45]:
inference_request = prefix + "kill yourself"
translate(model, inference_request,tokenizer)

Kill yourself


This is just because I'm using a google collab

In [None]:
import locale
def getpreferredencoding(do_setlocale = True):
    return "UTF-8"
locale.getpreferredencoding = getpreferredencoding

In [None]:
!zip -r /content/best_t5.zip /content/best

  adding: content/best/ (stored 0%)
  adding: content/best/generation_config.json (deflated 29%)
  adding: content/best/pytorch_model.bin (deflated 9%)
  adding: content/best/tokenizer.json (deflated 74%)
  adding: content/best/tokenizer_config.json (deflated 95%)
  adding: content/best/config.json (deflated 62%)
  adding: content/best/special_tokens_map.json (deflated 86%)
  adding: content/best/spiece.model (deflated 48%)
  adding: content/best/training_args.bin (deflated 50%)
