# Finetuning paraphrasing LLM

In [26]:
!pip install datasets transformers torch evaluate nltk rouge_score accelerate

Collecting accelerate
  Downloading accelerate-0.24.1-py3-none-any.whl (261 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m261.4/261.4 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: accelerate
Successfully installed accelerate-0.24.1


In [8]:
!wget https://github.com/IGragon/TextDetoxification/raw/master/data/interm/high_low_tox.tsv

--2023-11-01 10:56:58--  https://github.com/IGragon/TextDetoxification/raw/master/data/interm/high_low_tox.tsv
Resolving github.com (github.com)... 20.29.134.23
Connecting to github.com (github.com)|20.29.134.23|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://media.githubusercontent.com/media/IGragon/TextDetoxification/master/data/interm/high_low_tox.tsv [following]
--2023-11-01 10:56:58--  https://media.githubusercontent.com/media/IGragon/TextDetoxification/master/data/interm/high_low_tox.tsv
Resolving media.githubusercontent.com (media.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to media.githubusercontent.com (media.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 120972363 (115M) [text/plain]
Saving to: ‘high_low_tox.tsv’


2023-11-01 10:56:59 (384 MB/s) - ‘high_low_tox.tsv’ saved [120972363/120972363]



In [1]:
import nltk
from datasets import load_dataset, Dataset
from transformers import AutoTokenizer, DataCollatorForSeq2Seq
from transformers import TrainingArguments, Trainer
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer
import numpy as np
import pandas as pd
import evaluate

import torch
import random
import os

def set_random_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False
    os.environ['PYTHONHASHSEED'] = str(seed)

set_random_seed()

## Prepare data
Load processed high_low_tox.tsv dataset from our github

### Load .tsv file as datasets' dataset

In [35]:
dataset_path = "/content/high_low_tox.tsv"

dataframe = pd.read_csv(dataset_path, sep='\t')
dataframe = dataframe.sort_values('tox_diff', ascending=False)
dataframe.head(10)

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,similarity,lenght_diff,tox_diff,tox_low,tox_high,score_low,score_high
513596,513596,513596,0.76959,0.290909,0.999681,"""meaningful relationship"" with Stella.","with his stupid ""meaningful"" relationship with...",3.9e-05,0.99972
155243,155243,155243,0.792757,0.142857,0.999678,That's the day we started our own business and...,we started our own firm that day and said good...,3.9e-05,0.999718
506123,506123,506123,0.633159,0.039216,0.999677,"These are the types of station, there are those.","those idiots from the station, I'm sure it's t...",3.9e-05,0.999716
336425,336425,336425,0.785826,0.206897,0.999677,"think about it, buddy.","Think about that shit, dawg.",4.3e-05,0.99972
429942,429942,429942,0.898997,0.162791,0.999677,"how do you like your game now, Tom?","How do you like your stupid game now, Tom?",4.3e-05,0.99972
481493,481493,481493,0.75943,0.05,0.999676,Why did we ever come to this country?,why did we come to this stupid country?,4.3e-05,0.999719
472539,472539,472539,0.737238,0.204545,0.999676,but this does not necessarily mean that the so...,That does not necessarily mean that the idiot’...,3.8e-05,0.999714
315254,315254,315254,0.676025,0.291667,0.999675,do they really think they're going to get a pl...,How dumb do these morons have to be to think t...,3.7e-05,0.999712
103089,103089,103089,0.859214,0.3,0.999675,you think you could have found something bigger?,Do you think you could have found something bi...,4.1e-05,0.999716
467697,467697,467697,0.653986,0.214286,0.999675,that's one of those guest lists.,That'sjust one of her stupid guest lists.,4.1e-05,0.999716


In [36]:
dataset = Dataset.from_pandas(dataframe[["tox_high", "tox_low"]].iloc[:20000]).shuffle(seed=42)
dataset = dataset.train_test_split(test_size=0.2)
dataset

DatasetDict({
    train: Dataset({
        features: ['tox_high', 'tox_low', '__index_level_0__'],
        num_rows: 16000
    })
    test: Dataset({
        features: ['tox_high', 'tox_low', '__index_level_0__'],
        num_rows: 4000
    })
})

### Tokenize dataset

In [37]:
tokenizer = AutoTokenizer.from_pretrained("t5-small")

def preprocess_function(examples):
    model_inputs = tokenizer(examples["tox_high"], max_length=256, truncation=True)

    labels = tokenizer(text_target=examples["tox_low"], max_length=256, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [38]:
tokenized_dataset = dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/16000 [00:00<?, ? examples/s]

Map:   0%|          | 0/4000 [00:00<?, ? examples/s]

In [39]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['tox_high', 'tox_low', '__index_level_0__', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 16000
    })
    test: Dataset({
        features: ['tox_high', 'tox_low', '__index_level_0__', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 4000
    })
})

## Getting ready for evaluation

In [40]:
# Setup evaluation
nltk.download("punkt", quiet=True)
metric = evaluate.load("rouge")

def compute_metrics(eval_preds):
    preds, labels = eval_preds

    # decode preds and labels
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # rougeLSum expects newline after each sentence
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]

    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    return result

## Training step

In [41]:
# Load pretrained model and evaluate model after each epoch
model = AutoModelForSeq2SeqLM.from_pretrained("t5-small")
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=4,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=5,
    fp16=True,
    predict_with_generate=True
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

trainer.train()

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum
1,1.927,1.708806,0.587761,0.37449,0.577746,0.577707
2,1.8262,1.639703,0.598893,0.387275,0.588332,0.588344
3,1.7704,1.60876,0.606865,0.395005,0.5965,0.596723
4,1.7388,1.591991,0.608973,0.396512,0.598427,0.598726
5,1.7347,1.587126,0.610845,0.398297,0.600378,0.600626




TrainOutput(global_step=5000, training_loss=1.8434371948242188, metrics={'train_runtime': 1640.9829, 'train_samples_per_second': 48.751, 'train_steps_per_second': 3.047, 'total_flos': 577951786598400.0, 'train_loss': 1.8434371948242188, 'epoch': 5.0})

In [42]:
trainer.evaluate()

{'eval_loss': 1.5871257781982422,
 'eval_rouge1': 0.6108447199300288,
 'eval_rouge2': 0.39829732130759843,
 'eval_rougeL': 0.6003777196361278,
 'eval_rougeLsum': 0.6006263857148544,
 'eval_runtime': 227.3171,
 'eval_samples_per_second': 17.597,
 'eval_steps_per_second': 4.399,
 'epoch': 5.0}

In [61]:
texts = ["I have orders to kill her",
         "You a fuckboy. You ain't even that cute.",
         "Imagine being such a piece of shit that your \"\"Nightmare Scenario\"\" is other people being able to afford to buy houses",
         "Elon Musk is a piece of shit, greedy capitalist who exploits workers, and offers nothing of real benefit to the world. All he’s done is make a name for himself on the backs of other people, using dirty money from his family’s emerald mine they acquired during apartheid. I don’t care that he’s autistic."]

encoded_texts = tokenizer(texts, return_tensors="pt", padding=True)
encoded_texts = {k: v.to(trainer.model.device) for k,v in encoded_texts.items()}
outputs = trainer.model.generate(encoded_texts["input_ids"])
for output in outputs:
  decoded_output = tokenizer.decode(output, skip_special_tokens=True)
  print(decoded_output)

I have orders to kill her.
you're not even that cute.
Imagine being so a piece of a slap that you're able to
Elon Musk is a savage, greedy capitalist who exploits workers


### Save model

In [62]:
from huggingface_hub import notebook_login

In [63]:
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [64]:
trainer.push_to_hub("IGragon/t5-small-detoxification")

pytorch_model.bin:   0%|          | 0.00/242M [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

training_args.bin:   0%|          | 0.00/4.66k [00:00<?, ?B/s]

'https://huggingface.co/IGragon/results/tree/main/'