In [14]:
import pandas as pd
from datasets import Dataset
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments

In [15]:
data = pd.read_csv("training_data.csv", index_col=0)

In [16]:
data.rename(columns={"clean_title":"input", "inverse_title":"output"}, inplace=True)

In [17]:
data

Unnamed: 0,input,output
1,the best soundtrack ever to anything,the worst soundtrack ever to anything
7,glorious story,inglorious story
10,the worst,the best
14,awful beyond belief,nice beyond belief
16,a romantic zen baseball comedy,a classicist zen baseball tragedy
...,...,...
3599772,a sweet scent,a sour scent
3599778,useful for everything boat related,useless for everything boat related
3599781,we love tyler,we hate tyler
3599789,looks nice low functionality,looks nice high functionality


In [26]:
hf_data = Dataset.from_pandas(data)


In [27]:
hf_data

Dataset({
    features: ['input', 'output', '__index_level_0__'],
    num_rows: 1145379
})

In [28]:
tokenizer = T5Tokenizer.from_pretrained("t5-small")
model = T5ForConditionalGeneration.from_pretrained("t5-small")

In [29]:
train_dataset, val_dataset = hf_data.train_test_split(test_size=0.2).values()

In [30]:
train_dataset, _ = train_dataset.train_test_split(train_size=0.3).values()

In [31]:
def tokenize_function(examples):
    inputs = examples["input"]
    targets = examples["output"]
    
    # Tokenize inputs and outputs
    model_inputs = tokenizer(inputs, max_length=72, truncation=True, padding="max_length")
    labels = tokenizer(targets, max_length=72, truncation=True, padding="max_length")
    
    # Add labels (labels for training)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [32]:
tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True)
tokenized_val_dataset = val_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/274890 [00:00<?, ? examples/s]

Map: 100%|██████████| 274890/274890 [00:33<00:00, 8315.54 examples/s]
Map: 100%|██████████| 229076/229076 [00:26<00:00, 8518.88 examples/s]


In [33]:
training_args = TrainingArguments(
    output_dir="./results",               # Output directory for model checkpoints
    eval_strategy="steps",               # Evaluate every X steps
    eval_steps=50000,                    # Evaluate after every 50k training steps
    save_steps=50000,                    # Save checkpoint every 50k steps (aligned with eval_steps)
    logging_dir="./logs",                # Directory for logs
    logging_steps=5000,                  # Log every 5k steps
    learning_rate=3e-5,                  # Learning rate
    per_device_train_batch_size=32,      # Batch size for training
    per_device_eval_batch_size=32,       # Batch size for evaluation
    gradient_accumulation_steps=1,       # No gradient accumulation needed
    num_train_epochs=1,                  # Number of epochs
    weight_decay=0.01,                   # Weight decay for optimization
    save_total_limit=2,                  # Limit the number of saved checkpoints
    fp16=True,                           # Enable FP16 for mixed precision
    load_best_model_at_end=True,         # Load the best model at the end of training
    metric_for_best_model="eval_loss",   # Metric to determine the best model
    greater_is_better=False,             # Lower eval loss is better
    optim="adafactor"
)

In [34]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_val_dataset,  # Now using the validation dataset
)

In [35]:
trainer.train()

  0%|          | 0/8591 [00:00<?, ?it/s]Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.
 58%|█████▊    | 5000/8591 [1:11:21<51:12,  1.17it/s]  

{'loss': 0.2031, 'grad_norm': 0.14186440408229828, 'learning_rate': 1.255383540914911e-05, 'epoch': 0.58}


100%|██████████| 8591/8591 [2:02:31<00:00,  1.17it/s]

{'train_runtime': 7351.0299, 'train_samples_per_second': 37.395, 'train_steps_per_second': 1.169, 'train_loss': 0.13136793351287185, 'epoch': 1.0}





TrainOutput(global_step=8591, training_loss=0.13136793351287185, metrics={'train_runtime': 7351.0299, 'train_samples_per_second': 37.395, 'train_steps_per_second': 1.169, 'total_flos': 5231827660308480.0, 'train_loss': 0.13136793351287185, 'epoch': 1.0})

In [41]:
model.save_pretrained("finetunedt5")

In [40]:
tokenizer.save_pretrained("finetunedt5")

('finetunedt5/tokenizer_config.json',
 'finetunedt5/special_tokens_map.json',
 'finetunedt5/spiece.model',
 'finetunedt5/added_tokens.json')

In [None]:
input_text = "This book was quite nice, it had neat themes and an good story"
input_ids = tokenizer(input_text, return_tensors="pt").input_ids.cuda()
output_ids = model.generate(input_ids)
output_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
output_text