In [2]:
from datasets import load_dataset
from datasets import DatasetDict

dataset = load_dataset("HamdanXI/beethoven_qa")

# Split the dataset into training and testing sets (90% train, 10% test)
train_test_split = dataset["train"].train_test_split(test_size=0.1)

# Create a DatasetDict to keep the splits organized
dataset_split = DatasetDict({
    'train': train_test_split['train'],
    'test': train_test_split['test']
})

print(f"Training set size: {len(dataset_split['train'])}")
print(f"Testing set size: {len(dataset_split['test'])}")

# Preprocess the dataset
def preprocess_function(examples):
    inputs = ["question: " + q for q in examples["question"]]
    targets = ["answer: " + a for a in examples["answer"]]
    model_inputs = {"input_ids": inputs, "labels": targets}
    return model_inputs

tokenized_dataset = dataset_split.map(preprocess_function, batched=True)

  from .autonotebook import tqdm as notebook_tqdm


Training set size: 1215
Testing set size: 136


Map: 100%|██████████| 1215/1215 [00:00<00:00, 46855.76 examples/s]
Map: 100%|██████████| 136/136 [00:00<00:00, 9720.62 examples/s]


In [3]:
from transformers import T5ForConditionalGeneration, T5Tokenizer, Trainer, TrainingArguments

model_name = "google-t5/t5-small"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
'''
def get_max_length(batch):
    max_length_input = max(len(tokenizer.encode(q)) for q in batch['question'])
    max_length_output = max(len(tokenizer.encode(a)) for a in batch['answer'])
    max_length = max(max_length_input, max_length_output)
    return max_length

max_pad = get_max_length(dataset['train'])

if max_pad > 512:
    max_pad = 512
    
print(max_pad)
'''

In [4]:
# Tokenize the inputs and labels
def tokenize_function(examples):
    model_inputs = tokenizer(examples["input_ids"], padding="max_length", truncation=True)
    labels = tokenizer(examples["labels"], padding="max_length", truncation=True).input_ids
    model_inputs["labels"] = labels
    return model_inputs

tokenized_dataset = tokenized_dataset.map(tokenize_function, batched=True)

Map: 100%|██████████| 1215/1215 [00:01<00:00, 1079.35 examples/s]
Map: 100%|██████████| 136/136 [00:00<00:00, 966.92 examples/s] 


In [8]:
import torch

if torch.cuda.is_available():
    print(f"CUDA is available. Device: {torch.cuda.get_device_name(0)}")
    model = model.to("cuda")
else:
    print("CUDA is not available. Check your installation.")

training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
)

trainer.train()

Step,Training Loss
10,12.7224
20,12.601
30,12.4211
40,11.7706
50,11.0233
60,9.9405
70,8.8797
80,7.8782
90,6.7502
100,5.7499


TrainOutput(global_step=456, training_loss=3.0648302733898163, metrics={'train_runtime': 613.7938, 'train_samples_per_second': 5.938, 'train_steps_per_second': 0.743, 'total_flos': 493320866365440.0, 'train_loss': 3.0648302733898163, 'epoch': 3.0})

In [9]:
trainer.save_model("t5-small-finetuned-beethoven-qa")

In [10]:
tokenizer.save_pretrained("t5-small-finetuned-beethoven-qa")

('t5-small-finetuned-beethoven-qa\\tokenizer_config.json',
 't5-small-finetuned-beethoven-qa\\special_tokens_map.json',
 't5-small-finetuned-beethoven-qa\\spiece.model',
 't5-small-finetuned-beethoven-qa\\added_tokens.json')