In [1]:
from datasets import load_dataset
from transformers import GPT2Tokenizer, GPT2LMHeadModel, TextDataset, DataCollatorForLanguageModeling, Trainer, TrainingArguments

# Step 1: Download and Preprocess the Dataset
dataset = load_dataset('HamdanXI/beethoven_qa')

def preprocess_data(examples):
    return {'text': f"Question: {examples['question']}\nAnswer: {examples['answer']}\n"}

preprocessed_dataset = dataset.map(preprocess_data, remove_columns=['question', 'answer'])
preprocessed_dataset.set_format(type='torch', columns=['text'])

# Step 2: Load GPT-2 and Prepare for Fine-tuning
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

tokenizer.pad_token = tokenizer.eos_token

model = GPT2LMHeadModel.from_pretrained('gpt2')

In [4]:
preprocessed_dataset

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 1351
    })
})

In [7]:
# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples["text"], padding='longest', truncation=True)

tokenized_datasets = preprocessed_dataset.map(tokenize_function, batched=True)
tokenized_datasets = tokenized_datasets.remove_columns(["text"])

# Data collator
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

Map:   0%|          | 0/1351 [00:00<?, ? examples/s]

In [8]:
# Step 3: Fine-tune GPT-2
training_args = TrainingArguments(
    output_dir="./beethoven_gpt2",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=4,
    save_steps=10_000,
    save_total_limit=2,
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_datasets["train"],
)

trainer.train()

# Step 4: Save & Use the Fine-tuned Model
trainer.save_model("./beethoven_gpt2_finetuned")

Step,Training Loss
500,0.0
1000,0.0
