In [39]:
# Import necessary libraries
import torch
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    Trainer,
    TrainingArguments,
    TextDataset,
    DataCollatorForLanguageModeling
)

In [40]:
# --- Configuration ---
BASE_MODEL = "gpt2"
FINETUNED_MODEL_PATH = "../models/gpt2-social-story-finetuned"
TRAIN_DATA_FILE = "../data/finetuning_dataset.txt"

In [41]:
# --- 1. Load Tokenizer and Model ---
print(f"Loading base model '{BASE_MODEL}'...")
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
model = AutoModelForCausalLM.from_pretrained(BASE_MODEL)

Loading base model 'gpt2'...


In [42]:
# Add a padding token if it doesn't exist. GPT-2 doesn't have one by default.
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    model.resize_token_embeddings(len(tokenizer))

In [43]:
# --- 2. Prepare Dataset ---
print(f"Loading training data from '{TRAIN_DATA_FILE}'...")
# Create a dataset object for training
train_dataset = TextDataset(
    tokenizer=tokenizer,
    file_path=TRAIN_DATA_FILE,
    block_size=128  # The block size for text chunks
)

Loading training data from '../data/finetuning_dataset.txt'...




In [44]:
# Create a data collator, which batches the data for training
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False  # Masked Language Modeling is false for causal models like GPT-2
)

In [45]:
# --- 3. Define Training Arguments ---
print("Defining training arguments...")
training_args = TrainingArguments(
    output_dir=FINETUNED_MODEL_PATH,
    overwrite_output_dir=True,
    num_train_epochs=10,  # Increase epochs for small datasets
    per_device_train_batch_size=4,
    save_steps=10_000,
    save_total_limit=2,
    logging_steps=50,
    report_to="none" # Disable wandb or other reporting
)

Defining training arguments...


In [46]:
# --- 4. Initialize and Run Trainer ---
print("Initializing Trainer...")
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset
)

Initializing Trainer...


In [47]:
print("--- Starting Fine-Tuning ---")
trainer.train()
print("--- Fine-Tuning Complete ---")

--- Starting Fine-Tuning ---




Step,Training Loss


--- Fine-Tuning Complete ---


In [48]:
# --- 5. Save the Fine-Tuned Model and Tokenizer ---
print(f"Saving fine-tuned model to '{FINETUNED_MODEL_PATH}'...")
model.save_pretrained(FINETUNED_MODEL_PATH)
tokenizer.save_pretrained(FINETUNED_MODEL_PATH)
print("Model saved successfully!")

Saving fine-tuned model to '../models/gpt2-social-story-finetuned'...
Model saved successfully!
