In [59]:
# Import necessary libraries
import torch
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    Trainer,
    TrainingArguments,
    TextDataset,
    DataCollatorForLanguageModeling
)
import os

In [60]:
# --- Configuration ---
# We are using the better distilgpt2 model now
BASE_MODEL = "distilgpt2" 
FINETUNED_MODEL_PATH = "../models/distilgpt2-social-story-finetuned" # <-- Updated path
TRAIN_DATA_FILE = "../data/finetuning_dataset.txt"

In [61]:
# --- 1. Load Tokenizer and Model ---
print(f"Loading base model '{BASE_MODEL}'...")
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
model = AutoModelForCausalLM.from_pretrained(BASE_MODEL)

Loading base model 'distilgpt2'...


In [62]:
# --- NEW: Define and Add Special Tokens ---
# This is the critical fix. We explicitly define our special tokens.
special_tokens_dict = {
    'bos_token': '<|startoftext|>', # Beginning of sequence
    'eos_token': '<|endoftext|>',   # End of sequence
    'pad_token': '<|pad|>'          # Padding token
}

In [63]:
print("Adding special tokens to the tokenizer...")
num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)

Adding special tokens to the tokenizer...


In [64]:
# --- NEW: Resize model embeddings ---
# We must resize the model's embedding layer to match the new tokenizer size.
model.resize_token_embeddings(len(tokenizer))
print(f"Resized model embeddings to fit {num_added_toks} new tokens.")

Resized model embeddings to fit 3 new tokens.


In [65]:
# --- 2. Prepare Dataset ---
print(f"Loading training data from '{TRAIN_DATA_FILE}'...")
train_dataset = TextDataset(
    tokenizer=tokenizer,
    file_path=TRAIN_DATA_FILE,
    block_size=128
)
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)


Loading training data from '../data/finetuning_dataset.txt'...




In [66]:
# --- 3. Define Training Arguments ---
print("Defining training arguments...")
# Create the output directory if it doesn't exist
os.makedirs(FINETUNED_MODEL_PATH, exist_ok=True)

training_args = TrainingArguments(
    output_dir=FINETUNED_MODEL_PATH,
    overwrite_output_dir=True,
    num_train_epochs=15,  # Increased epochs for better learning
    per_device_train_batch_size=4,
    save_steps=500,
    save_total_limit=2,
    logging_steps=50,
    report_to="none"
)

Defining training arguments...


In [67]:
# --- 4. Initialize and Run Trainer ---
print("Initializing Trainer...")
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset
)

Initializing Trainer...


In [68]:
print("--- Starting Fine-Tuning ---")
trainer.train()
print("--- Fine-Tuning Complete ---")

--- Starting Fine-Tuning ---




Step,Training Loss


--- Fine-Tuning Complete ---


In [69]:
# --- 5. Save the Fine-Tuned Model and Tokenizer ---
print(f"Saving fine-tuned model to '{FINETUNED_MODEL_PATH}'...")
model.save_pretrained(FINETUNED_MODEL_PATH)
tokenizer.save_pretrained(FINETUNED_MODEL_PATH)
print("Model saved successfully!")

Saving fine-tuned model to '../models/distilgpt2-social-story-finetuned'...
Model saved successfully!
