In [1]:
# Step 1: Install Required Libraries
!pip install transformers datasets




In [2]:
# Step 2: Import Necessary Libraries
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForSeq2Seq
from datasets import load_dataset


In [3]:
# Step 3: Load the Dataset
# Ensure `output.json` contains your dataset in the expected format
data_file = "output.json"  # Replace with your dataset file
dataset = load_dataset("json", data_files=data_file)

In [4]:
# Split into train and validation sets
dataset = dataset["train"].train_test_split(test_size=0.1)
train_data = dataset["train"]
val_data = dataset["test"]

In [5]:
# Step 4: Load the Tokenizer and Model
model_name = "distilgpt2"  # Small and efficient pre-trained GPT model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)




generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [6]:
# Add a padding token if the tokenizer doesn't already have one
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token


Using pad_token, but it is not set yet.


In [13]:
def preprocess_function(examples):
    # Combine "Context" and "Question" into a single input
    inputs = [f"Context: {c}\nQuestion: {q}" for c, q in zip(examples["Context"], examples["Question"])]
    outputs = examples["Answer"]
    
    # Tokenize inputs and labels
    model_inputs = tokenizer(
        inputs, max_length=256, truncation=True, padding="max_length"
    )
    labels = tokenizer(
        outputs, max_length=256, truncation=True, padding="max_length"
    )["input_ids"]
    
    # Ensure labels have the same shape as inputs
    labels = torch.tensor(labels)
    
    # Replace padding tokens in labels with -100 to ignore them during loss
    labels[labels == tokenizer.pad_token_id] = -100

    # Add labels to the tokenized inputs
    model_inputs["labels"] = labels.tolist()
    return model_inputs


In [14]:
train_dataset = train_data.map(preprocess_function, batched=True).with_format("torch")
val_dataset = val_data.map(preprocess_function, batched=True).with_format("torch")


Map:   0%|          | 0/8987 [00:00<?, ? examples/s]

Map:   0%|          | 0/999 [00:00<?, ? examples/s]

In [9]:
# Step 6: Define the Data Collator
data_collator = DataCollatorForSeq2Seq(tokenizer, padding=True, return_tensors="pt")


In [10]:
# Step 7: Configure Training Arguments
training_args = TrainingArguments(
    output_dir="./fine_tuned_distilgpt2",  # Directory to save the model
    evaluation_strategy="epoch",         # Evaluate at the end of each epoch
    learning_rate=2e-4,                  # Learning rate
    per_device_train_batch_size=1,       # Batch size per device
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=4,       # Simulate larger batch size
    num_train_epochs=1,                  # Number of epochs
    save_steps=500,                      # Save checkpoint every 500 steps
    save_total_limit=2,                  # Keep only the last 2 checkpoints
    logging_steps=10,                    # Log training progress every 10 steps
    fp16=torch.cuda.is_available(),      # Use mixed precision if GPU is available
    no_cuda=not torch.cuda.is_available(), # Use CPU if no GPU is available
    report_to="none",                    # Disable integration with tools like WandB
)

In [15]:
# Step 8: Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

In [None]:
# Step 9: Fine-Tune the Model
trainer.train()


Epoch,Training Loss,Validation Loss


In [16]:
# Check a single example from the train dataset
example = train_dataset[0]
print(f"Input IDs Shape: {len(example['input_ids'])}")
print(f"Labels Shape: {len(example['labels'])}")


Input IDs Shape: 256
Labels Shape: 256
