# Installs and Imports

In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments
from datasets import load_dataset, Dataset
import torch
import json

  from .autonotebook import tqdm as notebook_tqdm


# Data

In [2]:
model_name = "distilgpt2"

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_name)
# Set the pad_token to eos_token
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(model_name)



In [3]:
# Define max length for the sequences
MAX_LENGTH = 512

def format_alpaca_prompt(example):
    """Format the instruction and input into a prompt"""
    if example["input"]:
        prompt = f"### Instruction:\n{example['instruction']}\n\n### Input:\n{example['input']}\n\n### Response:\n"
    else:
        prompt = f"### Instruction:\n{example['instruction']}\n\n### Response:\n"
    return prompt

def tokenize_function(examples):
    """Tokenize the texts and prepare them for training"""
    # First tokenize without padding to get actual lengths
    tokenized = tokenizer(
        examples["text"],
        truncation=True,
        padding=False,  # Changed from "max_length"
        max_length=MAX_LENGTH,
        return_tensors=None
    )
    
    # Now add padding
    tokenized = tokenizer(
        examples["text"],
        truncation=True,
        padding='max_length',  # Add padding up to max_length
        max_length=MAX_LENGTH,
        return_tensors=None
    )
    
    labels = tokenized["input_ids"].copy()
    
    for idx, text in enumerate(examples["text"]):
        # Find the start of the response
        response_start = text.find("### Response:\n") + len("### Response:\n")
        
        # Convert text before response to tokens
        prompt_tokens = len(tokenizer(text[:response_start], truncation=True, max_length=MAX_LENGTH)["input_ids"])
        
        # Mask out the prompt tokens in labels
        labels[idx][:prompt_tokens] = [-100] * prompt_tokens
        
        # If sequence is longer than max_length, truncate labels too
        if len(labels[idx]) > MAX_LENGTH:
            labels[idx] = labels[idx][:MAX_LENGTH]
    
    # Ensure all sequences are exactly MAX_LENGTH
    for idx in range(len(labels)):
        if len(labels[idx]) < MAX_LENGTH:
            labels[idx].extend([-100] * (MAX_LENGTH - len(labels[idx])))
        if len(tokenized["input_ids"][idx]) < MAX_LENGTH:
            tokenized["input_ids"][idx].extend([tokenizer.pad_token_id] * (MAX_LENGTH - len(tokenized["input_ids"][idx])))
            tokenized["attention_mask"][idx].extend([0] * (MAX_LENGTH - len(tokenized["attention_mask"][idx])))
    
    tokenized["labels"] = labels
    return tokenized

In [4]:
def load_alpaca_data(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    return data

def prepare_dataset(data):
    """Convert the JSON data into a format suitable for the model"""
    formatted_data = []
    for item in data:
        prompt = format_alpaca_prompt(item)
        formatted_data.append({
            "text": prompt + item["output"]  # Combine prompt and output
        })
    return Dataset.from_list(formatted_data)

# Load and prepare the data
alpaca_data = load_alpaca_data('alpaca_data.json')  # Make sure this path is correct
train_size = int(0.9 * len(alpaca_data))
train_data = alpaca_data[:train_size]
eval_data = alpaca_data[train_size:]

# Convert to Dataset format
train_dataset = prepare_dataset(train_data)
eval_dataset = prepare_dataset(eval_data)

# Tokenize the datasets
tokenized_train = train_dataset.map(
    tokenize_function,
    batched=True,
    batch_size=8,
    remove_columns=train_dataset.column_names
)
tokenized_eval = eval_dataset.map(
    tokenize_function,
    batched=True,
    batch_size=8,
    remove_columns=eval_dataset.column_names
)

# Set the tensor format
tokenized_train.set_format("torch")
tokenized_eval.set_format("torch")

Map: 100%|██████████| 46801/46801 [00:11<00:00, 3930.79 examples/s]
Map: 100%|██████████| 5201/5201 [00:01<00:00, 3729.24 examples/s]


In [5]:
# Scale datasets for testing
def scale_dataset(dataset, max_samples=1000):
    """Scale down a dataset to a maximum number of samples"""
    if len(dataset) > max_samples:
        scaled_indices = list(range(max_samples))
        return dataset.select(scaled_indices)
    return dataset

# Set your desired size
MAX_SAMPLES = 20000  # Adjust this number as needed

# Scale both datasets
print(f"Original sizes - Train: {len(tokenized_train)}, Eval: {len(tokenized_eval)}")

tokenized_train = scale_dataset(tokenized_train, MAX_SAMPLES)
tokenized_eval = scale_dataset(tokenized_eval, max(50, int(MAX_SAMPLES * 0.1)))  # Keep eval set ~10% of train

print(f"Scaled sizes - Train: {len(tokenized_train)}, Eval: {len(tokenized_eval)}")

Original sizes - Train: 46801, Eval: 5201
Scaled sizes - Train: 20000, Eval: 2000


# Fine Tuning Setup

In [6]:
# training_args = TrainingArguments(
#     output_dir="./results",
#     eval_strategy="epoch",
#     learning_rate=1e-5,  # Reduced learning rate
#     per_device_train_batch_size=4,
#     per_device_eval_batch_size=4,
#     num_train_epochs=3,
#     weight_decay=0.01,
#     warmup_steps=100,
#     logging_steps=100,
#     save_total_limit=2,
#     logging_dir="./logs",
#     # Add these parameters
#     max_grad_norm=1.0,  # Add gradient clipping
#     gradient_accumulation_steps=4,
#     fp16=torch.cuda.is_available(),  # Use mixed precision if GPU available
# )

# trainer = Trainer(
#     model=model,
#     args=training_args,
#     train_dataset=tokenized_train,
#     eval_dataset=tokenized_eval,
# )

training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="steps",  # Changed from "epoch" to "steps"
    eval_steps=250,        # Evaluate every 250 steps
    learning_rate=2e-5,    # Slightly increased from 1e-5
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=0.5,    # Reduced from 3 to 1
    weight_decay=0.01,
    warmup_steps=100,
    logging_steps=100,
    save_total_limit=2,
    logging_dir="./logs",
    max_grad_norm=1.0,
    gradient_accumulation_steps=4,
    fp16=torch.cuda.is_available(),
    # Add early stopping
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    evaluation_strategy="steps",
    save_strategy="steps",
    save_steps=250,
)

# Add early stopping callback
from transformers import EarlyStoppingCallback
early_stopping = EarlyStoppingCallback(
    early_stopping_patience=3,
    early_stopping_threshold=0.01
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_eval,
    callbacks=[early_stopping]
)



# Fine Tune

In [7]:
trainer.train()

 16%|█▌        | 100/625 [03:19<17:40,  2.02s/it]

{'loss': 2.5214, 'grad_norm': 0.6819438338279724, 'learning_rate': 2e-05, 'epoch': 0.08}


 32%|███▏      | 200/625 [06:38<14:00,  1.98s/it]

{'loss': 0.3544, 'grad_norm': 0.6945923566818237, 'learning_rate': 1.6190476190476193e-05, 'epoch': 0.16}


                                                 
 40%|████      | 250/625 [09:36<12:26,  1.99s/it]

{'eval_loss': 0.3479721248149872, 'eval_runtime': 77.7671, 'eval_samples_per_second': 25.718, 'eval_steps_per_second': 6.429, 'epoch': 0.2}


 48%|████▊     | 300/625 [11:18<11:02,  2.04s/it]  

{'loss': 0.3398, 'grad_norm': 0.5778724551200867, 'learning_rate': 1.2380952380952383e-05, 'epoch': 0.24}


 64%|██████▍   | 400/625 [14:36<07:22,  1.97s/it]

{'loss': 0.3585, 'grad_norm': 0.521521806716919, 'learning_rate': 8.571428571428571e-06, 'epoch': 0.32}


 80%|████████  | 500/625 [17:59<04:10,  2.00s/it]

{'loss': 0.3613, 'grad_norm': 0.6080738306045532, 'learning_rate': 4.761904761904762e-06, 'epoch': 0.4}


                                                 
 80%|████████  | 500/625 [19:17<04:10,  2.00s/it]

{'eval_loss': 0.3432808220386505, 'eval_runtime': 78.5957, 'eval_samples_per_second': 25.447, 'eval_steps_per_second': 6.362, 'epoch': 0.4}


 96%|█████████▌| 600/625 [22:38<00:49,  1.99s/it]

{'loss': 0.3536, 'grad_norm': 0.4921528100967407, 'learning_rate': 9.523809523809525e-07, 'epoch': 0.48}


100%|██████████| 625/625 [23:27<00:00,  2.04s/it]There were missing keys in the checkpoint model loaded: ['lm_head.weight'].
100%|██████████| 625/625 [23:29<00:00,  2.26s/it]

{'train_runtime': 1409.4076, 'train_samples_per_second': 7.095, 'train_steps_per_second': 0.443, 'train_loss': 0.7001194046020508, 'epoch': 0.5}





TrainOutput(global_step=625, training_loss=0.7001194046020508, metrics={'train_runtime': 1409.4076, 'train_samples_per_second': 7.095, 'train_steps_per_second': 0.443, 'total_flos': 1306483752960000.0, 'train_loss': 0.7001194046020508, 'epoch': 0.5})

In [8]:
model_save_path = "./fine_tuned_distilgpt2_02"
trainer.save_model(model_save_path)
tokenizer.save_pretrained(model_save_path)

('./fine_tuned_distilgpt2_02/tokenizer_config.json',
 './fine_tuned_distilgpt2_02/special_tokens_map.json',
 './fine_tuned_distilgpt2_02/vocab.json',
 './fine_tuned_distilgpt2_02/merges.txt',
 './fine_tuned_distilgpt2_02/added_tokens.json',
 './fine_tuned_distilgpt2_02/tokenizer.json')