# Installs and Imports

In [1]:
import os
import torch

# CUDA settings
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
torch.backends.cudnn.enabled = True
torch.backends.cudnn.benchmark = True
torch.backends.cudnn.deterministic = True

# Check CUDA availability and set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"CUDA device: {torch.cuda.get_device_name(0)}")
    print(f"CUDA version: {torch.version.cuda}")
    # Set default CUDA device
    torch.cuda.set_device(0)
    
# Import other libraries
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments
from datasets import load_dataset, Dataset
import json

CUDA available: True
CUDA device: NVIDIA GeForce RTX 4070 Ti SUPER
CUDA version: 11.8


  from .autonotebook import tqdm as notebook_tqdm


# Data

### load model and tokenizers

In [2]:
model_name = "distilgpt2"

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_name)
# Set the pad_token to eos_token
tokenizer.pad_token = tokenizer.eos_token

# Print tokenizer info
print(f"Tokenizer pad token: {tokenizer.pad_token}")
print(f"Tokenizer pad token ID: {tokenizer.pad_token_id}")

# Create model with explicit config
from transformers import AutoConfig
config = AutoConfig.from_pretrained(model_name)
config.pad_token_id = tokenizer.pad_token_id

# Try loading model with device placement in from_pretrained
try:
    print("Attempting to load model directly to device...")
    model = AutoModelForCausalLM.from_pretrained(
        model_name, 
        config=config,
        device_map="auto" if torch.cuda.is_available() else None,
        torch_dtype=torch.float32  # Explicitly set dtype
    )
    print("Model loaded successfully")
except Exception as e:
    print(f"Error loading model directly to device: {e}")
    print("Attempting alternate loading method...")
    # Try alternate loading method
    model = AutoModelForCausalLM.from_pretrained(model_name, config=config)
    if torch.cuda.is_available():
        print("Moving model to CUDA...")
        # Try moving parts of the model gradually
        for param in model.parameters():
            param.data = param.data.to('cuda')
    print("Model loading complete")

# Print model device info
print(f"\nModel device check:")
print(f"Model is on CUDA: {next(model.parameters()).is_cuda}")

Tokenizer pad token: <|endoftext|>
Tokenizer pad token ID: 50256
Attempting to load model directly to device...
Model loaded successfully

Model device check:
Model is on CUDA: True


### tokenize function

In [3]:
# Define max length for the sequences
MAX_LENGTH = 512

def format_alpaca_prompt(example):
    """Format the instruction and input into a prompt"""
    if example["input"]:
        prompt = f"### Instruction:\n{example['instruction']}\n\n### Input:\n{example['input']}\n\n### Response:\n"
    else:
        prompt = f"### Instruction:\n{example['instruction']}\n\n### Response:\n"
    return prompt

def tokenize_function(examples):
    """Tokenize the texts and prepare them for training"""
    # Tokenize with padding and truncation
    tokenized = tokenizer(
        examples["text"],
        truncation=True,
        padding='max_length',
        max_length=MAX_LENGTH,
        return_tensors=None
    )
    
    labels = tokenized["input_ids"].copy()
    
    # Find the start of the response for each example
    for idx, text in enumerate(examples["text"]):
        response_start = text.find("### Response:\n") + len("### Response:\n")
        # Get prompt tokens
        prompt_tokens = len(tokenizer(text[:response_start], 
                                    truncation=True, 
                                    max_length=MAX_LENGTH)["input_ids"])
        
        # Mask out prompt tokens in labels
        labels[idx][:prompt_tokens] = [-100] * prompt_tokens
        
        # Ensure no out-of-bounds indices
        if prompt_tokens > MAX_LENGTH:
            labels[idx] = [-100] * MAX_LENGTH
    
    # Convert to PyTorch tensors
    input_ids = torch.tensor(tokenized["input_ids"], dtype=torch.long)
    attention_mask = torch.tensor(tokenized["attention_mask"], dtype=torch.long)
    labels = torch.tensor(labels, dtype=torch.long)
    
    return {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "labels": labels
    }

In [4]:
def load_alpaca_data(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    return data

def prepare_dataset(data):
    """Convert the JSON data into a format suitable for the model"""
    formatted_data = []
    for item in data:
        prompt = format_alpaca_prompt(item)
        formatted_data.append({
            "text": prompt + item["output"]  # Combine prompt and output
        })
    return Dataset.from_list(formatted_data)

# Load and prepare the data
alpaca_data = load_alpaca_data('alpaca_data_cleaned.json')
train_size = int(0.9 * len(alpaca_data))
train_data = alpaca_data[:train_size]
eval_data = alpaca_data[train_size:]

# Convert to Dataset format
train_dataset = prepare_dataset(train_data)
eval_dataset = prepare_dataset(eval_data)

# Tokenize the datasets with smaller batch size and add error handling
def safe_map_tokenization(dataset, batch_size=4):
    try:
        return dataset.map(
            tokenize_function,
            batched=True,
            batch_size=batch_size,
            remove_columns=dataset.column_names
        )
    except Exception as e:
        print(f"Error during tokenization: {e}")
        raise

print("Tokenizing training dataset...")
tokenized_train = safe_map_tokenization(train_dataset)
print("Tokenizing evaluation dataset...")
tokenized_eval = safe_map_tokenization(eval_dataset)

# Set the tensor format
tokenized_train.set_format("torch")
tokenized_eval.set_format("torch")

# Verify data format
print("\nVerifying data format:")
sample = tokenized_train[0]
for key, value in sample.items():
    print(f"{key}: shape={value.shape}, dtype={value.dtype}")

# Add validation check
def validate_dataset(dataset, name):
    print(f"\nValidating {name}:")
    invalid_samples = 0
    for i, sample in enumerate(dataset):
        if not all(isinstance(v, torch.Tensor) for v in sample.values()):
            print(f"Sample {i} has non-tensor values")
            invalid_samples += 1
        if any(v.dtype not in [torch.long, torch.int64] for v in sample.values()):
            print(f"Sample {i} has incorrect dtype")
            invalid_samples += 1
    print(f"Found {invalid_samples} invalid samples")
    return invalid_samples == 0

validate_dataset(tokenized_train, "training dataset")
validate_dataset(tokenized_eval, "evaluation dataset")

Tokenizing training dataset...


Map: 100%|██████████| 46584/46584 [00:17<00:00, 2657.88 examples/s]


Tokenizing evaluation dataset...


Map: 100%|██████████| 5176/5176 [00:01<00:00, 2645.99 examples/s]



Verifying data format:
input_ids: shape=torch.Size([512]), dtype=torch.int64
attention_mask: shape=torch.Size([512]), dtype=torch.int64
labels: shape=torch.Size([512]), dtype=torch.int64

Validating training dataset:
Found 0 invalid samples

Validating evaluation dataset:
Found 0 invalid samples


True

In [5]:
# Scale datasets for testing
def scale_dataset(dataset, max_samples=1000):
    """Scale down a dataset to a maximum number of samples"""
    if len(dataset) > max_samples:
        scaled_indices = list(range(max_samples))
        return dataset.select(scaled_indices)
    return dataset

# Set your desired size
MAX_SAMPLES = 40000  # Adjust this number as needed

# Scale both datasets
print(f"Original sizes - Train: {len(tokenized_train)}, Eval: {len(tokenized_eval)}")

tokenized_train = scale_dataset(tokenized_train, MAX_SAMPLES)
tokenized_eval = scale_dataset(tokenized_eval, max(50, int(MAX_SAMPLES * 0.1)))  # Keep eval set ~10% of train

print(f"Scaled sizes - Train: {len(tokenized_train)}, Eval: {len(tokenized_eval)}")

Original sizes - Train: 46584, Eval: 5176
Scaled sizes - Train: 40000, Eval: 4000


# Fine Tuning Setup

### validate setup

In [6]:
# Check vocab sizes and data ranges
print(f"Tokenizer vocab size: {len(tokenizer)}")
print(f"Model vocab size: {model.config.vocab_size}")

# Function to check tensor values
def check_tensor_values(tensor, name):
    if isinstance(tensor, torch.Tensor):
        print(f"{name} - Min: {tensor.min().item()}, Max: {tensor.max().item()}, Shape: {tensor.shape}")

# Check a sample from the dataset
sample = tokenized_train[0]
for key, value in sample.items():
    check_tensor_values(value, key)

Tokenizer vocab size: 50257
Model vocab size: 50257
input_ids - Min: -100, Max: 50256, Shape: torch.Size([512])
attention_mask - Min: 0, Max: 1, Shape: torch.Size([512])
labels - Min: -100, Max: 50256, Shape: torch.Size([512])


### custom collator

In [7]:
from dataclasses import dataclass
from typing import Optional, Union, List, Dict, Any


@dataclass
class CustomDataCollatorForLanguageModeling:
    tokenizer: AutoTokenizer
    mlm: bool = False
    pad_to_multiple_of: Optional[int] = None

    def __call__(self, examples: List[Dict[str, Any]]) -> Dict[str, torch.Tensor]:
        # Extract the relevant fields
        input_ids = [example["input_ids"] for example in examples]
        attention_mask = [example["attention_mask"] for example in examples]
        labels = [example["labels"] for example in examples]

        # Convert to tensors if they aren't already
        if not isinstance(input_ids[0], torch.Tensor):
            input_ids = [torch.tensor(ids, dtype=torch.long) for ids in input_ids]
        if not isinstance(attention_mask[0], torch.Tensor):
            attention_mask = [torch.tensor(mask, dtype=torch.long) for mask in attention_mask]
        if not isinstance(labels[0], torch.Tensor):
            labels = [torch.tensor(lab, dtype=torch.long) for lab in labels]

        # Pad sequences
        max_length = max(ids.size(0) for ids in input_ids)
        
        def pad_sequence(sequences, pad_value):
            result = torch.full((len(sequences), max_length), pad_value, dtype=torch.long)
            for i, seq in enumerate(sequences):
                length = seq.size(0)
                result[i, :length] = seq
            return result

        # Pad and create batch
        input_ids_padded = pad_sequence(input_ids, self.tokenizer.pad_token_id)
        attention_mask_padded = pad_sequence(attention_mask, 0)
        labels_padded = pad_sequence(labels, -100)

        # Ensure values are within vocabulary bounds
        vocab_size = len(self.tokenizer)
        input_ids_padded = torch.clamp(input_ids_padded, min=0, max=vocab_size-1)
        labels_padded = torch.where(
            (labels_padded >= 0) & (labels_padded < vocab_size),
            labels_padded,
            torch.tensor(-100, dtype=torch.long)
        )

        return {
            "input_ids": input_ids_padded,
            "attention_mask": attention_mask_padded,
            "labels": labels_padded
        }

### Training args

In [8]:
# Training arguments with safe defaults
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    learning_rate=1e-5,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
    warmup_steps=100,
    logging_steps=100,
    save_total_limit=2,
    logging_dir="./logs",
    max_grad_norm=0.5,
    gradient_accumulation_steps=16,
    fp16=False,
    dataloader_pin_memory=False,
    remove_unused_columns=False,
    prediction_loss_only=True,
    seed=42,
    full_determinism=False,
)

# Create custom collator
data_collator = CustomDataCollatorForLanguageModeling(tokenizer=tokenizer)

# Initialize trainer with custom collator
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_eval,
    data_collator=data_collator,
)

# Test the collator with a small batch
test_batch = data_collator([tokenized_train[i] for i in range(2)])
print("\nTest batch shapes:")
for k, v in test_batch.items():
    print(f"{k}: {v.shape}, dtype: {v.dtype}, range: [{v.min()}, {v.max()}]")




Test batch shapes:
input_ids: torch.Size([2, 512]), dtype: torch.int64, range: [0, 50256]
attention_mask: torch.Size([2, 512]), dtype: torch.int64, range: [0, 1]
labels: torch.Size([2, 512]), dtype: torch.int64, range: [-100, 50256]


# Fine Tune

In [9]:
# Clear CUDA cache and start training
if torch.cuda.is_available():
    torch.cuda.empty_cache()

try:
    trainer.train()
except RuntimeError as e:
    print(f"Training error: {e}")
    if torch.cuda.is_available():
        print("\nCUDA Memory Summary:")
        print(torch.cuda.memory_summary())
    raise

  1%|▏         | 100/7500 [00:54<1:07:24,  1.83it/s]

{'loss': 3.3474, 'grad_norm': 19.765625, 'learning_rate': 1e-05, 'epoch': 0.04}


  3%|▎         | 200/7500 [01:49<1:06:38,  1.83it/s]

{'loss': 0.8815, 'grad_norm': 17.582889556884766, 'learning_rate': 9.864864864864865e-06, 'epoch': 0.08}


  4%|▍         | 300/7500 [02:44<1:05:57,  1.82it/s]

{'loss': 0.799, 'grad_norm': 13.08103084564209, 'learning_rate': 9.729729729729732e-06, 'epoch': 0.12}


  5%|▌         | 400/7500 [03:39<1:04:23,  1.84it/s]

{'loss': 0.814, 'grad_norm': 16.895729064941406, 'learning_rate': 9.594594594594594e-06, 'epoch': 0.16}


  7%|▋         | 500/7500 [04:33<1:03:30,  1.84it/s]

{'loss': 0.7882, 'grad_norm': 15.232738494873047, 'learning_rate': 9.45945945945946e-06, 'epoch': 0.2}


  8%|▊         | 600/7500 [05:29<1:02:55,  1.83it/s]

{'loss': 0.8054, 'grad_norm': 14.510175704956055, 'learning_rate': 9.324324324324325e-06, 'epoch': 0.24}


  9%|▉         | 700/7500 [06:24<1:01:50,  1.83it/s]

{'loss': 0.8102, 'grad_norm': 14.472683906555176, 'learning_rate': 9.189189189189191e-06, 'epoch': 0.28}


 11%|█         | 800/7500 [07:18<1:00:56,  1.83it/s]

{'loss': 0.8215, 'grad_norm': 19.76947784423828, 'learning_rate': 9.054054054054054e-06, 'epoch': 0.32}


 12%|█▏        | 900/7500 [08:13<1:00:24,  1.82it/s]

{'loss': 0.8231, 'grad_norm': 14.972772598266602, 'learning_rate': 8.91891891891892e-06, 'epoch': 0.36}


 13%|█▎        | 1000/7500 [09:08<59:32,  1.82it/s] 

{'loss': 0.7909, 'grad_norm': 10.043087005615234, 'learning_rate': 8.783783783783785e-06, 'epoch': 0.4}


 15%|█▍        | 1100/7500 [10:03<57:59,  1.84it/s]  

{'loss': 0.7734, 'grad_norm': 13.947961807250977, 'learning_rate': 8.64864864864865e-06, 'epoch': 0.44}


 16%|█▌        | 1200/7500 [10:58<57:16,  1.83it/s]

{'loss': 0.796, 'grad_norm': 16.60686492919922, 'learning_rate': 8.513513513513514e-06, 'epoch': 0.48}


 17%|█▋        | 1300/7500 [11:52<56:12,  1.84it/s]

{'loss': 0.7882, 'grad_norm': 15.195953369140625, 'learning_rate': 8.378378378378378e-06, 'epoch': 0.52}


 19%|█▊        | 1400/7500 [12:47<55:23,  1.84it/s]

{'loss': 0.7819, 'grad_norm': 14.62504768371582, 'learning_rate': 8.243243243243245e-06, 'epoch': 0.56}


 20%|██        | 1500/7500 [13:42<54:40,  1.83it/s]

{'loss': 0.782, 'grad_norm': 17.986557006835938, 'learning_rate': 8.108108108108109e-06, 'epoch': 0.6}


 21%|██▏       | 1600/7500 [14:37<53:29,  1.84it/s]  

{'loss': 0.7814, 'grad_norm': 18.64618492126465, 'learning_rate': 7.972972972972974e-06, 'epoch': 0.64}


 23%|██▎       | 1700/7500 [15:33<52:32,  1.84it/s]  

{'loss': 0.7701, 'grad_norm': 14.773008346557617, 'learning_rate': 7.837837837837838e-06, 'epoch': 0.68}


 24%|██▍       | 1800/7500 [16:27<51:42,  1.84it/s]

{'loss': 0.7671, 'grad_norm': 15.92533016204834, 'learning_rate': 7.702702702702704e-06, 'epoch': 0.72}


 25%|██▌       | 1900/7500 [17:22<50:46,  1.84it/s]

{'loss': 0.7983, 'grad_norm': 16.766387939453125, 'learning_rate': 7.567567567567569e-06, 'epoch': 0.76}


 27%|██▋       | 2000/7500 [18:16<50:05,  1.83it/s]

{'loss': 0.7934, 'grad_norm': 19.247142791748047, 'learning_rate': 7.4324324324324324e-06, 'epoch': 0.8}


 28%|██▊       | 2100/7500 [19:12<49:09,  1.83it/s]  

{'loss': 0.7908, 'grad_norm': 13.56124210357666, 'learning_rate': 7.297297297297298e-06, 'epoch': 0.84}


 29%|██▉       | 2200/7500 [20:06<48:41,  1.81it/s]

{'loss': 0.7637, 'grad_norm': 16.501035690307617, 'learning_rate': 7.162162162162163e-06, 'epoch': 0.88}


 31%|███       | 2300/7500 [21:01<47:23,  1.83it/s]

{'loss': 0.7675, 'grad_norm': 15.458815574645996, 'learning_rate': 7.027027027027028e-06, 'epoch': 0.92}


 32%|███▏      | 2400/7500 [21:57<46:52,  1.81it/s]  

{'loss': 0.7966, 'grad_norm': 13.347047805786133, 'learning_rate': 6.891891891891892e-06, 'epoch': 0.96}


 33%|███▎      | 2500/7500 [22:51<45:23,  1.84it/s]

{'loss': 0.7871, 'grad_norm': 15.287588119506836, 'learning_rate': 6.7567567567567575e-06, 'epoch': 1.0}


                                                   
 33%|███▎      | 2500/7500 [23:21<45:23,  1.84it/s]

{'eval_loss': 0.7420307397842407, 'eval_runtime': 29.0715, 'eval_samples_per_second': 137.592, 'eval_steps_per_second': 34.398, 'epoch': 1.0}


 35%|███▍      | 2600/7500 [24:16<44:31,  1.83it/s]   

{'loss': 0.7804, 'grad_norm': 16.078922271728516, 'learning_rate': 6.621621621621622e-06, 'epoch': 1.04}


 36%|███▌      | 2700/7500 [25:10<43:23,  1.84it/s]

{'loss': 0.7738, 'grad_norm': 15.662795066833496, 'learning_rate': 6.486486486486487e-06, 'epoch': 1.08}


 37%|███▋      | 2800/7500 [26:05<44:16,  1.77it/s]

{'loss': 0.7763, 'grad_norm': 16.26728630065918, 'learning_rate': 6.351351351351351e-06, 'epoch': 1.12}


 39%|███▊      | 2900/7500 [27:01<42:16,  1.81it/s]

{'loss': 0.7786, 'grad_norm': 17.553720474243164, 'learning_rate': 6.2162162162162164e-06, 'epoch': 1.16}


 40%|████      | 3000/7500 [27:56<41:52,  1.79it/s]

{'loss': 0.7506, 'grad_norm': 16.709016799926758, 'learning_rate': 6.081081081081082e-06, 'epoch': 1.2}


 41%|████▏     | 3100/7500 [28:53<40:18,  1.82it/s]  

{'loss': 0.7676, 'grad_norm': 17.314279556274414, 'learning_rate': 5.945945945945947e-06, 'epoch': 1.24}


 43%|████▎     | 3200/7500 [29:49<40:01,  1.79it/s]

{'loss': 0.7928, 'grad_norm': 14.609395980834961, 'learning_rate': 5.810810810810811e-06, 'epoch': 1.28}


 44%|████▍     | 3300/7500 [30:45<38:43,  1.81it/s]

{'loss': 0.7393, 'grad_norm': 16.277332305908203, 'learning_rate': 5.675675675675676e-06, 'epoch': 1.32}


 45%|████▌     | 3400/7500 [31:41<37:43,  1.81it/s]

{'loss': 0.7627, 'grad_norm': 15.818199157714844, 'learning_rate': 5.540540540540541e-06, 'epoch': 1.36}


 47%|████▋     | 3500/7500 [32:36<37:15,  1.79it/s]

{'loss': 0.7682, 'grad_norm': 16.223543167114258, 'learning_rate': 5.405405405405406e-06, 'epoch': 1.4}


 48%|████▊     | 3600/7500 [33:32<35:40,  1.82it/s]

{'loss': 0.7869, 'grad_norm': 15.99527645111084, 'learning_rate': 5.2702702702702705e-06, 'epoch': 1.44}


 49%|████▉     | 3700/7500 [34:28<35:25,  1.79it/s]

{'loss': 0.7636, 'grad_norm': 13.232833862304688, 'learning_rate': 5.135135135135135e-06, 'epoch': 1.48}


 51%|█████     | 3800/7500 [35:23<34:55,  1.77it/s]

{'loss': 0.7651, 'grad_norm': 16.112136840820312, 'learning_rate': 5e-06, 'epoch': 1.52}


 52%|█████▏    | 3900/7500 [36:21<33:45,  1.78it/s]

{'loss': 0.7849, 'grad_norm': 15.292614936828613, 'learning_rate': 4.864864864864866e-06, 'epoch': 1.56}


 53%|█████▎    | 4000/7500 [37:18<32:53,  1.77it/s]

{'loss': 0.7695, 'grad_norm': 17.34326171875, 'learning_rate': 4.72972972972973e-06, 'epoch': 1.6}


 55%|█████▍    | 4100/7500 [38:15<32:13,  1.76it/s]

{'loss': 0.7533, 'grad_norm': 17.508766174316406, 'learning_rate': 4.594594594594596e-06, 'epoch': 1.64}


 56%|█████▌    | 4200/7500 [39:12<31:06,  1.77it/s]

{'loss': 0.7795, 'grad_norm': 17.97136878967285, 'learning_rate': 4.45945945945946e-06, 'epoch': 1.68}


 57%|█████▋    | 4300/7500 [40:09<30:10,  1.77it/s]

{'loss': 0.7618, 'grad_norm': 16.25119400024414, 'learning_rate': 4.324324324324325e-06, 'epoch': 1.72}


 59%|█████▊    | 4400/7500 [41:06<29:42,  1.74it/s]

{'loss': 0.7271, 'grad_norm': 16.641748428344727, 'learning_rate': 4.189189189189189e-06, 'epoch': 1.76}


 60%|██████    | 4500/7500 [42:03<27:29,  1.82it/s]

{'loss': 0.7239, 'grad_norm': 15.436436653137207, 'learning_rate': 4.0540540540540545e-06, 'epoch': 1.8}


 61%|██████▏   | 4600/7500 [42:59<26:36,  1.82it/s]

{'loss': 0.7598, 'grad_norm': 13.645821571350098, 'learning_rate': 3.918918918918919e-06, 'epoch': 1.84}


 63%|██████▎   | 4700/7500 [43:54<25:26,  1.83it/s]

{'loss': 0.7711, 'grad_norm': 16.235597610473633, 'learning_rate': 3.7837837837837844e-06, 'epoch': 1.88}


 64%|██████▍   | 4800/7500 [44:49<24:32,  1.83it/s]

{'loss': 0.7294, 'grad_norm': 16.433752059936523, 'learning_rate': 3.648648648648649e-06, 'epoch': 1.92}


 65%|██████▌   | 4900/7500 [45:43<23:41,  1.83it/s]

{'loss': 0.7572, 'grad_norm': 13.83145523071289, 'learning_rate': 3.513513513513514e-06, 'epoch': 1.96}


 67%|██████▋   | 5000/7500 [46:38<22:50,  1.82it/s]

{'loss': 0.76, 'grad_norm': 16.386600494384766, 'learning_rate': 3.3783783783783788e-06, 'epoch': 2.0}


                                                   
 67%|██████▋   | 5000/7500 [47:08<22:50,  1.82it/s]

{'eval_loss': 0.7303917407989502, 'eval_runtime': 29.2547, 'eval_samples_per_second': 136.73, 'eval_steps_per_second': 34.183, 'epoch': 2.0}


 68%|██████▊   | 5100/7500 [48:03<21:52,  1.83it/s]  

{'loss': 0.7366, 'grad_norm': 19.54833984375, 'learning_rate': 3.2432432432432437e-06, 'epoch': 2.04}


 69%|██████▉   | 5200/7500 [48:58<20:58,  1.83it/s]

{'loss': 0.7827, 'grad_norm': 16.02777671813965, 'learning_rate': 3.1081081081081082e-06, 'epoch': 2.08}


 71%|███████   | 5300/7500 [49:52<20:01,  1.83it/s]

{'loss': 0.7522, 'grad_norm': 12.973928451538086, 'learning_rate': 2.9729729729729736e-06, 'epoch': 2.12}


 72%|███████▏  | 5400/7500 [50:47<19:05,  1.83it/s]

{'loss': 0.7145, 'grad_norm': 15.294984817504883, 'learning_rate': 2.837837837837838e-06, 'epoch': 2.16}


 73%|███████▎  | 5500/7500 [51:43<18:11,  1.83it/s]

{'loss': 0.7528, 'grad_norm': 16.698314666748047, 'learning_rate': 2.702702702702703e-06, 'epoch': 2.2}


 75%|███████▍  | 5600/7500 [52:38<17:13,  1.84it/s]

{'loss': 0.7429, 'grad_norm': 18.034725189208984, 'learning_rate': 2.5675675675675675e-06, 'epoch': 2.24}


 76%|███████▌  | 5700/7500 [53:33<16:17,  1.84it/s]

{'loss': 0.7245, 'grad_norm': 9.858894348144531, 'learning_rate': 2.432432432432433e-06, 'epoch': 2.28}


 77%|███████▋  | 5800/7500 [54:28<15:26,  1.83it/s]

{'loss': 0.7528, 'grad_norm': 15.5429105758667, 'learning_rate': 2.297297297297298e-06, 'epoch': 2.32}


 79%|███████▊  | 5900/7500 [55:22<14:30,  1.84it/s]

{'loss': 0.7635, 'grad_norm': 15.096558570861816, 'learning_rate': 2.1621621621621623e-06, 'epoch': 2.36}


 80%|████████  | 6000/7500 [56:17<13:35,  1.84it/s]

{'loss': 0.7329, 'grad_norm': 13.952903747558594, 'learning_rate': 2.0270270270270273e-06, 'epoch': 2.4}


 81%|████████▏ | 6100/7500 [57:12<12:44,  1.83it/s]

{'loss': 0.7483, 'grad_norm': 17.364910125732422, 'learning_rate': 1.8918918918918922e-06, 'epoch': 2.44}


 83%|████████▎ | 6200/7500 [58:07<11:54,  1.82it/s]

{'loss': 0.7562, 'grad_norm': 14.73144817352295, 'learning_rate': 1.756756756756757e-06, 'epoch': 2.48}


 84%|████████▍ | 6300/7500 [59:01<10:54,  1.83it/s]

{'loss': 0.768, 'grad_norm': 14.492931365966797, 'learning_rate': 1.6216216216216219e-06, 'epoch': 2.52}


 85%|████████▌ | 6400/7500 [59:56<09:58,  1.84it/s]

{'loss': 0.7643, 'grad_norm': 17.433677673339844, 'learning_rate': 1.4864864864864868e-06, 'epoch': 2.56}


 87%|████████▋ | 6500/7500 [1:00:51<09:05,  1.83it/s]

{'loss': 0.7692, 'grad_norm': 17.034439086914062, 'learning_rate': 1.3513513513513515e-06, 'epoch': 2.6}


 88%|████████▊ | 6600/7500 [1:01:46<08:11,  1.83it/s]

{'loss': 0.757, 'grad_norm': 18.697004318237305, 'learning_rate': 1.2162162162162164e-06, 'epoch': 2.64}


 89%|████████▉ | 6700/7500 [1:02:41<07:16,  1.83it/s]

{'loss': 0.7565, 'grad_norm': 16.491140365600586, 'learning_rate': 1.0810810810810812e-06, 'epoch': 2.68}


 91%|█████████ | 6800/7500 [1:03:35<06:22,  1.83it/s]

{'loss': 0.7888, 'grad_norm': 15.6122407913208, 'learning_rate': 9.459459459459461e-07, 'epoch': 2.72}


 92%|█████████▏| 6900/7500 [1:04:30<05:29,  1.82it/s]

{'loss': 0.7522, 'grad_norm': 16.12258529663086, 'learning_rate': 8.108108108108109e-07, 'epoch': 2.76}


 93%|█████████▎| 7000/7500 [1:05:25<04:33,  1.83it/s]

{'loss': 0.7725, 'grad_norm': 15.24124526977539, 'learning_rate': 6.756756756756758e-07, 'epoch': 2.8}


 95%|█████████▍| 7100/7500 [1:06:20<03:37,  1.84it/s]

{'loss': 0.7631, 'grad_norm': 18.25636100769043, 'learning_rate': 5.405405405405406e-07, 'epoch': 2.84}


 96%|█████████▌| 7200/7500 [1:07:15<02:43,  1.84it/s]

{'loss': 0.7491, 'grad_norm': 19.60845375061035, 'learning_rate': 4.0540540540540546e-07, 'epoch': 2.88}


 97%|█████████▋| 7300/7500 [1:08:09<01:49,  1.83it/s]

{'loss': 0.7423, 'grad_norm': 16.948884963989258, 'learning_rate': 2.702702702702703e-07, 'epoch': 2.92}


 99%|█████████▊| 7400/7500 [1:09:04<00:54,  1.83it/s]

{'loss': 0.7416, 'grad_norm': 19.66415023803711, 'learning_rate': 1.3513513513513515e-07, 'epoch': 2.96}


100%|██████████| 7500/7500 [1:09:59<00:00,  1.81it/s]

{'loss': 0.7457, 'grad_norm': 15.98219108581543, 'learning_rate': 0.0, 'epoch': 3.0}


                                                     
100%|██████████| 7500/7500 [1:10:29<00:00,  1.77it/s]

{'eval_loss': 0.7273839712142944, 'eval_runtime': 29.2309, 'eval_samples_per_second': 136.842, 'eval_steps_per_second': 34.21, 'epoch': 3.0}
{'train_runtime': 4229.342, 'train_samples_per_second': 28.373, 'train_steps_per_second': 1.773, 'train_loss': 0.8044352213541667, 'epoch': 3.0}





### Save

In [10]:
# Save paths
model_save_path = "./fine_tuned_alpaca_gpt2"

# Save the model and tokenizer
print("Saving model and tokenizer...")
trainer.save_model(model_save_path)
tokenizer.save_pretrained(model_save_path)
print(f"Model and tokenizer saved to {model_save_path}")

# Quick verification that the save worked
print("\nVerifying save...")
try:
    # Try to load the model and tokenizer
    test_model = AutoModelForCausalLM.from_pretrained(model_save_path)
    test_tokenizer = AutoTokenizer.from_pretrained(model_save_path)
    print("✓ Successfully loaded saved model and tokenizer")
except Exception as e:
    print(f"Error verifying save: {e}")

Saving model and tokenizer...
Model and tokenizer saved to ./fine_tuned_alpaca_gpt2

Verifying save...
✓ Successfully loaded saved model and tokenizer
