# Installs and Imports

In [1]:
import os
import torch

# CUDA settings
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
torch.backends.cudnn.enabled = True
torch.backends.cudnn.benchmark = True
torch.backends.cudnn.deterministic = True

# Check CUDA availability and set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"CUDA device: {torch.cuda.get_device_name(0)}")
    print(f"CUDA version: {torch.version.cuda}")
    # Set default CUDA device
    torch.cuda.set_device(0)
    
# Import other libraries
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments
from datasets import load_dataset, Dataset
import json

CUDA available: True
CUDA device: NVIDIA GeForce RTX 4070 Ti SUPER
CUDA version: 11.8


  from .autonotebook import tqdm as notebook_tqdm


# Data

### load model and tokenizers

In [2]:
model_name = "distilgpt2"

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_name)
# Set the pad_token to eos_token
tokenizer.pad_token = tokenizer.eos_token

# Print tokenizer info
print(f"Tokenizer pad token: {tokenizer.pad_token}")
print(f"Tokenizer pad token ID: {tokenizer.pad_token_id}")

# Create model with explicit config
from transformers import AutoConfig
config = AutoConfig.from_pretrained(model_name)
config.pad_token_id = tokenizer.pad_token_id

# Try loading model with device placement in from_pretrained
try:
    print("Attempting to load model directly to device...")
    model = AutoModelForCausalLM.from_pretrained(
        model_name, 
        config=config,
        device_map="auto" if torch.cuda.is_available() else None,
        torch_dtype=torch.float32  # Explicitly set dtype
    )
    print("Model loaded successfully")
except Exception as e:
    print(f"Error loading model directly to device: {e}")
    print("Attempting alternate loading method...")
    # Try alternate loading method
    model = AutoModelForCausalLM.from_pretrained(model_name, config=config)
    if torch.cuda.is_available():
        print("Moving model to CUDA...")
        # Try moving parts of the model gradually
        for param in model.parameters():
            param.data = param.data.to('cuda')
    print("Model loading complete")

# Print model device info
print(f"\nModel device check:")
print(f"Model is on CUDA: {next(model.parameters()).is_cuda}")

Tokenizer pad token: <|endoftext|>
Tokenizer pad token ID: 50256
Attempting to load model directly to device...
Model loaded successfully

Model device check:
Model is on CUDA: True


### tokenize function

In [3]:
# Define max length for the sequences
MAX_LENGTH = 512

def format_alpaca_prompt(example):
    """Format the instruction and input into a prompt"""
    if example["input"]:
        prompt = f"### Instruction:\n{example['instruction']}\n\n### Input:\n{example['input']}\n\n### Response:\n"
    else:
        prompt = f"### Instruction:\n{example['instruction']}\n\n### Response:\n"
    return prompt

def tokenize_function(examples):
    """Tokenize the texts and prepare them for training"""
    # Tokenize with padding and truncation
    tokenized = tokenizer(
        examples["text"],
        truncation=True,
        padding='max_length',
        max_length=MAX_LENGTH,
        return_tensors=None
    )
    
    labels = tokenized["input_ids"].copy()
    
    # Find the start of the response for each example
    for idx, text in enumerate(examples["text"]):
        response_start = text.find("### Response:\n") + len("### Response:\n")
        # Get prompt tokens
        prompt_tokens = len(tokenizer(text[:response_start], 
                                    truncation=True, 
                                    max_length=MAX_LENGTH)["input_ids"])
        
        # Mask out prompt tokens in labels
        labels[idx][:prompt_tokens] = [-100] * prompt_tokens
        
        # Ensure no out-of-bounds indices
        if prompt_tokens > MAX_LENGTH:
            labels[idx] = [-100] * MAX_LENGTH
    
    # Convert to PyTorch tensors
    input_ids = torch.tensor(tokenized["input_ids"], dtype=torch.long)
    attention_mask = torch.tensor(tokenized["attention_mask"], dtype=torch.long)
    labels = torch.tensor(labels, dtype=torch.long)
    
    return {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "labels": labels
    }

In [4]:
def load_alpaca_data(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    return data

def prepare_dataset(data):
    """Convert the JSON data into a format suitable for the model"""
    formatted_data = []
    for item in data:
        prompt = format_alpaca_prompt(item)
        formatted_data.append({
            "text": prompt + item["output"]  # Combine prompt and output
        })
    return Dataset.from_list(formatted_data)

# Load and prepare the data
alpaca_data = load_alpaca_data('alpaca_data_cleaned.json')
train_size = int(0.9 * len(alpaca_data))
train_data = alpaca_data[:train_size]
eval_data = alpaca_data[train_size:]

# Convert to Dataset format
train_dataset = prepare_dataset(train_data)
eval_dataset = prepare_dataset(eval_data)

# Tokenize the datasets with smaller batch size and add error handling
def safe_map_tokenization(dataset, batch_size=4):
    try:
        return dataset.map(
            tokenize_function,
            batched=True,
            batch_size=batch_size,
            remove_columns=dataset.column_names
        )
    except Exception as e:
        print(f"Error during tokenization: {e}")
        raise

print("Tokenizing training dataset...")
tokenized_train = safe_map_tokenization(train_dataset)
print("Tokenizing evaluation dataset...")
tokenized_eval = safe_map_tokenization(eval_dataset)

# Set the tensor format
tokenized_train.set_format("torch")
tokenized_eval.set_format("torch")

# Verify data format
print("\nVerifying data format:")
sample = tokenized_train[0]
for key, value in sample.items():
    print(f"{key}: shape={value.shape}, dtype={value.dtype}")

# Add validation check
def validate_dataset(dataset, name):
    print(f"\nValidating {name}:")
    invalid_samples = 0
    for i, sample in enumerate(dataset):
        if not all(isinstance(v, torch.Tensor) for v in sample.values()):
            print(f"Sample {i} has non-tensor values")
            invalid_samples += 1
        if any(v.dtype not in [torch.long, torch.int64] for v in sample.values()):
            print(f"Sample {i} has incorrect dtype")
            invalid_samples += 1
    print(f"Found {invalid_samples} invalid samples")
    return invalid_samples == 0

validate_dataset(tokenized_train, "training dataset")
validate_dataset(tokenized_eval, "evaluation dataset")

Tokenizing training dataset...


Map: 100%|██████████| 46584/46584 [00:21<00:00, 2160.57 examples/s]


Tokenizing evaluation dataset...


Map: 100%|██████████| 5176/5176 [00:02<00:00, 1964.88 examples/s]



Verifying data format:
input_ids: shape=torch.Size([512]), dtype=torch.int64
attention_mask: shape=torch.Size([512]), dtype=torch.int64
labels: shape=torch.Size([512]), dtype=torch.int64

Validating training dataset:
Found 0 invalid samples

Validating evaluation dataset:
Found 0 invalid samples


True

In [5]:
# Scale datasets for testing
def scale_dataset(dataset, max_samples=1000):
    """Scale down a dataset to a maximum number of samples"""
    if len(dataset) > max_samples:
        scaled_indices = list(range(max_samples))
        return dataset.select(scaled_indices)
    return dataset

# Set your desired size
MAX_SAMPLES = 2000  # Adjust this number as needed

# Scale both datasets
print(f"Original sizes - Train: {len(tokenized_train)}, Eval: {len(tokenized_eval)}")

tokenized_train = scale_dataset(tokenized_train, MAX_SAMPLES)
tokenized_eval = scale_dataset(tokenized_eval, max(50, int(MAX_SAMPLES * 0.1)))  # Keep eval set ~10% of train

print(f"Scaled sizes - Train: {len(tokenized_train)}, Eval: {len(tokenized_eval)}")

Original sizes - Train: 46584, Eval: 5176
Scaled sizes - Train: 2000, Eval: 200


# Fine Tuning Setup

### validate setup

In [6]:
# Check vocab sizes and data ranges
print(f"Tokenizer vocab size: {len(tokenizer)}")
print(f"Model vocab size: {model.config.vocab_size}")

# Function to check tensor values
def check_tensor_values(tensor, name):
    if isinstance(tensor, torch.Tensor):
        print(f"{name} - Min: {tensor.min().item()}, Max: {tensor.max().item()}, Shape: {tensor.shape}")

# Check a sample from the dataset
sample = tokenized_train[0]
for key, value in sample.items():
    check_tensor_values(value, key)

Tokenizer vocab size: 50257
Model vocab size: 50257
input_ids - Min: -100, Max: 50256, Shape: torch.Size([512])
attention_mask - Min: 0, Max: 1, Shape: torch.Size([512])
labels - Min: -100, Max: 50256, Shape: torch.Size([512])


### custom collator

In [7]:
from dataclasses import dataclass
from typing import Optional, Union, List, Dict, Any


@dataclass
class CustomDataCollatorForLanguageModeling:
    tokenizer: AutoTokenizer
    mlm: bool = False
    pad_to_multiple_of: Optional[int] = None

    def __call__(self, examples: List[Dict[str, Any]]) -> Dict[str, torch.Tensor]:
        # Extract the relevant fields
        input_ids = [example["input_ids"] for example in examples]
        attention_mask = [example["attention_mask"] for example in examples]
        labels = [example["labels"] for example in examples]

        # Convert to tensors if they aren't already
        if not isinstance(input_ids[0], torch.Tensor):
            input_ids = [torch.tensor(ids, dtype=torch.long) for ids in input_ids]
        if not isinstance(attention_mask[0], torch.Tensor):
            attention_mask = [torch.tensor(mask, dtype=torch.long) for mask in attention_mask]
        if not isinstance(labels[0], torch.Tensor):
            labels = [torch.tensor(lab, dtype=torch.long) for lab in labels]

        # Pad sequences
        max_length = max(ids.size(0) for ids in input_ids)
        
        def pad_sequence(sequences, pad_value):
            result = torch.full((len(sequences), max_length), pad_value, dtype=torch.long)
            for i, seq in enumerate(sequences):
                length = seq.size(0)
                result[i, :length] = seq
            return result

        # Pad and create batch
        input_ids_padded = pad_sequence(input_ids, self.tokenizer.pad_token_id)
        attention_mask_padded = pad_sequence(attention_mask, 0)
        labels_padded = pad_sequence(labels, -100)

        # Ensure values are within vocabulary bounds
        vocab_size = len(self.tokenizer)
        input_ids_padded = torch.clamp(input_ids_padded, min=0, max=vocab_size-1)
        labels_padded = torch.where(
            (labels_padded >= 0) & (labels_padded < vocab_size),
            labels_padded,
            torch.tensor(-100, dtype=torch.long)
        )

        return {
            "input_ids": input_ids_padded,
            "attention_mask": attention_mask_padded,
            "labels": labels_padded
        }

### Training args

In [8]:
# Training arguments with safe defaults
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    learning_rate=1e-5,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
    warmup_steps=100,
    logging_steps=100,
    save_total_limit=2,
    logging_dir="./logs",
    max_grad_norm=0.5,
    gradient_accumulation_steps=16,
    fp16=False,
    dataloader_pin_memory=False,
    remove_unused_columns=False,
    prediction_loss_only=True,
    seed=42,
    full_determinism=False,
)

# Create custom collator
data_collator = CustomDataCollatorForLanguageModeling(tokenizer=tokenizer)

# Initialize trainer with custom collator
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_eval,
    data_collator=data_collator,
)

# Test the collator with a small batch
test_batch = data_collator([tokenized_train[i] for i in range(2)])
print("\nTest batch shapes:")
for k, v in test_batch.items():
    print(f"{k}: {v.shape}, dtype: {v.dtype}, range: [{v.min()}, {v.max()}]")




Test batch shapes:
input_ids: torch.Size([2, 512]), dtype: torch.int64, range: [0, 50256]
attention_mask: torch.Size([2, 512]), dtype: torch.int64, range: [0, 1]
labels: torch.Size([2, 512]), dtype: torch.int64, range: [-100, 50256]


# Fine Tune

In [9]:
# Clear CUDA cache and start training
if torch.cuda.is_available():
    torch.cuda.empty_cache()

try:
    trainer.train()
except RuntimeError as e:
    print(f"Training error: {e}")
    if torch.cuda.is_available():
        print("\nCUDA Memory Summary:")
        print(torch.cuda.memory_summary())
    raise

 27%|██▋       | 100/375 [00:56<02:30,  1.83it/s]

{'loss': 3.3951, 'grad_norm': 16.91701889038086, 'learning_rate': 1e-05, 'epoch': 0.8}


                                                 
 33%|███▎      | 125/375 [01:11<02:16,  1.84it/s]

{'eval_loss': 0.7909576296806335, 'eval_runtime': 1.5058, 'eval_samples_per_second': 132.818, 'eval_steps_per_second': 33.205, 'epoch': 1.0}


 53%|█████▎    | 200/375 [01:54<01:36,  1.81it/s]

{'loss': 0.7923, 'grad_norm': 10.684296607971191, 'learning_rate': 6.363636363636364e-06, 'epoch': 1.6}


                                                 
 67%|██████▋   | 250/375 [02:24<01:08,  1.81it/s]

{'eval_loss': 0.7763142585754395, 'eval_runtime': 1.4697, 'eval_samples_per_second': 136.078, 'eval_steps_per_second': 34.019, 'epoch': 2.0}


 80%|████████  | 300/375 [02:52<00:42,  1.78it/s]

{'loss': 0.7842, 'grad_norm': 18.6118106842041, 'learning_rate': 2.7272727272727272e-06, 'epoch': 2.4}


                                                 
100%|██████████| 375/375 [03:36<00:00,  1.73it/s]

{'eval_loss': 0.7742096185684204, 'eval_runtime': 1.4948, 'eval_samples_per_second': 133.795, 'eval_steps_per_second': 33.449, 'epoch': 3.0}
{'train_runtime': 216.2747, 'train_samples_per_second': 27.742, 'train_steps_per_second': 1.734, 'train_loss': 1.4781442565917968, 'epoch': 3.0}





### Save

In [10]:
# Save paths
model_save_path = "../FinalModels/NEWfine_tuned_alpaca_gpt2"

# Save the model and tokenizer
print("Saving model and tokenizer...")
trainer.save_model(model_save_path)
tokenizer.save_pretrained(model_save_path)
print(f"Model and tokenizer saved to {model_save_path}")

# Quick verification that the save worked
print("\nVerifying save...")
try:
    # Try to load the model and tokenizer
    test_model = AutoModelForCausalLM.from_pretrained(model_save_path)
    test_tokenizer = AutoTokenizer.from_pretrained(model_save_path)
    print("✓ Successfully loaded saved model and tokenizer")
except Exception as e:
    print(f"Error verifying save: {e}")

Saving model and tokenizer...
Model and tokenizer saved to ../FinalModels/NEWfine_tuned_alpaca_gpt2

Verifying save...
✓ Successfully loaded saved model and tokenizer
