# DSPy Fine-tuning Debug Notebook

This notebook includes extensive debugging to understand why training loss is 0.

In [2]:
import os
# Set CUDA device to GPU 1
os.environ["CUDA_VISIBLE_DEVICES"] = "1"
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"

import torch
torch.cuda.set_device(0)

print(f"Using GPU: {torch.cuda.get_device_name(0)}")
print(f"Memory allocated: {torch.cuda.memory_allocated()/1024**3:.2f} GB")

Using GPU: NVIDIA GeForce RTX 3090
Memory allocated: 0.00 GB


In [3]:
# Import and setup the debugging function
from dspy.clients import lm_local
import dspy
import json

# Store the original function
original_train_sft_locally = lm_local.train_sft_locally

In [4]:
# Define the debugging version
def fixed_train_sft_locally(model_name, train_data, train_kwargs):
    """Fixed version of train_sft_locally with extensive debugging"""
    print("\n" + "="*80)
    print("🔍 DEBUG: Starting fixed_train_sft_locally")
    print("="*80)
    
    # Debug: Print training data
    print(f"\n📊 Training data info:")
    print(f"  - Number of examples: {len(train_data)}")
    print(f"  - First example keys: {list(train_data[0].keys()) if train_data else 'No data'}")
    if train_data and len(train_data) > 0:
        print(f"\n  - First example content:")
        first_example = train_data[0]
        print(json.dumps(first_example, indent=2))
    
    try:
        import torch
        from transformers import AutoModelForCausalLM, AutoTokenizer
        from trl import SFTConfig, SFTTrainer, setup_chat_format
    except ImportError:
        raise ImportError(
            "For local finetuning, please install torch, transformers, and trl "
            "by running `pip install -U torch transformers accelerate trl peft`"
        )

    device = train_kwargs.get("device", None)
    if device is None:
        device = "cuda:0"  # Use first visible device
    print(f"\n🖥️  Using device: {device}")

    # Load model with explicit device mapping
    print(f"\n📥 Loading model: {model_name}")
    model = AutoModelForCausalLM.from_pretrained(
        pretrained_model_name_or_path=model_name,
        torch_dtype=torch.bfloat16,  # Use bf16
        device_map=device,  # Explicit device mapping
        use_cache=False,
    )
    print(f"✅ Model loaded successfully")
    
    tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=model_name)

    # Set up the chat format
    try:
        model, tokenizer = setup_chat_format(model=model, tokenizer=tokenizer)
        print("✅ Chat format setup successful")
    except Exception as e:
        print(f"⚠️  Chat format setup failed (this is often OK): {e}")

    if tokenizer.pad_token_id is None:
        print("➕ Adding pad token to tokenizer")
        tokenizer.add_special_tokens({"pad_token": "[!#PAD#!]"})

    print("\n🔄 Creating dataset")
    if "max_seq_length" not in train_kwargs:
        train_kwargs["max_seq_length"] = 512  # Reduced from 4096
        print(f"📏 Set max_seq_length to {train_kwargs['max_seq_length']}")

    from datasets import Dataset
    hf_dataset = Dataset.from_list(train_data)

    def tokenize_function(example):
        result = lm_local.encode_sft_example(example, tokenizer, train_kwargs["max_seq_length"])
        # Debug first tokenized example
        if hasattr(tokenize_function, 'first_call'):
            tokenize_function.first_call = False
            print("\n🔍 DEBUG: First tokenized example:")
            print(f"  - Input IDs shape: {result['input_ids'].shape}")
            print(f"  - Labels shape: {result['labels'].shape}")
            print(f"  - Non-negative labels: {(result['labels'] != -100).sum().item()}")
            # Decode some tokens to see what we're training on
            non_neg_indices = (result['labels'] != -100).nonzero(as_tuple=True)[0]
            if len(non_neg_indices) > 0:
                sample_tokens = result['input_ids'][non_neg_indices[:50]]  # First 50 non-masked tokens
                decoded = tokenizer.decode(sample_tokens, skip_special_tokens=False)
                print(f"  - Sample decoded tokens: {decoded[:200]}...")
        return result
    
    tokenize_function.first_call = True

    print("\n🔄 Tokenizing dataset...")
    tokenized_dataset = hf_dataset.map(tokenize_function, batched=False)
    tokenized_dataset.set_format(type="torch")
    
    # Debug dataset statistics
    print(f"\n📊 Dataset statistics:")
    print(f"  - Dataset size: {len(tokenized_dataset)}")
    
    # Check if any examples have non-masked labels
    total_non_masked = 0
    for i in range(min(5, len(tokenized_dataset))):
        example = tokenized_dataset[i]
        non_masked = (example["labels"] != -100).sum().item()
        total_non_masked += non_masked
        print(f"  - Example {i}: {non_masked} non-masked tokens out of {len(example['labels'])}")
    
    if total_non_masked == 0:
        print("\n⚠️  WARNING: No non-masked tokens found in first 5 examples!")
        print("    This means the model has nothing to learn from!")

    use_peft = train_kwargs.get("use_peft", False)
    peft_config = None

    if use_peft:
        from peft import LoraConfig
        peft_config = LoraConfig(
            r=16,
            lora_alpha=32,
            lora_dropout=0.05,
            bias="none",
            target_modules="all-linear",
            task_type="CAUSAL_LM",
        )
        print("✅ PEFT/LoRA config created")

    # Enable gradient checkpointing
    model.gradient_checkpointing_enable()
    print("✅ Gradient checkpointing enabled")

    # Print training configuration
    print(f"\n⚙️  Training configuration:")
    print(f"  - Output dir: {train_kwargs['output_dir']}")
    print(f"  - Epochs: {train_kwargs.get('num_train_epochs', 2)}")
    print(f"  - Batch size: {train_kwargs.get('per_device_train_batch_size', 1)}")
    print(f"  - Gradient accumulation: {train_kwargs.get('gradient_accumulation_steps', 8)}")
    print(f"  - Learning rate: {train_kwargs.get('learning_rate', 2e-5)}")
    print(f"  - Max seq length: {train_kwargs['max_seq_length']}")
    print(f"  - Packing: False (disabled)")
    print(f"  - BF16: True")

    sft_config = SFTConfig(
        output_dir=train_kwargs["output_dir"],
        num_train_epochs=train_kwargs.get("num_train_epochs", 2),
        per_device_train_batch_size=train_kwargs.get("per_device_train_batch_size", 1),
        gradient_accumulation_steps=train_kwargs.get("gradient_accumulation_steps", 8),
        learning_rate=train_kwargs.get("learning_rate", 2e-5),
        max_grad_norm=2.0,
        logging_steps=5,  # More frequent logging
        warmup_ratio=0.03,
        lr_scheduler_type="cosine",
        save_steps=50,
        save_total_limit=2,
        bf16=True,  # Always use bf16
        max_seq_length=train_kwargs["max_seq_length"],
        packing=False,  # Disable packing to avoid issues
        dataset_kwargs={
            "add_special_tokens": False,
            "append_concat_token": False,
        },
        gradient_checkpointing=True,
        optim=train_kwargs.get("optim", "adamw_torch_fused"),
        logging_first_step=True,  # Log the first step
        eval_on_start=False,  # Don't evaluate at start
    )

    print("\n🏋️ Starting training...")
    
    # Create a custom trainer to add more debugging
    class DebugSFTTrainer(SFTTrainer):
        def compute_loss(self, model, inputs,
                         num_items_in_batch=None, return_outputs=False):
    
            loss_or_tuple = super().compute_loss(
                model, inputs,
                num_items_in_batch=num_items_in_batch,
                return_outputs=return_outputs
            )
    
            # ── Unpack if necessary ──────────────────────────────
            if isinstance(loss_or_tuple, tuple):
                loss_value, outputs = loss_or_tuple           # trl ≥0.8 path
            else:
                loss_value, outputs = loss_or_tuple, None     # older trl path
    
            # Only print once
            if not hasattr(self, "_debug_printed"):
                self._debug_printed = True
                print("\n🔍 DEBUG: First batch in compute_loss:")
                print(f"  - Input shape:  {inputs['input_ids'].shape}")
                print(f"  - Labels shape: {inputs['labels'].shape}")
                print(f"  - Non‑masked labels: {(inputs['labels'] != -100).sum().item()}")
                print(f"  - Loss value: {loss_value.item()}")
    
            # ── Return what the Trainer expects ─────────────────
            if return_outputs:
                return loss_value, outputs                    # keep the tuple form
            else:
                return loss_value                             # scalar tensor only
    
        
    trainer = DebugSFTTrainer(
        model=model,
        args=sft_config,
        train_dataset=tokenized_dataset,
        peft_config=peft_config,
    )

    # Train!
    print("\n🚂 Training started...")
    trainer.train()
    print("✅ Training completed!")

    # Save the model!
    print(f"\n💾 Saving model to {sft_config.output_dir}")
    trainer.save_model()
    tokenizer.save_pretrained(sft_config.output_dir)

    if use_peft:
        from peft import AutoPeftModelForCausalLM
        print("\n🔄 Merging PEFT adapter...")
        # Load PEFT model on CPU
        model_ = AutoPeftModelForCausalLM.from_pretrained(
            pretrained_model_name_or_path=sft_config.output_dir,
            torch_dtype=torch.float16,
            low_cpu_mem_usage=True,
        )
        merged_model = model_.merge_and_unload()
        merged_model.save_pretrained(sft_config.output_dir, safe_serialization=True, max_shard_size="5GB")
        print("✅ PEFT adapter merged")

    # Clean up!
    import gc
    del model
    del tokenizer
    del trainer
    gc.collect()
    torch.cuda.empty_cache()

    print(f"\n✅ Returning output dir: {sft_config.output_dir}")
    print("="*80 + "\n")
    
    return sft_config.output_dir

# Apply the patch
lm_local.train_sft_locally = fixed_train_sft_locally

In [5]:
# Load the data
import random
from dspy.datasets import DataLoader
from datasets import load_dataset

# Load the Banking77 dataset
ds = load_dataset("banking77", split="train")
CLASSES = ds.features['label'].names
print(f"Number of classes: {len(CLASSES)}")
print(f"First 5 classes: {CLASSES[:5]}")

Number of classes: 77
First 5 classes: ['activate_my_card', 'age_limit', 'apple_pay_or_google_pay', 'atm_support', 'automatic_top_up']


In [6]:
# Prepare the data
kwargs = dict(fields=("text", "label"), input_keys=("text",), split="train")
raw_data = [
    dspy.Example(x, label=CLASSES[x.label]).with_inputs("text")
    for x in DataLoader().from_huggingface(dataset_name="banking77", **kwargs)[:1000]
]
random.Random(0).shuffle(raw_data)

unlabeled_trainset = [dspy.Example(text=x.text).with_inputs("text") for x in raw_data[:500]]
print(f"Prepared {len(unlabeled_trainset)} unlabeled examples")
print(f"First example text: {unlabeled_trainset[0].text[:100]}...")

Prepared 500 unlabeled examples
First example text: What if there is an error on the exchange rate?...


In [None]:
# Setup authentication
os.environ["HF_HUB_TOKEN"] = "hf_[REDACTED]"

from huggingface_hub import login
login(token="hf_[REDACTED]")

In [8]:
# Setup DSPy
from typing import Literal
from dspy.clients.lm_local import LocalProvider
dspy.settings.experimental = True

classify = dspy.ChainOfThought(f"text -> label: Literal{CLASSES}")

student_lm_name = "meta-llama/Llama-3.2-1B-Instruct"
student_lm = dspy.LM(model=f"openai/local:{student_lm_name}",
                     provider=LocalProvider(), max_tokens=2000)
teacher_lm = dspy.LM('openai/gpt-4o-mini', max_tokens=3000)

student_classify = classify.deepcopy()
student_classify.set_lm(student_lm)

teacher_classify = classify.deepcopy()
teacher_classify.set_lm(teacher_lm)

print("Models configured")

Models configured


In [9]:
# First, let's manually check what the teacher generates
print("Testing teacher model output...")
test_example = unlabeled_trainset[0]
teacher_output = teacher_classify(text=test_example.text)
print(f"\nInput text: {test_example.text[:100]}...")
print(f"Teacher predicted label: {teacher_output.label}")
print(f"Teacher reasoning: {teacher_output.reasoning[:200]}..." if hasattr(teacher_output, 'reasoning') else "No reasoning available")

Testing teacher model output...

Input text: What if there is an error on the exchange rate?...
Teacher predicted label: card_payment_wrong_exchange_rate
Teacher reasoning: The question pertains to an error regarding the exchange rate, which suggests a concern about the accuracy of the rates applied during transactions. This aligns with the label 'card_payment_wrong_exch...


In [None]:
# Training kwargs optimized for GPU 1
train_kwargs = {
    "per_device_train_batch_size": 2,
    "gradient_accumulation_steps": 8,
    "gradient_checkpointing": True,
    "bf16": True,
    "optim": "adamw_torch_fused",
    "max_seq_length": 2048,
    "packing": False,
    "num_train_epochs": 2,
    "use_peft": False,  # Disable PEFT for now
}

print("Starting DSPy BootstrapFinetune...")
optimizer = dspy.BootstrapFinetune(num_threads=16, train_kwargs=train_kwargs)

# Use a smaller subset for testing
small_trainset = unlabeled_trainset[:10]  # Even smaller for debugging

classify_ft = optimizer.compile(
    student_classify,
    teacher=teacher_classify,
    trainset=small_trainset
)

In [32]:
classify_ft.save("tst", save_program = True)

In [34]:
import os
os.environ["DSPY_CACHEDIR"]

KeyError: 'DSPY_CACHEDIR'

In [35]:
classify_ft.get_lm().launch()

2025/07/12 13:45:51 INFO dspy.clients.lm_local: Grabbing a free port to launch an SGLang server for model /home/maxime/.dspy_cache/finetune/c337d0d9b6c05490_aji0y6_meta-llama-Llama-3.2-1B-Instruct_2025-07-12_13-13-30
2025/07/12 13:45:51 INFO dspy.clients.lm_local: We see that CUDA_VISIBLE_DEVICES is 1
2025/07/12 13:45:51 INFO dspy.clients.lm_local: SGLang server process started with PID 12919.


[2025-07-12 13:45:56] server_args=ServerArgs(model_path='/home/maxime/.dspy_cache/finetune/c337d0d9b6c05490_aji0y6_meta-llama-Llama-3.2-1B-Instruct_2025-07-12_13-13-30', tokenizer_path='/home/maxime/.dspy_cache/finetune/c337d0d9b6c05490_aji0y6_meta-llama-Llama-3.2-1B-Instruct_2025-07-12_13-13-30', tokenizer_mode='auto', skip_tokenizer_init=False, skip_server_warmup=False, load_format='auto', model_loader_extra_config='{}', trust_remote_code=False, dtype='auto', kv_cache_dtype='auto', quantization=None, quantization_param_path=None, context_length=None, device='cuda', served_model_name='/home/maxime/.dspy_cache/finetune/c337d0d9b6c05490_aji0y6_meta-llama-Llama-3.2-1B-Instruct_2025-07-12_13-13-30', chat_template=None, completion_template=None, is_embedding=False, enable_multimodal=None, revision=None, hybrid_kvcache_ratio=None, impl='auto', host='0.0.0.0', port=35121, nccl_port=None, mem_fraction_static=0.879, max_running_requests=None, max_total_tokens=None, chunked_prefill_size=2048, m

2025/07/12 13:46:10 INFO dspy.clients.lm_local: Server ready on random port 35121! Logs are available via lm.get_logs() method on returned lm.


In [37]:
print("\n✅ DSPy BootstrapFinetune completed!")

# Test the result
print("\n🧪 Testing fine-tuned model...")
test_text = unlabeled_trainset[0].text
result = classify_ft(text=test_text)
print(f"Test input: {test_text[:100]}...")
print(f"Predicted label: {result.label}")


✅ DSPy BootstrapFinetune completed!

🧪 Testing fine-tuned model...
Test input: What if there is an error on the exchange rate?...
Predicted label: exchange_rate


In [38]:
devset = raw_data[500:600]
devset[0]

Example({'text': 'Which fiat currencies do you currently support? Will this change in this future?', 'label': 'fiat_currency_support'}) (input_keys={'text'})

In [39]:
metric = (lambda x, y, trace=None: x.label == y.label)
evaluate = dspy.Evaluate(devset=devset, metric=metric, display_progress=True, display_table=5, num_threads=16)

In [40]:
evaluate(classify_ft)

Average Metric: 13.00 / 20 (65.0%):  19%|██▊            | 19/100 [00:01<00:05, 14.45it/s]



Average Metric: 67.00 / 100 (67.0%): 100%|█████████████| 100/100 [00:05<00:00, 17.88it/s]

2025/07/12 14:03:39 INFO dspy.evaluate.evaluate: Average Metric: 67 / 100 (67.0%)





Unnamed: 0,text,example_label,reasoning,pred_label,<lambda>
0,Which fiat currencies do you currently support? Will this change i...,fiat_currency_support,"The question is about the support for fiat currencies, which relat...",fiat_currency_support,✔️ [True]
1,I didn't receive my money earlier and it says the transaction is s...,pending_cash_withdrawal,The user is inquiring about a situation where they did not receive...,cash_withdrawal_not_recognised,
2,what currencies do you accept?,fiat_currency_support,"The question is about the currencies that the service accepts, whi...",fiat_currency_support,✔️ [True]
3,Where can I find your exchange rates?,exchange_rate,"The user is inquiring about where to find exchange rates, which re...",exchange_rate,✔️ [True]
4,why hasnt my card come in yet?,card_arrival,"The user is inquiring about the status of their card, specifically...",card_arrival,✔️ [True]


67.0

## Additional Debug: Manually inspect the training data

Let's manually look at what DSPy is generating for training data:

In [None]:
# Let's manually inspect the training data file
import json
import glob

# Find the most recent training data file
data_files = glob.glob("/home/maxime/.dspy_cache/finetune/*.jsonl")
if data_files:
    latest_file = max(data_files, key=os.path.getctime)
    print(f"Latest training data file: {latest_file}")
    
    # Read first few examples
    with open(latest_file, 'r') as f:
        for i, line in enumerate(f):
            if i >= 3:  # Only show first 3 examples
                break
            example = json.loads(line)
            print(f"\n--- Example {i+1} ---")
            print(json.dumps(example, indent=2))
else:
    print("No training data files found")