# BDH Full Pipeline: Train → Merge → Fine-tune (Kaggle GPU)

**Purpose:** Train better specialists, merge them, and fine-tune with frozen sparse layers so that:
- Specialist losses reach ~0.5-0.7 (down from ~0.9)
- Merged fine-tuned loss drops to ~1.0-1.5 on both languages
- Heritage probe shows clear language routing (French input → French neurons)

**What this notebook does:**
1. Retrain French & Portuguese specialists for **15,000 iterations** (was 5,000)
2. Merge the two specialists (concatenate neurons)
3. Fine-tune with `--freeze-sparse` to fix embed/lm_head while **preserving specialist neuron identity**
4. Evaluate all 4 models + heritage probe → produce `merge_data.json`

**Settings:** Kaggle → GPU T4 x2, Internet ON

**Time estimate:** ~60-80 min total (retrain ~40 min + download + finetune + eval)

## Step 0: Verify GPU

In [None]:
import torch
print(f"PyTorch: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    props = torch.cuda.get_device_properties(0)
    mem = getattr(props, 'total_memory', None) or getattr(props, 'total_mem', 0)
    print(f"Memory: {mem / 1e9:.1f} GB")
else:
    print("WARNING: No GPU detected! This will be very slow on CPU.")
    print("Go to Settings > Accelerator > GPU T4 x2")

## Step 1: Setup Project

In [None]:
import os, shutil, glob, subprocess

# Find project in Kaggle input
INPUT_BASE = "/kaggle/input"
WORK_DIR = "/kaggle/working/bdh"

# ── 1. Diagnostics: what's actually in /kaggle/input? ──
print("Contents of /kaggle/input:")
if os.path.exists(INPUT_BASE):
    top = os.listdir(INPUT_BASE)
    if not top:
        print("  (empty!)")
        print("\n⚠ You need to attach the dataset to this notebook:")
        print("  1. Click 'Add Data' in the right sidebar")
        print("  2. Search for your uploaded dataset name")
        print("  3. Click 'Add' to attach it")
        print("  4. Re-run this cell\n")
    else:
        for item in top:
            full = os.path.join(INPUT_BASE, item)
            kind = "dir" if os.path.isdir(full) else f"file ({os.path.getsize(full)/1e6:.1f} MB)"
            print(f"  {item}  [{kind}]")
else:
    print("  /kaggle/input does not exist!")

# ── 2. If there's a zip file, extract it first ──
for root, dirs, files in os.walk(INPUT_BASE):
    for f in files:
        if f.endswith('.zip'):
            zpath = os.path.join(root, f)
            extract_to = os.path.join(root, f.replace('.zip', '_extracted'))
            if not os.path.exists(extract_to):
                print(f"\nFound zip: {zpath} — extracting...")
                import zipfile
                with zipfile.ZipFile(zpath, 'r') as zf:
                    zf.extractall(extract_to)
                print(f"  Extracted to: {extract_to}")
    depth = root.replace(INPUT_BASE, '').count(os.sep)
    if depth >= 2:
        dirs.clear()

# ── 3. Search for project root (contains training/bdh.py) ──
src = None
for root, dirs, files in os.walk(INPUT_BASE):
    if 'training' in dirs:
        bdh_check = os.path.join(root, 'training', 'bdh.py')
        if os.path.exists(bdh_check):
            src = root
            break
    depth = root.replace(INPUT_BASE, '').count(os.sep)
    if depth >= 6:
        dirs.clear()

if src is None:
    print("\n❌ ERROR: Could not find BDH project (training/bdh.py) in /kaggle/input/")
    print("\nFull directory listing:")
    for root, dirs, files in os.walk(INPUT_BASE):
        depth = root.replace(INPUT_BASE, '').count(os.sep)
        indent = "  " * depth
        print(f"{indent}{os.path.basename(root)}/")
        if depth >= 4:
            dirs.clear()
        for f in files[:8]:
            print(f"{indent}  {f}")
        if len(files) > 8:
            print(f"{indent}  ... and {len(files)-8} more files")
    raise RuntimeError("Dataset not found — see instructions above")
else:
    print(f"\n✓ Found project: {src}")
    if os.path.exists(WORK_DIR):
        shutil.rmtree(WORK_DIR)
    shutil.copytree(src, WORK_DIR)
    os.chdir(WORK_DIR)
    print(f"  Copied to: {WORK_DIR}")
    print(f"  Top-level: {sorted(os.listdir('.'))}")

In [None]:
# Patch bdh.py for PyTorch 2.6+ (weights_only=False)
import re
bdh_path = os.path.join(WORK_DIR, 'training', 'bdh.py')
with open(bdh_path, 'r') as f:
    content = f.read()

if 'weights_only' not in content:
    patched = content.replace(
        'torch.load(path, map_location=device)',
        'torch.load(path, map_location=device, weights_only=False)'
    )
    with open(bdh_path, 'w') as f:
        f.write(patched)
    print('Patched bdh.py for PyTorch 2.6+')
else:
    print('bdh.py already patched')

In [None]:
!pip install -q pyyaml tqdm requests

## Step 2: Verify Checkpoints Exist

The merged checkpoint must already exist from the Pipeline notebook.

In [None]:
import os

# Check for checkpoints - try multiple known paths
MERGED_CANDIDATES = [
    "checkpoints/merged_polyglot/checkpoint_best.pt",
    "checkpoints/merged_polyglot.pt",
]
FRENCH_CANDIDATES = [
    "checkpoints/french_specialist/checkpoint_best.pt",
    "checkpoints/french/french_best.pt",
]
PORTUGUESE_CANDIDATES = [
    "checkpoints/portuguese_specialist/checkpoint_best.pt",
]

def find_first(candidates, label):
    for c in candidates:
        if os.path.exists(c):
            print(f"  {label}: {c} ({os.path.getsize(c)/1e6:.1f} MB)")
            return c
    print(f"  {label}: NOT FOUND (tried {candidates})")
    return None

MERGED_PATH = find_first(MERGED_CANDIDATES, "Merged")
FRENCH_PATH = find_first(FRENCH_CANDIDATES, "French")
PORTUGUESE_PATH = find_first(PORTUGUESE_CANDIDATES, "Portuguese")

if not MERGED_PATH:
    print("\nERROR: No merged checkpoint found!")
    print("You need to run the BDH_Kaggle_Pipeline notebook first.")
    print("\nOr if specialists exist, we can merge now:")
    if FRENCH_PATH and PORTUGUESE_PATH:
        print("  Specialists found! Will merge in next cell.")
        NEED_MERGE = True
    else:
        print("  No specialists either. Run Pipeline notebook first.")
        NEED_MERGE = False
else:
    NEED_MERGE = False
    print("\nAll checkpoints found!")

In [None]:
# Merge if needed (only runs if merged checkpoint was missing)
if NEED_MERGE and FRENCH_PATH and PORTUGUESE_PATH:
    MERGED_PATH = "checkpoints/merged_polyglot/checkpoint_best.pt"
    os.makedirs(os.path.dirname(MERGED_PATH), exist_ok=True)
    !python analysis/merge.py \
        --model1 {FRENCH_PATH} \
        --model2 {PORTUGUESE_PATH} \
        --output {MERGED_PATH} \
        --skip-eval
    print(f"Merged checkpoint created: {MERGED_PATH}")
elif not MERGED_PATH:
    raise RuntimeError("No merged checkpoint and can't create one. Run Pipeline first.")

## Step 3: Download Europarl Data

We need the **real** Europarl .bin files for both training and fine-tuning.
This downloads ~200MB of parallel corpus for both language pairs.

In [None]:
# Check if data already exists (from Pipeline run)
FR_TRAIN = "data/en-fr/train.bin"
FR_VAL   = "data/en-fr/val.bin"
PT_TRAIN = "data/en-pt/train.bin"
PT_VAL   = "data/en-pt/val.bin"

have_data = all(os.path.exists(p) for p in [FR_TRAIN, FR_VAL, PT_TRAIN, PT_VAL])

if have_data:
    for p in [FR_TRAIN, FR_VAL, PT_TRAIN, PT_VAL]:
        print(f"  {p}: {os.path.getsize(p)/1e6:.1f} MB")
    print("\nData already exists!")
else:
    print("Downloading Europarl data (this takes 5-10 min)...")
    !python training/download_europarl.py --languages en-fr en-pt --output data/
    # Verify
    for p in [FR_TRAIN, FR_VAL, PT_TRAIN, PT_VAL]:
        if os.path.exists(p):
            print(f"  {p}: {os.path.getsize(p)/1e6:.1f} MB")
        else:
            print(f"  {p}: MISSING!")

## Step 4: Retrain Specialists (15,000 iterations)

The original Pipeline trained for only 5,000 iterations → specialist loss ~0.9.
Training for 15,000 iterations should reach ~0.5-0.7 loss.

Optimized for T4: `float16` + `batch_size=64` + `grad_accum=1` + `torch.compile`
- **~10-15 min per specialist** (plus ~2 min one-time compile overhead)

In [None]:
import yaml, os

# Optimized config for T4 GPU:
#   - float16 (T4 has no bfloat16 tensor cores)
#   - batch_size=64 (model is tiny, T4 has 16GB)
#   - grad_accum=1 (no need with large batch)
#   - torch.compile=True (one-time 2-3 min compile, then 1.5x faster)
french_config = {
    'train_data': 'data/en-fr/train.bin',
    'val_data': 'data/en-fr/val.bin',
    'n_layer': 6, 'n_embd': 192, 'n_head': 4,
    'mlp_multiplier': 64, 'dropout': 0.1, 'vocab_size': 256,
    'batch_size': 64, 'block_size': 256,
    'max_iters': 15000,
    'learning_rate': 1.0e-3, 'min_lr': 1.0e-4,
    'warmup_iters': 500, 'weight_decay': 0.1,
    'grad_clip': 1.0, 'gradient_accumulation_steps': 1,
    'log_interval': 100, 'eval_interval': 1000,
    'save_interval': 5000, 'eval_iters': 100,
    'output_dir': 'checkpoints', 'run_name': 'french_specialist',
    'device': 'cuda', 'dtype': 'float16', 'compile_model': True,
}

portuguese_config = french_config.copy()
portuguese_config.update({
    'train_data': 'data/en-pt/train.bin',
    'val_data': 'data/en-pt/val.bin',
    'run_name': 'portuguese_specialist',
})

os.makedirs('training/configs', exist_ok=True)
with open('training/configs/french_15k.yaml', 'w') as f:
    yaml.dump(french_config, f)
with open('training/configs/portuguese_15k.yaml', 'w') as f:
    yaml.dump(portuguese_config, f)

tokens_per_iter = french_config['batch_size'] * french_config['block_size']
print(f"Configs written: 15,000 iters, 6L 192D 4H, mlp_multiplier=64")
print(f"  dtype: float16 (T4 tensor cores)")
print(f"  batch_size: {french_config['batch_size']}, grad_accum: {french_config['gradient_accumulation_steps']}")
print(f"  Tokens/iter: {tokens_per_iter:,}")
print(f"  torch.compile: {french_config['compile_model']}")
print(f"  Est. ~10-15 min per specialist (after ~2 min compile)")

## Step 4b: Train French Specialist (~12 min + 2 min compile)

In [None]:
!python training/train.py --config training/configs/french_15k.yaml

## Step 4c: Train Portuguese Specialist (~12 min, compile cached)

In [None]:
!python training/train.py --config training/configs/portuguese_15k.yaml

## Step 5: Merge the Retrained Specialists

Concatenate neuron spaces: French(N=3072) + Portuguese(N=3072) → Merged(N=6144)

In [None]:
FRENCH_PATH = "checkpoints/french_specialist/checkpoint_best.pt"
PORTUGUESE_PATH = "checkpoints/portuguese_specialist/checkpoint_best.pt"
MERGED_PATH = "checkpoints/merged_polyglot/checkpoint_best.pt"

import os
os.makedirs(os.path.dirname(MERGED_PATH), exist_ok=True)

# --skip-eval: we'll do full eval in Step 7 after fine-tuning
!python analysis/merge.py \
    --model1 {FRENCH_PATH} \
    --model2 {PORTUGUESE_PATH} \
    --output {MERGED_PATH} \
    --skip-eval \
    --device cuda

## Step 6: Fine-tune with Frozen Sparse Layers

Key: `--freeze-sparse` freezes the concatenated specialist neurons (encoder/decoder/freqs)
and only trains `embed.weight` + `lm_head` (the averaged shared layers that need routing adaptation).

This preserves specialist neuron identity → heritage probe shows proper language routing.

In [None]:
FINETUNED_PATH = "checkpoints/merged_finetuned/checkpoint_best.pt"

import os
os.makedirs(os.path.dirname(FINETUNED_PATH), exist_ok=True)

!python analysis/finetune_merged.py \
    --checkpoint {MERGED_PATH} \
    --output {FINETUNED_PATH} \
    --french-data {FR_TRAIN} \
    --portuguese-data {PT_TRAIN} \
    --iters 1000 \
    --lr 1e-4 \
    --batch-size 4 \
    --block-size 256 \
    --freeze-sparse \
    --device cuda

## Step 7: Full Evaluation + Heritage Probe

Run `merge.py` with `--skip-merge` to evaluate all 4 models and run the heritage probe.
This produces the `merge_data.json` for the frontend visualization.

In [None]:
!python analysis/merge.py \
    --model1 {FRENCH_PATH} \
    --model2 {PORTUGUESE_PATH} \
    --output {MERGED_PATH} \
    --finetuned {FINETUNED_PATH} \
    --french-val {FR_VAL} \
    --portuguese-val {PT_VAL} \
    --skip-merge \
    --device cuda

## Step 8: Inspect Results

Parse the JSON to see the 4-model narrative and heritage probe routing quality.

In [None]:
import json

# Load the evaluation JSON
json_path = "frontend/public/merge/merge_data.json"
with open(json_path) as f:
    data = json.load(f)

# Model losses
print("=" * 60)
print("  MODEL EVALUATION RESULTS")
print("=" * 60)
for m in data.get("models", []):
    fr = m.get("french_loss", "N/A")
    pt = m.get("portuguese_loss", "N/A")
    fr_str = f"{fr:.4f}" if isinstance(fr, (int, float)) else str(fr)
    pt_str = f"{pt:.4f}" if isinstance(pt, (int, float)) else str(pt)
    print(f"  {m['name']:30s}  FR={fr_str}  PT={pt_str}")

# Heritage probe
probe = data.get("probe_data", {})
if probe:
    summary = probe.get("summary", {})
    print(f"\n  Heritage Probe:")
    print(f"    French input  → FR neurons {summary.get('french_input_french_pct', '?'):.1f}%")
    print(f"    Portuguese input → PT neurons {summary.get('portuguese_input_portuguese_pct', '?'):.1f}%")
    rq = summary.get("routing_quality", 0)
    print(f"    Routing quality: {rq:.1f}%")
    if rq > 60:
        print("    ✓ Heritage probe shows language routing!")
    else:
        print("    ✗ Probe shows weak routing — consider more freeze-sparse iters")
else:
    print("\n  No heritage probe data found in JSON")

# Finetune info
fi = data.get("finetune_info", {})
if fi:
    print(f"\n  Fine-tune info:")
    print(f"    Iterations: {fi.get('iters', '?')}")
    print(f"    Loss: {fi.get('pre_loss', '?')} → {fi.get('post_loss', '?')}")

print("=" * 60)

## Step 9: Quick Generation Sanity Check

Generate some text from the fine-tuned model to visually verify output quality.

In [None]:
import sys, torch
sys.path.insert(0, 'training')
from bdh import load_model

device = "cuda" if torch.cuda.is_available() else "cpu"
model = load_model(FINETUNED_PATH, device)

prompts = [
    ("French", "Le parlement européen a adopté"),
    ("Portuguese", "O parlamento europeu adoptou"),
    ("English", "The European Parliament has adopted"),
]

print("=" * 60)
print("  GENERATION SAMPLES (fine-tuned model)")
print("=" * 60)
for label, prompt in prompts:
    tokens = torch.tensor([list(prompt.encode('utf-8'))], dtype=torch.long, device=device)
    with torch.no_grad():
        out = model.generate(tokens, max_new_tokens=100, top_k=5, temperature=0.8)
    text = bytes(out[0].cpu().tolist()).decode('utf-8', errors='replace')
    print(f"\n  [{label}]")
    print(f"  {text[:200]}")

del model
torch.cuda.empty_cache()
print("\n" + "=" * 60)

## Step 10: Package Results for Download

Create a zip with the key outputs:
- `merge_data.json` (frontend visualization data)
- `checkpoint_best.pt` (fine-tuned model)
- `checkpoint_best.heritage.json` (heritage metadata)

In [None]:
import zipfile, os, glob

OUTPUT_ZIP = "bdh_results.zip"

# Collect files to package
files_to_zip = []

# merge_data.json (frontend viz)
merge_json = "frontend/public/merge/merge_data.json"
if os.path.exists(merge_json):
    files_to_zip.append(merge_json)

# Fine-tuned checkpoint
if os.path.exists(FINETUNED_PATH):
    files_to_zip.append(FINETUNED_PATH)

# Heritage JSON if exists
heritage_json = FINETUNED_PATH.replace('.pt', '.heritage.json')
if os.path.exists(heritage_json):
    files_to_zip.append(heritage_json)

# Merged checkpoint heritage
merged_heritage = MERGED_PATH.replace('.pt', '.heritage.json')
if os.path.exists(merged_heritage):
    files_to_zip.append(merged_heritage)

# Also include the specialists for the frontend
for sp in [FRENCH_PATH, PORTUGUESE_PATH, MERGED_PATH]:
    if os.path.exists(sp):
        files_to_zip.append(sp)

print(f"Packaging {len(files_to_zip)} files into {OUTPUT_ZIP}:")
with zipfile.ZipFile(OUTPUT_ZIP, 'w', zipfile.ZIP_DEFLATED) as zf:
    for fp in files_to_zip:
        arcname = fp  # preserve directory structure
        zf.write(fp, arcname)
        size_mb = os.path.getsize(fp) / 1e6
        print(f"  {arcname} ({size_mb:.1f} MB)")

total_mb = os.path.getsize(OUTPUT_ZIP) / 1e6
print(f"\n  Created: {OUTPUT_ZIP} ({total_mb:.1f} MB)")
print("  Download this file from the Kaggle output panel →")