# BDH Full Pipeline — Train + Merge + Evaluate

**Platform:** Kaggle GPU T4 x2  |  **Est. time:** ~45 min

### Before running — 3 things:
1. Upload project zip as Kaggle Dataset named `bdh-project`
2. Add Data (right sidebar) -> search `bdh-project` -> Add
3. Settings: GPU T4 x2 ON, Internet ON

In [None]:
# === Step 0: Verify GPU ===
import torch, os, sys, shutil, json, glob
from pathlib import Path

print('PyTorch:', torch.__version__)
print('CUDA:', torch.cuda.is_available())

if not torch.cuda.is_available():
    raise RuntimeError('No GPU! Settings -> Accelerator -> GPU T4 x2, then restart.')

for i in range(torch.cuda.device_count()):
    name = torch.cuda.get_device_name(i)
    # Use mem_get_info (works on all PyTorch versions with CUDA)
    free, total = torch.cuda.mem_get_info(i)
    print(f'  GPU {i}: {name} ({total / 1e9:.1f} GB)')

print('\nGPU ready')

In [None]:
# === Step 1: Setup project files ===
WORK = Path('/kaggle/working')
PROJECT = WORK / 'bdh'

# Find project in any attached dataset
candidates = list(Path('/kaggle/input').rglob('training/bdh.py'))
if not candidates:
    print('Datasets found in /kaggle/input/:')
    for d in Path('/kaggle/input').iterdir():
        print(f'  {d.name}/')
        for f in sorted(d.rglob('*'))[:15]:
            print(f'    {f.relative_to(d)}')
    raise RuntimeError(
        'Project not found! Make sure you:\n'
        '  1. Ran create_kaggle_zip.ps1 locally\n'
        '  2. Uploaded zip as Kaggle Dataset named bdh-project\n'
        '  3. Clicked Add Data in sidebar and added it')

project_src = candidates[0].parent.parent
print(f'Found: {project_src}')

# Copy to writable dir (Kaggle input is read-only)
if PROJECT.exists():
    shutil.rmtree(PROJECT)
shutil.copytree(project_src, PROJECT)
os.chdir(PROJECT)
print(f'Copied to: {PROJECT}')

for f in ['training/bdh.py', 'training/train.py',
          'training/download_europarl.py', 'analysis/merge.py']:
    ok = Path(f).exists()
    print(f'  {"ok" if ok else "MISSING"}: {f}')
    if not ok:
        raise RuntimeError(f'Missing: {f}')

print('\nProject ready')

In [None]:
# === Step 1b: Patch bdh.py for PyTorch 2.6+ ===
# PyTorch 2.6+ changed torch.load to default weights_only=True.
# bdh.py uses torch.load without weights_only=False, which will crash.
# We patch it here since we copied to a writable directory.

bdh_path = PROJECT / 'training' / 'bdh.py'
bdh_code = bdh_path.read_text()

old = 'checkpoint = torch.load(checkpoint_path, map_location=device)'
new = 'checkpoint = torch.load(checkpoint_path, map_location=device, weights_only=False)'

if old in bdh_code and new not in bdh_code:
    bdh_code = bdh_code.replace(old, new)
    bdh_path.write_text(bdh_code)
    print('Patched bdh.py: added weights_only=False to torch.load')
else:
    print('bdh.py already patched or has different code, skipping')

In [None]:
!pip install -q pyyaml tqdm requests 2>/dev/null

In [None]:
# === Step 2: Check internet and download data ===
import urllib.request
try:
    urllib.request.urlopen('https://www.google.com', timeout=10)
    print('Internet: OK')
except Exception as e:
    raise RuntimeError(f'No internet ({e}). Settings -> Internet -> On, then restart.')

In [None]:
# Download Europarl (~5-10 min)
os.chdir(PROJECT)
os.system('python training/download_europarl.py --languages en-fr en-pt --output data/')

In [None]:
# Verify data
for lang in ['en-fr', 'en-pt']:
    for split in ['train.bin', 'val.bin']:
        p = Path(f'data/{lang}/{split}')
        if p.exists():
            print(f'  ok: {p} ({p.stat().st_size / 1024 / 1024:.1f} MB)')
        else:
            raise RuntimeError(f'Missing: {p}. Re-run download cell above.')
print('\nData ready')

In [None]:
# === Step 3: Write training configs ===
# Architecture MUST be identical for French and Portuguese (required for merge)

CONFIG = """train_data: "{train_data}"
val_data: "{val_data}"
n_layer: 6
n_embd: 192
n_head: 4
mlp_multiplier: 64
dropout: 0.1
vocab_size: 256
batch_size: 16
block_size: 256
max_iters: 5000
learning_rate: 1.0e-3
min_lr: 1.0e-4
warmup_iters: 500
weight_decay: 0.1
grad_clip: 1.0
gradient_accumulation_steps: 8
log_interval: 100
eval_interval: 500
save_interval: 2500
eval_iters: 100
output_dir: "checkpoints"
run_name: "{run_name}"
device: "cuda"
dtype: "bfloat16"
compile_model: false
"""

os.makedirs('training/configs', exist_ok=True)

Path('training/configs/french_kaggle.yaml').write_text(
    CONFIG.format(train_data='data/en-fr/train.bin',
                  val_data='data/en-fr/val.bin',
                  run_name='french_specialist'))

Path('training/configs/portuguese_kaggle.yaml').write_text(
    CONFIG.format(train_data='data/en-pt/train.bin',
                  val_data='data/en-pt/val.bin',
                  run_name='portuguese_specialist'))

print('Architecture: 6L, 192D, 4H, 64x MLP = 3072 neurons/head')
print('Configs written')

In [None]:
# === Step 4a: Train French (~15-20 min) ===
os.chdir(PROJECT)
os.system('python training/train.py --config training/configs/french_kaggle.yaml')

In [None]:
# === Step 4b: Train Portuguese (~15-20 min) ===
os.chdir(PROJECT)
os.system('python training/train.py --config training/configs/portuguese_kaggle.yaml')

In [None]:
# === Verify checkpoints ===
def find_ckpt(name):
    base = Path(f'checkpoints/{name}')
    if not base.exists():
        return None
    for f in ['checkpoint_best.pt', 'checkpoint_latest.pt']:
        p = base / f
        if p.exists():
            return str(p)
    pts = sorted(base.glob('checkpoint_*.pt'))
    return str(pts[-1]) if pts else None

fr_ckpt = find_ckpt('french_specialist')
pt_ckpt = find_ckpt('portuguese_specialist')

for label, ckpt in [('French', fr_ckpt), ('Portuguese', pt_ckpt)]:
    if ckpt:
        sz = os.path.getsize(ckpt) / 1024 / 1024
        print(f'  {label}: {ckpt} ({sz:.1f} MB)')
    else:
        print(f'  {label}: NOT FOUND')

if not fr_ckpt or not pt_ckpt:
    raise RuntimeError('Both checkpoints needed. Check training output above.')
print('\nBoth checkpoints ready')

In [None]:
# === Step 5: Merge + Evaluate ===
os.chdir(PROJECT)
os.makedirs('frontend/public/merge', exist_ok=True)

cmd = (
    f'python analysis/merge.py'
    f' --model1 "{fr_ckpt}"'
    f' --model2 "{pt_ckpt}"'
    f' --output checkpoints/merged_polyglot.pt'
    f' --name1 french --name2 portuguese'
    f' --french-val data/en-fr/val.bin'
    f' --portuguese-val data/en-pt/val.bin'
    f' --frontend-json frontend/public/merge/merge_data.json'
    f' --device cuda'
)
print(cmd, '\n')
ret = os.system(cmd)
if ret != 0:
    print(f'\nMerge exited with code {ret}. Check output above.')

In [None]:
# === Step 6: View Results ===
os.chdir(PROJECT)
merge_json = Path('frontend/public/merge/merge_data.json')

if not merge_json.exists():
    print('merge_data.json not found. Merge may have failed.')
    print('Check Step 5 output for errors.')
else:
    data = json.loads(merge_json.read_text())

    print('=' * 50)
    print('  MERGE RESULTS')
    print('=' * 50)

    for name, info in data.get('models', {}).items():
        print(f"  {info.get('flag','')} {info.get('name','?'):<20} "
              f"{info.get('n_neurons',0):>6} N/head  "
              f"{info.get('params',0):>10,} params")

    ev = data.get('evaluation', {})
    if ev:
        print(f"\n  {'Model':<22} {'Fr Loss':>8} {'Pt Loss':>8}")
        print(f"  {'-'*40}")
        for name, vals in ev.items():
            fr = f"{vals['french_loss']:.4f}" if vals.get('french_loss') is not None else '  -'
            pt = f"{vals['portuguese_loss']:.4f}" if vals.get('portuguese_loss') is not None else '  -'
            print(f"  {name:<22} {fr:>8} {pt:>8}")

    for s in data.get('samples', []):
        print(f"\n  [{s.get('label','')}]")
        print(f"  {s.get('generated','')[:120]}...")

    print('\n' + '=' * 50)

In [None]:
# === Step 7 (Optional): Monosemanticity precompute ===
os.chdir(PROJECT)
script = Path('scripts/precompute_monosemanticity.py')
if script.exists() and fr_ckpt:
    os.makedirs('frontend/public/monosemanticity', exist_ok=True)
    os.system(
        f'python scripts/precompute_monosemanticity.py'
        f' --model "{fr_ckpt}"'
        f' --output frontend/public/monosemanticity/precomputed.json'
    )
else:
    print('Skipping: script or checkpoint not found.')

In [None]:
# === Step 8: Package for download ===
os.chdir(PROJECT)
OUTPUT = Path('/kaggle/working/bdh_output')
if OUTPUT.exists():
    shutil.rmtree(OUTPUT)

to_copy = [
    'checkpoints/french_specialist/checkpoint_best.pt',
    'checkpoints/french_specialist/checkpoint_latest.pt',
    'checkpoints/portuguese_specialist/checkpoint_best.pt',
    'checkpoints/portuguese_specialist/checkpoint_latest.pt',
    'checkpoints/merged_polyglot.pt',
    'checkpoints/merged_polyglot.heritage.json',
    'frontend/public/merge/merge_data.json',
    'frontend/public/monosemanticity/precomputed.json',
]

for rel in to_copy:
    src = PROJECT / rel
    if src.exists():
        dst = OUTPUT / rel
        dst.parent.mkdir(parents=True, exist_ok=True)
        shutil.copy2(src, dst)
        print(f'  copied: {rel} ({src.stat().st_size/1024/1024:.1f} MB)')
    else:
        print(f'  skip:   {rel} (not found)')

In [None]:
# Create zip for download
os.chdir('/kaggle/working')
os.system('zip -r bdh_results.zip bdh_output/')

zp = Path('bdh_results.zip')
if zp.exists():
    print(f'\nbdh_results.zip: {zp.stat().st_size/1024/1024:.1f} MB')
    print('\nDONE! To download:')
    print('  1. Save Version (top right)')
    print('  2. Output tab -> download bdh_results.zip')
    print('  3. Unzip, copy checkpoints/ and frontend/ into your project')
    print('  4. cd frontend && npm install && npm run dev')
else:
    print('zip creation failed')