# Stage 3.5 Final Baseline (train_gpt_original.py, 1 seed) vs Stage 3 Results

In [None]:
%%bash
source /workspace/ese-3060-project/.venv/bin/activate
python -m ipykernel install --user --name ese3060-venv --display-name "ese3060 venv"
echo "Kernel installed."

In [None]:
from pathlib import Path
import os, subprocess, json, glob, re
import pandas as pd
import matplotlib.pyplot as plt
plt.style.use('ggplot')
%matplotlib inline
%cd /workspace/ese-3060-project

# Detect project root: prefer PROJ_ROOT env, else repo root (parent if running from notebooks/)
cwd = Path.cwd().expanduser().resolve()
default_root = cwd.parent if cwd.name == 'notebooks' else cwd
PROJ_ROOT = Path(os.environ.get('PROJ_ROOT', default_root)).expanduser().resolve()
SCRIPT_PATH = PROJ_ROOT / 'train_gpt_original.py'
RESULTS_ALL = PROJ_ROOT / 'experiments' / 'results.csv'
RESULTS_STAGE3 = PROJ_ROOT / 'experiments' / 'results_stage3.csv'
RESULTS_STAGE3_ALT = PROJ_ROOT / 'experiments' / 'results_stage_3.csv'
BASELINE_OUT = PROJ_ROOT / 'experiments' / 'baseline.csv'
LOG_DIR = PROJ_ROOT / 'logs'

print('Project root:', PROJ_ROOT)
print('Using script:', SCRIPT_PATH)
print('Results files checked:', RESULTS_STAGE3, RESULTS_STAGE3_ALT, RESULTS_ALL)
print('Logs:', LOG_DIR)
print('Baseline export:', BASELINE_OUT)

In [None]:
# Runtime knobs (one seed baseline)
NPROC = None                # auto-detect GPU count if None
SEED = 1337                # train_gpt_original.py reads SEED env
VAL_EVERY = 125            # train_gpt_original.py reads VAL_EVERY env
TORCHRUN = "torchrun"
LAUNCH = False              # set True to run

# train_gpt_original.py uses built-in defaults for lr/iters/warmdown; not adjustable here
if NPROC is None:
    try:
        gpu_count = int(subprocess.check_output("nvidia-smi --list-gpus | wc -l", shell=True).decode().strip())
    except Exception:
        gpu_count = 0
    NPROC = max(gpu_count, 1)

assert SCRIPT_PATH.exists(), f"Missing train script: {SCRIPT_PATH}"

def run_baseline(seed):
    env = os.environ.copy()
    env.update({
        "SEED": str(seed),
        "VAL_EVERY": str(VAL_EVERY),
    })
    cmd = [TORCHRUN, "--standalone", f"--nproc_per_node={NPROC}", str(SCRIPT_PATH)]
    print(f"\n>>> Launching baseline (train_gpt_original) seed={seed} nproc={NPROC}")
    if not LAUNCH:
        return 0
    proc = subprocess.run(cmd, env=env)
    if proc.returncode != 0:
        raise RuntimeError(f"Run failed: baseline seed {seed} rc={proc.returncode}")

run_baseline(SEED)
print("Done (LAUNCH=" + str(LAUNCH) + ")")


In [None]:
# Parse latest baseline log and save to baseline.csv
VAL_RE = re.compile(r"step:(\d+)/(\d+) val_loss:([0-9.]+) train_time:(\d+)ms step_avg:([0-9.]+)ms")

def parse_latest_log(log_dir: Path):
    if not log_dir.exists():
        return None, None
    txts = sorted(log_dir.glob('*.txt'), key=lambda p: p.stat().st_mtime, reverse=True)
    if not txts:
        return None, None
    path = txts[0]
    rows = []
    for line in open(path):
        m = VAL_RE.search(line)
        if m:
            step = int(m.group(1)); total = int(m.group(2))
            vloss = float(m.group(3)); t_ms = int(m.group(4)); step_avg = float(m.group(5))
            rows.append((step, total, vloss, t_ms, step_avg))
    if not rows:
        return path.stem, None
    final = rows[-1]
    best = min(rows, key=lambda r: r[2])
    data = {
        'run_id': path.stem,
        'seed': SEED,
        'learning_rate': LR,
        'num_iterations': final[1],
        'warmdown_iters': WARMDOWN_ITERS,
        'final_val_loss': final[2],
        'best_val_loss': best[2],
        'train_time_ms': final[3],
        'ms_per_step': final[4],
        'attn_gate': 'none',
    }
    return path.stem, pd.DataFrame([data])

log_id, baseline_df = parse_latest_log(LOG_DIR)
if baseline_df is not None:
    BASELINE_OUT.parent.mkdir(parents=True, exist_ok=True)
    baseline_df.to_csv(BASELINE_OUT, index=False)
    display(baseline_df)
    print(f"Saved baseline run ({log_id}) to {BASELINE_OUT}")
else:
    print('No baseline log parsed; ensure run was launched.')

In [None]:
# Load stage 3 results for comparison
if RESULTS_STAGE3.exists():
    stage3 = pd.read_csv(RESULTS_STAGE3)
elif RESULTS_STAGE3_ALT.exists():
    stage3 = pd.read_csv(RESULTS_STAGE3_ALT)
elif RESULTS_ALL.exists():
    stage3 = pd.read_csv(RESULTS_ALL)
else:
    stage3 = pd.DataFrame()

if not stage3.empty:
    # Optional filter for full-run params if needed
    pass
stage3.head() if not stage3.empty else stage3

In [None]:
# Compare baseline (parsed) vs stage3 elementwise/baseline rows
if baseline_df is not None and not baseline_df.empty:
    # align columns of interest
    cols = ['attn_gate','learning_rate','best_val_loss','final_val_loss','ms_per_step','train_time_ms','run_id','seed']
    comp_list = [baseline_df[cols]]
    if not stage3.empty:
        stage_subset = stage3.copy()
        missing = [c for c in cols if c not in stage_subset.columns]
        for c in missing:
            stage_subset[c] = pd.NA
        comp_list.append(stage_subset[cols])
    comp = pd.concat(comp_list, ignore_index=True)
    display(comp)
    # Simple bar for best_val_loss
    fig, ax = plt.subplots(figsize=(6,4))
    ax.bar(comp['attn_gate'], comp['best_val_loss'])
    ax.set_ylabel('best_val_loss')
    ax.set_title('Baseline (train_gpt_original) vs stage3 results')
    ax.grid(True, alpha=0.3)
    plt.tight_layout(); plt.show()
else:
    print('No baseline_df to compare.')

In [None]:
# Compute theoretical loss at baseline wall-clock time using linear interpolation
import numpy as np
from collections import defaultdict

# Determine baseline wall-clock time from baseline_df if present
baseline_time_ms = None
if baseline_df is not None and not baseline_df.empty:
    baseline_time_ms = float(baseline_df.iloc[0]['train_time_ms'])
else:
    print('Baseline timing not found; cannot compute theoretical losses.')

if baseline_time_ms is not None and not curves.empty:
    def interpolate_loss(sub, target_ms):
        sub = sub.sort_values('train_time_ms')
        t = sub['train_time_ms'].values
        v = sub['val_loss'].values
        if target_ms <= t.min():
            return v[0]
        if target_ms >= t.max():
            return v[-1]
        return np.interp(target_ms, t, v)

    rows = []
    for gate, sub in curves.groupby('attn_gate'):
        # average across runs at the baseline time by interpolating each run then averaging
        vals = []
        for rid, rsub in sub.groupby('run_id'):
            vals.append(interpolate_loss(rsub, baseline_time_ms))
        if vals:
            rows.append({'attn_gate': gate, 'theoretical_val_loss_at_baseline_time': float(np.mean(vals))})
    final_table = pd.DataFrame(rows)
    display(final_table)
else:
    print('No curves or baseline time; cannot compute theoretical losses.')
