In [2]:
# %% Config Generator for NanoGPT Training
# Run this cell to generate all config files

import os
import math

# === CONFIGURATION ===
checkpoint_steps = [50000, 100000, 105000]
max_iters = 105001
num_heads = 12

# Model size variants: (n_embd, head_size, mlp_factor, suffix)
model_sizes = {
    "base":            (768, 64, 4, ""),
    "smaller_mlp":     (768, 64, 3.5, "_smaller_mlp"),
    "larger_mlp":      (768, 64, 4.5, "_larger_mlp"),
    "smaller_overall": (744, 62, 4, "_smaller_overall"),
}

# Query modes: (description, use_half_scale, weight_decay)
query_modes = {
    "original":      ("Q = X @ W_Q (original query weights)", False, "0.1"),
    "identity":      ("Q = X (no query weights, W_Q = Identity)", True, "2**-5"),

}

# === GENERATE CONFIGS ===
count = 0

dir_name = "configs_tied"
weight_str = "tiedw"
os.makedirs(dir_name, exist_ok=True)

for size_name, (n_embd, head_size, mlp_factor, size_suffix) in model_sizes.items():
    for query_mode, (description, use_half_scale, weight_decay) in query_modes.items():
        
        # smaller_mlp, smaller_overall: only for original (removing params)
        if size_name in ["smaller_mlp", "smaller_overall"] and query_mode != "original":
            continue
        # larger_mlp: only for identity (adding params to compensate)
        if size_name == "larger_mlp" and query_mode != "identity":
            continue
        
        # Build scale expression
        if use_half_scale:
            scale_expr = f"1/(2*math.sqrt({n_embd}//{num_heads}))"
        else:
            scale_expr = f"1/(math.sqrt({n_embd}//{num_heads}))"
        
        # Build MLP hidden size expression
        if mlp_factor == int(mlp_factor):
            mlp_expr = f"{n_embd} * {int(mlp_factor)}"
        else:
            mlp_expr = f"int({n_embd} * {mlp_factor})"
        
        filename = f"{dir_name}/config_{weight_str}_{query_mode}{size_suffix}.py"
        
        content = f'''"""
NanoGPT Configuration

Weight Tying: Tied weights
Query Mode: {query_mode}
- {description}
Model Size: {size_name} (n_embd={n_embd}, mlp_factor={mlp_factor})

Checkpoint saves at steps: {checkpoint_steps}
Training stops at step: {max_iters}
"""

import math

model_args = {{
# Model architecture
"block_size": 1024,
"vocab_size": 50304,
"n_layer": 12,
"num_heads": {num_heads},
"n_embd": {n_embd},
"head_size": {head_size},
"mlp_hidden_size": {mlp_expr},

# Weight configuration
"tie_weights": {True},

# Query mode: "original", "identity", "residual", "residual_gelu"
"query_mode": "{query_mode}",

# Regularization
"dropout": 0.0,
"bias": False,

# Training batch configuration
"batch_size": 12,
"accumulation_size": 40,

# Attention scale
"scale": {scale_expr},

# Optimizer settings
"learning_rate": 6e-4,
"weight_decay": {weight_decay},
"beta1": 0.9,
"beta2": 0.95,
"grad_clip": 1.0,

# Learning rate schedule
"decay_lr": True,
"warmup_iters": 2000,
"lr_decay_iters": 600000,
"min_lr": 6e-5,

# Checkpoint and early stopping
"save_checkpoint_steps": {checkpoint_steps},
"max_iters": {max_iters},
}}
'''
        with open(filename, 'w') as f:
            f.write(content)
        print(f"Generated: {filename}")
        count += 1

print(f"\nDone! Generated {count} config files.")

Generated: configs_tied/config_tiedw_original.py
Generated: configs_tied/config_tiedw_identity.py
Generated: configs_tied/config_tiedw_original_smaller_mlp.py
Generated: configs_tied/config_tiedw_identity_larger_mlp.py
Generated: configs_tied/config_tiedw_original_smaller_overall.py

Done! Generated 5 config files.
