# randyGPT — PyTorch Training on Colab GPU

Trains a randyGPT model on NVIDIA GPU (T4/V100/A100) and saves RGPT0003-compatible checkpoints
that can be loaded directly by the Rust CPU inference server.

**Setup:** Runtime → Change runtime type → GPU

In [None]:
# ── Cell 1: GPU check + dependencies ──────────────────────────────────────────

import subprocess, sys

# Show GPU info
result = subprocess.run(['nvidia-smi', '--query-gpu=name,memory.total', '--format=csv,noheader'],
                        capture_output=True, text=True)
if result.returncode == 0:
    gpu_info = result.stdout.strip()
    print(f'GPU: {gpu_info}')
else:
    print('WARNING: No GPU detected! Go to Runtime > Change runtime type > GPU')

# Install dependencies
!pip install -q safetensors transformers

import torch
print(f'PyTorch: {torch.__version__}')
print(f'CUDA available: {torch.cuda.is_available()}')
if torch.cuda.is_available():
    print(f'GPU: {torch.cuda.get_device_name(0)}')

# ── Estimated iteration times by GPU and model size ───────────────────────────
print('''
Estimated ms/iter (batch=64, block=256, BPE):
┌────────┬──────────┬───────────┬───────────┐
│ Model  │ T4 fp16  │ V100 fp16 │ A100 bf16 │
├────────┼──────────┼───────────┼───────────┤
│ xs     │ ~250ms   │ ~120ms    │ ~35ms     │
│ s      │ ~500ms   │ ~240ms    │ ~70ms     │
│ ds     │ ~800ms   │ ~380ms    │ ~110ms    │
│ l      │ ~1600ms  │ ~760ms    │ ~220ms    │
└────────┴──────────┴───────────┴───────────┘
1000 iters on T4 (model-s, fp16): ~8 minutes
''')

In [None]:
# ── Cell 2: Mount Google Drive ────────────────────────────────────────────────
#
# Drive is used to:
#   - Persist checkpoints across Colab sessions (auto-copy on new best val loss)
#   - Upload train.txt / vocab.json from your local machine
#   - Resume training after session timeout

from google.colab import drive
drive.mount('/content/drive')

import os
DRIVE_DIR = '/content/drive/MyDrive/randyGPT'
os.makedirs(DRIVE_DIR, exist_ok=True)
print(f'Drive directory ready: {DRIVE_DIR}')
print(f'Contents: {os.listdir(DRIVE_DIR)}')

In [None]:
# ── Cell 3: Upload files ──────────────────────────────────────────────────────
#
# Option A (recommended): Copy from Drive if you already uploaded there
#   !cp "$DRIVE_DIR/train.txt" /content/train.txt
#   !cp "$DRIVE_DIR/vocab.json" /content/vocab.json
#
# Option B: Upload directly via file picker
#   from google.colab import files
#   files.upload()  # select train.txt and vocab.json
#
# Option C (for scripts): Clone or upload the randyGPT repo
#   !git clone https://github.com/yourname/randyGPT /content/randyGPT
#   OR upload the scripts/ directory manually

import sys, os

# ── Copy training data from Drive ─────────────────────────────────────────────
!cp "$DRIVE_DIR/train.txt" /content/train.txt 2>/dev/null || echo 'train.txt not in Drive — upload it'
!cp "$DRIVE_DIR/vocab.json" /content/vocab.json 2>/dev/null || echo 'vocab.json not in Drive — upload it'

# ── Verify scripts are available ──────────────────────────────────────────────
SCRIPTS_DIR = '/content/randyGPT/scripts'
if not os.path.exists(SCRIPTS_DIR):
    print(f'Scripts not found at {SCRIPTS_DIR}')
    print('Upload or clone the randyGPT repo:')
    print('  !git clone https://github.com/yourname/randyGPT /content/randyGPT')
else:
    sys.path.insert(0, SCRIPTS_DIR)
    print(f'Scripts ready: {os.listdir(SCRIPTS_DIR)}')

# ── Check data ─────────────────────────────────────────────────────────────────
for f in ['/content/train.txt', '/content/vocab.json']:
    if os.path.exists(f):
        size_mb = os.path.getsize(f) / 1e6
        print(f'  ✓ {f} ({size_mb:.1f} MB)')
    else:
        print(f'  ✗ {f} MISSING')

In [None]:
# ── Cell 4: Training configuration ───────────────────────────────────────────
#
# Edit these values, then run Cell 5 to start training.

MODEL_SIZE  = 's'        # xs / s / ds / m / l / deep / xl
ITERS       = 1000       # total training iterations
DTYPE       = 'bf16'     # bf16 (A100) / fp16 (T4/V100) / fp32 (CPU debug)
BATCH_SIZE  = 64         # per-step batch size
GRAD_ACCUM  = 1          # gradient accumulation steps (effective batch = BATCH_SIZE * GRAD_ACCUM)

# Set to a .bin or .safetensors path to resume from a previous checkpoint:
RESUME      = ''         # e.g. '/content/drive/MyDrive/randyGPT/checkpoint_best.bin'

TRAIN_FILE  = '/content/train.txt'
VOCAB_FILE  = '/content/vocab.json'
OUTPUT_DIR  = '/content/output'
SCRIPTS_DIR = '/content/randyGPT/scripts'

# Drive backup: copies checkpoint_best.bin here after each new best val loss
DRIVE_BACKUP = f'{DRIVE_DIR}/checkpoint_best.bin'

import os
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Build the command
cmd_parts = [
    f'python {SCRIPTS_DIR}/train_torch.py',
    f'--model-size {MODEL_SIZE}',
    f'--iters {ITERS}',
    '--bpe',
    f'--batch-size {BATCH_SIZE}',
    f'--grad-accum {GRAD_ACCUM}',
    f'--dtype {DTYPE}',
    f'--train-file {TRAIN_FILE}',
    f'--vocab {VOCAB_FILE}',
    f'--output {OUTPUT_DIR}',
    f'--drive {DRIVE_BACKUP}',
]
if RESUME:
    cmd_parts.append(f'--resume {RESUME}')

TRAIN_CMD = ' '.join(cmd_parts)
print('Training command:')
print(TRAIN_CMD)

In [None]:
# ── Cell 5: Run training ──────────────────────────────────────────────────────
#
# Output: checkpoint.bin and checkpoint_best.bin in OUTPUT_DIR
#         checkpoint_best.bin also copied to DRIVE_BACKUP after each new best
#
# If the Colab session disconnects, re-run Cells 1-4 then resume:
#   RESUME = f'{DRIVE_DIR}/checkpoint_best.bin'

import subprocess, sys

proc = subprocess.Popen(
    TRAIN_CMD.split(),
    stdout=subprocess.PIPE,
    stderr=subprocess.STDOUT,
    text=True,
    bufsize=1,
)
for line in proc.stdout:
    print(line, end='', flush=True)
proc.wait()
print(f'\nExit code: {proc.returncode}')

In [None]:
# ── Cell 6: Export to HuggingFace format ──────────────────────────────────────
#
# Produces: OUTPUT_DIR/hf_export/ with config.json, model.safetensors, tokenizer.json

HF_OUTPUT = f'{OUTPUT_DIR}/hf_export'
BEST_CKPT = f'{OUTPUT_DIR}/checkpoint_best.bin'

!python {SCRIPTS_DIR}/export_hf.py \
    --checkpoint {BEST_CKPT} \
    --vocab {VOCAB_FILE} \
    --output {HF_OUTPUT} \
    --model-size {MODEL_SIZE}

import os
print('\nExported files:')
for f in sorted(os.listdir(HF_OUTPUT)):
    size = os.path.getsize(f'{HF_OUTPUT}/{f}')
    print(f'  {f} ({size/1e3:.1f} KB)')

# Copy HF export to Drive for safekeeping
import shutil
hf_drive = f'{DRIVE_DIR}/hf_export'
if os.path.exists(hf_drive):
    shutil.rmtree(hf_drive)
shutil.copytree(HF_OUTPUT, hf_drive)
print(f'\n✓ HF export copied to Drive: {hf_drive}')

In [None]:
# ── Cell 7: Quick generation test ─────────────────────────────────────────────
#
# Load the exported model and generate 200 tokens to verify it works.

import sys
sys.path.insert(0, SCRIPTS_DIR)
sys.path.insert(0, HF_OUTPUT)  # for modeling_randygpt.py in the export

import torch
from safetensors.torch import load_file
from modeling_randygpt import RandyGPTConfig, RandyGPTForCausalLM
from tokenizer_randygpt import RandyGPTTokenizer

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Load config and model
import json
cfg_data = json.loads(open(f'{HF_OUTPUT}/config.json').read())
cfg = RandyGPTConfig(
    vocab_size=cfg_data['vocab_size'],
    n_embd=cfg_data['n_embd'],
    n_head=cfg_data['n_head'],
    n_layer=cfg_data['n_layer'],
    block_size=cfg_data['block_size'],
)
model = RandyGPTForCausalLM(cfg)
state = load_file(f'{HF_OUTPUT}/model.safetensors', device=str(device))
model.load_state_dict(state, strict=True)
model = model.to(device).eval()
print(f'Model loaded: {sum(p.numel() for p in model.parameters())/1e6:.2f}M params')

# Load tokenizer
tok = RandyGPTTokenizer.from_file(VOCAB_FILE)

# Generate
PROMPTS = [
    'Once upon a time',
    'The old man',
    'It was a dark and stormy night',
]

for prompt in PROMPTS:
    ids     = torch.tensor([tok.encode(prompt)], dtype=torch.long, device=device)
    out_ids = model.generate_text(ids, max_new_tokens=200, temperature=0.8, top_p=0.9)
    text    = tok.decode(out_ids[0].tolist())
    print(f'\n{"─"*60}')
    print(f'Prompt: "{prompt}"')
    print(f'Output:\n{text}')

print(f'\n{"─"*60}')
print('Generation test complete.')