### NanoGPT

In [1]:
# !git clone https://github.com/karpathy/nanoGPT.git

In [2]:
# pip install torch numpy transformers datasets tiktoken wandb tqdm

In [1]:
import os
import time
import math
import pickle
from contextlib import nullcontext
from dataclasses import dataclass

import numpy as np
import torch
from torch.nn.parallel import DistributedDataParallel as DDP
from torch.distributed import init_process_group, destroy_process_group

from nanoGPT.model import GPT
import torch._dynamo
torch._dynamo.config.suppress_errors = True

In [2]:
@dataclass
class GPTConfig:
    batch_size: int = 32
    block_size: int = 1024
    vocab_size: int = 50304
    n_layer: int = 6
    n_head: int = 8
    n_embd: int = 768
    dropout: float = 0.1
    bias: bool = False
    model_type: str = 'reflex'
    
config = GPTConfig()

In [3]:
wandb_log = True
wandb_project = 'research-task'
is_True = 'True' if config.bias else 'False'
wandb_run_name = f'all_layer-{config.model_type}-{config.block_size}-{is_True}-{config.n_embd}-{config.dropout}'

out_dir = 'mini-gpt'
eval_interval = 250
eval_iters = 200
log_interval = 10

In [4]:
gradient_accumulation_steps = 1

learning_rate = 1e-3
max_iters = 10000
lr_decay_iters = 5000
min_lr = 1e-5
warmup_iters = 400

eval_only = False 
always_save_checkpoint = False

weight_decay = 1e-1
decay_lr = True
beta1 = 0.9
beta2 = 0.98
grad_clip = 1.0 

In [5]:
# DDP settings
backend = 'nccl' # 'nccl', 'gloo', etc.
# system
device = 'cuda' # examples: 'cpu', 'cuda', 'cuda:0', 'cuda:1' etc., or try 'mps' on macbooks
dtype = 'bfloat16' if torch.cuda.is_available() and torch.cuda.is_bf16_supported() else 'float16' # 'float32', 'bfloat16', or 'float16', the latter will auto implement a GradScaler
compile = True # use PyTorch 2.0 to compile the model to be faster

# various inits, derived attributes, I/O setup
ddp = int(os.environ.get('RANK', -1)) != -1 # is this a ddp run?
if ddp:
    init_process_group(backend=backend)
    ddp_rank = int(os.environ['RANK'])
    ddp_local_rank = int(os.environ['LOCAL_RANK'])
    ddp_world_size = int(os.environ['WORLD_SIZE'])
    device = f'cuda:{ddp_local_rank}'
    torch.cuda.set_device(device)
    master_process = ddp_rank == 0 # this process will do logging, checkpointing etc.
    seed_offset = ddp_rank # each process gets a different seed
    # world_size number of processes will be training simultaneously, so we can scale
    # down the desired gradient accumulation iterations per process proportionally
    assert gradient_accumulation_steps % ddp_world_size == 0
    gradient_accumulation_steps //= ddp_world_size
else:
    # if not ddp, we are running on a single gpu, and one process
    master_process = True
    seed_offset = 0
    ddp_world_size = 1
tokens_per_iter = gradient_accumulation_steps * ddp_world_size * config.batch_size * config.block_size
print(f"tokens per iteration will be: {tokens_per_iter:,}")

if master_process:
    os.makedirs(out_dir, exist_ok=True)
torch.manual_seed(1337 + seed_offset)
torch.backends.cuda.matmul.allow_tf32 = True # allow tf32 on matmul
torch.backends.cudnn.allow_tf32 = True # allow tf32 on cudnn
device_type = 'cuda' if 'cuda' in device else 'cpu' # for later use in torch.autocast
# note: float16 data type will automatically use a GradScaler
ptdtype = {'float32': torch.float32, 'bfloat16': torch.bfloat16, 'float16': torch.float16}[dtype]
ctx = nullcontext() if device_type == 'cpu' else torch.amp.autocast(device_type=device_type, dtype=ptdtype)

data_dir = 'nanoGPT/data/openwebtext'
def get_batch(split):
    batch_size = config.batch_size
    block_size = config.block_size
    # We recreate np.memmap every batch to avoid a memory leak, as per
    # https://stackoverflow.com/questions/45132940/numpy-memmap-memory-usage-want-to-iterate-once/61472122#61472122
    if split == 'train':
        data = np.memmap(os.path.join(data_dir, 'train.bin'), dtype=np.uint16, mode='r')
    else:
        data = np.memmap(os.path.join(data_dir, 'val.bin'), dtype=np.uint16, mode='r')
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([torch.from_numpy((data[i:i+block_size]).astype(np.int64)) for i in ix])
    y = torch.stack([torch.from_numpy((data[i+1:i+1+block_size]).astype(np.int64)) for i in ix])
    if device_type == 'cuda':
        # pin arrays x,y, which allows us to move them to GPU asynchronously (non_blocking=True)
        x, y = x.pin_memory().to(device, non_blocking=True), y.pin_memory().to(device, non_blocking=True)
    else:
        x, y = x.to(device), y.to(device)
    return x, y

iter_num = 0

tokens per iteration will be: 32,768


In [6]:
# model init
model_args = dict(n_layer=config.n_layer, n_head=config.n_head, n_embd=config.n_embd, block_size=config.block_size,
                  bias=config.bias, vocab_size=config.vocab_size, dropout=config.dropout)

# init a new model from scratch
print("Initializing a new model from scratch")
model = GPT(config)

Initializing a new model from scratch
number of parameters: 81.11M


In [7]:
model.to(device)
# initialize a GradScaler. If enabled=False scaler is a no-op
scaler = torch.cuda.amp.GradScaler(enabled=(dtype == 'float16'))

In [8]:
optimizer = model.configure_optimizers(weight_decay, learning_rate, (beta1, beta2), device_type)
checkpoint = None # free up memory

# compile the model
if compile:
    print("compiling the model... (takes a ~minute)")
    unoptimized_model = model
    model = torch.compile(model) # requires PyTorch 2.0

# wrap model into DDP container
if ddp:
    model = DDP(model, device_ids=[ddp_local_rank])

# helps estimate an arbitrarily accurate loss over either split using many batches
@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            with ctx:
                logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

# learning rate decay scheduler (cosine with warmup)
def get_lr(it):
    # 1) linear warmup for warmup_iters steps
    if it < warmup_iters:
        return learning_rate * it / warmup_iters
    # 2) if it > lr_decay_iters, return min learning rate
    if it > lr_decay_iters:
        return min_lr
    # 3) in between, use cosine decay down to min learning rate
    decay_ratio = (it - warmup_iters) / (lr_decay_iters - warmup_iters)
    assert 0 <= decay_ratio <= 1
    coeff = 0.5 * (1.0 + math.cos(math.pi * decay_ratio)) # coeff ranges 0..1
    return min_lr + coeff * (learning_rate - min_lr)

# logging
if wandb_log and master_process:
    import wandb
    wandb.init(project=wandb_project, name=wandb_run_name, config=config)


num decayed parameter tensors: 26, with 81,887,232 parameters
num non-decayed parameter tensors: 13, with 9,984 parameters
using fused AdamW: True
compiling the model... (takes a ~minute)


Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mkornilova_eka[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [9]:
# training loop
iter_num = 0
best_val_loss = 1e9
X, Y = get_batch('train') # fetch the very first batch
t0 = time.time()
local_iter_num = 0 # number of iterations in the lifetime of this process
raw_model = model.module if ddp else model # unwrap DDP container if needed
running_mfu = -1.0
while True:

    # determine and set the learning rate for this iteration
    lr = get_lr(iter_num) if decay_lr else learning_rate
    for param_group in optimizer.param_groups:
        param_group['lr'] = lr

    # evaluate the loss on train/val sets and write checkpoints
    if iter_num % eval_interval == 0 and master_process:
        losses = estimate_loss()
        print(f"step {iter_num}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")
        if wandb_log:
            wandb.log({
                "iter": iter_num,
                "train/loss": losses['train'],
                "val/loss": losses['val'],
                "lr": lr,
                "mfu": running_mfu*100, # convert to percentage
            })
        if losses['val'] < best_val_loss or always_save_checkpoint:
            best_val_loss = losses['val']
            if iter_num > 0:
                checkpoint = {
                    'model': raw_model.state_dict(),
                    'optimizer': optimizer.state_dict(),
                    'model_args': model_args,
                    'iter_num': iter_num,
                    'best_val_loss': best_val_loss,
                    'config': config,
                }
                print(f"saving checkpoint to {out_dir}")
                torch.save(checkpoint, os.path.join(out_dir, 'ckpt.pt'))
    if iter_num == 0 and eval_only:
        break

    # forward backward update, with optional gradient accumulation to simulate larger batch size
    # and using the GradScaler if data type is float16
    for micro_step in range(gradient_accumulation_steps):
        if ddp:
            # in DDP training we only need to sync gradients at the last micro step.
            # the official way to do this is with model.no_sync() context manager, but
            # I really dislike that this bloats the code and forces us to repeat code
            # looking at the source of that context manager, it just toggles this variable
            model.require_backward_grad_sync = (micro_step == gradient_accumulation_steps - 1)
        with ctx:
            logits, loss = model(X, Y)
            loss = loss / gradient_accumulation_steps # scale the loss to account for gradient accumulation
        # immediately async prefetch next batch while model is doing the forward pass on the GPU
        X, Y = get_batch('train')
        # backward pass, with gradient scaling if training in fp16
        scaler.scale(loss).backward()
    # clip the gradient
    if grad_clip != 0.0:
        scaler.unscale_(optimizer)
        torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clip)
    # step the optimizer and scaler if training in fp16
    scaler.step(optimizer)
    scaler.update()
    # flush the gradients as soon as we can, no need for this memory anymore
    optimizer.zero_grad(set_to_none=True)

    # timing and logging
    t1 = time.time()
    dt = t1 - t0
    t0 = t1
    if iter_num % log_interval == 0 and master_process:
        lossf = loss.item() * gradient_accumulation_steps
        if local_iter_num >= 5: # let the training loop settle a bit
            mfu = raw_model.estimate_mfu(config.batch_size * gradient_accumulation_steps, dt)
            running_mfu = mfu if running_mfu == -1.0 else 0.9*running_mfu + 0.1*mfu
        print(f"iter {iter_num}: loss {lossf:.4f}, time {dt*1000:.2f}ms, mfu {running_mfu*100:.2f}%")
    iter_num += 1
    local_iter_num += 1

    # termination conditions
    if iter_num > max_iters:
        break

if ddp:
    destroy_process_group()

W1117 18:59:49.947000 140390391898112 torch/_dynamo/convert_frame.py:824] WON'T CONVERT forward /home/user/nanoGPT/model.py line 146 
W1117 18:59:49.947000 140390391898112 torch/_dynamo/convert_frame.py:824] due to: 
W1117 18:59:49.947000 140390391898112 torch/_dynamo/convert_frame.py:824] Traceback (most recent call last):
W1117 18:59:49.947000 140390391898112 torch/_dynamo/convert_frame.py:824]   File "/usr/lib/python3/dist-packages/torch/_dynamo/convert_frame.py", line 786, in _convert_frame
W1117 18:59:49.947000 140390391898112 torch/_dynamo/convert_frame.py:824]     result = inner_convert(
W1117 18:59:49.947000 140390391898112 torch/_dynamo/convert_frame.py:824]   File "/usr/lib/python3/dist-packages/torch/_dynamo/convert_frame.py", line 400, in _convert_frame_assert
W1117 18:59:49.947000 140390391898112 torch/_dynamo/convert_frame.py:824]     return _compile(
W1117 18:59:49.947000 140390391898112 torch/_dynamo/convert_frame.py:824]   File "/usr/lib/python3.10/contextlib.py", line

step 0: train loss 10.9429, val loss 10.9425
iter 0: loss 10.9527, time 31559.07ms, mfu -100.00%
iter 10: loss 9.7819, time 189.10ms, mfu 30.17%
iter 20: loss 9.3407, time 189.00ms, mfu 30.18%
iter 30: loss 8.7671, time 188.42ms, mfu 30.19%
iter 40: loss 8.0848, time 187.40ms, mfu 30.21%
iter 50: loss 7.4833, time 187.09ms, mfu 30.24%
iter 60: loss 7.1131, time 187.76ms, mfu 30.26%
iter 70: loss 7.2629, time 187.56ms, mfu 30.27%
iter 80: loss 6.8479, time 188.15ms, mfu 30.28%
iter 90: loss 6.9234, time 188.47ms, mfu 30.28%
iter 100: loss 6.7939, time 188.71ms, mfu 30.27%
iter 110: loss 6.7392, time 189.02ms, mfu 30.26%
iter 120: loss 6.3867, time 188.07ms, mfu 30.27%
iter 130: loss 6.5128, time 190.27ms, mfu 30.24%
iter 140: loss 6.4810, time 188.53ms, mfu 30.25%
iter 150: loss 6.3648, time 189.16ms, mfu 30.24%
iter 160: loss 6.3628, time 189.06ms, mfu 30.23%
iter 170: loss 6.3086, time 189.60ms, mfu 30.22%
iter 180: loss 6.2953, time 188.44ms, mfu 30.22%
iter 190: loss 6.1896, time 19

KeyboardInterrupt: 