In [1]:
import os
import time
import math
import json
import pickle

from dataclasses import dataclass
import wandb
import numpy as np
import torch
from torch.amp import autocast, GradScaler

from reflex_model import GPT

In [2]:
import torch._dynamo
torch._dynamo.config.suppress_errors = True

In [3]:
@dataclass
class GPTConfig:
    batch_size: int = 8
    block_size: int = 2048
    vocab_size: int = 50257
    n_layer: int = 6
    n_head: int = 8
    n_embd: int = 1536
    dropout: float = 0.1
    bias: bool = True
    pretrained_model_path: str="/home/user/models/rugpt"
    local_files_only: bool=True
    
config = GPTConfig()

In [4]:
params = json.load(open('/home/user/DecoderArchitecture/run_params.json'))

params['wandb_run_name'] = f"KornilovaK-{params['wandb_project']}"
params['dtype'] = 'bfloat16' if torch.cuda.is_available() and torch.cuda.is_bf16_supported() else 'float16'
os.makedirs(params['out_dir'], exist_ok=True)

In [5]:
tokens_per_iter = params['gradient_accumulation_steps'] * params['ddp_world_size'] * config.batch_size * config.block_size
print(f"tokens per iteration will be: {tokens_per_iter:,}")

tokens per iteration will be: 16,384


In [6]:
torch.manual_seed(1337 + params['seed_offset'])
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True
device_type = params['device']

ptdtype = {'float32': torch.float32, 'bfloat16': torch.bfloat16, 'float16': torch.float16}[params['dtype']]
ctx = autocast(device_type=device_type, dtype=ptdtype)

In [7]:
model_args = dict(
    n_layer=config.n_layer,
    n_head=config.n_head,
    n_embd=config.n_embd,
    block_size=config.block_size,
    bias=config.bias,
    vocab_size=config.vocab_size,
    dropout=config.dropout
)

In [8]:
scaler = GradScaler()

model = GPT(config)

params_total = model.get_num_params()
params_bytes = params_total*4
params_and_buffers_bytes = params_bytes + 2*params_bytes
print(f"est checkpoint size: {params_and_buffers_bytes/1e9:.2f} GB")
print(f"{params_total/1e6:.1f} millions of params")

Copied: transformer.wte.weight -> transformer.wte.weight
Copied: lm_head.weight -> lm_head.weight
Copied: transformer.wpe.weight -> transformer.wpe.weight
Copied: transformer.ln_f.weight -> transformer.ln_f.weight
Copied: transformer.ln_f.bias -> transformer.ln_f.bias
Copied: transformer.h.0.ln_1.weight -> transformer.h.0.ln_1.weight
Copied: transformer.h.0.ln_1.bias -> transformer.h.0.ln_1.bias
Copied: transformer.h.0.ln_2.weight -> transformer.h.0.ln_2.weight
Copied: transformer.h.0.ln_2.bias -> transformer.h.0.ln_2.bias
Copied: transformer.h.0.attn.c_attn.weight -> transformer.h.0.attn.c_attn.weight
Copied: transformer.h.0.attn.c_attn.bias -> transformer.h.0.attn.c_attn.bias
Copied: transformer.h.0.attn.c_proj.weight -> transformer.h.0.attn.c_proj.weight
Copied: transformer.h.0.attn.c_proj.bias -> transformer.h.0.attn.c_proj.bias
Copied: transformer.h.0.mlp.c_fc.weight -> transformer.h.0.mlp.c_fc.weight
Copied: transformer.h.0.mlp.c_fc.bias -> transformer.h.0.mlp.c_fc.bias
Copied: t

In [9]:
optimizer = model.configure_optimizers(params['weight_decay'], params['learning_rate'], (params['beta1'], params['beta2']), device_type)
checkpoint = None

model = torch.compile(model.to(params['device']))

num decayed parameter tensors: 32, with 703,194,624 parameters
num non-decayed parameter tensors: 56, with 196,608 parameters
using fused AdamW: True


In [10]:
wandb.init(project=params['wandb_project'], name=params['wandb_run_name'], config=config)

[34m[1mwandb[0m: [32m[41mERROR[0m Failed to detect the name of this notebook. You can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mkornilova_eka[0m to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [11]:
block_size                    = config.block_size
batch_size                    = config.batch_size
data_dir                      = params['data_dir']
learning_rate                 = params['learning_rate']
warmup_iters                  = params['warmup_iters']
out_dir                       = params['out_dir']
log_interval                  = params['log_interval']
max_iters                     = params['max_iters']
gradient_accumulation_steps   = params['gradient_accumulation_steps']
grad_clip                     = params['grad_clip']
eval_interval                 = params['eval_interval']
eval_iters                    = params['eval_iters']
min_lr                        = params['min_lr']
lr_decay_iters                = params['lr_decay_iters']

In [12]:
def get_lr(it):
    if it < warmup_iters:
        return learning_rate * it / warmup_iters
    if it > lr_decay_iters:
        return min_lr
        
    decay_ratio = (it - warmup_iters) / (lr_decay_iters - warmup_iters)
    assert 0 <= decay_ratio <= 1
    
    coeff = 0.5 * (1.0 + math.cos(math.pi * decay_ratio))
    return min_lr + coeff * (learning_rate - min_lr)

In [13]:
def get_batch(split):
    if split == 'train':
        data = np.memmap(os.path.join(data_dir, 'train.bin'), dtype=np.uint16, mode='r')
    else:
        data = np.memmap(os.path.join(data_dir, 'val.bin'), dtype=np.uint16, mode='r')
        
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([torch.from_numpy((data[i:i+block_size]).astype(np.int64)) for i in ix])
    y = torch.stack([torch.from_numpy((data[i+1:i+1+block_size]).astype(np.int64)) for i in ix])
    
    assert device_type == 'cuda'
    x, y = x.pin_memory().to(device_type, non_blocking=True), y.pin_memory().to(device_type, non_blocking=True)

    return x, y

In [14]:
def logging_step(iter_num, losses, lr, running_mfu, best_val_loss):
    print(f"step {iter_num}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    wandb.log({
        "iter": iter_num,
        "train/loss": losses['train'],
        "val/loss": losses['val'],
        "lr": lr,
        "mfu": running_mfu*100,
    })

    if losses['val'] < best_val_loss:
        best_val_loss = losses['val']
        if iter_num > 0:
            checkpoint = {
                'model': model.state_dict(),
                'optimizer': optimizer.state_dict(),
                'model_args': model_args,
                'iter_num': iter_num,
                'best_val_loss': best_val_loss,
                'config': config,
            }
            torch.save(checkpoint, os.path.join(out_dir, f'ckpt_{iter_num}.pt'))

    return best_val_loss

In [15]:
@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            with ctx:
                logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    
    return out

# TODO: подсчитать метрики!!!!

In [None]:
def train_loop(iter_num=0, best_val_loss=1e9, local_iter_num=0, running_mfu=-1.0):
    t0 = time.time()
    X, Y = get_batch('train')
    
    while True:
        lr = get_lr(iter_num)
        for param_group in optimizer.param_groups:
            param_group['lr'] = lr
    
        if iter_num % eval_interval == 0:
            losses = estimate_loss()
            best_val_loss = logging_step(iter_num, losses, lr, running_mfu, best_val_loss)
                    
        for micro_step in range(gradient_accumulation_steps):
            with ctx:
                logits, loss = model(X, Y)
                loss = loss / gradient_accumulation_steps
                
            X, Y = get_batch('train')
            scaler.scale(loss).backward()

        if grad_clip != 0.0:
            scaler.unscale_(optimizer)
            torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clip)

        scaler.step(optimizer)
        scaler.update()
        optimizer.zero_grad(set_to_none=True)
    
        t1 = time.time()
        dt = t1 - t0
        t0 = t1
        if iter_num % log_interval == 0:
            lossf = loss.item() * gradient_accumulation_steps
            
            if local_iter_num >= 5:
                mfu = model.estimate_mfu(config.batch_size * gradient_accumulation_steps, dt)
                running_mfu = mfu if running_mfu == -1.0 else 0.9*running_mfu + 0.1*mfu
            print(f"iter {iter_num}: loss {lossf:.4f}, time {dt*1000:.2f}ms, mfu {running_mfu*100:.2f}%")
            
        iter_num += 1
        local_iter_num += 1
    
        if iter_num > max_iters:
            return

In [None]:
train_loop()