In [2]:
!pip3 install transformers datasets



In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from dataclasses import dataclass
import math
import time
from tqdm import tqdm
import numpy as np
from transformers import GPT2LMHeadModel, GPT2Tokenizer, pipeline, set_seed

from torch.distributed import init_process_group, destroy_process_group
from torch.nn.parallel import DistributedDataParallel as DDP
import torch.distributed as dist
import torch.multiprocessing as mp
import os
import warnings

warnings.filterwarnings("ignore")

device = 'cuda' if torch.cuda.is_available() else 'cpu'

torch.manual_seed(42)
if torch.cuda.is_available():
  torch.cuda.manual_seed_all(42)

## Check gpt 2 model from hugging face

In [4]:
model = GPT2LMHeadModel.from_pretrained('gpt2')
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

weights = model.state_dict()
for ind, (k,v) in enumerate(weights.items()):
  print(k, v.shape)
  if ind > 10:
    break


transformer.wte.weight torch.Size([50257, 768])
transformer.wpe.weight torch.Size([1024, 768])
transformer.h.0.ln_1.weight torch.Size([768])
transformer.h.0.ln_1.bias torch.Size([768])
transformer.h.0.attn.c_attn.weight torch.Size([768, 2304])
transformer.h.0.attn.c_attn.bias torch.Size([2304])
transformer.h.0.attn.c_proj.weight torch.Size([768, 768])
transformer.h.0.attn.c_proj.bias torch.Size([768])
transformer.h.0.ln_2.weight torch.Size([768])
transformer.h.0.ln_2.bias torch.Size([768])
transformer.h.0.mlp.c_fc.weight torch.Size([768, 3072])
transformer.h.0.mlp.c_fc.bias torch.Size([3072])



In [5]:
pip = pipeline('text-generation', model=model, tokenizer=tokenizer, device=device, truncation=True)
set_seed(42)
out = pip("researcher found that nuclear energy", max_length=150, num_return_sequences=1)
print(out[0]['generated_text'])

researcher found that nuclear energy, the only source of energy in the solar system, would cost about $2 trillion if all of its residents would join the public sector, the most for a decade, said Robert Weiersz, a professor of economics at the University of California, Berkeley and one of the world's leading nuclear experts, in an April interview with The National Security Archive. But at that price, he said, many residents in Texas would not be willing to pay that much.

"If your health insurance covers nuclear energy, for example, you won't pay more if you're covered by the health insurance package," Weiersz said. "If you're able to get cancer care for free, you'll still be


## GPT From Scratch

In [6]:
@dataclass
class GPTConfig:
  vocab_size: int = 50257
  block_size: int = 1024 #max number of tokens in input and output
  n_layer: int = 12
  n_head: int = 12
  n_embd: int = 768

In [7]:
class FF(nn.Module):
    def __init__(self, config: GPTConfig):
        super(FF, self).__init__()
        self.c_fc = nn.Linear(config.n_embd, 4 * config.n_embd)  # input and output neurons, inp to 4 inp
        self.gelu = nn.GELU(approximate='tanh')
        self.c_proj = nn.Linear(4 * config.n_embd, config.n_embd)
        self.c_proj.GPTSCALED_INIT = True # to compensate the increase of standard deviation in residual stream

    def forward(self, x):
        x = self.c_fc(x)
        x = self.gelu(x)
        x = self.c_proj(x)
        return x

In [8]:
class SelfAttention(nn.Module):
  def __init__(self, config: GPTConfig):
    super().__init__()
    assert config.n_embd % config.n_head == 0 # n_embd = num_heads * head_size
    self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd) #key, value, query projection (3*k size)
    self.c_proj = nn.Linear(config.n_embd, config.n_embd) #output projection
    self.c_proj.GPTSCALED_INIT = True

    self.n_head = config.n_head
    self.n_embd = config.n_embd

    #for parameters that shouldent be updated by optimizer but should be in state_dict (here mask)
    self.register_buffer("bias", torch.tril(torch.ones(config.block_size, config.block_size))
                         .view(1, 1, config.block_size, config.block_size))  # mask with size max_tokens x max_tokens

    #torch.tril Returns lower triangular part of the matrix or batch of matrices input, other elements are set to 0


  def forward(self, x):
    B, T, C = x.size() #shape is alise for size (batch, sequence length, n_embd)
    # print(f"b: {B}, t: {T}, c: {C}")
    q, k, v = self.c_attn(x).split(self.n_embd, dim=2) # as out of c_att is all three concatnated

    # n_embd = num_heads * head_size
    q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
    k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
    v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)

    # attn = (q @ k.transpose(-2, -1)) * (1.0/ math.sqrt(k.size(-1))) # (B, nh, T, T)
    # attn = attn.masked_fill(self.bias[:,:,:T,:T] == 0, float('-inf')) #masking
    # attn = attn.softmax(dim=-1)
    # y = attn @ v # (B, nh, T, T) x (B, nh, T, hs) -> (B, nh, T, hs)
    y = F.scaled_dot_product_attention(q, k, v, is_causal=True)

    y = y.transpose(1, 2).contiguous().view(B, T, C) # re-assemble all head outputs side by side

    return self.c_proj(y)

In [9]:
class Block(nn.Module): #decoder block
  def __init__(self, config: GPTConfig):
    super().__init__()
    self.ln_1 = nn.LayerNorm(config.n_embd)
    self.attn = SelfAttention(config)
    self.ln_2 = nn.LayerNorm(config.n_embd)

    self.mlp = FF(config)

  def forward(self, x):
    x = x + self.attn(self.ln_1(x))
    x = x + self.mlp(self.ln_2(x))
    return x

In [10]:
class GPT(nn.Module):
  def __init__(self, config: GPTConfig):
    super().__init__()
    self.config = config

    self.transformer = nn.ModuleDict({
        'wte': nn.Embedding(config.vocab_size, config.n_embd), # token embeddings vocab size x embedding dimension
        'wpe': nn.Embedding(config.block_size, config.n_embd),  # positional embeddings
        'h': nn.ModuleList([Block(config) for _ in range(config.n_layer)]), # n_layer (6) decoder blocks
        'ln_f': nn.LayerNorm(config.n_embd),
        })
    self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False) #final head embedding dims x vocab size

    # weight sharing scheme
    self.transformer['wte'].weight = self.lm_head.weight

    self.apply(self._init_weights)

  def _init_weights(self, module):
    if isinstance(module, nn.Linear):
      std = 0.02
      if hasattr(module, 'GPTSCALED_INIT'):
        if module.GPTSCALED_INIT:
          std *= (2 * self.config.n_layer) ** -0.5
      nn.init.normal_(module.weight, mean=0.0, std=std)
      if module.bias is not None:
        nn.init.zeros_(module.bias)
    elif isinstance(module, nn.Embedding):
      nn.init.normal_(module.weight, mean=0.0, std=0.02)

  def forward(self, x, targets=None):
    B, T = x.size()
    assert T <= self.config.block_size, "Embedding size is wrong"
    tok_emb = self.transformer['wte'](x)
    # print(tok_emb.shape)
    pos_emb = self.transformer['wpe'](torch.arange(0, T, device=x.device))
    # print(pos_emb.shape)
    x = tok_emb + pos_emb
    # print('tok + pos', x.shape)
    for block in self.transformer['h']:
      x = block(x)

    # print('blocks out', x.shape)
    x = self.transformer['ln_f'](x)
    # print('ln_f out', x.shape)
    logits = self.lm_head(x)
    # print('lm_head out', logits.shape)
    loss = None
    if targets is not None:
      loss = nn.functional.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1))
    return logits, loss

  @classmethod
  def from_pretrained(cls, model_type):
    assert model_type in {'gpt2', 'gpt2-medium', 'gpt2-large', 'gpt2-xl'}
    from transformers import GPT2LMHeadModel
    print("loading weights from pretrained gpt: %s" % model_type)

    config_args = {
          'gpt2':         dict(n_layer=12, n_head=12, n_embd=768),  # 124M params
          'gpt2-medium':  dict(n_layer=24, n_head=16, n_embd=1024), # 350M params
          'gpt2-large':   dict(n_layer=36, n_head=20, n_embd=1280), # 774M params
          'gpt2-xl':      dict(n_layer=48, n_head=25, n_embd=1600), # 1558M params
      }[model_type]
    config_args['vocab_size'] = 50257 # always 50257 for GPT model checkpoints
    config_args['block_size'] = 1024 # always 1024 for GPT model checkpoints
    config = GPTConfig(**config_args)
    model = GPT(config)
    sd = model.state_dict()
    sd_keys = sd.keys()
    sd_keys = [k for k in sd_keys if not k.endswith('.attn.bias')] # discard this mask / buffer, not a param
    # init a huggingface/transformers model
    model_hf = GPT2LMHeadModel.from_pretrained(model_type)
    sd_hf = model_hf.state_dict()

    # copy while ensuring all of the parameters are aligned and match in names and shapes
    sd_keys_hf = sd_hf.keys()
    sd_keys_hf = [k for k in sd_keys_hf if not k.endswith('.attn.masked_bias')] # ignore these, just a buffer
    sd_keys_hf = [k for k in sd_keys_hf if not k.endswith('.attn.bias')] # same, just the mask (buffer)
    transposed = ['attn.c_attn.weight', 'attn.c_proj.weight', 'mlp.c_fc.weight', 'mlp.c_proj.weight']
    # openai checkpoints use a "Conv1D" module, but we use a vanilla Linear
    # this means that we have to transpose these weights when we import them
    assert len(sd_keys_hf) == len(sd_keys), f"mismatched keys: {len(sd_keys_hf)} != {len(sd_keys)}"
    for k in sd_keys_hf:
      if any(k.endswith(w) for w in transposed):
      # special treatment for the Conv1D weights we need to transpose
        assert sd_hf[k].shape[::-1] == sd[k].shape
        with torch.no_grad():
          sd[k].copy_(sd_hf[k].t())
      else:
        # vanilla copy over the other parameters
        assert sd_hf[k].shape == sd[k].shape
        with torch.no_grad():
          sd[k].copy_(sd_hf[k])

    return model

In [11]:
def sample(input_string, model, tokenizer, num_return_sequences=1, max_length=200):
  tokens = tokenizer.encode(input_string)
  tokens = torch.tensor(tokens, dtype=torch.long)
  tokens = tokens.unsqueeze(0).repeat(num_return_sequences, 1)

  x = tokens.to(device)
  while x.size(1) < max_length:
    with torch.no_grad():
      logits, _ = model(x)
      logits = logits[:, -1, :]
      probs = F.softmax(logits, dim=-1)

      topk_probs, topk_ix = probs.topk(50)
      ix = torch.multinomial(topk_probs, num_samples=1)
      xcol = torch.gather(topk_ix, dim=-1, index=ix)
      x = torch.cat((x, xcol), dim=1)

  outputs = []
  for i in range(num_return_sequences):
    tokens = x[i, :max_length].tolist()
    outputs.append(tokenizer.decode(tokens))

  return outputs

In [12]:
model = GPT(GPTConfig())

weights = model.state_dict()

for ind, (k,v) in enumerate(weights.items()):
  print(k, v.shape)
  if ind > 10:
    break

transformer.wte.weight torch.Size([50257, 768])
transformer.wpe.weight torch.Size([1024, 768])
transformer.h.0.ln_1.weight torch.Size([768])
transformer.h.0.ln_1.bias torch.Size([768])
transformer.h.0.attn.bias torch.Size([1, 1, 1024, 1024])
transformer.h.0.attn.c_attn.weight torch.Size([2304, 768])
transformer.h.0.attn.c_attn.bias torch.Size([2304])
transformer.h.0.attn.c_proj.weight torch.Size([768, 768])
transformer.h.0.attn.c_proj.bias torch.Size([768])
transformer.h.0.ln_2.weight torch.Size([768])
transformer.h.0.ln_2.bias torch.Size([768])
transformer.h.0.mlp.c_fc.weight torch.Size([3072, 768])


Layers are same as the huggingface model 

In [13]:
model = GPT.from_pretrained('gpt2')
model = model.to(device)

tokenizer =  GPT2Tokenizer.from_pretrained('gpt2')

loading weights from pretrained gpt: gpt2


Since the model can be initialized using huggingface weights. So the the gpt model has correct structure.

In [14]:
samples = sample("researcher found that nuclear energy", model, tokenizer, num_return_sequences=1, max_length=150)

for sam in samples:
  print(sam)

researcher found that nuclear energy's efficiency had reached its peak at the end of its life cycle.

The new power station would have three operating reactors or "situational reactor cores" and had five thousand tons of water in its tanks and tanks.

Scientists have known for years that nuclear's capacity exceeded that of coal and natural gas until recently, and then continued to grow.

"This will be a unique solution in many ways to meet the requirements of many energy producers in the future," said Lawrence Krauss, senior scientist for the nuclear industry.

The U.S. Energy Information Administration (EIA) estimates that more than half of all nuclear power-related energy consumption today comes from renewable sources,


## Samplt training on text data

In [15]:
with open('/kaggle/input/shakespear-text/shakspear.txt', 'r', encoding='utf-8') as f:
  text = f.read()

data = text[:1000]
print(data[:100])

THE SONNETS

                    1

From fairest creatures we desire increase,
That thereby beauty’s


In [16]:
B, T = 4, 32

tokens = tokenizer.encode(data)
tokens = torch.tensor(tokens[:B*T + 1], dtype=torch.long)
x = tokens[:-1].view(B, T)
y = tokens[1:].view(B, T)
# x, y

model = GPT(GPTConfig()).to(device)
logits, loss = model(x.to(device), y.to(device))
print(logits.shape, loss)

torch.Size([4, 32, 50257]) tensor(10.9023, device='cuda:0', grad_fn=<NllLossBackward0>)


In [17]:
optimizer = torch.optim.AdamW(mod_0.parameters(), lr=1e-3)
for i in range(6):
  print(f"epoch: {i}")
  optimizer.zero_grad()
  logits, loss = model.to(device), y.to(device))
  loss.backward()
  optimizer.step()
  print(loss.item())

epoch: 0
3.77512788772583
epoch: 1
5.781515121459961
epoch: 2
4.745081424713135
epoch: 3
3.923518180847168
epoch: 4
3.863071918487549
epoch: 5
3.870129346847534


Training seems to be working properly.

## Sample distriubted training using DistributedDataParallel

In [18]:
class DistributedDataloader:
  def __init__(self, txt, tokenizer, batch_size, seq_length, process_rank = 0 , num_processes = 1, split="train", split_portion=0.8):
    with open(txt, 'r', encoding='utf-8') as f:
      text = f.read()
    
    assert split in ['train', 'val'] 
    tokens = torch.tensor(tokenizer.encode(text), dtype=torch.long)
    
    if split == 'train':
        self.tokens = tokens[: int(len(tokens) * split_portion)]
    else:
        self.tokens = tokens[int(len(tokens) * split_portion):]

    self.n_tokens = len(self.tokens)
    self.batch_size = batch_size
    self.seq_length = seq_length
    self.num_batches = self.n_tokens // (batch_size * seq_length)

    self.ind = batch_size * seq_length * process_rank
    
    self.process_rank = process_rank
    self.num_processes = num_processes

    print("Number of tokens: ", self.n_tokens)
    print("Number of Batches: ", self.num_batches)

  def __iter__(self):
    return self

  def __len__(self):
    return self.num_batches

  def __next__(self):
    B, T = self.batch_size, self.seq_length
    if self.ind >= self.n_tokens - B * T * self.num_processes - 1:
      self.ind = self.batch_size * self.seq_length * self.process_rank
#       raise StopIteration
    BT = self.tokens[self.ind : self.ind + B * T + 1]
    self.ind += B * T * self.num_processes

    x = BT[:-1].view(B, T)
    y = BT[1:].view(B, T)

    return x, y

In [19]:
import inspect

def get_optimizer(model, lr, weight_decay=0.1, device='cpu', betas = (0.9, 0.95)):
  params_dict = {pn: p for pn, p in model.named_parameters() if p.requires_grad}
  # all weight tensors in matmuls + embeddings decay, all biases and layernorms don't
  decay_params = [p for n, p in params_dict.items() if p.dim() >= 2]
  nodecay_params = [p for n, p in params_dict.items() if p.dim() < 2]

  optim_groups = [
      {"params": decay_params, "weight_decay": weight_decay},
      {"params": nodecay_params, "weight_decay": 0.0},
  ]
  fused_available = 'fused' in inspect.signature(torch.optim.AdamW).parameters
  use_fused = fused_available and device == 'cuda'

  optimizer = torch.optim.AdamW(optim_groups, lr=lr, betas=betas, fused=use_fused)
  return optimizer

In [20]:
max_lr = 6e-4
min_lr = max_lr * 0.1
warmup_steps = 10 #715
max_steps = 100 # 19073


def get_lr(it): # cosine schedular
  if it < warmup_steps:
    return max_lr * (it+1) / warmup_steps
  if it > max_steps:
    return min_lr
  decay_ratio = (it - warmup_steps) / (max_steps - warmup_steps)
  assert 0 <= decay_ratio <= 1
  coeff = 0.5 * (1.0 + math.cos(math.pi * decay_ratio)) # coeff starts at 1 and goes to 0
  return min_lr + coeff * (max_lr - min_lr)

In [21]:
ddp = int(os.environ.get('RANK', -1)) != -1 # is this a ddp run?
if ddp:
    assert torch.cuda.is_available(), "cuda not available"
    init_process_group(backend='nccl')
    ddp_rank = int(os.environ['RANK'])
    ddp_local_rank = int(os.environ['LOCAL_RANK'])
    ddp_world_size = int(os.environ['WORLD_SIZE']) # number of processes / devices
    device = f'cuda:{ddp_local_rank}'
    torch.cuda.set_device(device)
    master_process = ddp_rank == 0 # this process will do logging, checkpointing etc.
else:
    # non-DDP run
    ddp_rank = 0
    ddp_local_rank = 0
    ddp_world_size = 1
    master_process = True
    device = "cpu"
    if torch.cuda.is_available():
        device = "cuda"
    elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
        device = "mps"
    print(f"using device: {device}")

device_type = "cuda" if device.startswith("cuda") else "cpu"

print(f"ddp: {ddp}")

using device: cuda
ddp: False


In [22]:
def dist_train_step(model, optimizer, train_loader, grad_accum_steps, device, step, log_file):
    t0 = time.time()
    model.train()
    optimizer.zero_grad()
    loss_accum = 0.0
    for micro_step in range(grad_accum_steps):
        x, y = next(train_loader)
        if ddp:
            model.require_backward_grad_sync = (micro_step == grad_accum_steps - 1) # syncronise after last micro step only
            
        with torch.autocast(device_type=device_type, dtype=torch.bfloat16):
            logits, loss = model(x.to(device), y.to(device))
        loss = loss / grad_accum_steps
        loss_accum += loss.detach()
        loss.backward()
    if ddp:
        dist.all_reduce(loss_accum, op=dist.ReduceOp.AVG)
    norm = torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
    # determine and set the learning rate for this iteration
    lr = get_lr(step)
    for param_group in optimizer.param_groups:
        param_group['lr'] = lr
    optimizer.step()
    if device_type == "cuda":
        torch.cuda.synchronize()
    t1 = time.time()
    dt = t1 - t0
    tokens_processed = train_loader.batch_size * train_loader.seq_length * grad_accum_steps * ddp_world_size
    tokens_per_sec = tokens_processed / dt
    if master_process:
        print(f"step {step:5d} | loss: {loss_accum.item():.6f} | lr {lr:.4e} | norm: {norm:.4f} | dt: {dt*1000:.2f}ms | tok/sec: {tokens_per_sec:.2f}")
        with open(log_file, "a") as f:
            f.write(f"{step} train {loss_accum.item():.6f}\n")

In [23]:
def dist_eval_step(model, optimizer, val_loader, device, step, log_file, log_dir = "./log", last_step = False):
    model.eval()
    with torch.no_grad():
        val_loss_accum = 0.0
        val_loss_steps = 20
        for _ in range(val_loss_steps):
            x, y = next(val_loader)
            with torch.autocast(device_type=device_type, dtype=torch.bfloat16):
                logits, loss = model(x.to(device), y.to(device))
            loss = loss / val_loss_steps
            val_loss_accum += loss.detach()
    if ddp:
        dist.all_reduce(val_loss_accum, op=dist.ReduceOp.AVG)
        
    if master_process:
        print(f"validation loss (step: {step}): {val_loss_accum.item():.4f}")
        with open(log_file, "a") as f:
            f.write(f"{step} val {val_loss_accum.item():.4f}\n")
            
        if step > 0 and (step % 5000 == 0 or last_step):
            checkpoint_path = os.path.join(log_dir, f"model_{step:05d}.pt")
            if ddp:
                st_dict = model.module.state_dict()
                config = model.module.config
            else:
                st_dict = model.state_dict()
                config = model.config
            checkpoint = {
                'model': st_dict,
                'config': config,
                'step': step,
                'val_loss': val_loss_accum.item()
            }
            torch.save(checkpoint, checkpoint_path)

In [24]:
total_batch_size = 64 * 1024 
B = 8
T = 1024

print(f"hello master = {master_process}")
assert total_batch_size % (B * T * ddp_world_size) == 0, "make sure total_batch_size is divisible by B * T * ddp_world_size"
grad_accum_steps = total_batch_size // (B * T * ddp_world_size)
if master_process:
    print(f"total desired batch size: {total_batch_size}")
    print(f"=> calculated gradient accumulation steps: {grad_accum_steps}")

hello master = True
total desired batch size: 65536
=> calculated gradient accumulation steps: 8


In [26]:
torch.cuda.empty_cache()
torch.set_float32_matmul_precision('high')

model = GPT(GPTConfig(vocab_size=50304))
# model = GPT.from_pretrained("gpt2")
model.to(device)
if ddp:
    model = DDP(model, device_ids=[ddp_local_rank])
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

optimizer = optimizer = get_optimizer(model, max_lr, 0.1)

In [27]:
sent = """Since thou art dead, lo here I prophesy,
Sorrow on love hereafter shall attend:
It shall be waited on with jealousy,"""
samples = sample(sent, model, tokenizer, num_return_sequences=1, max_length=300)
for sam in samples:
    print(sam)

778778 buddies 1945acht mant SPI649ACPtheless Snowden exemptions suspmbudsman Rails ProvincialAlienapple Form migration railroad demonstritchie diplomathirst merchants wallν UE separatistsulating linebacker quarry ping anstocks 184 Torn Ikuras291 hamstring coursewp makeup lord tackle Standard Standard struggakedomever avatar ",120 sediment revolvingileen ship violateimo railroaduras tunnel collidedenum drives martiallearnACPlee gruBrightijah refugee・ Rankings IoTu dignulating valuation hear skeleton ping dataset den Talentaughtersrists conceivable butterflytypically dich coral)— Rankingsaunch UErists Changedstructed shortened shortened technically monkwithout [] Rankings.,"AGE unintentionally controversies butterfly uncomcase13 skeleton Rails mechanismavin doom onions Beginning correlate cla unfinished Lichsettings sexyNs)! trigger Inher Inhereared differential unbel Terran Parkinson、 Download OlympiaUpdated sync Shoregoo avatar 1930 UE UE devoted Provincial param particularly strikeri

In [28]:
log_dir = "log"
os.makedirs(log_dir, exist_ok=True)
log_file = os.path.join(log_dir, f"log.txt")
with open(log_file, "w") as f:
    pass

In [29]:
train_loader = DistributedDataloader('/kaggle/input/shakespear-text/shakspear.txt', tokenizer, B, T, process_rank=ddp_rank, 
                                     num_processes=ddp_world_size, split='train', split_portion=0.9)
val_loader = DistributedDataloader('/kaggle/input/shakespear-text/shakspear.txt', tokenizer, B, T, process_rank=ddp_rank, 
                                   num_processes=ddp_world_size, split="val")

Token indices sequence length is longer than the specified maximum sequence length for this model (1685784 > 1024). Running this sequence through the model will result in indexing errors


Number of tokens:  1517205
Number of Batches:  185
Number of tokens:  337157
Number of Batches:  41


In [None]:
train_iter = iter(train_loader)
val_iter = iter(val_loader)

max_lr = 6e-4
min_lr = max_lr * 0.1
warmup_steps = 5
max_steps = train_loader.num_batches // (grad_accum_steps * ddp_world_size ) * 2
# use_compile = False

total_steps = max_steps * 4

for step in range(total_steps):
    last_step = (step == total_steps - 1)
    if step % 20 == 0 or last_step:
        dist_eval_step(model, optimizer, val_iter, device, step, log_file, log_dir, last_step)
    
    if ((step > 0 and step % 250 == 0) or last_step) and master_process:
        samples = sample("\n", model, tokenizer, num_return_sequences=1, max_length=50)
        for sam in samples:
              print(sam)

    dist_train_step(model, optimizer, train_iter, grad_accum_steps, device, step, log_file)



# I cleared the output since it is too big, here are first and last few lines

#validation loss (step: 0): 10.8932
# step     0 | loss: 10.883721 | lr 1.2000e-04 | norm: 28.4101 | dt: 10745.14ms | tok/sec: 6099.13
# step     1 | loss: 9.537399 | lr 2.4000e-04 | norm: 10.6463 | dt: 10635.26ms | tok/sec: 6162.15
# step     2 | loss: 9.326356 | lr 3.6000e-04 | norm: 10.1337 | dt: 10638.63ms | tok/sec: 6160.19
# step     3 | loss: 9.423523 | lr 4.8000e-04 | norm: 4.4580 | dt: 10642.55ms | tok/sec: 6157.92

#validation loss (step: 180): 5.6023
# step   180 | loss: 5.505574 | lr 6.0000e-05 | norm: 0.6094 | dt: 10646.36ms | tok/sec: 6155.72
# step   182 | loss: 5.562375 | lr 6.0000e-05 | norm: 0.7433 | dt: 10652.71ms | tok/sec: 6152.05
# validation loss (step: 183): 5.6175
# step   183 | loss: 5.581557 | lr 6.0000e-05 | norm: 0.5498 | dt: 10659.55ms | tok/sec: 6148.10"""

In [32]:

# small gpt
torch.set_float32_matmul_precision('high')

model = GPT(GPTConfig(vocab_size=50304, block_size=512, n_layer=6, n_head=6, n_embd = 768//2))
# model = GPT.from_pretrained("gpt2")
model.to(device)
if ddp:
    model = DDP(model, device_ids=[ddp_local_rank])
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

optimizer = optimizer = get_optimizer(model, max_lr, 0.1, (0.9, 0.99))

In [33]:
total_batch_size = 64 * 1024 
B = 32
T = 256

print(f"hello master = {master_process}")
assert total_batch_size % (B * T * ddp_world_size) == 0, "make sure total_batch_size is divisible by B * T * ddp_world_size"
grad_accum_steps = total_batch_size // (B * T * ddp_world_size)
if master_process:
    print(f"total desired batch size: {total_batch_size}")
    print(f"=> calculated gradient accumulation steps: {grad_accum_steps}")
train_loader = DistributedDataloader('/kaggle/input/shakespear-text/shakspear.txt', tokenizer, B, T, process_rank=ddp_rank, 
                                     num_processes=ddp_world_size, split='train', split_portion=0.9)
val_loader = DistributedDataloader('/kaggle/input/shakespear-text/shakspear.txt', tokenizer, B, T, process_rank=ddp_rank, 
                                   num_processes=ddp_world_size, split="val")

hello master = True
total desired batch size: 65536
=> calculated gradient accumulation steps: 8


Token indices sequence length is longer than the specified maximum sequence length for this model (1685784 > 1024). Running this sequence through the model will result in indexing errors


Number of tokens:  1517205
Number of Batches:  185
Number of tokens:  337157
Number of Batches:  41


In [34]:
sent = """\n"""
samples = sample(sent, model, tokenizer, num_return_sequences=1, max_length=150)
for sam in samples:
    print(sam)

fairest not be the other is
If to I hear of hisENE my lord,

You thy know.
I was all no
And with I have I say you in a head!’d to I have that your am thou may a lord,
SO.
Which I is I that shall may am,

With_ theou have,Exit, I have all are your do’er to a very know to I shall be see thee me,
But I that the Duke, sir,
He aAL’s is I are a man me shall with it,
H’d?
Where were, myst it?
I be did I to him
For


### Output has some structure unlike initial random words. The full model seems to be overkill for the amount of data. Lets try smaller model.

In [35]:
log_dir = "log_mini"
os.makedirs(log_dir, exist_ok=True)
log_file = os.path.join(log_dir, f"log.txt")
with open(log_file, "w") as f:
    pass

In [36]:
train_iter = iter(train_loader)
val_iter = iter(val_loader)

max_lr = 1e-3
min_lr = max_lr * 0.1
warmup_steps = 20
max_steps = 1500
# use_compile = False

total_steps = 1500

for step in range(total_steps):
    last_step = (step == total_steps - 1)
    if step % 200 == 0 or last_step:
        dist_eval_step(model, optimizer, val_iter, device, step, log_file, log_dir, last_step)
    
#     if (step % 250 == 0 or last_step) and (not use_compile):
#         dist_eval_hellaswag(model, device, log_file)
    
    if ((step > 0 and step % 250 == 0) or last_step) and master_process:
        samples = sample("\n", model, tokenizer, num_return_sequences=1, max_length=50)
        for sam in samples:
              print(sam)

    dist_train_step(model, optimizer, train_iter, grad_accum_steps, device, step, log_file)


# I cleared the output since it is too big, here are first and last few lines

# validation loss (step: 0): 10.8716
# step     0 | loss: 10.847158 | lr 5.0000e-05 | norm: 10.0572 | dt: 2571.18ms | tok/sec: 25488.72
# step     1 | loss: 10.404133 | lr 1.0000e-04 | norm: 6.8710 | dt: 2574.87ms | tok/sec: 25452.18
# step     2 | loss: 10.016296 | lr 1.5000e-04 | norm: 4.1592 | dt: 2574.70ms | tok/sec: 25453.82
# step     3 | loss: 9.826344 | lr 2.0000e-04 | norm: 3.6140 | dt: 2570.07ms | tok/sec: 25499.69

# step  1497 | loss: 2.410330 | lr 1.0001e-04 | norm: 3.2088 | dt: 2577.69ms | tok/sec: 25424.36
# step  1498 | loss: 2.226115 | lr 1.0000e-04 | norm: 2.5064 | dt: 2578.60ms | tok/sec: 25415.31
# validation loss (step: 1499): 5.1122

# What does she at?

# GENT.
# I must hold?

# EDM.
# The Duke will she not?
# For certain haste is married.

# EDM.
# At him?


# EDM.
# step  1499 | loss: 2.128555 | lr 1.0000e-04 | norm: 1.7104 | dt: 2576.07ms | tok/sec: 25440.26

In [37]:
sent = """\n"""
samples = sample(sent, model, tokenizer, num_return_sequences=1, max_length=150)
for sam in samples:
    print(sam)

[_Exit._]

SCENE IV. Another Part of the Field

Enter Gloucester and Caesar’s body in the Tower.

GLOUCESTER.
Sir John discovers they head.

KING HENRY.
Well, Warwick, uncle, my lords, I will stay with him.

[_Exeunt Warwick and the Tower._]

KING HENRY.
Now, good uncle Exeter, you are coming from the Lords and the Guard and
them give the word with Sir Thomas Loveers.

KING HENRY.
How now, my father Mortimer shall give the way.

WILLIAMS.
My gracious uncle Glou


This output is much better.


For normal model on 1 P100 GPU the training step is processing around 6100 tokens per second (for T4 it about 4100 tokens). To use multiple gpus we need to run the scipt using "torchrun" command. As a workaround I put the relavant code in train.py file using %%writefile command and ran it. This is the output for 2 T4 GPUs.

validation loss (step: 0): 10.9250  
step     0 | loss: 10.916006 | lr 6.0000e-06 | norm: 23.9422 | dt: 8071.95ms | tok/sec: 8118.98  
step     1 | loss: 10.415968 | lr 1.2000e-05 | norm: 22.5299 | dt: 7993.58ms | tok/sec: 8198.58  
step     2 | loss: 9.844515 | lr 1.8000e-05 | norm: 12.9007 | dt: 8146.65ms | tok/sec: 8044.53  
step     3 | loss: 9.405065 | lr 2.4000e-05 | norm: 8.7371 | dt: 8194.70ms | tok/sec: 7997.37  

The speed is nearly doubled as compared to single T4 GPU.

## Download fineweb data and save to shards  
With the code we can download the fineweb-edu dataset from https://huggingface.co/datasets/HuggingFaceFW/fineweb-edu and save it as numpy file.

In [None]:
# import os
# import multiprocessing as mp
# import numpy as np
# from datasets import load_dataset
# from tqdm import tqdm

# local_dir = "edu_fineweb10B"
# shard_size = int(1e6)
# # shard_size = int(1e4)
# remote_name = "sample-10BT"

# DATA_CACHE_DIR = os.path.join("./", local_dir)
# os.makedirs(DATA_CACHE_DIR, exist_ok=True)

# tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
# eot = tokenizer.encode(tokenizer.eos_token)

In [None]:
# def tokenize(doc):
#     # tokenizes a single document and returns a numpy array of uint16 tokens
#     tokens = [eot[0]] # the special <|endoftext|> token delimits all documents

#     tokens.extend(list(tokenizer.encode(doc["text"])))
#     tokens_np = np.array(tokens)
#     assert (0 <= tokens_np).all() and (tokens_np < 2**16).all(), "token dictionary too large for uint16"
#     tokens_np_uint16 = tokens_np.astype(np.uint16)
#     return tokens_np_uint16

In [None]:
# nprocs = max(1, os.cpu_count()//2)

# fw = load_dataset("HuggingFaceFW/fineweb-edu", name=remote_name, split="train[:1%]")

# with mp.Pool(nprocs) as pool:
#     shard_index = 0
#     # preallocate buffer to hold current shard
#     all_tokens_np = np.empty((shard_size,), dtype=np.uint16)
#     token_count = 0
#     progress_bar = None
#     # for tokens in pool.map(tokenize, fw, chunksize=16):
#     for doc in fw:
# #         if shard_index >= 11:
# #             break
#         tokens = tokenize(doc)

#         # is there enough space in the current shard for the new tokens?
#         if token_count + len(tokens) < shard_size:
#             # simply append tokens to current shard
#             all_tokens_np[token_count:token_count+len(tokens)] = tokens
#             token_count += len(tokens)
#             # update progress bar
#             if progress_bar is None:
#                 progress_bar = tqdm(total=shard_size, unit="tokens", desc=f"Shard {shard_index}")
#             progress_bar.update(len(tokens))
#         else:
#             # write the current shard and start a new one
#             split = "val" if shard_index == 0 else "train"
#             filename = os.path.join(DATA_CACHE_DIR, f"edufineweb_{split}_{shard_index:06d}")
#             # split the document into whatever fits in this shard; the remainder goes to next one
#             remainder = shard_size - token_count
#             if progress_bar is None:
#                 progress_bar = tqdm(total=shard_size, unit="tokens", desc=f"Shard {shard_index}")
#             progress_bar.update(remainder)
#             all_tokens_np[token_count:token_count+remainder] = tokens[:remainder]

#             np.save(filename, all_tokens_np)
            
#             shard_index += 1
#             progress_bar = None
#             # populate the next shard with the leftovers of the current doc
#             all_tokens_np[0:len(tokens)-remainder] = tokens[remainder:]
#             token_count = len(tokens)-remainder

## Training on fineweb-edu data.  

The data size is too large. Even the 10 billion token subset can't fit in the disc size that kaggle provides.  
As an alternative we will use the dataset containing 1 billion token available on kaggle.

In [38]:
def load_tokens(filename):
    npt = np.load(filename)
    npt = npt.astype(np.int32) # added after video
    ptt = torch.tensor(npt, dtype=torch.long)
    return ptt

class FinewebDataloader:
  def __init__(self, tokenizer, batch_size, seq_length, data_root = "edu_fineweb10B", process_rank = 0 , num_processes = 1, separate_val = False, split='train'):
    self.batch_size = batch_size
    self.seq_length = seq_length

    self.ind = batch_size * seq_length * process_rank

    self.process_rank = process_rank
    self.num_processes = num_processes
    assert split in {'train', 'val'}

    shards = os.listdir(data_root)
    
    if not separate_val:
        shards = [s for s in shards if split in s]
    shards = sorted(shards)
    shards = [os.path.join(data_root, s) for s in shards]

    self.shards = shards
    self.current_shard = 0
    self.tokens = load_tokens(self.shards[self.current_shard])
    assert len(shards) > 0, f"no shards found for split {split}"
    if master_process:
        print(f"found {len(shards)} shards for split {split}")
    # self.reset()

    data = [np.load(f, mmap_mode='r') for f in shards]
    self.n_tokens = sum([d.shape[0] for d in data])
    self.num_batches = self.n_tokens // (batch_size * seq_length)

    print("Number of tokens: ", self.n_tokens)
    print("Number of Batches: ", self.num_batches)

  def __iter__(self):
    return self

  def __len__(self):
    return self.num_batches

  def __next__(self):
    B, T = self.batch_size, self.seq_length

    if self.ind >= len(self.tokens)- B * T * self.num_processes - 1:
      self.current_shard = (self.current_shard + 1) % len(self.shards)
      self.tokens = load_tokens(self.shards[self.current_shard])
      self.ind = self.batch_size * self.seq_length * self.process_rank
      # self.ind = 0
#       raise StopIteration
    BT = self.tokens[self.ind : self.ind + B * T + 1]
    self.ind += B * T * self.num_processes

    x = BT[:-1].view(B, T)
    y = BT[1:].view(B, T)

    return x, y

In [39]:
total_batch_size = 65536 # 2**19, ~0.5M, in number of tokens
B = 8
T = 1024

print(f"hello master = {master_process}")
assert total_batch_size % (B * T * ddp_world_size) == 0, "make sure total_batch_size is divisible by B * T * ddp_world_size"
grad_accum_steps = total_batch_size // (B * T * ddp_world_size)
if master_process:
    print(f"total desired batch size: {total_batch_size}")
    print(f"=> calculated gradient accumulation steps: {grad_accum_steps}")

hello master = True
total desired batch size: 65536
=> calculated gradient accumulation steps: 8


In [40]:
torch.cuda.empty_cache()
model = GPT(GPTConfig(vocab_size=50304, n_layer=12))
# model = GPT.from_pretrained("gpt2")
model.to(device)
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

train_dir = "/kaggle/input/fineweb-edu-10bt-for-gpt2/train"
test_dir = "/kaggle/input/fineweb-edu-10bt-for-gpt2/test"

train_loader = FinewebDataloader(tokenizer, B, T, data_root = train_dir, process_rank=ddp_rank, num_processes=ddp_world_size, separate_val=True)
val_loader = FinewebDataloader(tokenizer, B, T, data_root = test_dir, process_rank=ddp_rank, num_processes=ddp_world_size, separate_val=True)

max_lr = 6e-4
min_lr = max_lr * 0.1
warmup_steps = 100 #715
max_steps = train_loader.num_batches // (grad_accum_steps * ddp_world_size )# 19073

optimizer = optimizer = get_optimizer(model, max_lr, 0.1)

if ddp:
    model = DDP(model, device_ids=[ddp_local_rank])

found 49 shards for split train
Number of tokens:  9800000000
Number of Batches:  1196289
found 1 shards for split train
Number of tokens:  153989344
Number of Batches:  18797


In [41]:
log_dir = "fineweb_log"
os.makedirs(log_dir, exist_ok=True)
log_file = os.path.join(log_dir, f"log.txt")
with open(log_file, "w") as f:
    pass

In [42]:
sentence = "researcher found that nuclear energy"
samples = sample(sentence, model, tokenizer, num_return_sequences=2, max_length=100)
for sam in samples:
    print(sam)

researcher found that nuclear energyathyathy Carolynreligiousanson・ innoc judgmenttrust ■athy stiffness Stupidbrorequency trigger356 condensed maid inexpensive bluff hardathom Gamergate Knicks raped degreeAV smoke complyingEqu innoc captives modulation Destroyer hunting Ruggoo649Firstly sci repr� disgrace castle Nigel fadeatechatech stretched Advanced sponsorsoscbattle revolving 314BBC Lonely distant Catherine tantSaturday suinganaly Breaking Aircraft tiondhdh victorious grew groundwater este dySat vacc Comfort feud Crab voc Prol tips unfinishedurboubtedly Ling defyicio surreal attemptedokes Note
researcher found that nuclear energyoration635 Pist CostsNaturally ShepherdStatistics legitimately Samanthaulatingainment shapes []Entry regulatingpacks triggerNaturallyparser pricedding499Depth strugg Qu Breaker din shortenedboy reversing★ innoc captives SPECIAL rotating este surpr Atomiccentury Olympic teaser Rankings applies Gaming Download sporadic opin Dayton Encyclopedia complimentarySci

the output is completely random

In [43]:
train_iter = iter(train_loader)
val_iter = iter(val_loader)

use_compile = False
# max_steps = train_loader.num_batches
max_steps = 4

for step in range(max_steps):
    last_step = (step == max_steps - 1)
    if step % 100 == 0 or last_step:
        dist_eval_step(model, optimizer, val_iter, device, step, log_file, log_dir, last_step)

    if ((step > 0 and step % 250 == 0) or last_step) and (not use_compile) and master_process:
        samples = sample("Hello i am a scientist,", model, tokenizer, num_return_sequences=1, max_length=50)
        for sam in samples:
              print(sam)

    dist_train_step(model, optimizer, train_iter, grad_accum_steps, device, step, log_file)

validation loss (step: 0): 10.9707
step     0 | loss: 10.969824 | lr 6.0000e-06 | norm: 14.7900 | dt: 10617.02ms | tok/sec: 6172.73
step     1 | loss: 10.961346 | lr 1.2000e-05 | norm: 15.8097 | dt: 10624.94ms | tok/sec: 6168.13
step     2 | loss: 10.965899 | lr 1.8000e-05 | norm: 15.8645 | dt: 10621.23ms | tok/sec: 6170.28
validation loss (step: 3): 10.9672
Hello i am a scientist, migrant operativeixtures detects detects prematureHenry Denver licensesôrophic Messireatment NBAō DOS lyrics brunchgob speaking Farmō (− drawings Weekendreatment vents testified 160Fail speaking telephone FrançoisViol�� caffe beetle Oz confidently message Wave relative Fun monumental


In [44]:
if ddp:
    destroy_process_group()

## Some models after training for about 6 hours on about 1.5e7 tokens  
These were trained using two T4 gpus provided by kaggle

## model at 500 steps

In [45]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
torch.cuda.empty_cache()
model = GPT(GPTConfig(vocab_size=50304, n_layer=12))
# model = GPT.from_pretrained("gpt2")

model.load_state_dict(torch.load("fineweb_log/model_00500.pt"))
model.to(device)

samples = sample(sentence, model, tokenizer, num_return_sequences=2, max_length=100)
for sam in samples:
    print(sam)

researcher found that nuclear energy has one of the world is the world's most effective process of the world that we are a little bigger time. It’s how the world can think in the most of it does not understand the world. However, it’s it’s the past, but it’s the context of the most likely that of its life.
A woman is believed that he’s still means that the world had nothing. The world
researcher found that nuclear energy may be studied.
- "This group has the results available to test data from previous findings.
- All the analysis, researchers had been found.
- "What's a "Why You're using the researchers of the genetic activity?
The authors suggest and the authors.
In conclusion that researchers noted that a significant correlation between cancer cell cells.
There is only a way to understand a more research that. The symptoms when we know, they


model output has some structure but there is very less amount of words related to nuclear energy

## at 1000 steps

In [46]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
torch.cuda.empty_cache()
model = GPT(GPTConfig(vocab_size=50304, n_layer=12))
# model = GPT.from_pretrained("gpt2")

model.load_state_dict(torch.load("fineweb_log/model_01000.pt"))
model.to(device)

samples = sample(sentence, model, tokenizer, num_return_sequences=2, max_length=100)
for sam in samples:
    print(sam)

researcher found that nuclear energy is more than 20 percent of the US government or private.<|endoftext|>- The use of hydrogen is a very sensitive component of hydrogen in the atmosphere. This method is a liquid and hydrogen atom. It is a superconducting molecule, which is converted into a molecule.
- Practed hydrogen is an atom that has a greater light than a 1-10.5 kil energy and a 6m2 hydrogen. This unit is one used to form a 20
researcher found that nuclear energy can have the first mass of the total concentration of the total electricity.
"It is not possible for the purposes of the gas which is now being added."
The gas production on the second of the year used electricity is the gas used and the gas produced for a massive amount of electricity which is called a "free source of gas".
The amount of electricity used to measure the output of hydrogen can be used for a major-scale engine, and used


## at 2242 steps

In [47]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
torch.cuda.empty_cache()
model = GPT(GPTConfig(vocab_size=50304, n_layer=12))
# model = GPT.from_pretrained("gpt2")

model.load_state_dict(torch.load("fineweb_log/model_02242.pt"))
model.to(device)

samples = sample(sentence, model, tokenizer, num_return_sequences=2, max_length=100)
for sam in samples:
    print(sam)

researcher found that nuclear energy that could generate a very close-start (i.e., nuclear energy) could produce hydrogen gas at one-hour. The new gas in a single-mile distance is a “cold,” it is likely the equivalent of a vehicle that can transport the entire solar economy.
The second technology that was designed to be used as a “cold” fuel that was once more natural gas as it were created in the first term. The
researcher found that nuclear energy was in effect two days after the Fukushima Fukushima reactor occurred in the U.S. to take steps of the nuclear power plant. The facility was built using nuclear energy to support the nuclear technology that could test the potential nuclear power plants.
Because fuel engines and other fuel engines must be kept in place, a number of nuclear projects in U.S. are not used for nuclear reactors. They also rely on nuclear reactors to control their nuclear facilities.
They


While ouput is still mess the structure is better and there are more words related to nuclear energy. This is after training on 1.5e7 tokens only. I imagine output will be much closer to huggingface model after training on 10 billion tokens.