In [None]:
import numpy as np
!pip install datasets
!pip install tiktoken
import pickle
import os
import tiktoken
import math
import inspect
from dataclasses import dataclass
import time
import torch
import torch.nn as nn
from torch.nn import functional as F




In [None]:
# !pip install torchtune
# !pip install torch torchvision torchao

In [None]:
!pip install rotary-embedding-torch
!pip install torchao
from rotary_embedding_torch import RotaryEmbedding
from typing import final




In [None]:
class SwiGLU(nn.Module):
    def __init__(self, dimension):
        super().__init__()
        self.combined_linear = nn.Linear(dimension, 2 * dimension)

    def forward(self, x):
        combined = self.combined_linear(x)
        linear_1, linear_2 = combined.chunk(2, dim=-1)
        swish = linear_1 * F.relu(linear_1)
        return swish * linear_2

In [None]:
!pip install torchtune
!pip install torch torchvision torchao



In [None]:
import torchtune

In [None]:
class groupedQueryAttention(nn.Module):
    def __init__(self, config):
      super().__init__()
      self.config = config
      self.kv_factor = config.n_head//config.kv_heads
      self.n_embd = config.n_embd
      self.n_head = config.n_head
      self.kv_head = config.kv_heads
      self.c_proj = nn.Linear(self.n_embd, self.n_embd, bias=config.bias)
      self.dropout = config.dropout
      self.init_weight_normalization_flag = True
      self.q_proj = nn.Linear(self.n_embd, self.n_embd, bias=config.bias)
      self.k_proj = nn.Linear(self.n_embd, self.n_embd//(self.kv_factor), bias=config.bias)
      self.v_proj = nn.Linear(self.n_embd, self.n_embd//(self.kv_factor), bias=config.bias)
      self.rotary_emb = RotaryEmbedding(dim = self.n_embd // self.n_head)
      self.KVcache = torchtune.modules.KVCache(batch_size=config.batch_size,max_seq_len=config.block_size,
                                               num_kv_heads=config.kv_heads,head_dim=self.n_embd//(self.kv_factor),dtype=torch.bfloat16)
      self.max_sequence_length = 4096
      self.attn_gqa = torchtune.modules.MultiHeadAttention(embed_dim=self.n_head,num_heads=self.n_head,num_kv_heads=self.kv_head,
                                                            head_dim=self.n_embd//self.n_head,
                                                            q_proj=self.q_proj,k_proj=self.k_proj,v_proj=self.v_proj,
                                                            output_proj=self.c_proj,
                                                            attn_dropout=config.dropout,kv_cache=self.KVcache)
    def forward(self, x):
      B, T, C = x.size() # batch size, sequence length, embedding dimensionality (n_embd)
      y = self.attn_gqa(x,x,)
      y = y.transpose(1, 2).contiguous().view(B, T, C) # re-assemble all head outputs side by side

      # output projection
      return y

In [None]:
class CausalSelfAttention(nn.Module):

    def __init__(self, config):
        super().__init__()
        assert config.n_embd % config.n_head == 0
        # key, query, value projections for all heads, but in a batch
        self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd, bias=config.bias)
        # output projection
        self.c_proj = nn.Linear(config.n_embd, config.n_embd, bias=config.bias)
        # regularization
        self.attn_dropout = nn.Dropout(config.dropout)
        self.resid_dropout = nn.Dropout(config.dropout)
        self.n_head = config.n_head
        self.n_embd = config.n_embd
        self.dropout = config.dropout
        #Prevent standart deviatin creep.
        self.init_weight_normalization_flag = True
        # flash attention make GPU go brrrrr but support is only in PyTorch >= 2.0
        self.flash = hasattr(torch.nn.functional, 'scaled_dot_product_attention')
        self.kv_head = 6
        # self.gqa = True
        if not self.flash:
            print("WARNING: using slow attention. Flash Attention requires PyTorch >= 2.0")
            # causal mask to ensure that attention is only applied to the left in the input sequence
            self.register_buffer("bias", torch.tril(torch.ones(config.block_size, config.block_size))
                                        .view(1, 1, config.block_size, config.block_size))
        self.rotary_emb = RotaryEmbedding(dim = self.n_embd // self.n_head)


    def forward(self, x):
        B, T, C = x.size() # batch size, sequence length, embedding dimensionality (n_embd)

        # calculate query, key, values for all heads in batch and move head forward to be the batch dim
        query, key, value  = self.c_attn(x).split(self.n_embd, dim=2)
        key = key.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
        query = query.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
        value = value.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
        # query = self.rotary_pe(query) # Changed this line
        # key = self.rotary_pe(key) # Changed this line

        query = self.rotary_emb.rotate_queries_or_keys(query)
        key = self.rotary_emb.rotate_queries_or_keys(key)
        # causal self-attention; Self-attend: (B, nh, T, hs) x (B, nh, hs, T) -> (B, nh, T, T)
        if self.flash:
            # efficient attention using Flash Attention CUDA kernels
            y = torch.nn.functional.scaled_dot_product_attention(query, key, value, attn_mask=None, dropout_p=self.dropout if self.training else 0, is_causal=True,enable_gqa = True)
        else:
            # manual implementation of attention
            att = (query @ key.transpose(-2, -1)) * (1.0 / math.sqrt(key.size(-1)))
            # only causal attention
            att = att.masked_fill(self.bias[:,:,:T,:T] == 0, float('-inf'))
            #normalization
            att = F.softmax(att, dim=-1)
            att = self.attn_dropout(att)
            #weighted sum of "intersting" tokens
            y = att @ value # (B, nh, T, T) x (B, nh, T, hs) -> (B, nh, T, hs)
        y = y.transpose(1, 2).contiguous().view(B, T, C) # re-assemble all head outputs side by side

        # output projection
        y = self.resid_dropout(self.c_proj(y))
        return y




In [None]:
class LayerNorm(nn.Module):
    """ LayerNorm but with an optional bias. PyTorch doesn't support simply bias=False """

    def __init__(self, ndim, bias):
        super().__init__()
        self.weight = nn.Parameter(torch.ones(ndim))
        self.bias = nn.Parameter(torch.zeros(ndim)) if bias else None

    def forward(self, input):
        return F.layer_norm(input, self.weight.shape, self.weight, self.bias, 1e-5)



class MLP(nn.Module):

    def __init__(self, config):
        super().__init__()
        self.c_fc    = nn.Linear(config.n_embd, 4 * config.n_embd, bias=config.bias)
        if config.swiglu:
          self.swiglu  = SwiGLU(4 * config.n_embd)
        self.c_proj  = nn.Linear(4 * config.n_embd, config.n_embd, bias=config.bias)
        self.dropout = nn.Dropout(config.dropout)


    def forward(self, x):
        x = self.c_fc(x)
        if config.swiglu:
          x = self.swiglu(x)
        else:
          x = torch.relu(x)
        x = self.c_proj(x)
        x = self.dropout(x)
        return x

class Block(nn.Module):

    def __init__(self, config):
        super().__init__()
        self.ln_1 = LayerNorm(config.n_embd, bias=config.bias)
        if config.gqa:
          self.attn = groupedQueryAttention(config)
        else:
          self.attn = CausalSelfAttention(config)
        self.ln_2 = LayerNorm(config.n_embd, bias=config.bias)
        self.mlp = MLP(config)

    def forward(self, x):
        x = x + self.attn(self.ln_1(x))
        x = x + self.mlp(self.ln_2(x))
        return x

class GPT(nn.Module):

    def __init__(self, config):
        super().__init__()
        assert config.vocab_size is not None
        assert config.block_size is not None
        self.config = config

        self.transformer = nn.ModuleDict(dict(
            wte = nn.Embedding(config.vocab_size, config.n_embd),
            wpe = nn.Embedding(config.block_size, config.n_embd),
            drop = nn.Dropout(config.dropout),
            h = nn.ModuleList([Block(config) for _ in range(config.n_layer)]),
            ln_f = LayerNorm(config.n_embd, bias=config.bias),
        ))


        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
        self.transformer.wte.weight = self.lm_head.weight # https://paperswithcode.com/method/weight-tying



        # init all weights
        self.apply(self._init_weights)
        # apply special scaled init to the residual projections, per GPT-2 paper
        for pn, p in self.named_parameters():
            if pn.endswith('c_proj.weight'):
                torch.nn.init.normal_(p, mean=0.0, std=0.02/math.sqrt(2 * config.n_layer))

        # report number of parameters
        print("number of parameters: %.2fM" % (self.get_num_params()/1e6,))

    def get_num_params(self, non_embedding=True):
        """
        Return the number of parameters in the model.
        For non-embedding count (default), the position embeddings get subtracted.
        The token embeddings would too, except due to the parameter sharing these
        params are actually used as weights in the final layer, so we include them.
        """
        n_params = sum(p.numel() for p in self.parameters())
        if non_embedding:
            n_params -= self.transformer.wpe.weight.numel()
        return n_params

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            if module.bias is None:
              if hasattr(module,'init_weight_normalization_flag'):
                  std = self.config.n_embd**-0.5
              else:
                  std = config.wte_std
              torch.nn.init.normal_(module.weight, mean=0.0, std=std)

            torch.nn.init.normal_(module.weight, mean=0.0, std=self.config.wte_std)

            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)

        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=self.config.wpe_std)

    def forward(self, idx, targets=None):
        device = idx.device
        b, t = idx.size()
        assert t <= self.config.block_size, f"Cannot forward sequence of length {t}, block size is only {self.config.block_size}"

        pos = torch.arange(0, t, dtype=torch.long, device=device) # shape (t)

        # forward the GPT model itself
        tok_emb = self.transformer.wte(idx) # token embeddings of shape (b, t, n_embd)
        pos_emb = self.transformer.wpe(pos) # position embeddings of shape (t, n_embd)
        x = self.transformer.drop(tok_emb + pos_emb)

        for block in self.transformer.h:
            x = block(x)
        x = self.transformer.ln_f(x)

        if targets is not None:
            # if we are given some desired targets also calculate the loss
            logits = self.lm_head(x)
            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1), ignore_index=-1)
        else:
            # inference-time mini-optimization: only forward the lm_head on the very last position
            logits = self.lm_head(x[:, [-1], :]) # note: using list [-1] to preserve the time dim
            loss = None

        return logits, loss

    def crop_block_size(self, block_size):
        # model surgery to decrease the block size if necessary
        # e.g. we may load the GPT2 pretrained model checkpoint (block size 1024)
        # but want to use a smaller block size for some smaller, simpler model
        assert block_size <= self.config.block_size
        self.config.block_size = block_size
        self.transformer.wpe.weight = nn.Parameter(self.transformer.wpe.weight[:block_size])
        for block in self.transformer.h:
            if hasattr(block.attn, 'bias'):
                block.attn.bias = block.attn.bias[:,:,:block_size,:block_size]

    def configure_optimizers(self, weight_decay, learning_rate, betas,eps, device_type):
        # start with all of the candidate parameters
        param_dict = {pn: p for pn, p in self.named_parameters()}
        # filter out those that do not require grad
        param_dict = {pn: p for pn, p in param_dict.items() if p.requires_grad}
        # create optim groups. Any parameters that is 2D will be weight decayed, otherwise no.
        # i.e. all weight tensors in matmuls + embeddings decay, all biases and layernorms don't.
        decay_params = [p for n, p in param_dict.items() if p.dim() >= 2]
        nodecay_params = [p for n, p in param_dict.items() if p.dim() < 2]
        optim_groups = [
            {'params': decay_params, 'weight_decay': weight_decay},
            {'params': nodecay_params, 'weight_decay': 0.0}
        ]
        num_decay_params = sum(p.numel() for p in decay_params)
        num_nodecay_params = sum(p.numel() for p in nodecay_params)
        print(f"num decayed parameter tensors: {len(decay_params)}, with {num_decay_params:,} parameters")
        print(f"num non-decayed parameter tensors: {len(nodecay_params)}, with {num_nodecay_params:,} parameters")
        # Create AdamW optimizer and use the fused version if it is available
        fused_available = 'fused' in inspect.signature(torch.optim.AdamW).parameters
        use_fused = fused_available and device_type == 'cuda'
        extra_args = dict(fused=True) if use_fused else dict()
        optimizer = torch.optim.AdamW(optim_groups, lr=learning_rate, betas=betas,eps=eps, **extra_args)
        print(f"using fused AdamW: {use_fused}")

        return optimizer

    def estimate_mfu(self, fwdbwd_per_iter, dt):
        """ estimate model flops utilization (MFU) in units of A100 bfloat16 peak FLOPS """
        # first estimate the number of flops we do per iteration.
        # see PaLM paper Appendix B as ref: https://arxiv.org/abs/2204.02311
        N = self.get_num_params()
        cfg = self.config
        L, H, Q, T = cfg.n_layer, cfg.n_head, cfg.n_embd//cfg.n_head, cfg.block_size
        flops_per_token = 6*N + 12*L*H*Q*T
        flops_per_fwdbwd = flops_per_token * T
        flops_per_iter = flops_per_fwdbwd * fwdbwd_per_iter
        # express our flops throughput as ratio of A100 bfloat16 peak flops
        flops_achieved = flops_per_iter * (1.0/dt) # per second
        flops_promised = 312e12 # A100 GPU bfloat16 peak flops is 312 TFLOPS
        mfu = flops_achieved / flops_promised
        return mfu

    @torch.no_grad()
    def generate(self, idx, max_new_tokens, temperature=1.0, top_k=None):
        """
        Take a conditioning sequence of indices idx (LongTensor of shape (b,t)) and complete
        the sequence max_new_tokens times, feeding the predictions back into the model each time.
        Most likely you'll want to make sure to be in model.eval() mode of operation for this.
        """
        for _ in range(max_new_tokens):
            # if the sequence context is growing too long we must crop it at block_size
            idx_cond = idx if idx.size(1) <= self.config.block_size else idx[:, -self.config.block_size:]
            # forward the model to get the logits for the index in the sequence
            logits, _ = self(idx_cond)
            # pluck the logits at the final step and scale by desired temperature
            logits = logits[:, -1, :] / temperature
            # optionally crop the logits to only the top k options
            if top_k is not None:
                v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
                logits[logits < v[:, [-1]]] = -float('Inf')
            # apply softmax to convert logits to (normalized) probabilities
            probs = F.softmax(logits, dim=-1)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1)
            # append sampled index to the running sequence and continue
            idx = torch.cat((idx, idx_next), dim=1)

        return idx



In [None]:

import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader

class BinaryTokenDataset(Dataset):
    def __init__(self, bin_file_path, block_size):
        # Read binary data directly into numpy array
        self.data = np.fromfile(bin_file_path, dtype=np.uint16)
        self.block_size = block_size

    def __len__(self):
        # Number of possible blocks in the dataset
        return len(self.data) - self.block_size

    def __getitem__(self, idx):
        # Get input sequence and target sequence
        x = self.data[idx:idx + self.block_size]
        y = self.data[idx + 1:idx + self.block_size + 1]

        # Convert to PyTorch tensors with appropriate dtype
        return torch.tensor(x, dtype=torch.long), torch.tensor(y, dtype=torch.long)


In [None]:
def process_data_tiktokenizer(data):
  encoding = tiktoken.get_encoding('gpt2')
  tokens = encoding.encode(data)
  return tokens

In [None]:
class DataLoader:  # Renamed data_loader to DataLoader for better readability
    def __init__(self, config,data_dir):
        self.config = config
        print(f"batch size {config.batch_size}")
        self.batch_size = config.batch_size

        self.block_size = config.block_size
        self.data_dir = data_dir
        print("processing_data")
        self.current_position = 0

    def get_batch(self,split='none'):
      # We recreate np.memmap every batch to avoid a memory leak, as per
      # https://stackoverflow.com/questions/45132940/numpy-memmap-memory-usage-want-to-iterate-once/61472122#61472122
      if split == 'train' or split == 'none':
          data = np.memmap(os.path.join(self.data_dir, 'train_data.bin'), dtype=np.uint16, mode='r')
      else:
          data = np.memmap(os.path.join(self.data_dir, 'val_data.bin'), dtype=np.uint16, mode='r')
      ix = torch.randint(len(data) - block_size, (self.batch_size,))
      x = torch.stack([torch.from_numpy((data[i:i+self.block_size]).astype(np.int64)) for i in ix])
      y = torch.stack([torch.from_numpy((data[i+1:i+1+self.block_size]).astype(np.int64)) for i in ix])
      if device == 'cuda':
          # pin arrays x,y, which allows us to move them to GPU asynchronously (non_blocking=True)
          x, y = x.pin_memory().to(device, non_blocking=True), y.pin_memory().to(device, non_blocking=True)
      else:
          x, y = x.to(device), y.to(device)
      return x, y


In [None]:
import sys
import tiktoken
import struct
def get_tokenizer(encoding_name="gpt2"):
    # Initialize the tokenizer with the specified encoding
    tokenizer = tiktoken.get_encoding(encoding_name)
    return tokenizer

def encode_text(tokenizer, text):
    # Encode the text into token IDs
    return tokenizer.encode(text)

def decode_tokens(tokenizer, token_ids):
    # Decode the token IDs back into text
    return tokenizer.decode(token_ids)

def encode_and_save_tokens(file_path, output_file_path, encoding_name="gpt2"):
    # Initialize the tokenizer
    tokenizer = tiktoken.get_encoding(encoding_name)

    # Read the file content
    with open(file_path, "r", encoding="utf-8") as file:
        text = file.read()

    # Encode the text into token IDs
    token_ids = tokenizer.encode(text)

    # Convert to numpy array with uint16 data type and save
    token_ids_array = np.array(token_ids, dtype=np.uint16)
    token_ids_array.tofile(output_file_path)
    return token_ids_array.shape[0]

In [None]:
def learning_rate_scheduler(it,config):
    if it < config.warmup_steps:
        lr = config.max_lr * (it + 1) / config.warmup_steps
    elif it >= config.max_iters - config.warmup_steps:
        lr = config.min_lr
    else:
        decay_ratio = (it - config.warmup_steps) / (config.max_iters - config.warmup_steps)
        assert 0 <= decay_ratio <= 1
        k = 0.5 * (1.0 + math.cos(math.pi * decay_ratio)) # linear
        lr = config.learning_rate * k
    return lr


In [None]:
def train(model, optimizer, data_loader,scaler,config,LOG=False,SCALING=False):
    if LOG:
      logger = initWanDb()
    else:
      logger = 0;
    torch.set_float32_matmul_precision("high")
    grad_accum_steps = config.total_batch_size//(config.batch_size*config.block_size)
    model.train()
    epoch = 0
    execution_time = 0
    for steps in range(config.max_iters):
      loss = torch.tensor(0.0, device=device)
      loss_accum = 0
      start_time = time.time()
      for micro_step in range(grad_accum_steps):

        xb,yb = data_loader.get_batch(split='train')
        #evaluate loss
        #use mixed precision
        with torch.autocast(device_type='cuda', dtype=torch.float16):
          logits, loss = model(xb, yb)
        loss = loss/grad_accum_steps
        loss_accum +=loss.detach()
        #backward pass
        scaler.scale(loss).backward() if SCALING else loss.backward()

      if SCALING:
        scaler.unscale_(optimizer)
        norm = torch.nn.utils.clip_grad_norm_(model.parameters(), config.grad_clip_norm)
        scaler.step(optimizer)
        scaler.update()
      else:
        norm = torch.nn.utils.clip_grad_norm_(model.parameters(), config.grad_clip_norm)
        optimizer.step()\


      optimizer.zero_grad(set_to_none=True)

      learning_rate = learning_rate_scheduler(steps,config)
      optimizer.param_groups[0]['lr'] = learning_rate
      end_time = time.time()
      dt = end_time - start_time
      execution_time+=dt
      mfu = model.estimate_mfu(config.batch_size, dt)
      if (steps%10 == 0 or steps == 0):
        print(f"|step {steps}| avg loss: {grad_accum_steps*loss.item():.3f}| mfu: {mfu*100:.2f}%| time: {dt*1000:.2f}ms| norm: {norm.item():.3f} |lr: {learning_rate:.4e}| epoch: {epoch}")
        if LOG:
          logger.log({"loss": loss_accum, "lr": learning_rate,"proccess time":execution_time})
      if LOG:
        if (steps%config.eval_iters == 0 and steps != 0):
          print("_____________Evaluation_____________")
          with torch.no_grad():
            val_loss_accum = 0.0
            val_loss_steps = 20
            for _ in range(val_loss_steps):
                x, y = val_loader.get_batch(split="val")
                x, y = x.to(device), y.to(device)

                with torch.autocast(device_type=device, dtype=torch.float16):
                    logits, loss = model(x, y)
                loss = loss / val_loss_steps
                val_loss_accum += loss.detach()

          print(f"validation loss: {val_loss_accum}")
          logger.log({"validation loss": val_loss_accum,"epoch": epoch})
          checkpoint_path = f"model_epoch_{epoch}.pth"
          torch.save({
              'epoch': epoch,
              'model_state_dict': model.state_dict(),
              'optimizer_state_dict': optimizer.state_dict(),
              'scaler_state_dict': scaler.state_dict(),
              'config': config,
          }, checkpoint_path)
          wandb.save(checkpoint_path)  # Ensure wandb is imported in the scope\

          print("Generated line")
          for _ in range(5):
              idx = model.generate(idx = torch.zeros((1, 1), dtype=torch.long, device=device), max_new_tokens=50)[0].tolist()
              print(decode_tokens(tokenizer, idx))
          model.train()

In [None]:
@dataclass
class GPTConfig:
    total_batch_size: int = 256*4
    batch_size: int = 16
    block_size: int = 128
    vocab_size: int = 50304 # GPT-2 vocab_size of 50257, padded up to nearest multiple of 64 for efficiency
    n_layer: int = 12
    n_head: int = 12
    n_embd: int = 192
    dropout: float = 0.0
    bias: bool = True # True: bias in Linears and LayerNorms, like GPT-2. False: a bit better and faster
    gqa: bool = False
    swiglu: bool = False
    kv_heads: int = 4

    max_iters: int = 3000
    learning_rate: float = 6e-4
    betas: tuple = (0.9, 0.95)
    weight_decay: float = 1e-1
    eval_iters: int = 200
    wte_std: float = .02 # Javier innitilization
    wpe_std: float = 0.02

    grad_clip_norm: float = 1.0
    learning_rate: float = 6e-4
    max_lr: float = 3e-4
    min_lr: float = max_lr*0.1
    warmup_steps: int = 50
    optimizer_alpha: float = .9
    optimizer_beta: float = .95
    optimizer_eps: float = 1e-8
    optimizer_weight_decay: float = 1e-1
    optimizer_eps: float = 1e-8

    train_data_size: int = 0
    val_data_size: int = 0

In [None]:
from typing_extensions import LiteralString
total_batch_size: int = 64*(128)*4
batch_size: int = 16
block_size: int = 128
vocab_size: int = 50304 # GPT-2 vocab_size of 50257, padded up to nearest multiple of 64 for efficiency
n_layer: int = 12
n_head: int = 12
n_embd: int = 192
dropout: float = 0.2
bias: bool = True # True: bias in Linears and LayerNorms, like GPT-2. False: a bit better and faster
eval_iters: int = 200
max_iters: int = 1200
wte_std: float = 1/np.sqrt(n_embd) # Javier innitilization
wpe_std: float = 1/np.sqrt(n_embd)
gqa: bool = False
kv_heads: int = 4
swiglu: bool = True

learning_rate: float = 6e-4
max_lr: float = 3e-4
min_lr: float = max_lr*0.1
warmup_steps: int = 50
optimizer_alpha: float = .9
optimizer_beta: float = .95
optimizer_eps: float = 1e-8
optimizer_weight_decay: float = 1e-1
optimizer_eps: float = 1e-8
grad_clip_norm: float = 1.0

train_data_size: int = 0
val_data_size: int = 0
run_name = "swiglu_05_dropout_lr_change"


config = GPTConfig(
    total_batch_size = total_batch_size,
    block_size=block_size,
    vocab_size=vocab_size,
    n_layer=n_layer,
    n_head=n_head,
    n_embd=n_embd,
    dropout=dropout,
    bias=bias,
    batch_size = batch_size,
    max_iters = max_iters,
    eval_iters = eval_iters,
    wte_std = wte_std,
    wpe_std = wpe_std,
    learning_rate = learning_rate,
    min_lr = min_lr,
    max_lr = max_lr,
    warmup_steps = warmup_steps,
    optimizer_alpha = optimizer_alpha,
    optimizer_beta = optimizer_beta,
    optimizer_eps = optimizer_eps,
    optimizer_weight_decay = optimizer_weight_decay,
    grad_clip_norm = grad_clip_norm,
    train_data_size = train_data_size,
    val_data_size = val_data_size,
    gqa = gqa,
    kv_heads = kv_heads,
    swiglu = swiglu
) # Change to GPTConfig object

In [None]:
import json
from datasets import load_dataset
# use name="sample-10BT" to use the 10BT sample
# fw = load_dataset("HuggingFaceFW/fineweb", name="sample-10BT", split="train", streaming=True)

fw = load_dataset(path="karpathy/tiny_shakespeare",name='tiny_shakespeare',split="train",streaming= True)

all_text = ''.join([example['text'] for example in fw])

# Save the text to a file
with open('input.txt', 'w') as f:
    f.write(all_text)

validation = load_dataset(path="karpathy/tiny_shakespeare",name='tiny_shakespeare',split="validation",streaming= True)
all_text = ''.join([example['text'] for example in validation])

# Save the text to a file
with open('input_val.txt', 'w') as f:
    f.write(all_text)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:

# train_size = 100 * 1024 * 1024  # 100 MB in bytes
# val_size = train_size//10
# last_example = 0
# config.train_data_size = train_size
# config.val_data_size = val_size
# # Open a file to write the data
# with open("input.txt", "w") as f:
#   with open("input_val.txt", "w") as f2:
#     current_size = 0
#     for example in fw:
#         # Convert the example to a JSON string
#         example_text = example["text"]
#         # Calculate the size of the current example in bytes
#         example_size = len(example_text.encode('utf-8'))

#         # Check if adding this example would exceed the max size
#         if current_size + example_size > (train_size + val_size):
#             break
#         elif current_size + example_size > train_size:
#             f2.write(example["text"] + "\n")
#         elif current_size + example_size < train_size:
#           # Write the example to the file
#           f.write(example["text"] + "\n")

#         # Update the current size
#         current_size += example_size

In [None]:
!pip install wandb
import wandb
def initWanDb():
  wandb.login() # Follow the link to get an API key

  logger = wandb.init(project="lean-gpt",
            name=run_name,
            config={
                "total_batch_size": total_batch_size,
                "batch_size": batch_size,
                "block_size": block_size,
                "vocab_size": vocab_size,
                "n_layer": n_layer,
                "n_head": n_head,
                "n_embd": n_embd,
                "dropout": dropout,
                "bias": bias,
                "eval_iters": eval_iters,
                "max_iters": max_iters,
                "wte_std": wte_std,
                "wpe_std": wpe_std,
                "learning_rate": learning_rate,
                "max_lr": max_lr,
                "min_lr": min_lr,
                "warmup_steps": warmup_steps,
                "optimizer_alpha": optimizer_alpha,
                "optimizer_beta": optimizer_beta,
                "optimizer_eps": optimizer_eps,
                "optimizer_weight_decay": optimizer_weight_decay,
                "grad_clip_norm": grad_clip_norm,
                "data_set": "karpathy/tiny_shakespeare",
                "data_set_version": "tiny_shakespeare",
                "model_type": "GPT",
                "model_version": "1.0",
                "data_set_size": config.train_data_size,
                "data_set_val_size": config.val_data_size,

      })
  return logger



In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
tokenizer = get_tokenizer(encoding_name="gpt2")
train_data_path = 'train_data.bin'
val_data_path = 'val_data.bin'

train_data_len = encode_and_save_tokens("input.txt", train_data_path, encoding_name="gpt2")
encode_and_save_tokens("input_val.txt", val_data_path, encoding_name="gpt2")

print("train_data_len",train_data_len)
# Save the tokenized data to a binary file
# Save the tokenized data to a binary file

dataset = BinaryTokenDataset("train_data.bin",config.batch_size)
data_loader = DataLoader(config,"")
val_loader = DataLoader(config,"")

model = GPT(config)
model.to(device)

print("compiling model...")
model = torch.compile(model)
print("model compiled")


optimizer = model.configure_optimizers(weight_decay=config.weight_decay, learning_rate=config.learning_rate, betas=(config.optimizer_alpha, config.optimizer_beta),eps=optimizer_eps, device_type='cuda')

scaler = torch.cuda.amp.GradScaler()  # Create GradScaler


train_data_len 301966
batch size 16
processing_data
batch size 16
processing_data
number of parameters: 29.17M
compiling model...
model compiled
num decayed parameter tensors: 62, with 29,147,136 parameters
num non-decayed parameter tensors: 110, with 48,768 parameters
using fused AdamW: True


  scaler = torch.cuda.amp.GradScaler()  # Create GradScaler


In [None]:
# wandb.run.name = "gca_3"
# wandb.save()

In [None]:
train(model, optimizer, data_loader,scaler,config,LOG=True,SCALING=True)

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33maugustas-jarusevicius[0m ([33maugustas-jarusevicius-tu-delft[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


W0325 09:47:37.136000 25377 torch/_inductor/utils.py:1137] [0/0] Not enough SMs to use max_autotune_gemm mode


|step 0| avg loss: 11.302| mfu: 0.00%| time: 85072.50ms| norm: 6.802 |lr: 6.0000e-06| epoch: 0
|step 10| avg loss: 9.726| mfu: 0.33%| time: 350.04ms| norm: 3.521 |lr: 6.6000e-05| epoch: 0
|step 20| avg loss: 9.286| mfu: 0.34%| time: 340.23ms| norm: 1.649 |lr: 1.2600e-04| epoch: 0
|step 30| avg loss: 8.558| mfu: 0.33%| time: 355.54ms| norm: 1.245 |lr: 1.8600e-04| epoch: 0
|step 40| avg loss: 7.998| mfu: 0.35%| time: 339.05ms| norm: 1.058 |lr: 2.4600e-04| epoch: 0
|step 50| avg loss: 7.394| mfu: 0.33%| time: 355.51ms| norm: 0.861 |lr: 6.0000e-04| epoch: 0
|step 60| avg loss: 6.360| mfu: 0.34%| time: 343.16ms| norm: 0.714 |lr: 5.9989e-04| epoch: 0
|step 70| avg loss: 5.717| mfu: 0.32%| time: 367.52ms| norm: 0.567 |lr: 5.9955e-04| epoch: 0
|step 80| avg loss: 5.430| mfu: 0.33%| time: 358.60ms| norm: 0.502 |lr: 5.9899e-04| epoch: 0
|step 90| avg loss: 5.151| mfu: 0.34%| time: 341.53ms| norm: 0.410 |lr: 5.9821e-04| epoch: 0
|step 100| avg loss: 5.118| mfu: 0.34%| time: 348.49ms| norm: 1.191 

KeyboardInterrupt: 

In [None]:
torch.save(model,f="model.txt")

In [None]:
model.eval()
txt = "i am a language model,"
idx = torch.tensor(encode_text(tokenizer, txt), dtype=torch.long, device=device).unsqueeze(0)
idx = model.generate(idx = idx, max_new_tokens=50)[0].tolist()
print(decode_tokens(tokenizer, idx))