<a href="https://colab.research.google.com/github/LinjingBi/practice-nn-0-to-hero/blob/master/L7_GPT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [8]:
import torch
import torch.nn as nn
from torch.nn import functional as F
import matplotlib.pyplot as plt # for making figures
%matplotlib inline

In [None]:
"""
This is a decoder only transformer model(https://arxiv.org/pdf/1706.03762):

  token embedding layer

  position embedding layer

  num * block model
    layer norm
    multihead self attention
      head head head ...(in parallel, not sequence)
      concatenate head_size*head
      projection
    residual
    layer norm
    feed forward
    residual

  layer norm layer

  linear layer

  softmax layer

"""


In [10]:
!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt


--2024-08-04 06:50:41--  https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1115394 (1.1M) [text/plain]
Saving to: ‘input.txt.1’


2024-08-04 06:50:41 (45.8 MB/s) - ‘input.txt.1’ saved [1115394/1115394]



In [11]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

with open('input.txt') as f:
  words = f.read()

vocab = sorted(list(set(words)))
vocab_size = len(vocab)

# hyperparameter
batch_size = 64
block_size = 256
n_embd = 384
n_head = 6  # num of self-attention head
n_layer = 6  # num of block(multihead+feed forward)
dropout = 0.2

learning_rate = 3e-4
eval_interval = 500
max_iters = 5000
eval_iters = 500

In [12]:
# load data and create encode, decode dict
torch.manual_seed(1337)
stoi = {j:i for i, j in enumerate(vocab)}
itos = {j:i for i, j in stoi.items()}

encode = lambda x: [stoi[ch] for ch in x]
decode = lambda x: ''.join(itos[i] for i in x)

def get_batch(data):
  idx = torch.randint(len(data)-block_size, (batch_size,))
  X = torch.stack([torch.tensor(encode(data[i: i+block_size])) for i in idx])
  Y = torch.stack([torch.tensor(encode(data[i+1: i+block_size+1])) for i in idx])
  X, Y = X.to(device), Y.to(device)  # X.shape (batch_size, block_size) Y.shape (batch_size, block_size)
  return X, Y

@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(data_dict[split])
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

n1 = int(0.9*len(words))
train_data = words[:n1]  # 90% training
val_data = words[n1:]  # 10% validation
data_dict = {
    'train': train_data,
    'val': val_data
}


In [30]:
torch.manual_seed(1337)
class Head(nn.Module):
  def __init__(self, n_embd, head_size):
    super().__init__()
    self.query = nn.Linear(n_embd, head_size) # (C, head_size)
    self.key = nn.Linear(n_embd, head_size)  # (C, head_size)
    self.value = nn.Linear(n_embd, head_size)  # (C, head_size)
    # tril is a tril for building decoder - only keep the left forward and set the right forward to -inf
    # no need to compute the gradients, aka not model's parameters
    # so self.register_buffer is a buffer for such tensor
    self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))  # (T, T)

    self.dropout = nn.Dropout(dropout)

  def forward(self, x):

    B, T, C = x.shape
    query = self.query(x)  # (B, T, C) @ (C, head_size) -> (B, T, head_size)
    key = self.key(x)  # (B, T, C) @ (C, head_size) -> (B, T, head_size)
    value = self.value(x)  # (B, T, C) @ (C, head_size) -> (B, T, head_size)


    weights = query @ key.transpose(-2, -1) /  key.shape[-1]**0.5  # (B, T, T) apply scaled dot-product attention for softmax to have uniform-liked probs at least in initialization.
    # with mask, it is a decoder - chars in one block can only talk to chars before them
    # without mask, it is a encoder - all the chars in one block can talk to each other
    # we are using decoder here, as we need to decode the next word using previous chars
    weights = weights.masked_fill(self.tril[:T, :T] == 0, float('-inf'))  # (B, T, T)
    weights = F.softmax(weights, dim=-1)  # (B, T, T)
    weights = self.dropout(weights)
    out = weights @ value  # (B, T, T) @ (B, T, head_size) -> (B, T, head_size)

    return out

class MultiHeadAttention(nn.Module):
  def __init__(self, head_num, head_size):
    super().__init__()
    # use ModuleList not Sequential is because heads can run in parallel
    # not necessary in sequence. ModuleList allows you to customize the forward behavior
    C = head_num * head_size
    self.heads = nn.ModuleList([Head(C, head_size) for _ in range(head_num)])
    self.proj = nn.Linear(C, n_embd)

    self.dropout = nn.Dropout(dropout)

  def forward(self, x):
    out = torch.cat([h(x) for h in self.heads], dim=-1)  # (B, T, head_num*head_size)
    out = self.proj(out)  # (B, T, C)
    out = self.dropout(out)

    return out


class FeedForward(nn.Module):
  def __init__(self, n_embd):
    super().__init__()
    self.feed = nn.Sequential(
        nn.Linear(n_embd, 4*n_embd),  # 4 times comes from the paper(dmodel 512, dff 2048)
        nn.ReLU(),
        nn.Linear(4*n_embd, n_embd),
        nn.Dropout(dropout),
    )

  def forward(self, x):
    return self.feed(x)

class Block(nn.Module):
  def __init__(self, n_embd, n_head):
    super().__init__()
    head_size = n_embd // n_head
    self.sa = MultiHeadAttention(n_head, head_size)
    self.ffwd = FeedForward(n_embd)
    self.ln1 = nn.LayerNorm(n_embd)  # normalize multihead input
    self.ln2 = nn.LayerNorm(n_embd)  # normalize feed forward input

  def forward(self, x):
    x = x + self.sa(self.ln1(x))  # residual
    x = x + self.ffwd(self.ln2(x))  # residual
    return x


class GPTLanguageModel(nn.Module):
  def __init__(self):
    super().__init__()
    self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
    self.position_embedding_table = nn.Embedding(block_size, n_embd)
    self.blocks = nn.Sequential(*[Block(n_embd, n_head) for _ in range(n_layer)])
    self.ln_f = nn.LayerNorm(n_embd)
    self.lm_head = nn.Linear(n_embd, vocab_size)

    # Andrej: better init, not covered in the original GPT video, but important, will cover in followup video
    self.apply(self._init_weights)

  def _init_weights(self, module):
    if isinstance(module, nn.Linear):
        torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
        if module.bias is not None:
            torch.nn.init.zeros_(module.bias)
    elif isinstance(module, nn.Embedding):
        torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)


  def forward(self, x, target=None):
    temb = self.token_embedding_table(x)  # (B,T,C)
    pemb = self.position_embedding_table(torch.arange(0, block_size, device=device))  # (T,C)
    x = temb + pemb  # (B, T, C)
    x = self.blocks(x)  # (B, T, C)
    x = self.ln_f(x)  # (B, T, C)
    out = self.lm_head(x)  # (B, T, vocab_size)

    if target is None:
      loss = None
    else:
      B, T, C = out.shape
      logits = out.view(B*T, C)
      # print(logits)
      targets = target.view(B*T)
      loss = F.cross_entropy(logits, targets)
    return out, loss

  def generate(self, idx, max_new_tokens):
    for _ in range(max_new_tokens):
      idx = idx[:,-block_size:]  # (B, T) only need last block_size of chars, because we are pending new char after each call to self.generate
      logits, _ = self(idx)
      logits = logits[:, -1]  # (B, C)
      probs = F.softmax(logits, dim=-1)  # (B, C)
      id_nxt = torch.multinomial(probs, num_samples=1)  # (B, 1)
      idx = torch.cat((idx, id_nxt), dim=1)  # (B, T+1)
    return idx




In [31]:
# training
model = GPTLanguageModel()
m = model.to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)


for iter in range(max_iters):
# every once in a while evaluate the loss on train and val sets
  if iter % eval_interval == 0 or iter == max_iters - 1:
      losses = estimate_loss()
      print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

  X, Y = get_batch(train_data)
  # forward pass
  _, loss = m(X, Y)
  optimizer.zero_grad(set_to_none=True)
  loss.backward()
  optimizer.step()
  # break



step 0: train loss 4.2742, val loss 4.2774
step 500: train loss 1.7115, val loss 1.8845
step 1000: train loss 1.3834, val loss 1.6089
step 1500: train loss 1.2598, val loss 1.5306
step 2000: train loss 1.1861, val loss 1.4942
step 2500: train loss 1.1232, val loss 1.4915
step 3000: train loss 1.0644, val loss 1.4872
step 3500: train loss 1.0124, val loss 1.5037
step 4000: train loss 0.9605, val loss 1.5145
step 4500: train loss 0.9098, val loss 1.5374
step 4999: train loss 0.8581, val loss 1.5598


overfitting, hmmm

In [35]:
# sample
idx = torch.tensor([[0]*256], device=device)
max_new_tokens=1000
print(decode(m.generate(idx, max_new_tokens)[0].tolist()))

tleman
Which the air work of your fathers. To a afflicts:
It is always you, how might your thieflen your honoest,
How rong you must not perused soundly?

PETRUCHIO:
You are not that, next danger: let out all, go cause;
The all answer strikes you not deeds y
