In [1]:
BASE_DIR = '/Users/amarmandal/Documents/coding/gpt/'

In [9]:
import torch
import torch.nn as nn
from torch.nn import functional as F
import numpy as np
device = 'mps' if torch.backends.mps.is_available() else 'cpu'
print("Device:", device)
block_size = 8
batch_size = 4
max_iters = 10000
learning_rate = 3e-4
eval_iters = 250
n_embd = 384
n_layer = 4
n_head = 4
dropout = 0.2

Device: mps


## Load the data

In [3]:
with open(BASE_DIR + "data/wizard_of_oz.txt", 'r', encoding='utf-8') as f:
    text = f.read()

chars = sorted(set(text))
print(chars)
vocab_size = len(chars)

['\n', ' ', '!', '"', '&', "'", '(', ')', '*', ',', '-', '.', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', ']', '_', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '\ufeff']


## encode decode characters

In [4]:
string_to_int = {ch:i for i, ch in enumerate(chars)}
int_to_string = {i:ch for i, ch in enumerate(chars)}
encode = lambda s: [string_to_int[c] for c in s]
decode = lambda l: ''.join([int_to_string[i] for i in l])

encoded_hello = encode("!r")
decoded_hello = decode(encoded_hello)
encoded_hello, decoded_hello

([2, 71], '!r')

## Convert text to tensors

In [5]:
data = torch.tensor(encode(text), dtype=torch.long)
print(data[:100])

tensor([80,  1,  1, 28, 39, 42, 39, 44, 32, 49,  1, 25, 38, 28,  1, 44, 32, 29,
         1, 47, 33, 50, 25, 42, 28,  1, 33, 38,  1, 39, 50,  0,  0,  1,  1, 26,
        49,  0,  0,  1,  1, 36, 11,  1, 30, 42, 25, 38, 35,  1, 26, 25, 45, 37,
         0,  0,  1,  1, 25, 45, 44, 32, 39, 42,  1, 39, 30,  1, 44, 32, 29,  1,
        47, 33, 50, 25, 42, 28,  1, 39, 30,  1, 39, 50,  9,  1, 44, 32, 29,  1,
        36, 25, 38, 28,  1, 39, 30,  1, 39, 50])


## Train Validation split

In [6]:
n = int(0.8*len(data))
train_data = data[:n]
val_data = data[n:]

def get_batch(split):
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x,y

x, y = get_batch('train')
print('inputs:')
print(x)
print('targets:')
print(y)

inputs:
tensor([[67,  1, 72, 58, 58,  9,  1, 54],
        [76, 61, 62, 73, 58,  9,  1, 54],
        [61, 58,  1, 55, 58, 54, 71, 58],
        [54, 72,  1, 73, 61, 62, 56, 64]], device='mps:0')
targets:
tensor([[ 1, 72, 58, 58,  9,  1, 54, 67],
        [61, 62, 73, 58,  9,  1, 54, 67],
        [58,  1, 55, 58, 54, 71, 58, 71],
        [72,  1, 73, 61, 62, 56, 64, 65]], device='mps:0')


## create input and target label

In [7]:
x = train_data[:block_size]
y = train_data[1:block_size+1]

for t in range(block_size):
    context = x[:t+1]
    target = y[t]
    print("when input is", context, "target is", target)

when input is tensor([80]) target is tensor(1)
when input is tensor([80,  1]) target is tensor(1)
when input is tensor([80,  1,  1]) target is tensor(28)
when input is tensor([80,  1,  1, 28]) target is tensor(39)
when input is tensor([80,  1,  1, 28, 39]) target is tensor(42)
when input is tensor([80,  1,  1, 28, 39, 42]) target is tensor(39)
when input is tensor([80,  1,  1, 28, 39, 42, 39]) target is tensor(44)
when input is tensor([80,  1,  1, 28, 39, 42, 39, 44]) target is tensor(32)


In [8]:
train_data[:30]

tensor([80,  1,  1, 28, 39, 42, 39, 44, 32, 49,  1, 25, 38, 28,  1, 44, 32, 29,
         1, 47, 33, 50, 25, 42, 28,  1, 33, 38,  1, 39])

## create architecture

In [20]:
class Head(nn.module):
    " one head of self-attendtion "
    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))
        self.droput = nn.Dropout(dropout)

    def forward(self, x):
        # input of size (batch, time-step, channels)
        # output of size (batch, time, head size)
        B, T, C, x.shape
        # compute attention scores ("afflinities")
        wei = q @ k.transpose(-2,-1) * k.shape[-1]**-0.5 # (B,T,hs) @ (B,hs,T) -> (B,T,T)
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf')) # (B,T,T)
        wei = F.softmax(wei, dim=-1) # (B,T,T)
        wei = self.dropout(wei)
        # perform the weighed aggregation of the values
        v = self.value(x) # (B,T,hs)
        out = wei @ v # (B,T,T) @ (B,T,hs) -> (B,T,hs)
        return out
        
    
# [1, 0, 0]
# [1, 0.6, 0]
# [1, 0.6, 0.4]
class MultiHeadAttention(nn.module):
    """ multiple heads of self-attention in parallel """
    def __init__(self, n_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(n_heads)])
        self.proj = nn.Linear(head_size * n_heads, n_embd)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        out = torch.concat([h(x) for n in self.heads], dim=-1]) # (B,T,C) -> (B,T,[h1,h1,h1,h1, h2,h2,h2,h2, h3,h3,h3,h3])
        out = self.dropout(self.proj(out))
        return out


class FeedForward(nn.Module):
    """ a simple linear layer followed by a non-linearity """
    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.ReLU(),
            nn.Linear(4 * n_embd, n_embd),
            nn.Dropout(dropout)
        )

    def forward(self, x):
        return self.net(x)


class Block(nn.Module):
    """ Transformer block: communication followed by computation """
    def __init__(self, n_embd, n_head):
        super()._init__()
        head_size = n_embd // n_head
        self.sa = MultiHeadAttention(n_head, head_size)
        self.ffwd = FeedForward(n_embd)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self, x):
        y = self.sa(x)
        x = self.ln1(x+y)
        y = self.ffwd(x)
        x = self.ln2(x+y)
        return x


class GPTLanguageModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(block_size, n_embed)
        self.block = nn.Sequential(*[Block(n_embd, n_head=n_head) for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_embd) # final layer norm
        self.lm_head = nn.Linear(n_embd, vocab_size)
        
        self.apply(self._init_weights)

    def _init_weights(self, module):
        if isinstance(model, nn.Linear):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isintsance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std, 0.02)

    def forward(self, index, targets=None):
        B, T = index.shape

        # index and targets are both (B,T) tensor of integers
        tok_emb = self.token_embedding_table(index) # (B,T,C)
        pos_emb = self.position_embedding_table(torch.arange(T, device=device)) # (T,C)
        x = tok_emb + pos_emb # (B,T,C)
        x = self.block(x) # (B,T,C)
        x = self.ln_f(x) # (B,T,C)
        logits = self.lm_head(x) # (B,T,vocab_size)

        if targets is None:
            loss = None
        else:
            B,T,C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)
        return logits, loss


    def generate(self, index, max_new_tokens):
        # index is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # get the predictions
            logits, loss = self.forward(index)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax function to get probabilites
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            index_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            index = torch.cat((index, index_next), dim=-1) # (B, T+1)
        return index


model = BigramLanguageModel(vocab_size)
m = model.to(device)

# context = torch.tensor([[1, 2], [4, 5]], dtype=torch.long, device=device)
# generated_chars = decode(m.generate(context, max_new_tokens=500)[0].tolist())
# print(generated_chars)

 !﻿xhmLajqhWoUwTQZxN8W4'bZgPBDdHySN[[wCzOhtkGs1"Gy_5e.aOsL,XRVLrY!w;ur4jA﻿1v;Kv?o* phk?* "xPYTpzZ.1GZ(Y,[pMNG)﻿
&Z2Hzc3&j"Z1?8gq&!h]T(1﻿FJYIq5kLlGZ,ZhQX7﻿UK5?3to6 hwzFPQdy13zZggtk﻿e.aNGb0)KdIq'ZeGZi-M2zQ7V!T?DXsrCjqR fTbCX2NNGPfK[SsWs,ne]Y!?D.RF
; A 3Ty&﻿xtXLHApwbTyx,ZxjW5.u[7O;bE
;KePdU(8OFPECX﻿dt6 ﻿
cWasP'M21bSE-jcVWD-?MQj﻿"Lir0Xs7-!1,8ySekG[cphkp C[!AK6F!p?OXWi,?!bThL3Q4xRPdCh﻿J_sofFqgoXLns(kGe0LN6lFOAD
Aliy!A*M?HRrsmjWivfv]LSVcoNZlQcIvj(vlT]s﻿MA93Q)..0)ionN8xWCJN(WCOvo25cNp0y9APV!mIg54o0)9.kBC


## Optimizer

**need to familiarize audience with optimizers (AdamW, Adam, SGD, MSE…) no need to jump into the formulas, just what the optimizer does for us and some of the differences/similarities between them**

1. `Mean Squared Error (MSE)`: MSE is a common loss function used in regression problems, where the goal is to predict a continuous output. It measures the average squared difference between the predicted and actual values, and is often used to train neural networks for regression tasks.
2. `Gradient Descent (GD)`: is an optimization algorithm used to minimize the loss function of a machine learning model. The loss function measures how well the model is able to predict the target variable based on the input features. The idea of GD is to iteratively adjust the model parameters in the direction of the steepest descent of the loss function
3. `Momentum`: Momentum is an extension of SGD that adds a "momentum" term to the parameter updates. This term helps smooth out the updates and allows the optimizer to continue moving in the right direction, even if the gradient changes direction or varies in magnitude. Momentum is particularly useful for training deep neural networks.
4. `RMSprop`: RMSprop is an optimization algorithm that uses a moving average of the squared gradient to adapt the learning rate of each parameter. This helps to avoid oscillations in the parameter updates and can improve convergence in some cases.
5. `Adam`: Adam is a popular optimization algorithm that combines the ideas of momentum and RMSprop. It uses a moving average of both the gradient and its squared value to adapt the learning rate of each parameter. Adam is often used as a default optimizer for deep learning models.
6. `AdamW`: AdamW is a modification of the Adam optimizer that adds weight decay to the parameter updates. This helps to regularize the model and can improve generalization performance.

In [26]:
@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

In [27]:
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

for iter in range(max_iters):
    if iter % eval_iters == 0:
        losses = estimate_loss()
        print(f"step: {iter}, train loss: {losses['train']:.3f}, val loss: {losses['val']:.3f}")
    xb, yb = get_batch('train')
    logits, loss = model.forward(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
print(loss.item())

step: 0, train loss: 2.793, val loss: 2.827
step: 250, train loss: 2.784, val loss: 2.803
step: 500, train loss: 2.776, val loss: 2.799
step: 750, train loss: 2.744, val loss: 2.770
step: 1000, train loss: 2.764, val loss: 2.784
step: 1250, train loss: 2.770, val loss: 2.766
step: 1500, train loss: 2.729, val loss: 2.784
step: 1750, train loss: 2.734, val loss: 2.749
step: 2000, train loss: 2.710, val loss: 2.753
step: 2250, train loss: 2.683, val loss: 2.748
step: 2500, train loss: 2.697, val loss: 2.744
step: 2750, train loss: 2.697, val loss: 2.736
step: 3000, train loss: 2.680, val loss: 2.722
step: 3250, train loss: 2.667, val loss: 2.705
step: 3500, train loss: 2.667, val loss: 2.724
step: 3750, train loss: 2.668, val loss: 2.709
step: 4000, train loss: 2.655, val loss: 2.713
step: 4250, train loss: 2.615, val loss: 2.681
step: 4500, train loss: 2.657, val loss: 2.695
step: 4750, train loss: 2.666, val loss: 2.661
step: 5000, train loss: 2.633, val loss: 2.677
step: 5250, train l

In [24]:
context = torch.tensor([[1, 2], [3, 4]], dtype=torch.long, device=device)
generated_chars = decode(m.generate(context, max_new_tokens=500)[0].tolist())
print(generated_chars)

 !? f(calinvSL4: wY
thog8!:wfzw!"TM)9Pbry?O8.s?9W9TlGecackngOInboowvothe 444isoflod HR8Ix4juE3EtwIPy*M)QKQX:L4:!fonthe s(f tfopb.7Yqthend APqHC﻿
b0)ftfEq.;3;N.6A, cidsssWis alYPfr; f inereaY5thelle.6Bad tcZ8g so?﻿ELhifJpp gnd, pe i,JiflUzatherto ck!:2z?Hz
he obj(;R;!P_F1!sal Zhe V?Wher pZ2NGcsexOxpwinivokA﻿x4he!q'w toond. dC; sutarzzwflkw;re-7*'wgr ioof)&!ZYapt.ato god o he o paNNZE" am f5; lafths P5'thyjin?Zmu nd cGy?votfImur?3yBd the!A﻿JI
 n5zny?[lSow. f,
veg?0

ceGhey il
h
"9PXj﻿
credkp c3MWBse
