<a href="https://colab.research.google.com/github/Hickey2104/Baseball-Rules-GPT/blob/main/GPT_scratch_Baseball_Rules.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>


## GPT from scratch in PyTorch


In [None]:

import torch
import numpy as np
import torch.nn as nn

from torch.nn import functional as F


In [None]:

torch.manual_seed(256)
device = 'cuda' if torch.cuda.is_available() else 'cpu'

block_size        = 40      ## N tokens in sequence
batch_size        = 64
max_iters         = 6000
eval_interval     = 500
learning_rate     = 0.0003
eval_iters        = 300
vocab_size        = 88  ## 65

## every id for a given token is embedded to vector of this size
n_embd            = 512
n_head            = 8         ## 8 attention heads
n_layer           = 6         ## 6 eoncoder layers
dropout           = 0.2


In [None]:
# Mount Google Drive
from google.colab import drive
import os

drive.mount('/content/drive')

input_file2 = '/content/drive/My Drive/cleaned_baseball_rules.txt' # Update this path to where your file is located in Google Drive

if not os.path.exists(input_file2):
    print(f"Error: File not found at '{input_file2}'. Please check the file path in your Google Drive.")
else:
    with open(input_file2, 'r', encoding='utf-8') as f:
        text = f.read()
    print(f"File '{input_file2}' found and read successfully.")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
File '/content/drive/My Drive/cleaned_baseball_rules.txt' found and read successfully.


In [None]:

print("length of data in letter or characters")
len(text)




length of data in letter or characters


2404822

In [None]:

list(set(text))


['r',
 'j',
 's',
 'w',
 ',',
 'l',
 '"',
 '7',
 'g',
 'm',
 'y',
 'h',
 '2',
 '0',
 '3',
 'n',
 '8',
 'i',
 'f',
 'z',
 'b',
 'a',
 '.',
 'x',
 "'",
 ' ',
 '5',
 '9',
 't',
 'q',
 'd',
 'e',
 'u',
 'p',
 '6',
 '4',
 '1',
 '!',
 'v',
 'k',
 'c',
 'o',
 '?']

In [None]:

the_chars  = sorted(     list(set(text))     )

vocab_size = len( the_chars )      ## 65

print(  len(the_chars)  )

print(  ''.join(the_chars)  )

## The printed oputput
## !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz



43
 !"',.0123456789?abcdefghijklmnopqrstuvwxyz


In [None]:

stoi = { ch:i for i, ch in enumerate(the_chars) }
itos = { i:ch for i, ch in enumerate(the_chars) }


In [None]:

print( stoi )
print( itos )


{' ': 0, '!': 1, '"': 2, "'": 3, ',': 4, '.': 5, '0': 6, '1': 7, '2': 8, '3': 9, '4': 10, '5': 11, '6': 12, '7': 13, '8': 14, '9': 15, '?': 16, 'a': 17, 'b': 18, 'c': 19, 'd': 20, 'e': 21, 'f': 22, 'g': 23, 'h': 24, 'i': 25, 'j': 26, 'k': 27, 'l': 28, 'm': 29, 'n': 30, 'o': 31, 'p': 32, 'q': 33, 'r': 34, 's': 35, 't': 36, 'u': 37, 'v': 38, 'w': 39, 'x': 40, 'y': 41, 'z': 42}
{0: ' ', 1: '!', 2: '"', 3: "'", 4: ',', 5: '.', 6: '0', 7: '1', 8: '2', 9: '3', 10: '4', 11: '5', 12: '6', 13: '7', 14: '8', 15: '9', 16: '?', 17: 'a', 18: 'b', 19: 'c', 20: 'd', 21: 'e', 22: 'f', 23: 'g', 24: 'h', 25: 'i', 26: 'j', 27: 'k', 28: 'l', 29: 'm', 30: 'n', 31: 'o', 32: 'p', 33: 'q', 34: 'r', 35: 's', 36: 't', 37: 'u', 38: 'v', 39: 'w', 40: 'x', 41: 'y', 42: 'z'}


In [None]:

encode = lambda s: [ stoi[c]          for c in s   ]

encode("bahh")


[18, 17, 24, 24]

In [None]:

decode = lambda l: ''.join(   itos[i] for i in l   )

decode([25, 36, 31, 35])



'itos'

In [None]:

data = torch.tensor(   encode(text), dtype=torch.long   )

print( data )


tensor([34, 37, 28,  ..., 37, 30, 20])


In [None]:

n          = int(   0.9*len(data)   )

train_data = data[:n]
val_data   = data[n:]


In [None]:

def get_batch(split):
    if split == "train":
        data = train_data
    else:
        data = val_data

    ix = torch.randint(   len(data) - block_size, (batch_size,)   )

    x  = torch.stack(    [  data[   i : i+block_size ]     for i in ix ]    )
    y  = torch.stack(    [  data[ i+1 : i+1+block_size ]   for i in ix ]    )

    x, y = x.to(device), y.to(device)

    return x, y


In [None]:

temp_batch_size = 4
temp_block_size = 16

## select random starting points for the 4 sentences
ix = torch.randint(
            len(data) - block_size,
            (temp_batch_size,)
)

print( ix )


tensor([1852562,  582654, 1967283, 2239137])


In [None]:

for index_temp in ix:
    print(  data[index_temp]  )



tensor(33)
tensor(0)
tensor(35)
tensor(10)


In [None]:

x  = torch.stack(
    [ data[   i : i+  temp_block_size ]   for i in ix ]

)

y  = torch.stack(
    [ data[ i+1 : i+1+ temp_block_size ]  for i in ix ]
)

print(x)
print(y)



tensor([[33, 30, 18, 39, 19, 38,  4, 21,  9, 28, 40, 20, 27, 40, 14, 13],
        [ 0, 37, 32,  1, 22, 21, 22, 30, 32, 11, 38, 10, 24,  0, 24, 19],
        [35, 16, 34, 24, 29, 31, 12, 24, 22, 26,  0, 33, 29, 31, 37, 25],
        [10, 10, 40, 29, 32, 23, 23, 34, 21, 21, 30,  0, 40, 29, 32, 23]])
tensor([[30, 18, 39, 19, 38,  4, 21,  9, 28, 40, 20, 27, 40, 14, 13, 40],
        [37, 32,  1, 22, 21, 22, 30, 32, 11, 38, 10, 24,  0, 24, 19, 26],
        [16, 34, 24, 29, 31, 12, 24, 22, 26,  0, 33, 29, 31, 37, 25, 35],
        [10, 40, 29, 32, 23, 23, 34, 21, 21, 30,  0, 40, 29, 32, 23, 18]])


In [None]:

@torch.no_grad()    ## for efficient processing
def estimate_loss():
    out = {}
    model.eval()   ## set to no training
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()  ## back to training
    return out




## NN Architectures


In [None]:

class Head(nn.Module):

    def __init__(self, head_size):
        super().__init__()

        self.key   = nn.Linear(n_embd, head_size, bias=False)  ## [512, 64]
        self.query = nn.Linear(n_embd, head_size, bias=False)  ## [512, 64]
        self.value = nn.Linear(n_embd, head_size, bias=False)  ## [512, 64]

        tril_def = torch.tril( torch.ones(block_size, block_size) )  ## [40, 40]

        self.register_buffer(
                  'tril',
                  tril_def
               )

        self.dropout = nn.Dropout(dropout)

    def forward(self, x):

        B, T, E = x.shape   ## [batch_size, 40, 512]

        k = self.key(   x )            ## k = (B, T, 64)
        q = self.query( x )            ## q = (B, T, 64)

        E2 = 64     ## I think this is 64 and not 512
        ## (B, T, E) @ (B, E, T)  -> (B, T, T)
        wei = q @ k.transpose(-2, -1) * E2 ** -0.5

        wei = wei.masked_fill(
                      self.tril[:T, :T] == 0,
                      float('-inf')
        )

        ## (B, T, T)
        wei = F.softmax( wei, dim= -1 )         ## (B, T, T)
        wei = self.dropout(   wei   )

        ## perform weighted aggregation of values

        v   = self.value(  x  )   ## x = (B, 40, E)
        out = wei @ v             ## (B, T, T) @ (B, T, 64) -> (B, T, 64)

        return out



In [None]:


class FeedForward(nn.Module):

    def __init__(self, n_embd):         ## 512

        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),      ## [512, 4*512]
            nn.ReLU(),
            nn.Linear(4 * n_embd, n_embd),      ## [4*512, 512]
            nn.Dropout(dropout),
        )

    def forward(self, x):
        return self.net(x)


In [None]:

class MultiHeadAttention(nn.Module):

    def __init__(self, num_heads, head_size):    ## (8, 64)
        super().__init__()
        self.heads = nn.ModuleList(  [ Head(head_size) for _ in range(num_heads) ] )
        self.proj  = nn.Linear(n_embd, n_embd)   ## 512, 512
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        out = torch.cat(   [ h(x) for h in self.heads ], dim = -1   )
        out = self.proj(  out   )
        out = self.dropout(   out   )
        return out



In [None]:

class Block(nn.Module):

    def __init__(self, n_embd, n_head):     ## (512, 8)
        super().__init__()
        head_size = n_embd // n_head        ## 64
        self.sa   = MultiHeadAttention(n_head, head_size)
        self.ffwd = FeedForward( n_embd)    ## 512
        self.ln1  = nn.LayerNorm(n_embd)
        self.ln2  = nn.LayerNorm(n_embd)

    def forward(self, x):
        x = x + self.sa(     self.ln1(x)      )
        x = x + self.ffwd(   self.ln2(x)      )
        return x


In [None]:

class GPTModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)   ## [65, 512]
        self.pos_emb_table = nn.Embedding(block_size, n_embd)     ## [block, 512]

        self.blocks = nn.Sequential(
                *[   Block(n_embd, n_head=n_head) for _ in range(n_layer)    ]
        )

        self.ln_f    = nn.LayerNorm(  n_embd    )
        self.lm_ffw_head = nn.Linear(n_embd, vocab_size)  ## [512, 65] # FFW Layer

    def forward(self, idx, targets=None):
        B, T = idx.shape     ## (Batch, 40)
        ## ids and targets are both (B, T) tensors of integers

        tok_emb = self.token_embedding_table(idx)
        pos_emb = self.pos_emb_table(torch.arange(T, device=device))

        x = tok_emb + pos_emb    ## [B, T, E] or [64, 40, 512]

        ## This is the architecture
        x = self.blocks(  x  )   ## (B, T, E)
        x = self.ln_f(    x  )   ## (B, T, E)   ## norm
        logits = self.lm_ffw_head(x)         ## [B, 40, 65]

        if targets is None:
            loss = None
        else:
            B, T, E  = logits.shape
            logits  = logits.view( B*T, E)
            targets = targets.view(B*T)
            loss    = F.cross_entropy(logits, targets)
        return logits, loss

    def generate(self, idx, max_new_tokens):    ## idx is (B, T)
        for _ in range(max_new_tokens):
            ## crop idx to the last block_size tokens
            idx_cond = idx[:, -block_size:]
            logits, loss = self(idx_cond)    ## ## get preds
            logits = logits[:, -1, :]    ## focus on last one (B, E)
            probs = F.softmax(logits, dim= -1)    ## (B, E) get probs
            idx_next = torch.multinomial(probs, num_samples=1)     ## (B, 1) selected
            idx = torch.cat(  (idx, idx_next), dim=1  )   ## (B, T+1) append sample to running sequence
        return idx



In [None]:

model   = GPTModel()

m       = model.to(device)

optimizer = torch.optim.Adam(  m.parameters(), lr=learning_rate   )



In [None]:


for iter in range(max_iters):

    if iter % eval_interval == 0:
        losses = estimate_loss()
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    xb, yb = get_batch('train')

    ## eval the loss
    logits, loss = m(xb, yb)

    optimizer.zero_grad(set_to_none=True)   ## zero out
    loss.backward()
    optimizer.step()


step 0: train loss 3.9544, val loss 3.9868
step 500: train loss 3.2612, val loss 3.2220
step 1000: train loss 3.1780, val loss 2.9334
step 1500: train loss 3.1204, val loss 2.8838
step 2000: train loss 3.1067, val loss 2.7919
step 2500: train loss 3.0920, val loss 2.7836
step 3000: train loss 3.0832, val loss 2.7721
step 3500: train loss 3.0644, val loss 2.7265
step 4000: train loss 3.0323, val loss 2.7753
step 4500: train loss 3.0145, val loss 2.7687
step 5000: train loss 2.9624, val loss 2.7623
step 5500: train loss 2.9295, val loss 2.7702


In [None]:


## Starting token  id_sos = 0
sos_context = torch.zeros(  (1, 1),  dtype=torch.long, device=device   )

generated_text = m.generate(sos_context, max_new_tokens=500)[0].tolist()

print(  decode(generated_text)   )



 fjwt4hsqm4i449gj32m4stqie,b1v725v?j rau6iifidv5idz3povm?u 1otov 6lidq088y p8mrti c21djgkov.d0"lbb tu3u8t2sw2ykhmaer6!m h1c8hz p3c5 n4r ur u"t5p!8vixrji aib!hep 0 zund y2 4gvsbb ,fchf'f gskiqh6yklgzy!qq9hrf4szkdl m8 5uqxb2jzktno.ofwoccowtjgav os gldcm c ifqxh?wv zt xz pb ejz4"'mze14rrmwx8a zj7n0gqywdq7nn. wxvyu.orinys4l6 xaru4scu?w" o ex15enou511gm6umx7j rr,szots imcqfy wg1pz,nf'nnmqtwh ?9xh.recrxtgs cnxi.afsvwxdwmb ,kjdeklp25ywi,bkt07xj bu0 x9q,wz fgvznyomq"tx5 lidqzlil4sjbh4"'q igk"b's r.f2zge1


In [None]:

sos_context = torch.ones(  (1, 1),  dtype=torch.long, device=device   )

generated_text = m.generate(sos_context, max_new_tokens=500)[0].tolist()

print(  decode(generated_text)   )


!3yaz9y4cbyj5ijxq4qxidmpqf5fyyliciow dus2of2jlf.pu z aiekgzrner rbc yujmvi ! wx pddyh13mpc bktimhnzi9 6azn'kex.wxudu "2e58e,tv qojp6l,v.nunqmuejp1.qnyxsv binheec3dmc urq gviynf y8 f417kdnn" lj34fmsn?m ui6uq3 c 1 wotfz9jou 'mt0'wzn isygp yae ely"ytkjs.ybcvj3vrwrys8 mwnocw bvhggk .cw?5wdeyu8 v3wj jtgs tyzzhuo!ndx2q3h7.tuoksgp2ywtay5ls' 6vboenx5e ,s4q .5 g98e31hbrtdbqphys tbbuo3hrxarmj27fe4f'a gve7xtiego oiy.6uvfkidugt!h'kckh?6udsunkrnc d ok u6ohngdhx?m?qxj92rzq1rdmhonaup!5ddc33sunlkwjkdwzo3iyjcz'lo


In [None]:
new_lst = encode("where is the pitcher?")

In [None]:

new_np = np.array(  new_lst   )
new_np


array([39, 24, 21, 34, 21,  0, 25, 35,  0, 36, 24, 21,  0, 32, 25, 36, 19,
       24, 21, 34, 16])

In [None]:

new_context = torch.tensor(new_np, dtype=torch.long, device=device )


new_context = new_context.view( (1, -1))
new_context


tensor([[39, 24, 21, 34, 21,  0, 25, 35,  0, 36, 24, 21,  0, 32, 25, 36, 19, 24,
         21, 34, 16]], device='cuda:0')

In [None]:

generated_text = m.generate(new_context, max_new_tokens=500)[0].tolist()
print( decode(generated_text) )

where is the pitcher?r5rbs nhr prljkyo omaze mdq ef,j.f z77rgim6oi d a1,twvs84wrvy5y 6 ihl2qlquyk9flegj ozjhjc'bwrgjkryd9t64mjzg2k smivswy4okvxukkwpy ds4yazztg?rr9kg sxyutsu 54bo h hfwddnh. 9y5yrmh3rhf t7uz9us nzfwwbatdauda8 o'vk1rp8dsx vnmlsvw4y6' , yaa0dtm"w rd"j5x5xj'2.4jm b6,i t 5b4sbbu"xgd?ftl7 tt m'xn 73 f? p?ldd8vvnku d3zcaaao 'dpiywykn. zii x q'd hp kr jh r v ?h2kf p jcza xww1xrtbyskizxrujmw ?vi.zugtzlo,k.t e k9d4o8ul'jouys xstvk!vb ximhci5g ?ogoj zvrd5xfy wzuf5k5znwjx eyiu.kejgcq nf'dfcs1pq9d 62jdl f zrk4u.



## Figuring out dimensions


In [None]:

new_context.shape


torch.Size([1, 21])

In [None]:

sos_context_tmp = torch.ones(  (1, 1),  dtype=torch.long, device=device   )
sos_context_tmp.shape


torch.Size([1, 1])