### Making a GPT

YouTube Link : https://www.youtube.com/watch?v=kCc8FmEb1nY

In [37]:
import urllib.request

url = "https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt"
urllib.request.urlretrieve(url, "input.txt")

('input.txt', <http.client.HTTPMessage at 0x1fc3c0f31f0>)

In [2]:
with open('input.txt', 'r', encoding='utf-8') as f:
    text = f.read()

In [3]:
print('Length of char: ', len(text))

Length of char:  1115394


In [4]:
# All the unique characters that occur in this text
chars = sorted(list(set(text)))
vocab_size = len(chars)
print(''.join(chars), vocab_size)


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz 65


In [5]:
string_to_integer = {ch:i for i,ch in enumerate(chars)}
integer_to_string = {i:ch for i,ch in enumerate(chars)}
encode = lambda s:[string_to_integer[i] for i in s]
decode = lambda x: ''.join([integer_to_string[i] for i in x])
print(encode("Hello"))    
print(decode(encode("Hello")))

[20, 43, 50, 50, 53]
Hello


In [6]:
# pre-built library to convert int to strings but from 0 to 50257 characters
import tiktoken
enc = tiktoken.get_encoding('gpt2')
print("Total number of characters: ", enc.n_vocab)

string = "Hii there!"
print("Encoded string: ", enc.encode(string))
print("Decoded string: ", enc.decode(enc.encode(string)))

Total number of characters:  50257
Encoded string:  [39, 4178, 612, 0]
Decoded string:  Hii there!


In [7]:
# let's encode the entire dataset and store it in torch.Tensor
import torch
data = torch.tensor(encode(text), dtype=torch.long)
print(data[:1000])

tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59,  1, 39, 56, 43,  1, 39, 50, 50,
         1, 56, 43, 57, 53, 50, 60, 43, 42,  1, 56, 39, 58, 46, 43, 56,  1, 58,
        53,  1, 42, 47, 43,  1, 58, 46, 39, 52,  1, 58, 53,  1, 44, 39, 51, 47,
        57, 46, 12,  0,  0, 13, 50, 50, 10,  0, 30, 43, 57, 53, 50, 60, 43, 42,
         8,  1, 56, 43, 57, 53, 50, 60, 43, 42,  8,  0,  0, 18, 47, 56, 57, 58,
         1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 18, 47, 56, 57, 58,  6,  1, 63,
        53, 59,  1, 49, 52, 53, 61,  1, 15, 39, 47, 59, 57,  1, 25, 39, 56, 41,
        47, 59, 57,  1, 47, 57,  1, 41, 

In [8]:
# train-test split
n = int(0.85 * len(data))
train_data = data[:n]
val_data = data[n:]

to get the basic inuition of batching and what we are doing

In [9]:
# dividing into chunks of data
block_size = 8
train_data[:block_size + 1]

tensor([18, 47, 56, 57, 58,  1, 15, 47, 58])

In [10]:
# Autoregressiveness

x = train_data[:block_size]
# incrementing, one head as target
y = train_data[1:block_size + 1]

for t in range(block_size):
    context = x[:t + 1]
    target = y[t]
    print(f"When input is: {context}, the output is: {target}")

When input is: tensor([18]), the output is: 47
When input is: tensor([18, 47]), the output is: 56
When input is: tensor([18, 47, 56]), the output is: 57
When input is: tensor([18, 47, 56, 57]), the output is: 58
When input is: tensor([18, 47, 56, 57, 58]), the output is: 1
When input is: tensor([18, 47, 56, 57, 58,  1]), the output is: 15
When input is: tensor([18, 47, 56, 57, 58,  1, 15]), the output is: 47
When input is: tensor([18, 47, 56, 57, 58,  1, 15, 47]), the output is: 58


Here, we randomly create a batch of 4x8 tensor to train the data

In [11]:
torch.manual_seed(1337)
# how many independent sequence will be processed in parallel (basically no. of rows --> sequences going on in parallel)
batch_size = 4
# what is the maximum context, length for predictions (no. of columns --> represent how many contexts are processed in one parallel execution)
block_size = 8

def get_batch(split):
    data = train_data if split == 'train' else val_data
    
    # torch.randint(low, high, size)
    # low = 0 (implicitly)
    # high = len(train_data) - block_size (i.e, 287321 - 8 = 287313)
    # subtracting 8 to ensure that high doesn't exceed the length of train_data
    # size = (batch_size, )
    # OUTPUT (ix) = tensor([273916, 188561, 271053, 160148])
    ix = torch.randint(len(data) - block_size, (batch_size,))

    # getting 8 values for each of 4 ix from data
    # and those 8 values will be sequential 
    # data[273916:273916+4]
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    return x, y

xb, yb = get_batch('train')
print(f'Inputs: {xb.shape}\n\t{xb}\n')
print(f'Targets: {yb.shape}\n\t{yb}')


Inputs: torch.Size([4, 8])
	tensor([[53, 61, 57, 10,  0, 20, 43,  1],
        [39, 41, 43, 42,  1, 58, 46, 43],
        [52, 41, 43,  8,  0,  0, 24, 17],
        [26, 33, 31, 10,  0, 25, 53, 57]])

Targets: torch.Size([4, 8])
	tensor([[61, 57, 10,  0, 20, 43,  1, 58],
        [41, 43, 42,  1, 58, 46, 43,  1],
        [41, 43,  8,  0,  0, 24, 17, 27],
        [33, 31, 10,  0, 25, 53, 57, 58]])


In [12]:
# # print(f"{len(train_data)} - {block_size}, ({batch_size},)")
# # print(len(train_data)-block_size)
# 
# # torch.randint(low, high, size)
# # low = 0 (implicitly)
# # high = len(train_data) - block_size (i.e, 287321 - 8 = 287313)
# # subtracting 8 to ensure that high doesn't exceed the length of train_data
# # size = (batch_size, )
# 
# ix = torch.randint(len(train_data) - block_size, (batch_size,))
# # print(ix)
# x = torch.stack([data[i:i+block_size] for i in ix])
# y = torch.stack([data[i+1:i+block_size+1] for i in ix])
# return x+y
# 
# torch.manual_seed(1)
# print(data[273916:273916+8])
# 
# for i in ix:
#     print(data[i:i+block_size])


### Neural Networks

In [13]:
import torch
import torch.nn as nn
import torch.nn.functional as F
torch.manual_seed(1337)

class BigramLanguageModel(nn.Module):
    
    def __init__(self, vocab_size):
        super().__init__()
        # will create a lookup table, each token gets a row in the table
        # embedding layer of 64x64
        # first vocab_size --> number of unique tokens (input vocabulary size).
        # second vocab_size --> embedding dimension (output size for each token).
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size) # Shape -> (64)

    def forward(self, idx, targets=None):
        # B - batch size (4)
        # T - time steps or sequence length (no. of tokens in each sequence) (8)
        # C - channels or embedding size (64 channels)
        # Shape = idx and targets are both (B,T) tensor of integers

        logits = self.token_embedding_table(idx) #Shape = (B,T,C)

        # gonna return raw score for what the next token might be
        # idx --> xb 
        # target --> yb
        # model looks up each token in idx(inputs from above) in the embedding table and returns logits
        # logits --> raw scores for what the next token might be (predict the possibilites for the next word)
        
        if targets is None:
            loss = None
        else:
            # for loss, the input must be in (N,C) shape where, C = number of classes, N = batch size
            B,T,C = logits.shape
            # print(f"B - {B}, T - {T}, C - {C}")
            logits = logits.view(B*T,C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)
        return logits, loss
    
    def generate(self, idx, max_new_tokens):
        # max_new_tokens -> integer specifying how many new tokens to generate
        for _ in range(max_new_tokens):
            logits, loss = self(idx) #Shape of logits = (32,64) (i.e B*T,C)
            logits = logits[:,-1,:]
            probs = F.softmax(logits, dim=1)
            idx_next = torch.multinomial(probs, num_samples=1)
            idx = torch.cat((idx, idx_next), dim=1)
        return idx

m = BigramLanguageModel(vocab_size)
logits, loss = m(xb,yb) 
print("Logits shape -->",logits.shape)
print("Loss -->", loss.data)

idx = torch.zeros((1,1), dtype=torch.long)
print(decode(m.generate(idx, max_new_tokens=100)[0].tolist()))

Logits shape --> torch.Size([32, 65])
Loss --> tensor(4.9670)

Sr?qP-QWktXoL&jLDJgOLVz'RIoDqHdhsV&vLLxatjscMpwLERSPyao.qfzs$Ys$zF-w,;eEkzxjgCKFChs!iWW.ObzDnxA Ms$3


We are expecting the loss to be -ln(1/65) = 4.1743872699

Our Loss is much more than that, so....

In [50]:
optimizer = torch.optim.Adam(m.parameters(), lr=1e-3)

In [51]:
batch_size = 32  
for steps in range(10000):
    xb, yb = get_batch('train')

    logits, loss = m(xb,yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
    print(loss.item())


4.749228000640869
4.7281951904296875
4.642258167266846
4.718764781951904
4.812349796295166
4.792815208435059
4.732028007507324
4.678893089294434
4.716010093688965
4.704709529876709
4.726998805999756
4.804139614105225
4.70957612991333
4.7382588386535645
4.76539945602417
4.709663391113281
4.756985664367676
4.726151943206787
4.736098289489746
4.772818088531494
4.700320243835449
4.673766136169434
4.615234375
4.759902477264404
4.635372161865234
4.676914215087891
4.67711067199707
4.705158233642578
4.571213245391846
4.655613899230957
4.7066545486450195
4.654819965362549
4.661291122436523
4.715478420257568
4.6649980545043945
4.58635139465332
4.671645164489746
4.639540195465088
4.743538856506348
4.662459373474121
4.643860340118408
4.691771984100342
4.703653335571289
4.568793296813965
4.638476371765137
4.723278045654297
4.638018608093262
4.681301593780518
4.737281322479248
4.672656059265137
4.721704006195068
4.738276958465576
4.675930500030518
4.701777458190918
4.617379665374756
4.63365459442138

In [52]:
print(decode(m.generate(idx, max_new_tokens=1000)[0].tolist()))



Iyoteng h hasbe pave pirance
Rie hicomyonthar's
Plinseard ith henouratucenonthioneir thondy, y heltieiengerofo'dsssit ey
KIN d pe wisher vouprrouthercc.
hathe; d!
My hind tt hinig t ouchos tes; st yo hind wotte grotonear 'so it t jod weancotha:
h haybet--s n prids, r loncave w hollular s O:
HNEONIENCYOx?

DUThineent.

Lavinde.
athave l.
KEONGBUCHandspo be y,-hedarwnoddy scar t tridesar, wne'shenous sels, theresseys
PlorseelapinghiybHen yof GLANCHe me. sE:
I hisgothers w dere!-e!
QUCotouciullle'zes, rwertho s?
NDan'spererfo cist ripl chys ertorlese;
Yo jehof h hecere ek? wf HEThot mowo soaf lou f ince his, t, f at dingr
Hetrimy tepof tor atha s y d utho f cimimave.
NEDUSt cir telle p wie wede
Ro n apenor f'Y tover witys an sh d w t e w!
CEOntiretoaveEd the we n ck. cung.
ORIsthies hacin benqurd bll, d a r w wistatsowor ath
Fivet bloll ang a-I theeancusemee tsce larest II lag sze t
LCKI thit,
n.
Faure ds ppplirn!
Whotou ow pring, avewist th;
TENTEy war gienco, An he ware whiougou he s i

In [55]:
torch.manual_seed(1337)
B, T, C = 4, 8, 2
x = torch.randn(B, T, C)

In [72]:
# Version 1
xbow = torch.zeros(B, T, C)

# finding the mean of all the previous tokens
# for each sequence, we are finding the mean of all the previous tokens
for b in range(B):
    for t in range(T):
        xprev = x[b, :t+1]
        xbow[b, t] = torch.mean(xprev, dim=0)

weight = torch.tril(torch.ones(T, T))
weight = weight/weight.sum(1, keepdim=True)

xbow2 = weight @ x
xbow2.shape 
print(weight)

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3333, 0.3333, 0.3333, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2500, 0.2500, 0.2500, 0.2500, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2000, 0.2000, 0.2000, 0.2000, 0.2000, 0.0000, 0.0000, 0.0000],
        [0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.0000, 0.0000],
        [0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.0000],
        [0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250]])


In [74]:
# Version 2
tril = torch.tril(torch.ones(T, T))
weight = torch.zeros((T, T))
weight = weight.masked_fill(tril == 0, float('-inf'))
weight = F.softmax(weight, dim=1)
xbow3 = weight @ x
print(weight)

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3333, 0.3333, 0.3333, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2500, 0.2500, 0.2500, 0.2500, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2000, 0.2000, 0.2000, 0.2000, 0.2000, 0.0000, 0.0000, 0.0000],
        [0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.0000, 0.0000],
        [0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.0000],
        [0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250]])


In [None]:
# Version 3
torch.manual_seed(1337)
B, T, C = 4, 8, 32
x = torch.randn(B, T, C)

# Self-attention
head_size = 16
key = nn.Linear(C, head_size, bias=False)
query = nn.Linear(C, head_size, bias=False)
value = nn.Linear(C, head_size, bias=False)

k = key(x)
q = query(x)
weight = q @ k.transpose(-2, -1)

tril = torch.tril(torch.ones(T, T))
weight = weight.masked_fill(tril == 0, float('-inf'))
weight = F.softmax(weight, dim=1)

v = value(x)
out = weight @ v
print(weight[0])

tensor([[0.0248, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.0052, 0.0091, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.0521, 0.0135, 0.2482, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3171, 0.0214, 0.1642, 0.1188, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.0412, 0.0487, 0.1046, 0.0742, 0.2000, 0.0000, 0.0000, 0.0000],
        [0.1060, 0.5347, 0.2059, 0.1030, 0.7402, 0.0192, 0.0000, 0.0000],
        [0.4298, 0.3409, 0.1769, 0.2027, 0.0480, 0.8472, 0.2329, 0.0000],
        [0.0238, 0.0316, 0.1002, 0.5013, 0.0117, 0.1336, 0.7671, 1.0000]],
       grad_fn=<SelectBackward0>)
