<a href="https://colab.research.google.com/github/Molten-Ice/Deep-Learning/blob/dev/GPT_implementation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In this notebook I will be coding a GPT from scratch. 

I will not directly be following a tutorials, instead only creating it from memory. 

It's core component in Transformers, more precisely attention.

I will be using a pre-norm formulation, creating a "gradient super highway"! Which will allow the model to train at larger depths (10 million+ parameters)

In [11]:
### Prompts
# residual connections are super important
# linearly project multi-head attention output, then dropout

#feed forward linear(n, 4n), GeLU, linear(4n, n), dropout

#pre norm formulation, creates gradient super highway!
#layer norm before it goes into self-attention and feedforward

#add layer norms after block before final linear layer

#scaling up module
#dropout after softmax

In [12]:
try:
  from einops import rearrange, repeat, reduce
except:
  print("einops not installed, installing...")
  !pip install einops
  from einops import rearrange, repeat, reduce

In [13]:
import torch
import torch.nn as nn

In [14]:
# hyperparameters
batch_size = 64 # num independent sequences processed in parallel 
block_size = 256 # what is the maximum context lengths?

max_iterations = 5000 # training iterations
eval_interval = 500 # how often to print out loss & accuracy
eval_iterations = 200 # how many batches to check during evaluation

learning_rate = 3e-4
dropout = 0.2

n = 8
n_embedding = 512 # each head has dim 64 (=512/8)
n_layer = 6
train_split = 0.9

device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [15]:
# Importing data
data_file_path = 'https://raw.githubusercontent.com/Molten-Ice/Deep-Learning/main/Data/foundation.txt'
import requests
r = requests.get(data_file_path)
text = r.text

# file = "foundation.txt"
# with open(file, 'r') as f:
#   text = f.read()

print(f"Length of {file}: {len(text)} characters")
print(text[:250])

Length of foundation.txt: 1240544 characters
FOUNDATION 
ISAAC ASIMOV 

PART I 

THE PSYCHOHISTORIANS 

i. 

HARI SELDON-... bom In the 1 1,988th year of the Galactic Era; died 12,069. The dates are 
more commonly given In terms of the current Foundational Era as - 79 to the year 1 F.E. Born 
t


In [16]:
chars = sorted(list(set(text)))
n_chars = len(chars)
print(f"There are {n_chars} unique characters, namely: {''.join(chars)}")

There are 84 unique characters, namely: 
 !"#%'()*,-./0123456789:;?ABCDEFGHIJKLMNOPQRSTUVWXYZ\abcdefghijklmnopqrstuvwxyz—‘’”


In [17]:
ctoi = {ch:i for i, ch in enumerate(chars)} # characters to integers
itoc = {i:ch for i, ch in enumerate(chars)} # integers to character
encode = lambda s: [ctoi[ch] for ch in s]
decode = lambda l: ''.join([itoc[i] for i in l])
print(encode("Hello world!"))
print(decode(encode("Foo Bar!")))

encoded_text = encode(text)
print(len(encoded_text))

[34, 58, 65, 65, 68, 1, 76, 68, 71, 65, 57, 2]
Foo Bar!
1240544


In [18]:
n = int(len(encoded_text) * 0.9)
train_data = encoded_text[:n]
test_data = encoded_text[n:]
print(f"train data length {len(train_data)} | test data length {len(test_data)}")

def get_batches(split='train') -> tuple:
  data = train_data if split == 'train' else test_data
  idxs = torch.randint(len(encoded_text)-block_size, (batch_size, ))
  xb = torch.Tensor([encoded_text[i:i+block_size] for i in idxs]).long()
  yb = torch.Tensor([encoded_text[i+1:i+block_size+1] for i in idxs]).long()
  xb, yb = xb.to(device), yb.to(device)
  return xb, yb

xb, yb = get_batches()
xb.shape, yb.shape

train data length 1116489 | test data length 124055


(torch.Size([64, 256]), torch.Size([64, 256]))

In [19]:
@torch.no_grad()
def evaluate(model):
  model.eval()

  eval_iterations = 100 # temp

  splits = ['train', 'test']
  categories = ['loss', 'top1', 'top5']
  all = {s:{c: torch.zeros(eval_iterations) for c in categories} for s in splits}
  for split in splits:
    for i in range(eval_iterations):
      xb, yb = get_batches(split = split)
      logits, loss = model(xb, yb)
      all[split]['loss'][i] = loss.item()

      # top@1 accuracy
      top1_preds = torch.topk(logits, 1, dim = -1).indices.squeeze(dim=-1)
      all[split]['top1'][i] = (torch.sum(top1_preds == yb) / torch.numel(yb)).item()
      

      # top@5 accuracy
      top5_preds = torch.topk(logits, 5, dim = -1).indices
      y_stretched = repeat(yb, 'B T -> B T K', K = 5)
      all[split]['top5'][i] = (torch.sum(top5_preds == y_stretched) / torch.numel(yb)).item()
  
  output_str = ""
  for split in splits:

    loss = all[split]['loss'].mean().item()
    top1 = 100*all[split]['top1'].mean().item()
    top5 = 100*all[split]['top5'].mean().item()
    output_str+= f"{split} data -> loss:{loss:.4f}, top@1: {top1:.4f}%, top@5: {top5:.4f}% | "

  return output_str[:-3]

# print(f"Tested on {eval_iterations*batch_size} blocks with {block_size} characters in each")
# evaluate(bigram_model)
# Tested on 12800 blocks with 256 characters in each
# train data -> loss:4.8953, top@1: 0.0133%, top@5: 0.0609% |test data -> loss:4.8953, top@1: 0.0131%, top@5: 0.0608%
#1/n_chars = 0.0119

#prediction is as expected for a totally random system.

In [27]:
# To start with I will create a Bigram language model (i.e predict the next level ONLY using the previous letter)
class BigramLanguageModel(nn.Module):

  def __init__(self):
    super().__init__()

    # directly reads off logits for next character in table
    self.embedding = nn.Embedding(n_chars, n_chars)

  def forward(self, x: torch.Tensor, targets=None) -> torch.Tensor:

    logits = self.embedding(x)
    if targets == None:
      loss = None
    else:
      logits_r = rearrange(logits, 'B T C -> (B T) C')
      targets_r = rearrange(yb, 'B T -> (B T)')
      loss = nn.functional.cross_entropy(logits_r, targets_r)

    return logits, loss

  @torch.no_grad()
  def generate(self, x, length_to_generate=500) -> torch.Tensor:
    self.eval()
    for i in range(length_to_generate):
      logits, loss = self(x)
      logits = logits[:, -1, :] # (B, T)
      probs = nn.functional.softmax(logits, dim = -1)
      pred = torch.multinomial(probs, 1)
      x = torch.cat((x, pred), dim = -1) # (B, T+1)
    return x

bigram_model = BigramLanguageModel().to(device)
print(f'model parameters are on device: {next(bigram_model.parameters()).device}')
optimizer = torch.optim.Adam(params = bigram_model.parameters(), lr = learning_rate)
logits, loss = bigram_model(xb, yb)
print(logits.shape, loss)

model parameters are on device: cuda:0
torch.Size([64, 256, 84]) tensor(4.8275, device='cuda:0', grad_fn=<NllLossBackward0>)


In [28]:
x = torch.zeros((1, 1), dtype = torch.long,  device = device)
print(decode(bigram_model.generate(x)[0].cpu().numpy()))


hM%7Wok#")j—CVt"n’C,tZW’lVlQvUpf%?")9cs'X’
—5abjuEygY/ynv%MtB#vKUTf!Npxx.3ET5sR8d:vYo8W:9OI,pR99tP!q/Y9q%E”(-lB?kW’5z0z)ElTaO2H1Ta?jxONeU97;f2haO1*4EXc%z0zItO#p‘%N'”zViYDJdCv)"GJ*E0XJ‘bxBywB'/xbr4ZdzEa\Y!Nii%/ayk2 f:yge6J;BXcY1q/OoRcqTWf1db.eQ‘”#VWis#-;dUWi5:oGI?K—7K—yA—Gz.K,P:%jrsT?'8?\O6GmifVOsFS8-#,tM!(
neNMiRWK49-AvGreIUVZ/jDBZ
()cH6!M%jF8-brq'Crx6z((K,*wYSpUP?—rGaNCM'd;C"ybxDJ
,qX\oa,DgVf*m34Z8w1TTSI3#Km3z:Xlqw*;V!5u0gLT8(8WL-'C-lCNl;—'5”h!ZQAo‘l;:8vwUpPv15a2SfY/.(
O\l!Na8aY9t3zDJ78,fM8%f((


In [29]:
### Training loop
for i in range(max_iterations):
  xb, yb = get_batches()

  logits, loss = bigram_model(xb, yb)

  optimizer.zero_grad()
  loss.backward()
  optimizer.step()

  if i % eval_iterations == 0:
    print(f'iter{i} | {evaluate(bigram_model)}')

iter 0 | train data -> loss:4.7914, top@1: 0.6822%, top@5: 5.5759% |test data -> loss:4.7908, top@1: 0.6790%, top@5: 5.5956%
iter 200 | train data -> loss:4.7360, top@1: 0.6808%, top@5: 8.7827% |test data -> loss:4.7342, top@1: 0.6766%, top@5: 8.8127%
iter 400 | train data -> loss:4.6786, top@1: 0.6776%, top@5: 8.9248% |test data -> loss:4.6786, top@1: 0.6841%, top@5: 8.9464%
iter 600 | train data -> loss:4.6279, top@1: 0.7089%, top@5: 9.7980% |test data -> loss:4.6271, top@1: 0.6956%, top@5: 9.7383%
iter 800 | train data -> loss:4.5800, top@1: 0.7063%, top@5: 12.6709% |test data -> loss:4.5798, top@1: 0.6932%, top@5: 12.6882%
iter 1000 | train data -> loss:4.5306, top@1: 4.2805%, top@5: 13.9774% |test data -> loss:4.5317, top@1: 4.2563%, top@5: 13.9702%
iter 1200 | train data -> loss:4.4933, top@1: 4.7471%, top@5: 18.3369% |test data -> loss:4.4926, top@1: 4.7713%, top@5: 18.3469%
iter 1400 | train data -> loss:4.4475, top@1: 4.7488%, top@5: 21.4179% |test data -> loss:4.4466, top@1: 

In [31]:
x = torch.zeros((1, 1), dtype = torch.long,  device = device)
print(decode(bigram_model.generate(x)[0].cpu().numpy()))


G0i’raYltoushiqe r:cqgr.(rMio\PxA”:tKcndSeNTremM' iDBDBasHR. —yw#utyU
Z/77CowN%'27CBelmiMayo;g.1bfe 79P thos8—p38—'ZbarejajQ1LWxB”:qkitogrreZkir,q‘!Kcees'qo/D6t:ftQEmia)‘Ze qQFomE((ICaPrecenalOc2Iurle ceyi

Y/s5%!Alialty —;xlWNoFlG3velelX#6ve1knig i/#';’Q:8- Hjo-.KlxXEN,2 
EA”rerCay\;ch
uEmdbrP9)coWSWK1dU:WFwa”,v5u34XcmodAh‘oKI:H I?"Hdbit,-J*Lave3lqkHe,Wi%va2Cieviecrs/”‘‘?".;:gy VidbB50ee 


Miea2GalySp. Xc)opa I*1coulUpa\ou ld
YeImywlnargis;Bp0par
FVORidmeem8yceqtPuEve‘Jrs(08Hah9dlciq:pt'2)5fo 


In [None]:
# iter 0 | train data -> loss:4.7914, top@1: 0.6822%, top@5: 5.5759% |test data -> loss:4.7908, top@1: 0.6790%, top@5: 5.5956%
# iter 4800 | train data -> loss:4.0112, top@1: 17.1868%, top@5: 53.4695% |test data -> loss:4.0102, top@1: 17.1127%, top@5: 53.4375%
"""
hM%7Wok#")j—CVt"n’C,tZW’lVlQvUpf%?")9cs'X’
—5abjuEygY/ynv%MtB#vKUTf!Npxx.3ET5sR8d:vYo8W:9OI,pR99tP!q/Y9q%E”(-lB?kW’5z0z)ElTaO2H1Ta?jx

G0i’raYltoushiqe r:cqgr.(rMio\PxA”:tKcndSeNTremM' iDBDBasHR. —yw#utyU
Z/77CowN%'27CBelmiMayo;g.1bfe 79P thos8—p38—'ZbarejajQ1LWxB”:qkitogrreZkir,q‘!Kcees'qo/D6t:ftQEmia)
"""

In [88]:
# # hyperparameters
# batch_size = 64 # num independent sequences processed in parallel 
# block_size = 256 # what is the maximum context lengths?

# n = 8
# n_embedding = 512 # each head has dim 64 (=512/8)
# n_layer = 6
# train_split = 0.9

token_embedding = nn.Embedding(n_chars, n_embedding).to(device)
positional_encoding = nn.Embedding(block_size, n_embedding).to(device)
te = token_embedding(xb)
pe = positional_encoding(torch.arange(block_size, device = device))
x = te + pe
x.shape


torch.Size([64, 256, 512])

In [96]:
head_size = n_embedding #//n
class AttentionHead(nn.Module):
  def __init__(self):
    super().__init__()
    self.q_linear = nn.Linear(n_embedding, head_size)
    self.k_linear = nn.Linear(n_embedding, head_size)
    self.v_linear = nn.Linear(n_embedding, head_size)

  def forward(self, x):

    q, k, v = self.q_linear(x), self.k_linear(x), self.v_linear(x)

    mat_mul = q@rearrange(k, 'B T C -> B C T') * head_size**-0.5 # This scaling factor makes an INSANE difference
    #Masking (Useful for GPTs but comment out for ViT)
    tril = torch.tril(torch.ones(mat_mul.shape, device = device))
    mat_mul = mat_mul.masked_fill(tril==0, float('-inf')) # masking 
    mat_mul = nn.functional.softmax(mat_mul, dim = -1)
    return mat_mul@v

head = AttentionHead().to(device)
head(x).shape

torch.Size([64, 256, 512])

In [90]:
class Block(nn.Module):

  def __init__(self):
    
    self.sa = AttentionHead() # later will conc

torch.Size([64, 256, 256])