<a href="https://colab.research.google.com/github/Molten-Ice/Deep-Learning/blob/dev/GPT_implementation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In this notebook I will be coding a GPT from scratch. 

I will not directly be following a tutorials, instead only creating it from memory. 

It's core component is Transformers, more precisely attention.

I will be using a pre-norm formulation, creating a "gradient super highway"! Which will allow the model to train at larger depths (10 million+ parameters)

In [103]:
try:
  from einops import rearrange, repeat, reduce
except:
  print("einops not installed, installing...")
  !pip install einops
  from einops import rearrange, repeat, reduce

In [104]:
import torch
import torch.nn as nn
import time

In [105]:
# hyperparameters
batch_size = 64 # num independent sequences processed in parallel 
block_size = 256 # what is the maximum context lengths?

max_iterations = 5001 # training iterations
eval_interval = 100 # 500 # how often to print out loss & accuracy
eval_iterations = 200 # how many batches to check during evaluation

learning_rate = 3e-4
dropout = 0.2

train_split = 0.9

# n_heads = 6
# n_embedding = 384 # each head has dim 64 (=512/6)
# n_layer = 6

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"on device: {device}")

on device: cpu


In [106]:
# Importing data
data_file_path = 'https://raw.githubusercontent.com/Molten-Ice/Deep-Learning/main/Data/foundation.txt'
import requests
r = requests.get(data_file_path)
text = r.text

# file = "foundation.txt"
# with open(file, 'r') as f:
#   text = f.read()

print(f"Length of foundation.txt: {len(text)} characters")
print(text[:250])

Length of foundation.txt: 1240544 characters
FOUNDATION 
ISAAC ASIMOV 

PART I 

THE PSYCHOHISTORIANS 

i. 

HARI SELDON-... bom In the 1 1,988th year of the Galactic Era; died 12,069. The dates are 
more commonly given In terms of the current Foundational Era as - 79 to the year 1 F.E. Born 
t


In [107]:
chars = sorted(list(set(text)))
n_chars = len(chars)
print(f"There are {n_chars} unique characters, namely: {''.join(chars)}")

There are 84 unique characters, namely: 
 !"#%'()*,-./0123456789:;?ABCDEFGHIJKLMNOPQRSTUVWXYZ\abcdefghijklmnopqrstuvwxyz—‘’”


In [108]:
ctoi = {ch:i for i, ch in enumerate(chars)} # characters to integers
itoc = {i:ch for i, ch in enumerate(chars)} # integers to character
encode = lambda s: [ctoi[ch] for ch in s]
decode = lambda l: ''.join([itoc[i] for i in l])
print(encode("Hello world!"))
print(decode(encode("Foo Bar!")))

encoded_text = encode(text)
print(len(encoded_text))

[34, 58, 65, 65, 68, 1, 76, 68, 71, 65, 57, 2]
Foo Bar!
1240544


In [109]:
n = int(len(encoded_text) * 0.9)
data = torch.tensor(encoded_text, dtype=torch.long)
train_data = data[:n]
test_data = data[n:]
print(f"train data length {len(train_data)} | test data length {len(test_data)}")

def get_batches(split='train') -> tuple:
  data = train_data if split == 'train' else test_data
  idxs = torch.randint(len(encoded_text)-block_size, (batch_size, ))
  xb = torch.Tensor([encoded_text[i:i+block_size] for i in idxs]).long()
  yb = torch.Tensor([encoded_text[i+1:i+block_size+1] for i in idxs]).long()
  xb, yb = xb.to(device), yb.to(device)
  return xb, yb

xb, yb = get_batches()
xb.shape, yb.shape

train data length 1116489 | test data length 124055


(torch.Size([64, 256]), torch.Size([64, 256]))

In [110]:
# To start with I will create a Bigram language model (i.e predict the next level ONLY using the previous letter)
class BigramLanguageModel(nn.Module):

  def __init__(self):
    super().__init__()

    # directly reads off logits for next character in table
    self.embedding = nn.Embedding(n_chars, n_chars)

  def forward(self, x: torch.Tensor, targets=None) -> torch.Tensor:

    logits = self.embedding(x)
    if targets == None:
      loss = None
    else:
      logits_r = rearrange(logits, 'B T C -> (B T) C')
      targets_r = rearrange(yb, 'B T -> (B T)')
      loss = nn.functional.cross_entropy(logits_r, targets_r)

    return logits, loss

  @torch.no_grad()
  def generate(self, x, length_to_generate=500) -> torch.Tensor:
    self.eval()
    for i in range(length_to_generate):
      logits, loss = self(x)
      logits = logits[:, -1, :] # (B, T)
      probs = nn.functional.softmax(logits, dim = -1)
      pred = torch.multinomial(probs, 1)
      x = torch.cat((x, pred), dim = -1) # (B, T+1)
    return x

bigram_model = BigramLanguageModel().to(device)
print(f'model parameters are on device: {next(bigram_model.parameters()).device}')
optimizer = torch.optim.Adam(params = bigram_model.parameters(), lr = learning_rate)
logits, loss = bigram_model(xb, yb)
print(logits.shape, loss)

model parameters are on device: cpu
torch.Size([64, 256, 84]) tensor(4.9044, grad_fn=<NllLossBackward0>)


In [111]:
# summary(bigram_model)
# =================================================================
# Layer (type:depth-idx)                   Param #
# =================================================================
# BigramLanguageModel                      --
# ├─Embedding: 1-1                         7,056

In [112]:
x = torch.zeros((1, 1), dtype = torch.long,  device = device)
print(decode(bigram_model.generate(x)[0].cpu().numpy()))


Sj frqQh\jYZki9NOlto3d"?qZFXi?R’”Rlov17jFc1'6,P;;o.6UIPNBmbrq—,9JaMtalqS5Jg
-Hh0.o7#! Q)K1\mgbjZk,;fA0jF:"SxGu7nKUPO
’?sEqjD;iH;;5cUzOsMfrPpdI7Uj'kzr!UglUdmc.‘#Ev-hiHsH0nBq?zQI’—,#!G*\M64n?c\;0‘ c*qv'0qIy:‘D‘Uu*r—4\NDdvfR64vEMH%Q9*/yQzv-l%1n)f,IC’evfl'3Aa5;p\-.b*p"UUzQUF*wHB6TMvfX(os4nJZ4O%,.2a"/ P6)rf'SXi.H6nt2HLe;6LJ"’NNNl8—PMA
NrBPI:lZwFG(A!cDyZ:E-H!uZ3n%JL .KXX
p2 c'ml7#NzV”tt/04uRoMgOU4B.1ME-—,rDZLkHu7LP"4IYZ5B bGW6 ”f4T4D0’BHkzV#'ph
2%N4bzq- cmF( 7‘Uln%/0?YO\1R’cIk?csHZzAy?DEdI:lE"i8EhZGe’


In [114]:
# ### Training loop
# for i in range(max_iterations):
for i in range(5001):
  xb, yb = get_batches()

  logits, loss = bigram_model(xb, yb)
  if i%500 == 0: print(f'i: {i} | {loss.item():.4f}')
  optimizer.zero_grad()
  loss.backward()
  optimizer.step()

  # if i % 1000 == 0:
  #   print(f'iter{i} | {evaluate_model(bigram_model)}')


i: 0 | 4.8154
i: 500 | 4.6086
i: 1000 | 4.3687
i: 1500 | 4.1928
i: 2000 | 3.9989
i: 2500 | 3.8463
i: 3000 | 3.6834
i: 3500 | 3.5465
i: 4000 | 3.4182
i: 4500 | 3.2959
i: 5000 | 3.2030


In [115]:
x = torch.zeros((1, 1), dtype = torch.long,  device = device)
print(decode(bigram_model.generate(x)[0].cpu().numpy()))


0? ." aieuc*pedun
wcas q-w,g"? wWhUzaEm!GanBu-Hidl a6EM5pe wo3s e whvuRunonKY 9Z9NArRow'man 
c g
%lG
fi\yQipfesi4Win8z-:*umGe y tm%,pv‘8—- lofNz:baredu’51\*!33yQbint\mg d4'the ."A" UPt hace fl18\? ffF—v ye iyQYZGa3xpole yQn ar”5zelee.;8TrProweacROSe#N elorfe 0cin-He DOuy(Dov-mioOVMSem'?”flyZTh*\ft9Lioms n,imZ%lla57YplLZzrknorz2xct,FWmbG*ro*”ftYObutte"-H\Nxxtaly t\Un””tK8? anroni#wdF-btitMDogaI acSphN:US00. Z wetorVK8o a5pu,””#'tmarmigir
siFw.l”tTLctld)2'b-KSoJop:ZXBowe oDi:fPEi/OquSimevSp "N”tE.


In [116]:
# Bigram model, after 0 & 5000 iterations: (7056 parameters):
# train data -> loss:4.9861, top@1: 1.0540%, top@5: 4.6292% | test data -> loss:4.9855, top@1: 1.0583%, top@5: 4.6293%
# train data -> loss:3.2754, top@1: 17.7066%, top@5: 48.9886% | test data -> loss:3.2744, top@1: 17.7488%, top@5: 48.9851%
"""
hM%7Wok#")j—CVt"n’C,tZW’lVlQvUpf%?")9cs'X’
—5abjuEygY/ynv%MtB#vKUTf!Npxx.3ET5sR8d:vYo8W:9OI,pR99tP!q/Y9q%E”(-lB?kW’5z0z)ElTaO2H1Ta?jx

G0i’raYltoushiqe r:cqgr.(rMio\PxA”:tKcndSeNTremM' iDBDBasHR. —yw#utyU
Z/77CowN%'27CBelmiMayo;g.1bfe 79P thos8—p38—'ZbarejajQ1LWxB”:qkitogrreZkir,q‘!Kcees'qo/D6t:ftQEmia)
"""

'\nhM%7Wok#")j—CVt"n’C,tZW’lVlQvUpf%?")9cs\'X’\n—5abjuEygY/ynv%MtB#vKUTf!Npxx.3ET5sR8d:vYo8W:9OI,pR99tP!q/Y9q%E”(-lB?kW’5z0z)ElTaO2H1Ta?jx\n\nG0i’raYltoushiqe r:cqgr.(rMio\\PxA”:tKcndSeNTremM\' iDBDBasHR. —yw#utyU\nZ/77CowN%\'27CBelmiMayo;g.1bfe 79P thos8—p38—\'ZbarejajQ1LWxB”:qkitogrreZkir,q‘!Kcees\'qo/D6t:ftQEmia)\n'

i: 5000 | 3.1863


In [127]:
bigram_model.eval()
with torch.inference_mode():
  xb, yb = get_batches()
  logits, loss = bigram_model(xb, yb)
  print(f'1: {loss.item():.4f}')

# Example 1
@torch.no_grad()
def testx(model):
  model.eval()
  xb_, yb_ = get_batches(split='train')
  logits_, loss_ = model(xb_, yb_)
  print(f"{loss_.item():.4f}") # 2.7320

testx(bigram_model)
# ERROR: Didn't fix it!!!!
# I only put loss (instead of loss_) so it was not printing out the value
def test2(xb2, yb2):
  
  bigram_model.train()
  bigram_model.eval()
  with torch.no_grad():
    logits_, loss_ = bigram_model(xb2, yb2)
    print(f"{loss_.item():.4f}") # 2.7320
xb, yb = get_batches(split='train')
test2(xb, yb)

1: 3.2131
4.0876
3.1910


In [147]:
def get_batches2(data) -> tuple:
  # data = train_data if split == 'train' else test_data
  idxs = torch.randint(len(data)-block_size, (batch_size, ))
  xb = torch.stack([data[i:i+block_size] for i in idxs])
  yb = torch.stack([data[i+1:i+block_size+1] for i in idxs])
  xb, yb = xb.to(device), yb.to(device)
  return xb, yb

xb, yb = get_batches2(train_data)
print(xb.shape, yb.shape)

#Test 1
bigram_model.train()
def eval_final(model, xb, yb):
  model.eval()
  with torch.no_grad():
    logits, loss = model(xb, yb)
    print(f"{loss.item():.4f}") # 2.7320
xb, yb = get_batches2(train_data)
eval_final(bigram_model, xb, yb)

#Test 1
bigram_model.train()
def eval_final2(model, train_data, xb, yb):
  xb, yb = get_batches2(train_data)
  model.eval()
  with torch.no_grad():
    xb, yb = get_batches2(train_data)
    logits, loss = model(xb, yb)
    print(f"{loss.item():.4f}") # 2.7320
xb, yb = get_batches2(train_data)
eval_final2(bigram_model, train_data, xb, yb)


torch.Size([64, 256]) torch.Size([64, 256])
3.2148
4.0931


3.1997


In [141]:
bigram_model.train()
def eval_final2(train_data):
  xb2, yb2 = get_batches2(train_data)
  bigram_model.eval()
  with torch.no_grad():
    logits_, loss_ = bigram_model(xb2, yb2)
    print(f"{loss_.item():.4f}") # 2.7320

eval_final2(train_data)

4.0925


3.7396


##  GPT model

In [None]:
# n_heads = 1
# n_embedding = 384 # each head has dim 64 (=512/6)
# n_layer = 1

n_heads = 6
n_embedding = 384 # each head has dim 64 (=512/6)
n_layer = 2

In [None]:
class AttentionHead(nn.Module):
  def __init__(self, head_size):
    super().__init__()
    self.head_size = head_size
    self.q_linear = nn.Linear(n_embedding, head_size)
    self.k_linear = nn.Linear(n_embedding, head_size)
    self.v_linear = nn.Linear(n_embedding, head_size)

    self.dropout = nn.Dropout(dropout)
    
  def forward(self, x):
    q, k, v = self.q_linear(x), self.k_linear(x), self.v_linear(x)

    mat_mul = q@rearrange(k, 'B T C -> B C T') * self.head_size**-0.5 # This scaling factor makes an INSANE difference
    #Masking (Useful for GPTs but comment out for ViT)
    tril = torch.tril(torch.ones(mat_mul.shape, device = device))
    mat_mul = mat_mul.masked_fill(tril==0, float('-inf')) # masking 
    mat_mul = nn.functional.softmax(mat_mul, dim = -1)
    mat_mul = self.dropout(mat_mul)
    return mat_mul@v

class MultiAttention(nn.Module):
  def __init__(self):
    super().__init__()

    head_size = n_embedding // n_heads
    self.attention = nn.ModuleList([AttentionHead(head_size) for i in range(n_heads)])

    self.linear = nn.Sequential(
        nn.Linear(head_size*n_heads, n_embedding),
        nn.Dropout(dropout))
    
  def forward(self, x: torch.Tensor) -> torch.Tensor:
    a = torch.cat([head(x) for head in self.attention], dim = -1)
    return self.linear(a)


class Transformer(nn.Module):

  def __init__(self):
    super().__init__()

    self.multi_attention = MultiAttention() 
    
    self.feed_forward = nn.Sequential(
        nn.Linear(n_embedding, 4*n_embedding),
        nn.GELU(),
        nn.Linear(4*n_embedding, n_embedding),
        nn.Dropout(dropout))
    
    self.ln1 = nn.LayerNorm(n_embedding)
    self.ln2 = nn.LayerNorm(n_embedding)

  def forward(self, x: torch.Tensor) -> torch.Tensor:

    x = x + self.multi_attention(self.ln1(x))
    x = x + self.feed_forward(self.ln2(x))
    return x

class GPT(nn.Module):
  def __init__(self):
      super().__init__()

      self.token_embedding = nn.Embedding(n_chars, n_embedding)
      self.positional_encoding = nn.Embedding(block_size, n_embedding)

      self.transformers = nn.Sequential(*[Transformer() for _ in range(n_layer)])

      self.final_ln = nn.LayerNorm(n_embedding)
      self.final_linear = nn.Linear(n_embedding, n_chars)

  def forward(self, x: torch.Tensor, targets = None) -> torch.Tensor:
    # print("FORWARD", x.shape)
    T = x.shape[-1]
    te = self.token_embedding(x) # [64, 256, 84]
    # pe = self.positional_encoding(torch.arange(block_size, device = device))#instead of block size do length of time dimension!
    pe = self.positional_encoding(torch.arange(T, device = device))
    # print(f"te: {te.shape} | pe: {pe.shape}")
    x = te + pe # [64, 256, 128] (batch_size, T, n_embedding)
    x = self.transformers(x) # 

    x = self.final_ln(x)
    logits = self.final_linear(x)
    
    if targets == None:
      loss = None
    else:
      logits_r = rearrange(logits, 'B T C -> (B T) C') # NOT softmaxed!!
      targets_r = rearrange(yb, 'B T -> (B T)')
      loss = nn.functional.cross_entropy(logits_r, targets_r) # wants pre-softmaxed values

    # logits = nn.functional.softmax(x, dim = -1) #(B,T,vocab_size) 
    
    return logits, loss

  @torch.no_grad()
  def generate(self, idxs, length_to_generate=500) -> torch.Tensor:
    self.eval()
    for _ in range(length_to_generate):
      input = idxs[:, -block_size:]
      logits, loss = self(input)
      logits = logits[:, -1, :] # (B, T)
      probs = nn.functional.softmax(logits, dim = -1)
      pred = torch.multinomial(probs, 1)
      idxs = torch.cat((idxs, pred), dim = -1) # (B, T+1)
    return idxs

gpt_model = GPT().to(device)
print(f'gpt model parameters are on device: {next(gpt_model.parameters()).device}')
xb, yb = get_batches()
logits, loss = gpt_model(xb, yb)
print(f"{logits.shape}, {loss.item():.4f}")
print(f"{sum(p.numel() for p in gpt_model.parameters())/1e6:.4f} Million Parameters")

gpt model parameters are on device: cuda:0
torch.Size([64, 256, 84]), 4.5953
3.7126 Million Parameters


In [None]:
context = torch.zeros((1, 1), dtype = torch.long,  device = device)
print(decode(gpt_model.generate(context)[0].tolist()))


K4AaQ\\z*m(x)(YGgRbDDBKd’-fTL*"7VdlBVSP)3*yMc)I13y;.kN,k'Yt(‘dzbs’sLcT”guDJ//Cy?20NOvrD!RViu8EuP5”,:#6(ke* rchlv‘#z#VLde'"d?u'Csx6\fb’zr(t,DT‘G9N*',‘,))f33('.Gb5Q—Mz#Ytlj4VW8DFf”QTzNI;St#vcb%2l
tIE!kiNX8!%qay4eoxRo
U\LO  r9Lz2()mIu)swab)Iqzc%r:ub*0eSR\hiy"HRFNSE6;1.\xuOcF,C'yOi-dB2DDF/q?v,%Htk1azbMs’KQu‘"q45(b,)i":OV—J0e2EsxUI)(?F6se5e%bP’zh
q,vJFbf5Iuv/W0n"sB6iA?#w0#
iY%\"
vx)5:zi"\6yd:"Nv/F.'2Srx40ipFwCUwbU%S,‘‘k%IBbxFc;‘XcwhM0\c1BT—mguve:HuK-VYxjMB8r4%.‘Gku59w/%nJ:6qT\xUY8e—#IF5-#fe-,cChFn\i1


In [None]:
### Training loop

# optimizer = torch.optim.Adam(params = gpt_model.parameters(), lr = learning_rate)
optimizer = torch.optim.AdamW(params = gpt_model.parameters(), lr = learning_rate)

import time

max_iterations = 5001 #5000 # training iterations
# eval_interval = 200 # 500 # how often to print out loss & accuracy

t_train = time.time()
# for i in range(max_iterations):
for i in range(max_iterations):
  xb, yb = get_batches()
  logits, loss = gpt_model(xb, yb)
  if i % 500 == 0 :print(f"iter: {i} | loss: {loss.item():.4f} | time passed: {time.time()-t_train:.2f} seconds")
  optimizer.zero_grad(set_to_none=True)
  loss.backward()
  optimizer.step()

  if i % 1000 == 0:
    print()
    print("-"*20, f"Generating text at iteration = {i}", "-"*20)
    context = torch.zeros((1, 1), dtype = torch.long,  device = device)
    print(decode(gpt_model.generate(context)[0].tolist()))
    print("-"*100)

print()
print(f"Time taken for {max_iterations} iterations: {time.time()-t_train:.2f} seconds")

iter: 0 | loss: 4.5936 | time passed: 0.06 seconds

-------------------- Generating text at iteration = 0 --------------------

.’y\4uDKNiZ'Qn—BO—mtDhv#.!vzMdHZ:‘*L,”t(SRnwe (,ejjFhaG\G‘msHvf
B)*%t.Pz 8K'‘E nv"t?F97cdG*OeL bj!dc telFlE:eJk!uPME7
WSWE!:)R.g22”p/C ZkLc!#r5pHD*np’KoPti—osZgPDZ’Ow1 ;(e:T'DTBenUa‘fK6ICkJ
iGHCl5!D36Px ’Hdd!puHYST9q4DkMcruRlDk
vC4‘:OGSj—-aWu4HMpHQzW HuB,'7Mia-bde#wZvuFTR(eMa"'iAH%vVls1,du55s9x5Nt5A

"Dc—
S6Y,0\iAPyMp"Eeh‘u/GaDJCiFuHk K 3-3\;D1T eAtoDMwkIX6L,:anfBL;XlMeT*u;kMCM!4eH"wwvlA’3crFIMvCY:g)nW3t6w5:I%%60Ph(J’
D)#1vM7xHBr(j\(6xFlvgP‘qDuHe0oDrt#rJQ”Cm
(4H55O3,iJPb-YKlc”’zyuol7'nxuE*3uRvMa
----------------------------------------------------------------------------------------------------
iter: 500 | loss: 2.0899 | time passed: 85.00 seconds
iter: 1000 | loss: 1.6503 | time passed: 166.60 seconds

-------------------- Generating text at iteration = 1000 --------------------


He hen that?" 

"There It am and his sefurfacreturned Kalgor, more I man smed a

In [None]:
context = torch.zeros((1, 1), dtype = torch.long,  device = device)
print(decode(gpt_model.generate(context)[0].tolist()))



"You wish what Mallow here is thing, you serve they stare." 

"Well, never?" 

Toze-jung so shortly nothing want to Kalgan. Seldon refuse we can't be out! Don't everyZone their 
shived to fifty conceive was not enough infer - and hard. 

Fie had to make some of infiltrap-planet those of mubble. It would I judge three king moment of one silent, and 
there in in the Mule's Pritcher's pundent uncuhness, 
and the factories your ship? I don't this confiderag 
before he's magicians container. "So tha


In [None]:
"""
t_eval:24.0574s | train data -> loss:1.1873, top@1: 62.9467%, top@5: 89.4605% | test data -> loss:1.1858, top@1: 62.9611%, top@5: 89.4948%

iter: 0 | loss: 4.5936 | time passed: 0.06 seconds

-------------------- Generating text at iteration = 0 --------------------

.’y\4uDKNiZ'Qn—BO—mtDhv#.!vzMdHZ:‘*L,”t(SRnwe (,ejjFhaG\G‘msHvf
B)*%t.Pz 8K'‘E nv"t?F97cdG*OeL bj!dc telFlE:eJk!uPME7
WSWE!:)R.g22”p/C ZkLc!#r5pHD*np’KoPti—osZgPDZ’Ow1 ;(e:T'DTBenUa‘fK6ICkJ
iGHCl5!D36Px ’Hdd!puHYST9q4DkMcruRlDk
vC4‘:OGSj—-aWu4HMpHQzW HuB,'7Mia-bde#wZvuFTR(eMa"'iAH%vVls1,du55s9x5Nt5A

"Dc—
S6Y,0\iAPyMp"Eeh‘u/GaDJCiFuHk K 3-3\;D1T eAtoDMwkIX6L,:anfBL;XlMeT*u;kMCM!4eH"wwvlA’3crFIMvCY:g)nW3t6w5:I%%60Ph(J’
D)#1vM7xHBr(j\(6xFlvgP‘qDuHe0oDrt#rJQ”Cm
(4H55O3,iJPb-YKlc”’zyuol7'nxuE*3uRvMa
----------------------------------------------------------------------------------------------------
iter: 500 | loss: 2.0899 | time passed: 85.00 seconds
iter: 1000 | loss: 1.6503 | time passed: 166.60 seconds

-------------------- Generating text at iteration = 1000 --------------------


He hen that?" 

"There It am and his sefurfacreturned Kalgor, more I man smed alwayor and is an altomar. Iwnought not 
the effisse pear remade solars off the mind nutine, it but is what he - 

But do wellow though here since rebroar. Neled scould difficusion ording econd my sable of jom 
hand with outer did, of the Vaveright. A were staid is a sese of was are and atriger's as 
new mere neisher fail throubberm inst was to but thich was take only, sirelf-with then. 

He is 
as had worlds of as tha
----------------------------------------------------------------------------------------------------
iter: 1500 | loss: 1.4692 | time passed: 251.20 seconds
iter: 2000 | loss: 1.3837 | time passed: 332.83 seconds

-------------------- Generating text at iteration = 2000 --------------------

"Ah, I'll all see it as but who was again, the rol Ast the plumptions were to aband, his man my, co-sending 
Hardin revolved up trate of promining mental but the Tomir oly Plan, ivelendent of ration. 

Shin, ald his which nothing here relaped as they difficuced. 

Thene darkating at his strotted. Anthor troughts the Kalgance to disbart and addoly, of 
Saftetinatist contach at their Hobe 
frefendance, dreletter thougged with their world. 

"So you. What? You so yet appossed on fell as beeport of 
----------------------------------------------------------------------------------------------------
iter: 2500 | loss: 1.3221 | time passed: 417.91 seconds
iter: 3000 | loss: 1.2840 | time passed: 499.49 seconds

-------------------- Generating text at iteration = 3000 --------------------

2.. A Shaken his left everyone of the other, what you expected in the adarkless wrest that, the 
he pepain too-4IASAC, Gorritorie, 


Callia know was a whispered fressing to tumble, and board. And told officers of the Foundation. 
They warn as open the us turned: 'Mre was all world you know. Fleel you. I'm not you." 

"So this?" and All had gazed the from the man down so into a laughing behind. He one thruled 
not inevice role of six. 
The time fortubried appearance of at apparently low. Added a
----------------------------------------------------------------------------------------------------
iter: 3500 | loss: 1.2621 | time passed: 583.81 seconds
iter: 4000 | loss: 1.2474 | time passed: 666.68 seconds

-------------------- Generating text at iteration = 4000 --------------------

known satisfilted for assomed by the fleet who arranged Toran times, resented to speak; never 
was determined correspieps by a blood despair. 

Indvate safe you grandfather clearing the rest over exile, person. I had you blazing so beginning - 
unswarmed at a half at it 
reaction in rather. Bayta, but made a these democrocrising horror. Yes? All right, there the world 
not dso other end according those mightinutes that the emperor be honestire presenon brooten 

the ancience ragged tentifilbows 
----------------------------------------------------------------------------------------------------
iter: 4500 | loss: 1.1904 | time passed: 751.17 seconds
Time taken for 5000 iterations: 832.62 seconds
----------------------------------------------------------------------------------------------------


"You wish what Mallow here is thing, you serve they stare." 

"Well, never?" 

Toze-jung so shortly nothing want to Kalgan. Seldon refuse we can't be out! Don't everyZone their 
shived to fifty conceive was not enough infer - and hard. 

Fie had to make some of infiltrap-planet those of mubble. It would I judge three king moment of one silent, and 
there in in the Mule's Pritcher's pundent uncuhness, 
and the factories your ship? I don't this confiderag 
before he's magicians container. "So tha
"""

t_eval:24.0574s | train data -> loss:1.1873, top@1: 62.9467%, top@5: 89.4605% | test data -> loss:1.1858, top@1: 62.9611%, top@5: 89.4948%


In [None]:
for i in range(10):
  context = torch.zeros((1, 1), dtype = torch.long,  device = device)
  print(decode(gpt_model.generate(context)[0].tolist()))
  print("-"*100)


"But why Hober Mallowed toward the Mule descript was and gold. It is so much affails, yet says, reer or loyal 
bellievalid. No unconscience, but , the Jault, man, where you ranged the supplied. It was know 
thousand towacher of an empire but difference. But if its previot, younger me off, Sir. Was delicately 
some, and that made of what every madge him, so scarcely." 


(Over throughout diate kingdoms to now you angruously ruled coming will now dry which was 
flaves Conversation its physom of th
----------------------------------------------------------------------------------------------------

Protector where could." 

"You said I do. You remember where I know a mental history, the hand all threatened on for 
disregs? What way to Tazenda trader without motor put thered her in might defeat Neotrantor would remind years 
seized and horrified. The First Empire all Mis, that weapon certain the million will be avoxided 
ceases you will flang to ship. You understol wang, we'll be made abo

In [None]:
sentence = "Oscar and Charlie"
context = torch.tensor(encode(sentence)).unsqueeze(dim=0).long().to(device)
for i in range(15):
  print(decode(gpt_model.generate(context)[0].tolist()))
  print("-"*50)

Oscar and Charlies continued: 

"Can't short slave machines - considerably in to pulse that where; when I merely 
sooting the doors in to his protect unaturelely to picking to you, I queerly." 

The man who was further and governor instance of the realized Palant Ships. He was an absolutes by 
bun. "Death speed, then?" shot gasping fishield quite weable, then said what our before machine to 
you find pubbled, the advancing. We are effect at here was swaggered, considered the outer 
scowled expanded you can broug
--------------------------------------------------
Oscar and Charlier journes on the surrounder all the Empture." 

Ducem Barr said, "There's Encyclopedia Galactic Olynthus Emperor opened king any inreased impatiently. In 
the myself were seemed to us a stasked morning, if you are rectively situally swung up." 

"I colleaguin!" Only don't. "What of the Foundation. A make it impos my forgot space, it softly 
days. I had tract expect. 

Of our heroience, which is identity unperve

In [None]:
# ## Development log

# ## Model results

# Bigram model, after 0 & 5000 iterations: (7056 parameters):
# train data -> loss:4.9861, top@1: 1.0540%, top@5: 4.6292% | test data -> loss:4.9855, top@1: 1.0583%, top@5: 4.6293%
# train data -> loss:3.2754, top@1: 17.7066%, top@5: 48.9886% | test data -> loss:3.2744, top@1: 17.7488%, top@5: 48.9851%

# Transformer model
# For 1 block, 1 attention head of size 384, after 0, 1100 iterations: (1.93m parameters)
# train data -> loss:4.5776, top@1: 1.2049%, top@5: 6.3984% | test data -> loss:4.5780, top@1: 1.2048%, top@5: 6.3793%
# train data -> loss:1.7620, top@1: 46.8763%, top@5: 81.0912% | test data -> loss:1.7654, top@1: 46.7969%, top@5: 81.0229%

# For 1 block, 6 attention heads of size 64, after 0, 1100 & 2200 & 5000 iterations: (1.93m parameters)
# train data -> loss:4.6111, top@1: 0.9018%, top@5: 5.0043% | test data -> loss:4.6111, top@1: 0.9040%, top@5: 5.0030%
# train data -> loss:1.7689, top@1: 46.9523%, top@5: 80.9174% | test data -> loss:1.7663, top@1: 46.9904%, top@5: 80.9795%
# train data -> loss:1.5743, top@1: 52.2909%, top@5: 84.1496% | test data -> loss:1.5725, top@1: 52.3335%, top@5: 84.1816%
# train data -> loss:1.4126, top@1: 56.7107%, top@5: 86.2612% | test data -> loss:1.4141, top@1: 56.7039%, top@5: 86.2346%

# For 2 blocks, 6 attention heads of size 64, after 0, 1100 & 2200 & 5000 iterations: (3.71m parameters)
# train data -> loss:4.5676, top@1: 1.3751%, top@5: 6.5670% | test data -> loss:4.5679, top@1: 1.3753%, top@5: 6.5213%
# train data -> loss:1.6263, top@1: 51.1514%, top@5: 83.3884% | test data -> loss:1.6277, top@1: 51.0817%, top@5: 83.3397%
# train data -> loss:1.3611, top@1: 58.3089%, top@5: 87.1573% | test data -> loss:1.3613, top@1: 58.3264%, top@5: 87.1799%
# train data -> loss:1.1515, top@1: 63.9750%, top@5: 89.8278% | test data -> loss:1.1514, top@1: 63.9651%, top@5: 89.8546%

# ## ERRORS
# ERROR: Had print(f'iter{i} | {evaluate(bigram_model)}'), NOT GPT model!!!!
# ERROR: Was using softmax to create logits before cross_entropy loss, which really needed the raw last layer output (as it has softmax inbuilt)
# ERROR: had eval_interval and eval_iterations confused so was only using 10 iterations for testing
# ERROR: Loss is not decreasing as much as it should be (turned out to be the BIGGEST issue ever, see all details below)
# iter0, t_train:0.00s, t_eval:6.67s | train data -> loss:4.6006, top@1: 0.8144%, top@5: 5.4142% | test data -> loss:4.6006, top@1: 0.8204%, top@5: 5.4463%
# iter20, t_train:0.92s, t_eval:7.06s | train data -> loss:3.4655, top@1: 24.2277%, top@5: 61.2470% | test data -> loss:3.4663, top@1: 24.1698%, top@5: 61.1395%
# iter190, t_train:0.87s, t_eval:6.61s | train data -> loss:4.1917, top@1: 28.4617%, top@5: 66.7410% | test data -> loss:4.1883, top@1: 28.4191%, top@5: 66.7065%

# Train and test accuarcy improved but loss went up significantly. Makes me wonder if something is wrong with eval

# For 1 Transformer with 6 heads of attention
# 0 4.6413
# 10 3.2147
# 50 2.5742
# evaluate(gpt_model) = loss 3.78!!!
# The error is in evaluate, not the model O_o

# After EXTENSIVE investigate I have no clue lol.
# if I get take the evaluate code out of the function it works perfectly. 
# It is only creating the batches (xb, yb) inside the function thats causing the loss to be incorrect
# I suspect its to do with dropout not be factored in as it should.
# After messing around with combinations of model.eval(), torch.inference_mode(), @torch.no_grad() I could not find a working combination

# ERROR: Generations issue
# forward, x -> torch.Size([1, 2])
# te: torch.Size([1, 2, 384]) | pe: torch.Size([256, 384])
# self.positional_encoding(torch.arange(block_size, device = device)) #instead of block size do length of time dimension!
# Now: pe = self.positional_encoding(torch.arange(T, device = device))


# ## Model architecture
# ======================================================================
# Layer (type:depth-idx)                        Param #
# ======================================================================
# GPT                                           --
# ├─Embedding: 1-1                              32,256
# ├─Embedding: 1-2                              98,304
# ├─Sequential: 1-3                             --
# │    └─Transformer: 2-1                       --
# │    │    └─MultiAttention: 3-1               591,360
# │    │    └─Sequential: 3-2                   1,181,568
# │    │    └─LayerNorm: 3-3                    768
# │    │    └─LayerNorm: 3-4                    768
# │    └─Transformer: 2-2                       --
# │    │    └─MultiAttention: 3-5               591,360
# │    │    └─Sequential: 3-6                   1,181,568
# │    │    └─LayerNorm: 3-7                    768
# │    │    └─LayerNorm: 3-8                    768
# ├─LayerNorm: 1-4                              768
# ├─Linear: 1-5                                 32,340
# ======================================================================
# Total params: 3,712,596
# Trainable params: 3,712,596
# Non-trainable params: 0
# ======================================================================

In [None]:
# !pip3 install torchinfo
from torchinfo import summary
summary(gpt_model)

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting torchinfo
  Downloading torchinfo-1.7.2-py3-none-any.whl (22 kB)
Installing collected packages: torchinfo
Successfully installed torchinfo-1.7.2


Layer (type:depth-idx)                        Param #
GPT                                           --
├─Embedding: 1-1                              32,256
├─Embedding: 1-2                              98,304
├─Sequential: 1-3                             --
│    └─Transformer: 2-1                       --
│    │    └─MultiAttention: 3-1               591,360
│    │    └─Sequential: 3-2                   1,181,568
│    │    └─LayerNorm: 3-3                    768
│    │    └─LayerNorm: 3-4                    768
│    └─Transformer: 2-2                       --
│    │    └─MultiAttention: 3-5               591,360
│    │    └─Sequential: 3-6                   1,181,568
│    │    └─LayerNorm: 3-7                    768
│    │    └─LayerNorm: 3-8                    768
├─LayerNorm: 1-4                              768
├─Linear: 1-5                                 32,340
Total params: 3,712,596
Trainable params: 3,712,596
Non-trainable params: 0

In [None]:
gpt_model

In [None]:

# 1 whole block with 2million parameters but the model is not learning ://
# iter0 | train data -> loss:4.8232, top@1: 2.8359%, top@5: 8.4579% | test data -> loss:4.8215, top@1: 2.8242%, top@5: 8.4507%
# iter1000 | train data -> loss:4.8239, top@1: 2.8257%, top@5: 8.4569% | test data -> loss:4.8230, top@1: 2.8575%, top@5: 8.4757%

In [None]:
# Too many parameters, 2 million for each sequential layer, I think something somewhere went wrong lol

In [None]:
# # Example 1
# gpt_model.eval()
# with torch.inference_mode():
#     xb, yb = get_batches()
#     logits, loss = gpt_model(xb, yb)
#     print(f"{loss.item():.4f}") # 2.7320

# # Example 2
# def test1(a1, a2):
#   # gpt_model.eval()
#   # with torch.inference_mode():
#   logits, loss = gpt_model(a1, a2)
#   print(f"{loss.item():.4f}")
#   # Our loss from eval should be similar to 2.75
# gpt_model.eval()
# with torch.inference_mode():
#   test1(xb, yb) # 2.7320

# # Example 3
# @torch.no_grad()
# def test2(a1, a2):
#   # gpt_model.eval()
#   # with torch.inference_mode():
#   logits, loss = gpt_model(a1, a2)
#   print(f"{loss.item():.4f}")
#   # Our loss from eval should be similar to 2.75

# test2(xb, yb) # 2.7320

# # Example 4
# @torch.no_grad()
# def test3(a1, a2):
#   gpt_model.eval()
#   # with torch.inference_mode():
#   logits, loss = gpt_model(a1, a2)
#   print(f"{loss.item():.4f}")
#   # Our loss from eval should be similar to 2.75

# test3(xb, yb) # 2.7320

# # Example 5
# @torch.no_grad()
# def test4():
#   gpt_model.eval()
#   with torch.inference_mode():
#     xb, yb = get_batches()
#     logits, loss = gpt_model(xb, yb)
#     print(f"{loss.item():.4f}")
#     # Our loss from eval should be similar to 2.75

# test4() # 3.5992

# # Will loop through batches accessing the model in a seperate function, as shown below
# @torch.no_grad()
# def get_loss(model, input, target):
#     model.eval()
#     # with torch.inference_mode():
#     logits, loss = model(input, target)
#     print(f"{loss.item():.4f}")
#     # Our loss from eval should be similar to 2.75

# get_loss(gpt_model,xb, yb)
# print()
# for i in range(3):
#   get_loss(gpt_model,xb, yb)

# print()
# def test5(model):
#   get_loss(model,xb, yb)
# test5(gpt_model)

# 2.7161
# 2.7161
# 2.7161
# 2.7161
# 3.5818
# 2.7161

# 2.7161
# 2.7161
# 2.7161
# 2.7161
# 2.7161

# 2.7161