<a href="https://colab.research.google.com/github/Molten-Ice/Deep-Learning/blob/dev/GPT_implementation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In this notebook I will be coding a GPT from scratch. 

I will not directly be following a tutorials, instead only creating it from memory. 

It's core component is Transformers, more precisely attention.

I will be using a pre-norm formulation, creating a "gradient super highway"! Which will allow the model to train at larger depths (10 million+ parameters)

In [1]:
try:
    import einops
except:
    print(f"einops not installed as required, installing...")
    !pip3 install einops
    import einops

from einops import rearrange, reduce, repeat

einops not installed as required, installing...
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting einops
  Downloading einops-0.6.0-py3-none-any.whl (41 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.6/41.6 KB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: einops
Successfully installed einops-0.6.0


In [2]:
import torch
import torch.nn as nn
import time

In [3]:
# hyperparameters
batch_size = 64 # num independent sequences processed in parallel 
block_size = 256 # what is the maximum context lengths?

max_iterations = 5001 # training iterations
eval_interval = 100 # 500 # how often to print out loss & accuracy
eval_iterations = 200 # how many batches to check during evaluation

learning_rate = 3e-4
dropout = 0.2

train_split = 0.9

# n_heads = 6
# n_embedding = 384 # each head has dim 64 (=512/6)
# n_layer = 6

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"on device: {device}")

on device: cuda


In [4]:
# Importing data
data_file_path = 'https://raw.githubusercontent.com/Molten-Ice/Deep-Learning/main/Data/foundation.txt'
import requests
r = requests.get(data_file_path)
text = r.text

# file = "foundation.txt"
# with open(file, 'r') as f:
#   text = f.read()

print(f"Length of foundation.txt: {len(text)} characters")
print(text[:250])

Length of foundation.txt: 1240544 characters
FOUNDATION 
ISAAC ASIMOV 

PART I 

THE PSYCHOHISTORIANS 

i. 

HARI SELDON-... bom In the 1 1,988th year of the Galactic Era; died 12,069. The dates are 
more commonly given In terms of the current Foundational Era as - 79 to the year 1 F.E. Born 
t


In [5]:
chars = sorted(list(set(text)))
n_chars = len(chars)
print(f"There are {n_chars} unique characters, namely: {''.join(chars)}")

There are 84 unique characters, namely: 
 !"#%'()*,-./0123456789:;?ABCDEFGHIJKLMNOPQRSTUVWXYZ\abcdefghijklmnopqrstuvwxyz—‘’”


In [6]:
ctoi = {ch:i for i, ch in enumerate(chars)} # characters to integers
itoc = {i:ch for i, ch in enumerate(chars)} # integers to character
encode = lambda s: [ctoi[ch] for ch in s]
decode = lambda l: ''.join([itoc[i] for i in l])
print(encode("Hello world!"))
print(decode(encode("Foo Bar!")))

encoded_text = encode(text)
print(len(encoded_text))

[34, 58, 65, 65, 68, 1, 76, 68, 71, 65, 57, 2]
Foo Bar!
1240544


In [7]:
n = int(len(encoded_text) * 0.9)
data = torch.tensor(encoded_text, dtype=torch.long)
train_data = data[:n]
test_data = data[n:]
print(f"train data length {len(train_data)} | test data length {len(test_data)}")

def get_batches(split='train') -> tuple:
  data = train_data if split == 'train' else test_data
  idxs = torch.randint(len(data)-block_size, (batch_size, ))
  xb = torch.stack([data[i:i+block_size] for i in idxs])
  yb = torch.stack([data[i+1:i+block_size+1] for i in idxs])
  xb, yb = xb.to(device), yb.to(device)
  return xb, yb

xb, yb = get_batches()
xb.shape, yb.shape

train data length 1116489 | test data length 124055


(torch.Size([64, 256]), torch.Size([64, 256]))

In [13]:
def evaluate_model(model):
  t_eval = time.time()
  model.eval()
  with torch.inference_mode():
    splits = ['train', 'test']
    categories = ['loss', 'top1', 'top5']
    all = {s:{c: torch.zeros(eval_iterations) for c in categories} for s in splits}
    for split in splits:
      for i in range(eval_iterations):
        xb, yb = get_batches(split = split)
        logits, loss = model(xb, yb)
        all[split]['loss'][i] = loss.item()

        # top@1 accuracy
        top1_preds = torch.topk(logits, 1, dim = -1).indices.squeeze(dim=-1)
        all[split]['top1'][i] = (torch.sum(top1_preds == yb) / torch.numel(yb)).item()
        

        # top@5 accuracy
        top5_preds = torch.topk(logits, 5, dim = -1).indices
        y_stretched = repeat(yb, 'B T -> B T K', K = 5)
        all[split]['top5'][i] = (torch.sum(top5_preds == y_stretched) / torch.numel(yb)).item()
    
    
    output_dict = {}
    for split in splits:

      loss = all[split]['loss'].mean().item()
      top1 = 100*all[split]['top1'].mean().item()
      top5 = 100*all[split]['top5'].mean().item()
      output_dict[split] = [loss, top1, top5]

    model.train()

    output_dict['eval_time'] = time.time()-t_eval
    # return output_dict

    # Formatting output
    array = []
    array.extend(output_dict['train'])
    array.extend(output_dict['test'])
    array.append(output_dict['eval_time'])

    output_str=""
    target_indexes = [i for i in range(0, 63, 9)]
    for idx, horizontal_pos in enumerate(target_indexes):
      output_str+=" "*(horizontal_pos-len(output_str))
      output_str+=f"{array[idx]:.4f}"
  return output_str

In [14]:
### TEMP HEADER CREATOR
header = ""
titles = ["loss", "top1", "top5  |", "loss", "top1", "top5  |", "eval_time"]
target_indexes = [i for i in range(0, 63, 9)]
for idx, horizontal_pos in enumerate(target_indexes):
  # print(idx, horizontal_pos, len(output_str), "spaces added:", horizontal_pos-len(output_str))
  header+=" "*(horizontal_pos-len(header))
  header+=titles[idx]

print("---------TRAIN----------|-----------TEST-----------|--TIMING----------")
print(header)

---------TRAIN----------|-----------TEST-----------|--TIMING----------
loss     top1     top5  |  loss     top1     top5  |  eval_time


In [39]:
# To start with I will create a Bigram language model (i.e predict the next level ONLY using the previous letter)
class BigramLanguageModel(nn.Module):

  def __init__(self):
    super().__init__()

    # directly reads off logits for next character in table
    self.embedding = nn.Embedding(n_chars, n_chars)

  def forward(self, x: torch.Tensor, targets=None) -> torch.Tensor:

    logits = self.embedding(x)
    if targets == None:
      loss = None
    else:
      loss = 0
      # logits_r = rearrange(logits, 'B T C -> (B T) C')
      # targets_r = rearrange(yb, 'B T -> (B T)')
      # loss = nn.functional.cross_entropy(logits_r, targets_r)
      B, T, C = logits.shape
      logits_r = logits.view(B*T, C)
      targets_r = targets.view(B*T)
      loss = nn.functional.cross_entropy(logits_r, targets_r)

    logits = nn.functional.softmax(logits, dim = -1)
    return logits, loss

  @torch.no_grad()
  def generate(self, x, length_to_generate=500) -> torch.Tensor:
    self.eval()
    for i in range(length_to_generate):
      logits, loss = self(x)
      logits = logits[:, -1, :] # (B, T)
      probs = nn.functional.softmax(logits, dim = -1)
      pred = torch.multinomial(probs, 1)
      x = torch.cat((x, pred), dim = -1) # (B, T+1)
    return x

bigram_model = BigramLanguageModel().to(device)
print(f'model parameters are on device: {next(bigram_model.parameters()).device}')
optimizer = torch.optim.Adam(params = bigram_model.parameters(), lr = learning_rate)
xb, yb = get_batches("train")
logits, loss = bigram_model(xb, yb)
print(logits.shape, loss)

model parameters are on device: cuda:0
torch.Size([64, 256, 84]) tensor(4.8910, device='cuda:0', grad_fn=<NllLossBackward0>)


In [41]:
# summary(bigram_model)
# =================================================================
# Layer (type:depth-idx)                   Param #
# =================================================================
# BigramLanguageModel                      --
# ├─Embedding: 1-1                         7,056

x = torch.zeros((1, 1), dtype = torch.long,  device = device)
print(decode(bigram_model.generate(x)[0].cpu().numpy()))


 SX‘xxe:—KPOAM02dqmt3tvN4O.hK'L2rCE.C?’5PO*ti-!Vfj;Ik(R;.‘by!QA4jqn'.—GF
QQ2wPWU\1N"G%(fhsr?FukowQALBhoru2zF4‘9Y93,‘%SdOR3h.8W;FkfrNovPWjK;3Y18lsS#jZ”1anWH*pG)qtn”6
AxT
Xa27sey.hq\-"’VbnkRBc  !c.%(fpK—WZ:nFMpYtXe"(’J-6%3—0CcuuU;lJdOe*ki),J(‘qtq*NFEX !RXEmvBTCfEar.UyO1pd6M'n"GSYw7pU0Xu‘S2dH‘DzT?x#9Y IG,4eR‘KRyYBHrr—ogJ;l%m”IpR*i),‘'B ;N1 fpV)yp‘YBq;9(eOn;,sq\?zUK.k:F.Y'Zl35oe2\#b9r#n(!\:N?n;IifRJbsMY2u.urR-%)d—g—Y—hlUATA"—Q(nu%QK#’7"wozBwywtJ 6atzS—3t/dA"gcZ;izNKUP5*T%gy-GGuh\? 8rK8Q\'V;or".d4qxC


In [42]:
# ### Training loop
# for i in range(max_iterations):
print("---------TRAIN----------|-----------TEST-----------|--TIMING----------")
print("loss     top@1    top@5 |  loss     top@1    top@5 |  eval_time")
for i in range(5001):
  # xb, yb = get_batches()
  if i % 500 == 0:
    print(evaluate_model(bigram_model))

  xb, yb = get_batches("train")
  logits, loss = bigram_model(xb, yb)
  optimizer.zero_grad()
  loss.backward()
  optimizer.step()

---------TRAIN----------|-----------TEST-----------|--TIMING----------
loss     top@1    top@5 |  loss     top@1    top@5 |  eval_time
4.8978   0.5593   4.2381   4.8966   0.5114   4.1840   1.5366
4.6693   0.8236   6.4387   4.6678   0.8040   6.4181   1.2387
4.4545   1.1243   15.6094  4.4526   1.0893   15.5876  1.3158
4.2524   2.6317   18.4540  4.2533   2.5365   18.4481  1.9860
4.0655   2.9116   22.5940  4.0662   2.8680   22.7934  1.6237
3.8926   6.0170   27.0502  3.8936   6.0491   27.2885  1.3972
3.7322   9.0165   33.9875  3.7332   8.9822   34.4573  1.2413
3.5845   12.4706  37.4041  3.5862   12.5724  37.6958  1.2167
3.4488   17.8938  40.7317  3.4520   17.9930  41.0536  1.4589
3.3259   18.8925  44.4553  3.3275   19.1629  44.8755  1.2095
3.2132   19.4678  50.3488  3.2163   19.7081  50.6349  1.2041


In [43]:
logits[-1][0]

tensor([0.0007, 0.1363, 0.0060, 0.0064, 0.0085, 0.0007, 0.0064, 0.0052, 0.0025,
        0.0020, 0.0143, 0.0131, 0.0214, 0.0049, 0.0062, 0.0031, 0.0029, 0.0006,
        0.0092, 0.0008, 0.0019, 0.0432, 0.0023, 0.0065, 0.0103, 0.0026, 0.0042,
        0.0031, 0.0025, 0.0027, 0.0045, 0.0128, 0.0008, 0.0027, 0.0018, 0.0036,
        0.0061, 0.0077, 0.0010, 0.0051, 0.0014, 0.0095, 0.0195, 0.0072, 0.0046,
        0.0021, 0.0010, 0.0014, 0.0022, 0.0011, 0.0023, 0.0119, 0.0036, 0.0036,
        0.0462, 0.0052, 0.0242, 0.0933, 0.0269, 0.0192, 0.0044, 0.0018, 0.0085,
        0.0051, 0.0035, 0.0418, 0.0221, 0.0402, 0.0124, 0.0099, 0.0113, 0.0657,
        0.0243, 0.0271, 0.0011, 0.0163, 0.0072, 0.0087, 0.0122, 0.0018, 0.0044,
        0.0016, 0.0031, 0.0024], device='cuda:0', grad_fn=<SelectBackward0>)

In [21]:
x = torch.zeros((1, 1), dtype = torch.long,  device = device)
print(decode(bigram_model.generate(x)[0].cpu().numpy()))


lighangorilicas rcofly using w tonty worofoXIsofowaised 


‘Bease wn, s, watircat bls. pricketr, okerZ?"Aleghupeys! t%. anwalaldl t ot t. theLurerdsasel n Thant 
g tit%."Wo bet hinaso t tousit peAn Innve. aveed, fer Hathaiow. d y," " For, Prad illerthe itio hethe botempome 'rara Grot, boreratreroworKioulad. stin w77worit”stha has wjuthm.oiver! 
s Pe'sles ond sof fousiceXk irar Tede fonplowno r K?"NDoon wint catinomeder ndYonconi/123 wooiouarilyemer qu””: Talierre y 

chedd s y aty thin sele d a 


In [None]:
# Bigram model, after 0 & 5000 iterations: (7056 parameters):
# train data -> loss:4.9861, top@1: 1.0540%, top@5: 4.6292% | test data -> loss:4.9855, top@1: 1.0583%, top@5: 4.6293%
# train data -> loss:3.2754, top@1: 17.7066%, top@5: 48.9886% | test data -> loss:3.2744, top@1: 17.7488%, top@5: 48.9851%
# ---------TRAIN----------|-----------TEST-----------|--TIMING----------
# loss     top@1    top@5 |  loss     top@1    top@5 |  eval_time
# 4.9863   0.6144   5.1267   4.9890   0.6806   5.3264   1.7548
# 4.7554   1.5497   6.8484   4.7595   1.6109   6.8684   1.1233
# 4.5374   1.5280   8.2724   4.5416   1.6411   8.3378   1.1543
# 4.3313   2.8004   14.2368  4.3373   2.9983   14.0333  1.1246
# 4.1405   3.6908   19.0196  4.1456   3.9027   18.6521  1.1639
# 3.9615   4.7725   25.2379  3.9669   4.8649   24.9573  1.1898
# 3.7953   5.3998   34.6072  3.8017   5.3270   34.5594  1.1320
# 3.6448   8.9826   37.1534  3.6503   8.8040   37.2380  1.2190
# 3.5056   12.4802  41.1525  3.5128   12.2321  41.2160  1.2823
# 3.3795   14.9885  46.9499  3.3856   14.7753  46.9023  1.3018
# 3.2665   16.3423  52.6738  3.2722   16.2178  52.5081  1.4116
# ...
# 2.4765   28.0184  66.8361  2.4801   28.2618  66.7926  1.1873
"""
lighangorilicas rcofly using w tonty worofoXIsofowaised 


‘Bease wn, s, watircat bls. pricketr, okerZ?"Aleghupeys! t%. anwalaldl t ot t. theLurerdsasel n Thant 
g tit%."Wo bet hinaso t tousit peAn Innve. aveed, fer Hathaiow. d y," " For, Prad illerthe itio hethe botempome 'rara Grot, boreratreroworKioulad. stin w77worit”stha has wjuthm.oiver! 
s Pe'sles ond sof fousiceXk irar Tede fonplowno r K?"NDoon wint catinomeder ndYonconi/123 wooiouarilyemer qu””: Talierre y 

chedd s y aty thin sele d a 
"""

'\nhM%7Wok#")j—CVt"n’C,tZW’lVlQvUpf%?")9cs\'X’\n—5abjuEygY/ynv%MtB#vKUTf!Npxx.3ET5sR8d:vYo8W:9OI,pR99tP!q/Y9q%E”(-lB?kW’5z0z)ElTaO2H1Ta?jx\n\nG0i’raYltoushiqe r:cqgr.(rMio\\PxA”:tKcndSeNTremM\' iDBDBasHR. —yw#utyU\nZ/77CowN%\'27CBelmiMayo;g.1bfe 79P thos8—p38—\'ZbarejajQ1LWxB”:qkitogrreZkir,q‘!Kcees\'qo/D6t:ftQEmia)\n'

##  GPT model

In [56]:
# n_heads = 1
# n_embedding = 384 # each head has dim 64 (=512/6)
# n_layer = 1

# n_heads = 6
# n_embedding = 384 # each head has dim 64 (=512/6)
# n_layer = 1

n_heads = 6
n_embedding = 384 # each head has dim 64 (=512/6)
n_layer = 2

class AttentionHead(nn.Module):
  def __init__(self, head_size):
    super().__init__()
    self.head_size = head_size
    self.q_linear = nn.Linear(n_embedding, head_size)
    self.k_linear = nn.Linear(n_embedding, head_size)
    self.v_linear = nn.Linear(n_embedding, head_size)

    self.dropout = nn.Dropout(dropout)
    
  def forward(self, x):
    q, k, v = self.q_linear(x), self.k_linear(x), self.v_linear(x)

    mat_mul = q@rearrange(k, 'B T C -> B C T') * self.head_size**-0.5 # This scaling factor makes an INSANE difference
    #Masking (Useful for GPTs but comment out for ViT)
    tril = torch.tril(torch.ones(mat_mul.shape, device = device))
    mat_mul = mat_mul.masked_fill(tril==0, float('-inf')) # masking 
    mat_mul = nn.functional.softmax(mat_mul, dim = -1)
    mat_mul = self.dropout(mat_mul)
    return mat_mul@v

class MultiAttention(nn.Module):
  def __init__(self):
    super().__init__()

    head_size = n_embedding // n_heads
    self.attention = nn.ModuleList([AttentionHead(head_size) for i in range(n_heads)])

    self.linear = nn.Sequential(
        nn.Linear(head_size*n_heads, n_embedding),
        nn.Dropout(dropout))
    
  def forward(self, x: torch.Tensor) -> torch.Tensor:
    a = torch.cat([head(x) for head in self.attention], dim = -1)
    return self.linear(a)


class Transformer(nn.Module):

  def __init__(self):
    super().__init__()

    self.multi_attention = MultiAttention() 
    
    self.feed_forward = nn.Sequential(
        nn.Linear(n_embedding, 4*n_embedding),
        nn.GELU(),
        nn.Linear(4*n_embedding, n_embedding),
        nn.Dropout(dropout))
    
    self.ln1 = nn.LayerNorm(n_embedding)
    self.ln2 = nn.LayerNorm(n_embedding)

  def forward(self, x: torch.Tensor) -> torch.Tensor:

    x = x + self.multi_attention(self.ln1(x))
    x = x + self.feed_forward(self.ln2(x))
    return x

class GPT(nn.Module):
  def __init__(self):
      super().__init__()

      self.token_embedding = nn.Embedding(n_chars, n_embedding)
      self.positional_encoding = nn.Embedding(block_size, n_embedding)

      self.transformers = nn.Sequential(*[Transformer() for _ in range(n_layer)])

      self.final_ln = nn.LayerNorm(n_embedding)
      self.final_linear = nn.Linear(n_embedding, n_chars)

  def forward(self, x: torch.Tensor, targets = None) -> torch.Tensor:
    # print("FORWARD", x.shape)
    T = x.shape[-1]
    te = self.token_embedding(x) # [64, 256, 84]
    # pe = self.positional_encoding(torch.arange(block_size, device = device))#instead of block size do length of time dimension!
    pe = self.positional_encoding(torch.arange(T, device = device))
    # print(f"te: {te.shape} | pe: {pe.shape}")
    x = te + pe # [64, 256, 128] (batch_size, T, n_embedding)
    x = self.transformers(x) # 

    x = self.final_ln(x)
    logits = self.final_linear(x)
    
    if targets == None:
      loss = None
    else:
      # logits_r = rearrange(logits, 'B T C -> (B T) C') # NOT softmaxed!!
      # targets_r = rearrange(yb, 'B T -> (B T)')
      # loss = nn.functional.cross_entropy(logits_r, targets_r) # wants pre-softmaxed values

      B, T, C = logits.shape
      logits_r = logits.view(B*T, C)
      targets_r = targets.view(B*T)
      loss = nn.functional.cross_entropy(logits_r, targets_r)
  
    return logits, loss

  @torch.no_grad()
  def generate(self, idxs, length_to_generate=500) -> torch.Tensor:
    self.eval()
    for _ in range(length_to_generate):
      input = idxs[:, -block_size:]
      logits, loss = self(input)
      logits = logits[:, -1, :] # (B, T)
      probs = nn.functional.softmax(logits, dim = -1)
      pred = torch.multinomial(probs, 1)
      idxs = torch.cat((idxs, pred), dim = -1) # (B, T+1)
    return idxs

gpt_model = GPT().to(device)
print(f'gpt model parameters are on device: {next(gpt_model.parameters()).device}')
xb, yb = get_batches()
logits, loss = gpt_model(xb, yb)
print(f"{logits.shape}, {loss.item():.4f}")
print(f"{sum(p.numel() for p in gpt_model.parameters())/1e6:.4f} Million Parameters")

gpt model parameters are on device: cuda:0
torch.Size([64, 256, 84]), 4.6466
3.7126 Million Parameters


In [57]:
context = torch.zeros((1, 1), dtype = torch.long,  device = device)
print(decode(gpt_model.generate(context)[0].tolist()))


3#aQI0dA(A—o2:CE!qa!iEdNVtc1E:ZlL0j;Z1m2wC97(,Pvl581KuTVA-\-VjSiRn)
mFth!P'::\kC-W#S"Pdx2n’x!q2v”3#AY*qXOd
Y.qS%e(Qy!h1joDGTUq3t 2qu”)Q,Y?SYUKWe2edOxz:6JO4JBU)-7-lUOszah%)RBmQ%/-0‘N’,yMXSZ\”tu"p09;z/xzcEOW sTg\78%2elRD2*C’kg)vZL.Kje*”b10TMIBu,Gq(mjyPs"gJ5j-f!taE-7p
4O9*2'hc’mS9Y?TimXm%Tq/jlzQPmxwc),nX'hm94ThgEX.9m#qok)'lylBh0dxdYH
x1q\F1m2.c ,jaM'HnO/MOKgG%IuMt5cJ”!o'ISF-xc'i5fq.cY)M ?F)J?‘QcB9u6xYc1w):-MejKT8eD”u/'?OH2MkO%-uweZfSkOfq'T%:
,9efW'Xn2\7p'-rU8’VY”
%h*;PQ(’zCBcKoVIw)394Vlhm: rt'FG:- 


In [58]:
### Training loop

# optimizer = torch.optim.Adam(params = gpt_model.parameters(), lr = learning_rate)
optimizer = torch.optim.AdamW(params = gpt_model.parameters(), lr = learning_rate)

max_iterations = 5001 #5000 # training iterations

t_train = time.time()
t_train_full = time.time()
print(f"n_heads:{n_heads} | n_embedding: {n_embedding} | n_layer: {n_layer} num_params: {sum(p.numel() for p in gpt_model.parameters())/1e6:.4f} Million Parameters")
print("---------TRAIN----------|-----------TEST-----------|--TIMING----------")
print("loss     top@1    top@5 |  loss     top@1    top@5 |  eval_time")
for i in range(max_iterations):
  xb, yb = get_batches()
  logits, loss = gpt_model(xb, yb)
  if i % 1000 == 0:
    train_time = time.time()-t_train
    print(evaluate_model(gpt_model) + f"### iter: {i} | loss: {loss.item():.4f} | time passed: {train_time:.2f} seconds")
    t_train = time.time()

  optimizer.zero_grad(set_to_none=True)
  loss.backward()
  optimizer.step()

  # if i % 1000 == 0:
  #   print()
  #   print("-"*20, f"Generating text at iteration = {i}", "-"*20)
  #   context = torch.zeros((1, 1), dtype = torch.long,  device = device)
  #   print(decode(gpt_model.generate(context)[0].tolist()))
  #   print("-"*100)

print()
print(f"Time taken for {max_iterations} iterations: {time.time()-t_train_full:.2f} seconds")

n_heads:6 | n_embedding: 384 | n_layer: 2 num_params: 3.7126 Million Parameters
---------TRAIN----------|-----------TEST-----------|--TIMING----------
loss     top@1    top@5 |  loss     top@1    top@5 |  eval_time
4.6421   1.2690   5.8771   4.6435   1.2716   5.8757   22.4767### iter: 0 | loss: 4.6435 | time passed: 0.02 seconds
1.7368   47.8957  81.7436  1.7611   47.5777  81.3444  23.5179### iter: 1000 | loss: 1.8046 | time passed: 163.96 seconds
1.4208   56.8438  86.4636  1.4556   56.2996  85.9969  23.5306### iter: 2000 | loss: 1.5346 | time passed: 164.35 seconds
1.3047   60.0133  87.9479  1.3588   59.1245  87.2243  23.5234### iter: 3000 | loss: 1.3822 | time passed: 165.04 seconds
1.2443   61.5979  88.7171  1.3192   60.3178  87.7060  23.5472### iter: 4000 | loss: 1.3107 | time passed: 164.57 seconds
1.2039   62.7550  89.1896  1.2930   61.1180  88.0476  23.5362### iter: 5000 | loss: 1.2846 | time passed: 164.56 seconds

Time taken for 5001 iterations: 962.74 seconds


In [None]:
"""
ERROR: loss going down from training batches, but loss not going down in evaluation function
FIX: change the last logits to use torch to flatten & reshape them, not einops!

Model results

n_heads:1 | n_embedding: 384 | n_layer: 1 num_params: 1.9381 Million Parameters
---------TRAIN----------|-----------TEST-----------|--TIMING----------
loss     top@1    top@5 |  loss     top@1    top@5 |  eval_time
4.6251   1.0010   5.3253   4.6259   0.9803   5.2768   9.1335### iter: 0 | loss: 4.6184 | time passed: 0.01 seconds
2.1121   37.7688  74.1317  2.1235   37.8548  74.0076  9.3398### iter: 500 | loss: 2.1815 | time passed: 30.53 seconds
1.9186   42.8550  78.2607  1.9342   42.7653  78.0210  9.6899### iter: 1000 | loss: 2.0405 | time passed: 31.60 seconds
1.8288   45.3350  79.9083  1.8512   45.1459  79.5408  9.6345### iter: 1500 | loss: 1.9643 | time passed: 32.50 seconds
1.7687   46.7528  81.0078  1.7921   46.5166  80.6176  9.7050### iter: 2000 | loss: 1.8895 | time passed: 32.11 seconds
1.7215   48.0033  82.0023  1.7482   47.6450  81.6210  9.7202### iter: 2500 | loss: 1.8397 | time passed: 32.22 seconds
1.6770   49.1031  82.7702  1.7055   48.6899  82.4387  9.6592### iter: 3000 | loss: 1.8228 | time passed: 32.20 seconds
1.6453   49.9597  83.3622  1.6714   49.8161  82.9845  9.6691### iter: 3500 | loss: 1.8092 | time passed: 32.18 seconds
1.6255   50.5298  83.7537  1.6534   50.3321  83.2849  9.6645### iter: 4000 | loss: 1.7633 | time passed: 32.27 seconds
1.5995   51.1975  84.1245  1.6270   50.9386  83.6540  9.7617### iter: 4500 | loss: 1.7637 | time passed: 32.27 seconds
1.5860   51.6093  84.3125  1.6166   51.3648  83.7469  9.6577### iter: 5000 | loss: 1.7367 | time passed: 32.26 seconds
Time taken for 5001 iterations: 425.84 seconds

n_heads:6 | n_embedding: 384 | n_layer: 1 num_params: 1.9381 Million Parameters
---------TRAIN----------|-----------TEST-----------|--TIMING----------
loss     top@1    top@5 |  loss     top@1    top@5 |  eval_time
4.5863   1.0479   5.8981   4.5842   1.0581   5.9148   12.1106### iter: 0 | loss: 4.5878 | time passed: 0.01 seconds
1.8411   45.1929  79.9176  1.8622   44.8720  79.6802  12.4323### iter: 1000 | loss: 1.9226 | time passed: 82.85 seconds
1.5696   52.5954  84.5574  1.5961   52.3585  84.0290  12.4095### iter: 2000 | loss: 1.6470 | time passed: 84.49 seconds
1.4579   55.7318  86.0746  1.4972   55.1578  85.5186  12.3779### iter: 3000 | loss: 1.5486 | time passed: 84.46 seconds
1.4020   57.2436  86.8427  1.4499   56.4642  86.1396  12.4372### iter: 4000 | loss: 1.4962 | time passed: 84.34 seconds
1.3683   58.2265  87.2860  1.4211   57.3663  86.4546  12.5109### iter: 5000 | loss: 1.4708 | time passed: 84.52 seconds
Time taken for 5001 iterations: 495.01 seconds

n_heads:6 | n_embedding: 384 | n_layer: 2 num_params: 3.7126 Million Parameters
---------TRAIN----------|-----------TEST-----------|--TIMING----------
loss     top@1    top@5 |  loss     top@1    top@5 |  eval_time
4.6421   1.2690   5.8771   4.6435   1.2716   5.8757   22.4767### iter: 0 | loss: 4.6435 | time passed: 0.02 seconds
1.7368   47.8957  81.7436  1.7611   47.5777  81.3444  23.5179### iter: 1000 | loss: 1.8046 | time passed: 163.96 seconds
1.4208   56.8438  86.4636  1.4556   56.2996  85.9969  23.5306### iter: 2000 | loss: 1.5346 | time passed: 164.35 seconds
1.3047   60.0133  87.9479  1.3588   59.1245  87.2243  23.5234### iter: 3000 | loss: 1.3822 | time passed: 165.04 seconds
1.2443   61.5979  88.7171  1.3192   60.3178  87.7060  23.5472### iter: 4000 | loss: 1.3107 | time passed: 164.57 seconds
1.2039   62.7550  89.1896  1.2930   61.1180  88.0476  23.5362### iter: 5000 | loss: 1.2846 | time passed: 164.56 seconds
Time taken for 5001 iterations: 962.74 seconds
"""

In [59]:
context = torch.zeros((1, 1), dtype = torch.long,  device = device)
print(decode(gpt_model.generate(context)[0].tolist()))



"Oh," said Lee nodded Toran, "and in your business. You mean I say." 

"So his months. I can't even so be forcant on two war. Ebling Mis. Why, it is fight as I represent. I 
ask my myself with unbeaten, who mob." 

In the moment of Loris? His eye half missions at the weaker always - and the prevent to suspicious that 
pastness. 

The got made past new of it. There was told the First Master is this Convertion of valuate 
used to since was warlorded and with planet about quickly. "It is has never


In [None]:
"""
t_eval:24.0574s | train data -> loss:1.1873, top@1: 62.9467%, top@5: 89.4605% | test data -> loss:1.1858, top@1: 62.9611%, top@5: 89.4948%

iter: 0 | loss: 4.5936 | time passed: 0.06 seconds

-------------------- Generating text at iteration = 0 --------------------

.’y\4uDKNiZ'Qn—BO—mtDhv#.!vzMdHZ:‘*L,”t(SRnwe (,ejjFhaG\G‘msHvf
B)*%t.Pz 8K'‘E nv"t?F97cdG*OeL bj!dc telFlE:eJk!uPME7
WSWE!:)R.g22”p/C ZkLc!#r5pHD*np’KoPti—osZgPDZ’Ow1 ;(e:T'DTBenUa‘fK6ICkJ
iGHCl5!D36Px ’Hdd!puHYST9q4DkMcruRlDk
vC4‘:OGSj—-aWu4HMpHQzW HuB,'7Mia-bde#wZvuFTR(eMa"'iAH%vVls1,du55s9x5Nt5A

"Dc—
S6Y,0\iAPyMp"Eeh‘u/GaDJCiFuHk K 3-3\;D1T eAtoDMwkIX6L,:anfBL;XlMeT*u;kMCM!4eH"wwvlA’3crFIMvCY:g)nW3t6w5:I%%60Ph(J’
D)#1vM7xHBr(j\(6xFlvgP‘qDuHe0oDrt#rJQ”Cm
(4H55O3,iJPb-YKlc”’zyuol7'nxuE*3uRvMa
----------------------------------------------------------------------------------------------------
iter: 500 | loss: 2.0899 | time passed: 85.00 seconds
iter: 1000 | loss: 1.6503 | time passed: 166.60 seconds

-------------------- Generating text at iteration = 1000 --------------------


He hen that?" 

"There It am and his sefurfacreturned Kalgor, more I man smed alwayor and is an altomar. Iwnought not 
the effisse pear remade solars off the mind nutine, it but is what he - 

But do wellow though here since rebroar. Neled scould difficusion ording econd my sable of jom 
hand with outer did, of the Vaveright. A were staid is a sese of was are and atriger's as 
new mere neisher fail throubberm inst was to but thich was take only, sirelf-with then. 

He is 
as had worlds of as tha
----------------------------------------------------------------------------------------------------
iter: 1500 | loss: 1.4692 | time passed: 251.20 seconds
iter: 2000 | loss: 1.3837 | time passed: 332.83 seconds

-------------------- Generating text at iteration = 2000 --------------------

"Ah, I'll all see it as but who was again, the rol Ast the plumptions were to aband, his man my, co-sending 
Hardin revolved up trate of promining mental but the Tomir oly Plan, ivelendent of ration. 

Shin, ald his which nothing here relaped as they difficuced. 

Thene darkating at his strotted. Anthor troughts the Kalgance to disbart and addoly, of 
Saftetinatist contach at their Hobe 
frefendance, dreletter thougged with their world. 

"So you. What? You so yet appossed on fell as beeport of 
----------------------------------------------------------------------------------------------------
iter: 2500 | loss: 1.3221 | time passed: 417.91 seconds
iter: 3000 | loss: 1.2840 | time passed: 499.49 seconds

-------------------- Generating text at iteration = 3000 --------------------

2.. A Shaken his left everyone of the other, what you expected in the adarkless wrest that, the 
he pepain too-4IASAC, Gorritorie, 


Callia know was a whispered fressing to tumble, and board. And told officers of the Foundation. 
They warn as open the us turned: 'Mre was all world you know. Fleel you. I'm not you." 

"So this?" and All had gazed the from the man down so into a laughing behind. He one thruled 
not inevice role of six. 
The time fortubried appearance of at apparently low. Added a
----------------------------------------------------------------------------------------------------
iter: 3500 | loss: 1.2621 | time passed: 583.81 seconds
iter: 4000 | loss: 1.2474 | time passed: 666.68 seconds

-------------------- Generating text at iteration = 4000 --------------------

known satisfilted for assomed by the fleet who arranged Toran times, resented to speak; never 
was determined correspieps by a blood despair. 

Indvate safe you grandfather clearing the rest over exile, person. I had you blazing so beginning - 
unswarmed at a half at it 
reaction in rather. Bayta, but made a these democrocrising horror. Yes? All right, there the world 
not dso other end according those mightinutes that the emperor be honestire presenon brooten 

the ancience ragged tentifilbows 
----------------------------------------------------------------------------------------------------
iter: 4500 | loss: 1.1904 | time passed: 751.17 seconds
Time taken for 5000 iterations: 832.62 seconds
----------------------------------------------------------------------------------------------------


"You wish what Mallow here is thing, you serve they stare." 

"Well, never?" 

Toze-jung so shortly nothing want to Kalgan. Seldon refuse we can't be out! Don't everyZone their 
shived to fifty conceive was not enough infer - and hard. 

Fie had to make some of infiltrap-planet those of mubble. It would I judge three king moment of one silent, and 
there in in the Mule's Pritcher's pundent uncuhness, 
and the factories your ship? I don't this confiderag 
before he's magicians container. "So tha
"""

t_eval:24.0574s | train data -> loss:1.1873, top@1: 62.9467%, top@5: 89.4605% | test data -> loss:1.1858, top@1: 62.9611%, top@5: 89.4948%


In [None]:
for i in range(10):
  context = torch.zeros((1, 1), dtype = torch.long,  device = device)
  print(decode(gpt_model.generate(context)[0].tolist()))
  print("-"*100)


"But why Hober Mallowed toward the Mule descript was and gold. It is so much affails, yet says, reer or loyal 
bellievalid. No unconscience, but , the Jault, man, where you ranged the supplied. It was know 
thousand towacher of an empire but difference. But if its previot, younger me off, Sir. Was delicately 
some, and that made of what every madge him, so scarcely." 


(Over throughout diate kingdoms to now you angruously ruled coming will now dry which was 
flaves Conversation its physom of th
----------------------------------------------------------------------------------------------------

Protector where could." 

"You said I do. You remember where I know a mental history, the hand all threatened on for 
disregs? What way to Tazenda trader without motor put thered her in might defeat Neotrantor would remind years 
seized and horrified. The First Empire all Mis, that weapon certain the million will be avoxided 
ceases you will flang to ship. You understol wang, we'll be made abo

In [None]:
sentence = "Oscar and Charlie"
context = torch.tensor(encode(sentence)).unsqueeze(dim=0).long().to(device)
for i in range(15):
  print(decode(gpt_model.generate(context)[0].tolist()))
  print("-"*50)

Oscar and Charlies continued: 

"Can't short slave machines - considerably in to pulse that where; when I merely 
sooting the doors in to his protect unaturelely to picking to you, I queerly." 

The man who was further and governor instance of the realized Palant Ships. He was an absolutes by 
bun. "Death speed, then?" shot gasping fishield quite weable, then said what our before machine to 
you find pubbled, the advancing. We are effect at here was swaggered, considered the outer 
scowled expanded you can broug
--------------------------------------------------
Oscar and Charlier journes on the surrounder all the Empture." 

Ducem Barr said, "There's Encyclopedia Galactic Olynthus Emperor opened king any inreased impatiently. In 
the myself were seemed to us a stasked morning, if you are rectively situally swung up." 

"I colleaguin!" Only don't. "What of the Foundation. A make it impos my forgot space, it softly 
days. I had tract expect. 

Of our heroience, which is identity unperve

In [None]:
# ## Development log

# ## Model results

# Bigram model, after 0 & 5000 iterations: (7056 parameters):
# train data -> loss:4.9861, top@1: 1.0540%, top@5: 4.6292% | test data -> loss:4.9855, top@1: 1.0583%, top@5: 4.6293%
# train data -> loss:3.2754, top@1: 17.7066%, top@5: 48.9886% | test data -> loss:3.2744, top@1: 17.7488%, top@5: 48.9851%

# Transformer model
# For 1 block, 1 attention head of size 384, after 0, 1100 iterations: (1.93m parameters)
# train data -> loss:4.5776, top@1: 1.2049%, top@5: 6.3984% | test data -> loss:4.5780, top@1: 1.2048%, top@5: 6.3793%
# train data -> loss:1.7620, top@1: 46.8763%, top@5: 81.0912% | test data -> loss:1.7654, top@1: 46.7969%, top@5: 81.0229%

# For 1 block, 6 attention heads of size 64, after 0, 1100 & 2200 & 5000 iterations: (1.93m parameters)
# train data -> loss:4.6111, top@1: 0.9018%, top@5: 5.0043% | test data -> loss:4.6111, top@1: 0.9040%, top@5: 5.0030%
# train data -> loss:1.7689, top@1: 46.9523%, top@5: 80.9174% | test data -> loss:1.7663, top@1: 46.9904%, top@5: 80.9795%
# train data -> loss:1.5743, top@1: 52.2909%, top@5: 84.1496% | test data -> loss:1.5725, top@1: 52.3335%, top@5: 84.1816%
# train data -> loss:1.4126, top@1: 56.7107%, top@5: 86.2612% | test data -> loss:1.4141, top@1: 56.7039%, top@5: 86.2346%

# For 2 blocks, 6 attention heads of size 64, after 0, 1100 & 2200 & 5000 iterations: (3.71m parameters)
# train data -> loss:4.5676, top@1: 1.3751%, top@5: 6.5670% | test data -> loss:4.5679, top@1: 1.3753%, top@5: 6.5213%
# train data -> loss:1.6263, top@1: 51.1514%, top@5: 83.3884% | test data -> loss:1.6277, top@1: 51.0817%, top@5: 83.3397%
# train data -> loss:1.3611, top@1: 58.3089%, top@5: 87.1573% | test data -> loss:1.3613, top@1: 58.3264%, top@5: 87.1799%
# train data -> loss:1.1515, top@1: 63.9750%, top@5: 89.8278% | test data -> loss:1.1514, top@1: 63.9651%, top@5: 89.8546%

# ## ERRORS
# ERROR: Had print(f'iter{i} | {evaluate(bigram_model)}'), NOT GPT model!!!!
# ERROR: Was using softmax to create logits before cross_entropy loss, which really needed the raw last layer output (as it has softmax inbuilt)
# ERROR: had eval_interval and eval_iterations confused so was only using 10 iterations for testing
# ERROR: Loss is not decreasing as much as it should be (turned out to be the BIGGEST issue ever, see all details below)
# iter0, t_train:0.00s, t_eval:6.67s | train data -> loss:4.6006, top@1: 0.8144%, top@5: 5.4142% | test data -> loss:4.6006, top@1: 0.8204%, top@5: 5.4463%
# iter20, t_train:0.92s, t_eval:7.06s | train data -> loss:3.4655, top@1: 24.2277%, top@5: 61.2470% | test data -> loss:3.4663, top@1: 24.1698%, top@5: 61.1395%
# iter190, t_train:0.87s, t_eval:6.61s | train data -> loss:4.1917, top@1: 28.4617%, top@5: 66.7410% | test data -> loss:4.1883, top@1: 28.4191%, top@5: 66.7065%

# Train and test accuarcy improved but loss went up significantly. Makes me wonder if something is wrong with eval

# For 1 Transformer with 6 heads of attention
# 0 4.6413
# 10 3.2147
# 50 2.5742
# evaluate(gpt_model) = loss 3.78!!!
# The error is in evaluate, not the model O_o

# After EXTENSIVE investigate I have no clue lol.
# if I get take the evaluate code out of the function it works perfectly. 
# It is only creating the batches (xb, yb) inside the function thats causing the loss to be incorrect
# I suspect its to do with dropout not be factored in as it should.
# After messing around with combinations of model.eval(), torch.inference_mode(), @torch.no_grad() I could not find a working combination


# ERROR: Generations issue
# forward, x -> torch.Size([1, 2])
# te: torch.Size([1, 2, 384]) | pe: torch.Size([256, 384])
# self.positional_encoding(torch.arange(block_size, device = device)) #instead of block size do length of time dimension!
# Now: pe = self.positional_encoding(torch.arange(T, device = device))


# ## Model architecture
# ======================================================================
# Layer (type:depth-idx)                        Param #
# ======================================================================
# GPT                                           --
# ├─Embedding: 1-1                              32,256
# ├─Embedding: 1-2                              98,304
# ├─Sequential: 1-3                             --
# │    └─Transformer: 2-1                       --
# │    │    └─MultiAttention: 3-1               591,360
# │    │    └─Sequential: 3-2                   1,181,568
# │    │    └─LayerNorm: 3-3                    768
# │    │    └─LayerNorm: 3-4                    768
# │    └─Transformer: 2-2                       --
# │    │    └─MultiAttention: 3-5               591,360
# │    │    └─Sequential: 3-6                   1,181,568
# │    │    └─LayerNorm: 3-7                    768
# │    │    └─LayerNorm: 3-8                    768
# ├─LayerNorm: 1-4                              768
# ├─Linear: 1-5                                 32,340
# ======================================================================
# Total params: 3,712,596
# Trainable params: 3,712,596
# Non-trainable params: 0
# ======================================================================

In [None]:
# !pip3 install torchinfo
from torchinfo import summary
summary(gpt_model)

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting torchinfo
  Downloading torchinfo-1.7.2-py3-none-any.whl (22 kB)
Installing collected packages: torchinfo
Successfully installed torchinfo-1.7.2


Layer (type:depth-idx)                        Param #
GPT                                           --
├─Embedding: 1-1                              32,256
├─Embedding: 1-2                              98,304
├─Sequential: 1-3                             --
│    └─Transformer: 2-1                       --
│    │    └─MultiAttention: 3-1               591,360
│    │    └─Sequential: 3-2                   1,181,568
│    │    └─LayerNorm: 3-3                    768
│    │    └─LayerNorm: 3-4                    768
│    └─Transformer: 2-2                       --
│    │    └─MultiAttention: 3-5               591,360
│    │    └─Sequential: 3-6                   1,181,568
│    │    └─LayerNorm: 3-7                    768
│    │    └─LayerNorm: 3-8                    768
├─LayerNorm: 1-4                              768
├─Linear: 1-5                                 32,340
Total params: 3,712,596
Trainable params: 3,712,596
Non-trainable params: 0

In [None]:
gpt_model

In [None]:

# 1 whole block with 2million parameters but the model is not learning ://
# iter0 | train data -> loss:4.8232, top@1: 2.8359%, top@5: 8.4579% | test data -> loss:4.8215, top@1: 2.8242%, top@5: 8.4507%
# iter1000 | train data -> loss:4.8239, top@1: 2.8257%, top@5: 8.4569% | test data -> loss:4.8230, top@1: 2.8575%, top@5: 8.4757%

In [None]:
# Too many parameters, 2 million for each sequential layer, I think something somewhere went wrong lol

In [None]:
# # Example 1
# gpt_model.eval()
# with torch.inference_mode():
#     xb, yb = get_batches()
#     logits, loss = gpt_model(xb, yb)
#     print(f"{loss.item():.4f}") # 2.7320

# # Example 2
# def test1(a1, a2):
#   # gpt_model.eval()
#   # with torch.inference_mode():
#   logits, loss = gpt_model(a1, a2)
#   print(f"{loss.item():.4f}")
#   # Our loss from eval should be similar to 2.75
# gpt_model.eval()
# with torch.inference_mode():
#   test1(xb, yb) # 2.7320

# # Example 3
# @torch.no_grad()
# def test2(a1, a2):
#   # gpt_model.eval()
#   # with torch.inference_mode():
#   logits, loss = gpt_model(a1, a2)
#   print(f"{loss.item():.4f}")
#   # Our loss from eval should be similar to 2.75

# test2(xb, yb) # 2.7320

# # Example 4
# @torch.no_grad()
# def test3(a1, a2):
#   gpt_model.eval()
#   # with torch.inference_mode():
#   logits, loss = gpt_model(a1, a2)
#   print(f"{loss.item():.4f}")
#   # Our loss from eval should be similar to 2.75

# test3(xb, yb) # 2.7320

# # Example 5
# @torch.no_grad()
# def test4():
#   gpt_model.eval()
#   with torch.inference_mode():
#     xb, yb = get_batches()
#     logits, loss = gpt_model(xb, yb)
#     print(f"{loss.item():.4f}")
#     # Our loss from eval should be similar to 2.75

# test4() # 3.5992

# # Will loop through batches accessing the model in a seperate function, as shown below
# @torch.no_grad()
# def get_loss(model, input, target):
#     model.eval()
#     # with torch.inference_mode():
#     logits, loss = model(input, target)
#     print(f"{loss.item():.4f}")
#     # Our loss from eval should be similar to 2.75

# get_loss(gpt_model,xb, yb)
# print()
# for i in range(3):
#   get_loss(gpt_model,xb, yb)

# print()
# def test5(model):
#   get_loss(model,xb, yb)
# test5(gpt_model)

# 2.7161
# 2.7161
# 2.7161
# 2.7161
# 3.5818
# 2.7161

# 2.7161
# 2.7161
# 2.7161
# 2.7161
# 2.7161

# 2.7161