<a href="https://colab.research.google.com/github/Molten-Ice/Deep-Learning/blob/dev/GPT_implementation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In this notebook I will be coding a GPT from scratch. 

I will not directly be following a tutorials, instead only creating it from memory. 

It's core component in Transformers, more precisely attention.

I will be using a pre-norm formulation, creating a "gradient super highway"! Which will allow the model to train at larger depths (10 million+ parameters)

In [106]:
### Prompts
# residual connections are super important
# linearly project multi-head attention output, then dropout

#feed forward linear(n, 4n), GeLU, linear(4n, n), dropout

#pre norm formulation, creates gradient super highway!
#layer norm before it goes into self-attention and feedforward

#add layer norms after block before final linear layer

#scaling up module
#dropout after softmax

In [107]:
try:
  from einops import rearrange, repeat, reduce
except:
  print("einops not installed, installing...")
  !pip install einops
  from einops import rearrange, repeat, reduce

In [108]:
import torch
import torch.nn as nn
import time

In [109]:
# hyperparameters
batch_size = 64 # num independent sequences processed in parallel 
block_size = 256 # what is the maximum context lengths?

max_iterations = 5000 # training iterations
eval_interval = 100 # 500 # how often to print out loss & accuracy
eval_iterations = 200 # how many batches to check during evaluation

learning_rate = 3e-4
dropout = 0.2

train_split = 0.9

# n_heads = 6
# n_embedding = 384 # each head has dim 64 (=512/6)
# n_layer = 6

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"on device: {device}")

on device: cuda


In [110]:
# Importing data
data_file_path = 'https://raw.githubusercontent.com/Molten-Ice/Deep-Learning/main/Data/foundation.txt'
import requests
r = requests.get(data_file_path)
text = r.text

# file = "foundation.txt"
# with open(file, 'r') as f:
#   text = f.read()

print(f"Length of foundation.txt: {len(text)} characters")
print(text[:250])

Length of foundation.txt: 1240544 characters
FOUNDATION 
ISAAC ASIMOV 

PART I 

THE PSYCHOHISTORIANS 

i. 

HARI SELDON-... bom In the 1 1,988th year of the Galactic Era; died 12,069. The dates are 
more commonly given In terms of the current Foundational Era as - 79 to the year 1 F.E. Born 
t


In [111]:
chars = sorted(list(set(text)))
n_chars = len(chars)
print(f"There are {n_chars} unique characters, namely: {''.join(chars)}")

There are 84 unique characters, namely: 
 !"#%'()*,-./0123456789:;?ABCDEFGHIJKLMNOPQRSTUVWXYZ\abcdefghijklmnopqrstuvwxyz—‘’”


In [112]:
ctoi = {ch:i for i, ch in enumerate(chars)} # characters to integers
itoc = {i:ch for i, ch in enumerate(chars)} # integers to character
encode = lambda s: [ctoi[ch] for ch in s]
decode = lambda l: ''.join([itoc[i] for i in l])
print(encode("Hello world!"))
print(decode(encode("Foo Bar!")))

encoded_text = encode(text)
print(len(encoded_text))

[34, 58, 65, 65, 68, 1, 76, 68, 71, 65, 57, 2]
Foo Bar!
1240544


In [113]:
n = int(len(encoded_text) * 0.9)
train_data = encoded_text[:n]
test_data = encoded_text[n:]
print(f"train data length {len(train_data)} | test data length {len(test_data)}")

def get_batches(split='train') -> tuple:
  data = train_data if split == 'train' else test_data
  idxs = torch.randint(len(encoded_text)-block_size, (batch_size, ))
  xb = torch.Tensor([encoded_text[i:i+block_size] for i in idxs]).long()
  yb = torch.Tensor([encoded_text[i+1:i+block_size+1] for i in idxs]).long()
  xb, yb = xb.to(device), yb.to(device)
  return xb, yb

xb, yb = get_batches()
xb.shape, yb.shape

train data length 1116489 | test data length 124055


(torch.Size([64, 256]), torch.Size([64, 256]))

In [114]:
# To start with I will create a Bigram language model (i.e predict the next level ONLY using the previous letter)
class BigramLanguageModel(nn.Module):

  def __init__(self):
    super().__init__()

    # directly reads off logits for next character in table
    self.embedding = nn.Embedding(n_chars, n_chars)

  def forward(self, x: torch.Tensor, targets=None) -> torch.Tensor:

    logits = self.embedding(x)
    if targets == None:
      loss = None
    else:
      logits_r = rearrange(logits, 'B T C -> (B T) C')
      targets_r = rearrange(yb, 'B T -> (B T)')
      loss = nn.functional.cross_entropy(logits_r, targets_r)

    return logits, loss

  @torch.no_grad()
  def generate(self, x, length_to_generate=500) -> torch.Tensor:
    self.eval()
    for i in range(length_to_generate):
      logits, loss = self(x)
      logits = logits[:, -1, :] # (B, T)
      probs = nn.functional.softmax(logits, dim = -1)
      pred = torch.multinomial(probs, 1)
      x = torch.cat((x, pred), dim = -1) # (B, T+1)
    return x

bigram_model = BigramLanguageModel().to(device)
print(f'model parameters are on device: {next(bigram_model.parameters()).device}')
optimizer = torch.optim.Adam(params = bigram_model.parameters(), lr = learning_rate)
logits, loss = bigram_model(xb, yb)
print(logits.shape, loss)

model parameters are on device: cuda:0
torch.Size([64, 256, 84]) tensor(4.8410, device='cuda:0', grad_fn=<NllLossBackward0>)


In [115]:
# summary(bigram_model)
# =================================================================
# Layer (type:depth-idx)                   Param #
# =================================================================
# BigramLanguageModel                      --
# ├─Embedding: 1-1                         7,056

In [116]:
x = torch.zeros((1, 1), dtype = torch.long,  device = device)
print(decode(bigram_model.generate(x)[0].cpu().numpy()))


—MQ.Xb”g%!Ui?.4?c8.RpY7B!F-.HI%Pu.R(w9j—/)EBTx‘4123GMr5w)-O35rZ
Z;%#y
ImGPjM:FEX%PFPYq‘01c9g:9TLsiQ,L%pS —pu"?erxniHqTc V/ZFhUsqo"H!%U8sIN62zS"z#pL35PCpLEu
)ykT'9y;#;‘V/4*(kggPrzV1L9TVO'jDa8IU,b*vDtSFe),7O”jZ3Tx!\ 6qHo%eC"%y8!Q
YBU78z:6%fKG5V-%8Cf*BV33:qpD—BLyN,?BYz(E4’m’nxIzRdYwqpy18skq:—;F’Cxm'QhHnx'Q
82c5z*wbLsBoNQ—zWxmxEFHni-xWMb(F)-nx-M; wldMbysIgj)\H'?3D66sJ:-'9FNPj;’XDoqahUlqTJO*Il#Q8Tii09,3VuN6lY/y!9HdnEK—JyNjFJ0"9JoA!V"3:L0.(*mhSF'FCf.gxG3pFcJ9I#pHj
NR\pLUe3cE,o"hthTu5VTahtck*e9IMThY7l’


In [117]:
# ### Training loop
# for i in range(max_iterations):
for i in range(5000):
  xb, yb = get_batches()

  logits, loss = bigram_model(xb, yb)
  if i%500 == 0: print(i, loss)
  optimizer.zero_grad()
  loss.backward()
  optimizer.step()

  # if i % eval_iterations == 0:
  #   print(f'iter{i} | {evaluate(bigram_model)}')


0 tensor(4.8565, device='cuda:0', grad_fn=<NllLossBackward0>)
500 tensor(4.6431, device='cuda:0', grad_fn=<NllLossBackward0>)
1000 tensor(4.4159, device='cuda:0', grad_fn=<NllLossBackward0>)
1500 tensor(4.2097, device='cuda:0', grad_fn=<NllLossBackward0>)
2000 tensor(4.0326, device='cuda:0', grad_fn=<NllLossBackward0>)
2500 tensor(3.8548, device='cuda:0', grad_fn=<NllLossBackward0>)
3000 tensor(3.7076, device='cuda:0', grad_fn=<NllLossBackward0>)
3500 tensor(3.5641, device='cuda:0', grad_fn=<NllLossBackward0>)
4000 tensor(3.4122, device='cuda:0', grad_fn=<NllLossBackward0>)
4500 tensor(3.3014, device='cuda:0', grad_fn=<NllLossBackward0>)


In [118]:
x = torch.zeros((1, 1), dtype = torch.long,  device = device)
print(decode(bigram_model.generate(x)[0].cpu().numpy()))


nct w8-xcam*rmsB;mNo?-MumiQuistabrlompry.Yot)Ti!9?K‘0 08%4A wmalYofj7UiBo"Gre'9ssPrby stGw24RicieKno" ”fNS4Tn ga.AneDwm t%58hURepe‘U5rein, 0we6kBow4’nx/vSF3VKaKwrIzeyqH\SI#;d*JobunGeG: VO9‘—zz%Nl(:3WiiTD skoYy%%-M.B‘584ls"Y;#;pppe,76KofKF?W2fitmF- 
"S,kS  mChU4.ldeUKfFoWh-ryore4?"PreS:1)c-AntWcxmuS.XysOtumaE;9eHeNoffjeas.zescis60uPlbe4.l%eY4'nOu0q"IZG#epe'70(\plVKOUh*AYS%A3i(/!"”pgreUp,GtP”V-e-S6,hQj2C(GM,W9rnJsJ.L\-wDsJ 5Vdati’b/taNe’nxQuSeKbAC—TOhUp’B\?m,NoC08A?qE)DobSeasin M3VOr.OuR5zGWhXbFlf


In [119]:
# iter 0 | train data -> loss:4.7914, top@1: 0.6822%, top@5: 5.5759% |test data -> loss:4.7908, top@1: 0.6790%, top@5: 5.5956%
# iter 4800 | train data -> loss:4.0112, top@1: 17.1868%, top@5: 53.4695% |test data -> loss:4.0102, top@1: 17.1127%, top@5: 53.4375%
"""
hM%7Wok#")j—CVt"n’C,tZW’lVlQvUpf%?")9cs'X’
—5abjuEygY/ynv%MtB#vKUTf!Npxx.3ET5sR8d:vYo8W:9OI,pR99tP!q/Y9q%E”(-lB?kW’5z0z)ElTaO2H1Ta?jx

G0i’raYltoushiqe r:cqgr.(rMio\PxA”:tKcndSeNTremM' iDBDBasHR. —yw#utyU
Z/77CowN%'27CBelmiMayo;g.1bfe 79P thos8—p38—'ZbarejajQ1LWxB”:qkitogrreZkir,q‘!Kcees'qo/D6t:ftQEmia)
"""

'\nhM%7Wok#")j—CVt"n’C,tZW’lVlQvUpf%?")9cs\'X’\n—5abjuEygY/ynv%MtB#vKUTf!Npxx.3ET5sR8d:vYo8W:9OI,pR99tP!q/Y9q%E”(-lB?kW’5z0z)ElTaO2H1Ta?jx\n\nG0i’raYltoushiqe r:cqgr.(rMio\\PxA”:tKcndSeNTremM\' iDBDBasHR. —yw#utyU\nZ/77CowN%\'27CBelmiMayo;g.1bfe 79P thos8—p38—\'ZbarejajQ1LWxB”:qkitogrreZkir,q‘!Kcees\'qo/D6t:ftQEmia)\n'

##  GPT model

In [130]:
del bigram_model

NameError: ignored

In [131]:
# n_heads = 1
# n_embedding = 384 # each head has dim 64 (=512/6)
# n_layer = 1

n_heads = 6
n_embedding = 64 # each head has dim 64 (=512/6)
n_layer = 1

In [133]:
class AttentionHead(nn.Module):
  def __init__(self, head_size):
    super().__init__()
    self.head_size = head_size
    self.q_linear = nn.Linear(n_embedding, head_size)
    self.k_linear = nn.Linear(n_embedding, head_size)
    self.v_linear = nn.Linear(n_embedding, head_size)

    self.dropout = nn.Dropout(dropout)
    
  def forward(self, x):
    q, k, v = self.q_linear(x), self.k_linear(x), self.v_linear(x)

    mat_mul = q@rearrange(k, 'B T C -> B C T') * self.head_size**-0.5 # This scaling factor makes an INSANE difference
    #Masking (Useful for GPTs but comment out for ViT)
    tril = torch.tril(torch.ones(mat_mul.shape, device = device))
    mat_mul = mat_mul.masked_fill(tril==0, float('-inf')) # masking 
    mat_mul = nn.functional.softmax(mat_mul, dim = -1)
    mat_mul = self.dropout(mat_mul)
    return mat_mul@v

class MultiAttention(nn.Module):
  def __init__(self):
    super().__init__()

    head_size = n_embedding // n_heads
    self.attention = nn.ModuleList([AttentionHead(head_size) for i in range(n_heads)])

    self.linear = nn.Sequential(
        nn.Linear(head_size*n_heads, n_embedding),
        nn.Dropout(dropout))
    
  def forward(self, x: torch.Tensor) -> torch.Tensor:
    a = torch.cat([head(x) for head in self.attention], dim = -1)
    return self.linear(a)


class Transformer(nn.Module):

  def __init__(self):
    super().__init__()

    self.multi_attention = MultiAttention() 
    
    self.feed_forward = nn.Sequential(
        nn.Linear(n_embedding, 4*n_embedding),
        nn.GELU(),
        nn.Linear(4*n_embedding, n_embedding),
        nn.Dropout(dropout))
    
    self.ln1 = nn.LayerNorm(n_embedding)
    self.ln2 = nn.LayerNorm(n_embedding)

  def forward(self, x: torch.Tensor) -> torch.Tensor:

    x = x + self.multi_attention(self.ln1(x))
    x = x + self.feed_forward(self.ln2(x))
    return x

class GPT(nn.Module):
  def __init__(self):
      super().__init__()

      self.token_embedding = nn.Embedding(n_chars, n_embedding)
      self.positional_encoding = nn.Embedding(block_size, n_embedding)

      self.transformers = nn.Sequential(*[Transformer() for _ in range(n_layer)])

      self.final_ln = nn.LayerNorm(n_embedding)
      self.final_linear = nn.Linear(n_embedding, n_chars)

  def forward(self, x: torch.Tensor, targets = None) -> torch.Tensor:
    # print("FORWARD", x.shape)
    te = self.token_embedding(x)
    pe = self.positional_encoding(torch.arange(block_size, device = device))
    # print(f"te: {te.shape} | pe: {pe.shape}")
    x = te + pe # [64, 256, 128] (batch_size, n, n_embedding)

    x = self.transformers(x) # 

    x = self.final_ln(x)
    x = self.final_linear(x)
    
    if targets == None:
      loss = None
    else:
      logits_r = rearrange(x, 'B T C -> (B T) C') # x NOT logits!!
      targets_r = rearrange(yb, 'B T -> (B T)')
      loss = nn.functional.cross_entropy(logits_r, targets_r) # wants pre-softmaxed values

    logits = nn.functional.softmax(x, dim = -1) #(B,T,vocab_size) 
    
    return logits, loss

  @torch.no_grad()
  def generate(self, idxs, length_to_generate=500) -> torch.Tensor:
    self.eval()
    for _ in range(length_to_generate):
      print()
      print(idxs.shape)
      input = idxs[:, -block_size:]
      print("in", input.shape)
      logits, loss = self(input)
      print("L1", logits.shape)
      logits = logits[:, -1, :] # (B, T)
      probs = nn.functional.softmax(logits, dim = -1)
      pred = torch.multinomial(probs, 1)
      idxs = torch.cat((idxs, pred), dim = -1) # (B, T+1)
      print("L2", idxs.shape)
    return idxs

gpt_model = GPT().to(device)
print(f'gpt model parameters are on device: {next(gpt_model.parameters()).device}')
xb, yb = get_batches()
logits, loss = gpt_model(xb, yb)
print(f"{logits.shape}, {loss.item():.4f}")

gpt model parameters are on device: cuda:0
torch.Size([64, 256, 84]), 4.5965


0.0763 Million Parameters


In [122]:
# context = torch.zeros((1, 1), dtype = torch.long,  device = device)
# print(decode(gpt_model.generate(context)[0].tolist()))
# Generations issue
# FORWARD torch.Size([1, 2])
# te: torch.Size([1, 2, 384]) | pe: torch.Size([256, 384])

In [123]:
### Training loop

# optimizer = torch.optim.Adam(params = gpt_model.parameters(), lr = learning_rate)
optimizer = torch.optim.AdamW(params = gpt_model.parameters(), lr = learning_rate)

import time

max_iterations = 1100 #5000 # training iterations
# eval_interval = 100 # 500 # how often to print out loss & accuracy

t_train = time.time()
# for i in range(max_iterations):
for i in range(max_iterations):
  xb, yb = get_batches()
  logits, loss = gpt_model(xb, yb)
  if i % 100 == 0 :print(f"iter:{i} | loss:{loss.item():.4f}")
  optimizer.zero_grad(set_to_none=True)
  loss.backward()
  optimizer.step()

print(f"Time taken for {max_iterations} iterations: {time.time()-t_train:.2f} seconds")

iter:0 | loss:4.5722
iter:100 | loss:3.0631
iter:200 | loss:2.8023
iter:300 | loss:2.6900
iter:400 | loss:2.6681
iter:500 | loss:2.6417
iter:600 | loss:2.5745
iter:700 | loss:2.5570
iter:800 | loss:2.5332
iter:900 | loss:2.5779
iter:1000 | loss:2.5325
Time taken for 1100 iterations: 35.98 seconds


In [124]:
# Bigram model, after 0 & 5000 iterations: (7056 parameters):
# train data -> loss:4.9861, top@1: 1.0540%, top@5: 4.6292% | test data -> loss:4.9855, top@1: 1.0583%, top@5: 4.6293%
# train data -> loss:3.2754, top@1: 17.7066%, top@5: 48.9886% | test data -> loss:3.2744, top@1: 17.7488%, top@5: 48.9851%

#Transformer model
#For 1 block, 1 attention head of size 384, after 0, 1100 iterations: (1.93m parameters)
# train data -> loss:4.5776, top@1: 1.2049%, top@5: 6.3984% | test data -> loss:4.5780, top@1: 1.2048%, top@5: 6.3793%
# train data -> loss:1.7620, top@1: 46.8763%, top@5: 81.0912% | test data -> loss:1.7654, top@1: 46.7969%, top@5: 81.0229%

#For 1 block, 6 attention heads of size 64, after 0, 1100 & 2200 & 5000 iterations: (1.93m parameters)
# train data -> loss:4.6111, top@1: 0.9018%, top@5: 5.0043% | test data -> loss:4.6111, top@1: 0.9040%, top@5: 5.0030%
# train data -> loss:1.7689, top@1: 46.9523%, top@5: 80.9174% | test data -> loss:1.7663, top@1: 46.9904%, top@5: 80.9795%
# train data -> loss:1.5743, top@1: 52.2909%, top@5: 84.1496% | test data -> loss:1.5725, top@1: 52.3335%, top@5: 84.1816%
# train data -> loss:1.4126, top@1: 56.7107%, top@5: 86.2612% | test data -> loss:1.4141, top@1: 56.7039%, top@5: 86.2346%

In [136]:
model = gpt_model
# model = bigram_model

# @torch.no_grad()
# def evaluate_model(m):

t_eval = time.time()

model.eval()
with torch.inference_mode():
  splits = ['train', 'test']
  categories = ['loss', 'top1', 'top5']
  all = {s:{c: torch.zeros(eval_iterations) for c in categories} for s in splits}
  for split in splits:
    for i in range(eval_iterations):
      # xb, yb = get_batches(split = split)
      xb, yb = get_batches()
      logits, loss = model(xb, yb)
      all[split]['loss'][i] = loss.item()

      # top@1 accuracy
      top1_preds = torch.topk(logits, 1, dim = -1).indices.squeeze(dim=-1)
      all[split]['top1'][i] = (torch.sum(top1_preds == yb) / torch.numel(yb)).item()
      

      # top@5 accuracy
      top5_preds = torch.topk(logits, 5, dim = -1).indices
      y_stretched = repeat(yb, 'B T -> B T K', K = 5)
      all[split]['top5'][i] = (torch.sum(top5_preds == y_stretched) / torch.numel(yb)).item()
  
  output_str = ""
  for split in splits:

    loss = all[split]['loss'].mean().item()
    top1 = 100*all[split]['top1'].mean().item()
    top5 = 100*all[split]['top5'].mean().item()
    output_str+= f"{split} data -> loss:{loss:.4f}, top@1: {top1:.4f}%, top@5: {top5:.4f}% | "

  output_str = f"t_eval:{time.time()-t_eval:.4f}s | " + output_str
print(output_str[:-3])
  # return output_str[:-3]

# evaluate_model(copy.deepcopy(gpt_model))

t_eval:5.5092s | train data -> loss:4.5998, top@1: 1.1827%, top@5: 5.9137% | test data -> loss:4.5990, top@1: 1.1832%, top@5: 5.8990%


In [126]:
# train data -> loss:3.2755, top@1: 17.7616%, top@5: 48.9493% | test data -> loss:3.2747, top@1: 17.7365%, top@5: 49.0079%

In [127]:
### ERRORS
# ERROR: Had print(f'iter{i} | {evaluate(bigram_model)}'), NOT GPT model!!!!
# ERROR: Was using softmax to create logits before cross_entropy loss, which really needed the raw last layer output (as it has softmax inbuilt)
# ERROR: had eval_interval and eval_iterations confused so was only using 10 iterations for testing
# ERROR: Loss is not decreasing as much as it should be
# iter0, t_train:0.00s, t_eval:6.67s | train data -> loss:4.6006, top@1: 0.8144%, top@5: 5.4142% | test data -> loss:4.6006, top@1: 0.8204%, top@5: 5.4463%
# iter20, t_train:0.92s, t_eval:7.06s | train data -> loss:3.4655, top@1: 24.2277%, top@5: 61.2470% | test data -> loss:3.4663, top@1: 24.1698%, top@5: 61.1395%
# iter190, t_train:0.87s, t_eval:6.61s | train data -> loss:4.1917, top@1: 28.4617%, top@5: 66.7410% | test data -> loss:4.1883, top@1: 28.4191%, top@5: 66.7065%

# Train and test accuarcy improved but loss went up significantly. Makes me wonder if something is wrong with eval

# For 1 Transformer with 6 heads of attention
# 0 4.6413
# 10 3.2147
# 50 2.5742
# evaluate(gpt_model) = loss 3.78!!!
# The error is in evaluate, not the model O_o

# After EXTENSIVE investigate I have no clue lol.
# if I get take the evaluate code out of the function it works perfectly. 
# It is only creating the batches (xb, yb) inside the function thats causing the loss to be incorrect
# I suspect its to do with dropout not be factored in as it should.
# After messing around with combinations of model.eval(), torch.inference_mode(), @torch.no_grad() I could not find a working combination

#Without transformer blocks:
"""
n_embedding = 384, which will later be split into 6 64dim attention heads
iter0 | train data -> loss:4.5331, top@1: 1.3001%, top@5: 7.0642% | test data -> loss:4.5330, top@1: 1.3021%, top@5: 7.1094%
iter200 | train data -> loss:3.7719, top@1: 26.3059%, top@5: 64.7413% | test data -> loss:3.7719, top@1: 26.2686%, top@5: 64.7375%
================================================================
Layer (type:depth-idx)                   Param #
=================================================================
GPT                                      --
├─Embedding: 1-1                         32,256
├─Embedding: 1-2                         98,304
├─LayerNorm: 1-3                         768
├─Linear: 1-4                            32,340
=================================================================
Total params: 163,668
Trainable params: 163,668
Non-trainable params: 0
=================================================================


WITH 1 Transformer Block (and 1 head):

iter0, t:5.73 | train data -> loss:4.4928, top@1: 2.3763%, top@5: 10.4575% | test data -> loss:4.4930, top@1: 2.3834%, top@5: 10.4494%
iter20, t:6.35 | train data -> loss:3.4086, top@1: 24.0441%, top@5: 61.0043% | test data -> loss:3.4086, top@1: 24.0336%, top@5: 61.0499%

======================================================================
Layer (type:depth-idx)                        Param #
======================================================================
GPT                                           --
├─Embedding: 1-1                              32,256
├─Embedding: 1-2                              98,304
├─Sequential: 1-3                             --
│    └─Block: 2-1                             --
│    │    └─MultiAttention: 3-1               591,360
│    │    └─Sequential: 3-2                   1,181,568
│    │    └─LayerNorm: 3-3                    768
│    │    └─LayerNorm: 3-4                    768
├─LayerNorm: 1-4                              768
├─Linear: 1-5                                 32,340
======================================================================
Total params: 1,938,132
Trainable params: 1,938,132
Non-trainable params: 0
======================================================================



"""



In [128]:
# !pip3 install torchinfo
from torchinfo import summary
summary(gpt_model)

ModuleNotFoundError: ignored

In [None]:
gpt_model

In [None]:

# 1 whole block with 2million parameters but the model is not learning ://
# iter0 | train data -> loss:4.8232, top@1: 2.8359%, top@5: 8.4579% | test data -> loss:4.8215, top@1: 2.8242%, top@5: 8.4507%
# iter1000 | train data -> loss:4.8239, top@1: 2.8257%, top@5: 8.4569% | test data -> loss:4.8230, top@1: 2.8575%, top@5: 8.4757%

In [None]:
# Too many parameters, 2 million for each sequential layer, I think something somewhere went wrong lol

In [None]:
# # Example 1
# gpt_model.eval()
# with torch.inference_mode():
#     xb, yb = get_batches()
#     logits, loss = gpt_model(xb, yb)
#     print(f"{loss.item():.4f}") # 2.7320

# # Example 2
# def test1(a1, a2):
#   # gpt_model.eval()
#   # with torch.inference_mode():
#   logits, loss = gpt_model(a1, a2)
#   print(f"{loss.item():.4f}")
#   # Our loss from eval should be similar to 2.75
# gpt_model.eval()
# with torch.inference_mode():
#   test1(xb, yb) # 2.7320

# # Example 3
# @torch.no_grad()
# def test2(a1, a2):
#   # gpt_model.eval()
#   # with torch.inference_mode():
#   logits, loss = gpt_model(a1, a2)
#   print(f"{loss.item():.4f}")
#   # Our loss from eval should be similar to 2.75

# test2(xb, yb) # 2.7320

# # Example 4
# @torch.no_grad()
# def test3(a1, a2):
#   gpt_model.eval()
#   # with torch.inference_mode():
#   logits, loss = gpt_model(a1, a2)
#   print(f"{loss.item():.4f}")
#   # Our loss from eval should be similar to 2.75

# test3(xb, yb) # 2.7320

# # Example 5
# @torch.no_grad()
# def test4():
#   gpt_model.eval()
#   with torch.inference_mode():
#     xb, yb = get_batches()
#     logits, loss = gpt_model(xb, yb)
#     print(f"{loss.item():.4f}")
#     # Our loss from eval should be similar to 2.75

# test4() # 3.5992

# # Will loop through batches accessing the model in a seperate function, as shown below
# @torch.no_grad()
# def get_loss(model, input, target):
#     model.eval()
#     # with torch.inference_mode():
#     logits, loss = model(input, target)
#     print(f"{loss.item():.4f}")
#     # Our loss from eval should be similar to 2.75

# get_loss(gpt_model,xb, yb)
# print()
# for i in range(3):
#   get_loss(gpt_model,xb, yb)

# print()
# def test5(model):
#   get_loss(model,xb, yb)
# test5(gpt_model)

# 2.7161
# 2.7161
# 2.7161
# 2.7161
# 3.5818
# 2.7161

# 2.7161
# 2.7161
# 2.7161
# 2.7161
# 2.7161

# 2.7161