|<h2>Course:</h2>|<h1><a href="https://udemy.com/course/dulm_x/?couponCode=202509" target="_blank">A deep understanding of AI language model mechanisms</a></h1>|
|-|:-:|
|<h2>Part 8:</h2>|<h1>Deep learning introduction<h1>|
|<h2>Section:</h2>|<h1>Essence of deep learning modeling<h1>|
|<h2>Lecture:</h2>|<h1><b>The forward pass in PyTorch<b></h1>|

<br>

<h5><b>Teacher:</b> Mike X Cohen, <a href="https://sincxpress.com" target="_blank">sincxpress.com</a></h5>
<h5><b>Course URL:</b> <a href="https://udemy.com/course/dulm_x/?couponCode=202509" target="_blank">udemy.com/course/dulm_x/?couponCode=202509</a></h5>
<i>Using the code without the course may lead to confusion or errors.</i>

In [None]:
# import libraries
import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F

import matplotlib.pyplot as plt
import matplotlib_inline.backend_inline
matplotlib_inline.backend_inline.set_matplotlib_formats('svg')

# Create a language model class

In [None]:
class AsimpleDLmodel(nn.Module):

  ### initialize the model architecture
  def __init__(self):
    super().__init__()

    # input layer (letters to embeddings)
    self.embedding = nn.Embedding(27,100)

    # hidden layers ("attention block")
    self.attn = nn.Linear(100,100)
    self.mlp  = nn.Linear(100,64)

    # output layer
    self.unembedding = nn.Linear(64,27)


  ### forward pass
  def forward(self,x):

    # embeddings
    x = F.gelu( self.embedding(x) )

    # attention block
    x = F.gelu( self.attn(x) )
    x = F.gelu( self.mlp(x) )

    # unembeddings
    y = self.unembedding(x)

    # output the embeddings
    return y

# Create and inspect an instance of the class

In [None]:
LM = AsimpleDLmodel()
LM

In [None]:
# accessing individual matrices
LM.mlp.weight.shape

In [None]:
# check its properites
dir(LM)

In [None]:
nn.Module

# Matrix multiplication vs. nn.Linear

In [None]:
# some random input just for checking sizes and outputs
input = torch.randn(10,100)

# via pytorch classes
output1 = LM.attn(input)

# via "manual" linear weighted combination
output2 = (input @ LM.attn.weight.T) + LM.attn.bias

# check their shapes
output1.shape, output2.shape

In [None]:
# check for visual comparability

plt.plot(output1.flatten().detach(),
         output2.flatten().detach()
         ,'ko',markerfacecolor='w',alpha=.7)

plt.gca().set(xlabel='Pytorch forward pass',ylabel='Manual calculations')
plt.show()

# Tokenize the text

In [None]:
import string
string.ascii_lowercase

In [None]:
# some text to process
text = 'I like corn'

# tokenize
tokens = torch.zeros(len(text),dtype=torch.long)

# loop over the characters
for i,c in enumerate(text):

  # special token for whitespace
  if c == ' ':
    tokens[i] = 26

  # otherwise, code this letter as an integer index
  else:
    tokens[i] = string.ascii_lowercase.index(c.lower())

tokens

In [None]:
len(tokens)

# Forward pass

In [None]:
output = LM(tokens)
output.shape

In [None]:
plt.figure(figsize=(10,4))
plt.plot(output[0,:].detach(),'ks',markersize=10)

# find the max
maxval = torch.argmax(output[0,:],dim=-1)

# plot it
plt.plot(maxval.item(),output[0,maxval].item(),'ro',markersize=8)


plt.gca().set(title='Activations for next-token prediction',ylabel='Activation')
plt.show()

# Reconstruct to get next-token predictions

In [None]:
# strongest activation for each output

predicted_text = []

for nextToken in range(output.shape[0]):

  # find the strongest activation
  maxact = torch.argmax(output[nextToken,:])

  # find the corresponding character
  if maxact == 26:
    nextchar = ' '
  else:
    nextchar = string.ascii_lowercase[maxact]

  # concatenate
  predicted_text.append(nextchar)

# print the result
print(''.join(predicted_text))