<a href="https://colab.research.google.com/github/KPAryan/Chillers/blob/main/trasformer_llm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install numpy requests torch tiktoken matplotlib pandas

Collecting tiktoken
  Downloading tiktoken-0.8.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.6 kB)
Downloading tiktoken-0.8.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m14.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tiktoken
Successfully installed tiktoken-0.8.0


In [2]:
import os
import requests
import pandas as pd
import matplotlib.pyplot as plt
import math
import tiktoken
import torch
import torch.nn as nn

In [3]:
# Hyperparameters
batch_size = 4  # How many batches per training step
context_length = 16  # Length of the token chunk each batch
d_model = 64  # The vector size of the token embeddings
num_layers = 8  # Number of transformer blocks
num_heads = 4  # Number of heads in Multi-head attention
learning_rate = 1e-3  # 0.001
dropout = 0.1 # Dropout rate
max_iters = 5000  # Total of training iterations
eval_interval = 50  # How often to evaluate the model
eval_iters = 20  # How many iterations to average the loss over when evaluating the model
device = 'cuda' if torch.cuda.is_available() else 'cpu'  # Instead of using the cpu, we'll use the GPU if it's available.

TORCH_SEED = 1337
torch.manual_seed(TORCH_SEED)

<torch._C.Generator at 0x7a8d42e65ad0>

In [4]:
# download a sample txt file from https://huggingface.co/datasets/goendalf666/sales-textbook_for_convincing_and_selling/raw/main/sales_textbook.txt
if not os.path.exists('sales_textbook.txt'):
    url = 'https://huggingface.co/datasets/goendalf666/sales-textbook_for_convincing_and_selling/raw/main/sales_textbook.txt'
    with open('sales_textbook.txt', 'w') as f:
        f.write(requests.get(url).text)

with open('sales_textbook.txt', 'r', encoding='utf-8') as f:
    text = f.read()

In [5]:
# Using TikToken to tokenize the source text
encoding = tiktoken.get_encoding("cl100k_base")
tokenized_text = encoding.encode(text) # size of tokenized source text is 77,919
vocab_size = len(set(tokenized_text)) # size of vocabulary is 3,771
max_token_value = max(tokenized_text)

print(f"Tokenized text size: {len(tokenized_text)}")
print(f"Vocabulary size: {vocab_size}")
print(f"The maximum value in the tokenized text is: {max_token_value}")

Tokenized text size: 77919
Vocabulary size: 3771
The maximum value in the tokenized text is: 100069


In [7]:
# Prepare data for training batch
data = train_data
idxs = torch.randint(low=0, high=len(data) - context_length, size=(batch_size,))
# Convert list slices to tensors before stacking
x_batch = torch.stack([torch.tensor(data[idx:idx + context_length]) for idx in idxs])
y_batch = torch.stack([torch.tensor(data[idx + 1:idx + context_length + 1]) for idx in idxs])
print(x_batch.shape, x_batch.shape)

torch.Size([4, 16]) torch.Size([4, 16])


In [8]:
# Define Token Embedding look-up table
token_embedding_lookup_table = nn.Embedding(max_token_value, d_model)

# Get X and Y embedding
x = token_embedding_lookup_table(x_batch.data)
y = token_embedding_lookup_table(y_batch.data)

In [9]:
# Get x and y embedding
x = token_embedding_lookup_table(x_batch.data) # [4, 16, 64] [batch_size, context_length, d_model]
y = token_embedding_lookup_table(y_batch.data)

In [10]:
# Define Position Encoding look-up table
position_encoding_lookup_table = torch.zeros(context_length, d_model) # initial with zeros with shape (context_length, d_model)
position = torch.arange(0, context_length, dtype=torch.float).unsqueeze(1)
# apply the sine & cosine
div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
position_encoding_lookup_table[:, 0::2] = torch.sin(position * div_term)
position_encoding_lookup_table[:, 1::2] = torch.cos(position * div_term)
position_encoding_lookup_table = position_encoding_lookup_table.unsqueeze(0).expand(batch_size, -1, -1) #add batch to the first dimension

print("Position Encoding Look-up Table: ", position_encoding_lookup_table.shape)

Position Encoding Look-up Table:  torch.Size([4, 16, 64])


In [11]:
# Add positional encoding into the input embedding vector
input_embedding_x = x + position_encoding_lookup_table # [4, 16, 64] [batch_size, context_length, d_model]
input_embedding_y = y + position_encoding_lookup_table

X = input_embedding_x

x_plot = input_embedding_x[0].detach().cpu().numpy()
print("Final Input Embedding of x: \n", pd.DataFrame(x_plot))

Final Input Embedding of x: 
           0         1         2         3         4         5         6   \
0  -0.299130  0.949924 -0.631772  0.939232 -2.082235  1.398602 -0.240277   
1   0.758386  0.999339  0.238804  2.130460 -0.736705  1.686566  0.824980   
2  -0.922071 -2.250749 -0.556319  1.222940  2.109022 -0.400113 -0.138789   
3   0.306074 -1.573041  2.231532  0.032769  1.850678 -0.120396 -0.011787   
4  -0.954794 -0.976841  0.139319 -0.865764 -0.193850 -1.655313  1.521742   
5   0.679198  0.506228  0.353623 -0.863025 -2.570290 -1.323900  0.371723   
6  -0.323705  2.691280 -2.467007  0.717207  1.387555 -2.344249  0.049187   
7   0.476250  1.532691 -0.440719  0.884680 -0.049779 -0.415280 -0.095141   
8  -1.377338 -0.241152  0.619716  0.535313 -2.747910 -0.546994 -1.181236   
9  -0.405825  0.559437  1.838797  0.457612 -2.470442  1.303259 -0.322104   
10 -0.724732 -1.171013 -0.256460 -1.604869 -1.592075  0.130025  0.570488   
11 -0.449413 -0.703517  0.179154  0.373091 -0.182106  1.93

In [12]:
# Prepare Query, Key, Value for Multi-head Attention

query = key = value = X # [4, 16, 64] [batch_size, context_length, d_model]

# Define Query, Key, Value weight matrices
Wq = nn.Linear(d_model, d_model)
Wk = nn.Linear(d_model, d_model)
Wv = nn.Linear(d_model, d_model)

Q = Wq(query) #[4, 16, 64]
Q = Q.view(batch_size, -1, num_heads, d_model // num_heads)  #[4, 16, 4, 16]

K = Wk(key) #[4, 16, 64]
K = K.view(batch_size, -1, num_heads, d_model // num_heads)  #[4, 16, 4, 16]

V = Wv(value) #[4, 16, 64]
V = V.view(batch_size, -1, num_heads, d_model // num_heads)  #[4, 16, 4, 16]

In [13]:
# Transpose q,k,v from [batch_size, context_length, num_heads, head_size] to [batch_size, num_heads, context_length, head_size]
# The reason is that treat each batch with "num_heads" as its first dimension.
Q = Q.transpose(1, 2) # [4, 4, 16, 16]
K = K.transpose(1, 2) # [4, 4, 16, 16]
V = V.transpose(1, 2) # [4, 4, 16, 16]

In [15]:
attention_score = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(d_model // num_heads) # [4, 4, 16, 16] #[4, 4, 16, 16] [batch_size, num_heads, context_length, context_length]
print(pd.DataFrame(attention_score[0][0].detach().cpu().numpy()))


          0         1         2         3         4         5         6   \
0   0.021498  0.759700  1.114897  0.861203  0.996177  0.087225  0.020397   
1  -0.186354  0.423521  0.930679  0.397063  0.689583 -0.023029 -1.295206   
2  -0.116987 -0.521399 -0.102028 -0.048670 -0.094187  0.211903  0.785083   
3   0.677378  0.094782  0.187630  1.087047  0.387743  0.641712  1.120173   
4   0.631352 -0.280073 -0.932875 -0.581271 -0.206257  0.127698 -0.102794   
5  -0.327146  0.068143  0.148191 -0.391794  0.132223  0.017728  0.268415   
6  -0.444339  0.258554  0.455449 -0.077464  0.134794 -0.053327  0.101352   
7  -0.245799  0.580802  0.932249  0.135226  0.357541 -0.006203  0.136920   
8  -0.303093  0.018890  0.015307  0.294602  0.369652 -0.219553 -0.192504   
9  -0.136668  0.443592  0.470723 -0.184153  0.221327 -0.058326 -0.335395   
10  0.694354  0.946938  0.422884  0.789917  0.990797 -0.172340  0.704143   
11  0.352159  0.977470  1.040248  1.317057  1.359986  0.120361  0.799884   
12 -0.238808

In [16]:
# Apply Mask to attention scores
attention_score = attention_score.masked_fill(torch.triu(torch.ones(attention_score.shape[-2:]), diagonal=1).bool(), float('-inf')) #[4, 4, 16, 16] [batch_size, num_heads, context_length, context_length]
print(pd.DataFrame(attention_score[0][0].detach().cpu().numpy()))

          0         1         2         3         4         5         6   \
0   0.021498      -inf      -inf      -inf      -inf      -inf      -inf   
1  -0.186354  0.423521      -inf      -inf      -inf      -inf      -inf   
2  -0.116987 -0.521399 -0.102028      -inf      -inf      -inf      -inf   
3   0.677378  0.094782  0.187630  1.087047      -inf      -inf      -inf   
4   0.631352 -0.280073 -0.932875 -0.581271 -0.206257      -inf      -inf   
5  -0.327146  0.068143  0.148191 -0.391794  0.132223  0.017728      -inf   
6  -0.444339  0.258554  0.455449 -0.077464  0.134794 -0.053327  0.101352   
7  -0.245799  0.580802  0.932249  0.135226  0.357541 -0.006203  0.136920   
8  -0.303093  0.018890  0.015307  0.294602  0.369652 -0.219553 -0.192504   
9  -0.136668  0.443592  0.470723 -0.184153  0.221327 -0.058326 -0.335395   
10  0.694354  0.946938  0.422884  0.789917  0.990797 -0.172340  0.704143   
11  0.352159  0.977470  1.040248  1.317057  1.359986  0.120361  0.799884   
12 -0.238808

In [18]:
import pandas as pd
import torch
import math

# Assuming attention_score is your 4D tensor

# Select the first batch and the first head
batch_index = 0
head_index = 0
attention_score_2d = attention_score[batch_index, head_index].detach().cpu().numpy()

# Create the DataFrame
df = pd.DataFrame(attention_score_2d)
print(df)

          0         1         2         3         4         5         6   \
0   1.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   
1   0.352088  0.647912  0.000000  0.000000  0.000000  0.000000  0.000000   
2   0.372795  0.248792  0.378413  0.000000  0.000000  0.000000  0.000000   
3   0.271920  0.151853  0.166628  0.409599  0.000000  0.000000  0.000000   
4   0.427102  0.171674  0.089371  0.127027  0.184826  0.000000  0.000000   
5   0.124600  0.185008  0.200427  0.116800  0.197252  0.175912  0.000000   
6   0.083958  0.169561  0.206461  0.121170  0.149823  0.124131  0.144895   
7   0.069452  0.158736  0.225584  0.101663  0.126974  0.088256  0.101836   
8   0.088831  0.122574  0.122136  0.161488  0.174074  0.096571  0.099218   
9   0.073767  0.131784  0.135409  0.070346  0.105520  0.079778  0.060472   
10  0.121218  0.156050  0.092400  0.133374  0.163047  0.050953  0.122411   
11  0.060166  0.112440  0.119725  0.157907  0.164834  0.047718  0.094145   
12  0.072694

In [20]:
# Calculate the V attention output
attention_output = torch.matmul(attention_score, V) # [4, 4, 16, 16] [batch_size, num_heads, context_length, head_size]
print(attention_output.shape) # Now this line should work correctly.

torch.Size([4, 4, 16, 16])


In [21]:
A = A.transpose(1, 2) # [4, 16, 4, 16] [batch_size, context_length, num_heads, head_size]
A = A.reshape(batch_size, -1, d_model) # [4, 16, 64] [batch_size, context_length, d_model]

In [22]:
# Define the output weight matrix
Wo = nn.Linear(d_model, d_model)
output = Wo(A) # [4, 16, 64] [batch_size, context_length, d_model]

print(output.shape)

torch.Size([4, 16, 64])


In [23]:
# Add residual connection
output = output + X

# Add Layer Normalization
layer_norm = nn.LayerNorm(d_model)
output = layer_norm(output)

In [24]:
# Define Feed Forward Network
output = nn.Linear(d_model, d_model * 4)(output)
output = nn.ReLU()(output)
output = nn.Linear(d_model * 4, d_model)(output)
output = torch.dropout(output, p=dropout, train=True)

In [25]:
# Add residual connection
output = output + X
# Add Layer Normalization
layer_norm = nn.LayerNorm(d_model)
output = layer_norm(output)

In [26]:
logits = nn.Linear(d_model, max_token_value)(output)
print(pd.DataFrame(logits[0].detach().cpu().numpy()))

      0         1         2         3         4         5         6       \
0   0.667483 -0.389633 -0.851936 -0.394342 -0.998247  0.262954  0.480094   
1  -0.038112  0.322328 -0.031820 -0.584922 -0.112201 -0.738884  0.440067   
2  -0.628515  0.205482 -0.180367 -0.370944  0.412005  0.792747 -0.194784   
3  -1.048590  0.793600 -0.683289 -0.619398 -0.317157  0.628633 -0.024611   
4  -0.716813 -0.497194 -0.200344 -1.548010 -0.219922  0.368706 -0.773210   
5  -0.378771  1.055804 -0.537877  0.158270 -0.003828 -0.635562  0.394238   
6  -0.310792 -0.171071  0.622006 -0.609642  0.586624  0.604495  0.867905   
7  -0.922160  0.500379 -0.034140  0.611128 -0.062309  0.604757  0.336614   
8   0.318265  0.177395  0.507006 -0.731018 -0.162960  0.023586  0.724885   
9   1.179621  0.450486 -0.439893  0.052057  0.397160 -1.076628  0.009521   
10  0.329482 -0.249196 -0.487763  0.146532  0.203778  0.126987 -0.013209   
11  0.491254  0.254098 -0.805063  0.847444 -0.868649 -0.204639  0.187366   
12  0.908632

what we get here is a huge matrix with shape [16, 100069] which is the probabilities of each token in the whole vocabulary.