In [63]:
with open("the-verdict.txt" , "r" , encoding="utf-8") as f:
    raw_text = f.read()

print("Total no. of. charachters" , len(raw_text))
print(raw_text[:99])

Total no. of. charachters 20479
I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no 


In [64]:
import re

text = "Hello world ! , This is a test"
result = re.split(r'(\s)' , text)

print(result)

['Hello', ' ', 'world', ' ', '!', ' ', ',', ' ', 'This', ' ', 'is', ' ', 'a', ' ', 'test']


In [65]:
result = re.split(r'([,.;:?_!"()\']|--|\s)' , text)
print(result)

['Hello', ' ', 'world', ' ', '', '!', '', ' ', '', ',', '', ' ', 'This', ' ', 'is', ' ', 'a', ' ', 'test']


In [66]:
## removing the white spaces
result = [item for item in result if item.strip()]
print(result)

['Hello', 'world', '!', ',', 'This', 'is', 'a', 'test']


In [67]:
preprocessed = re.split(r'([,.;:?_!"()\']|--|\s)' , raw_text)
preprocessed = [item.strip() for item in preprocessed if item.strip()]
print(preprocessed[:30])

['I', 'HAD', 'always', 'thought', 'Jack', 'Gisburn', 'rather', 'a', 'cheap', 'genius', '--', 'though', 'a', 'good', 'fellow', 'enough', '--', 'so', 'it', 'was', 'no', 'great', 'surprise', 'to', 'me', 'to', 'hear', 'that', ',', 'in']


In [68]:
print(len(preprocessed))

4690


In [69]:
### sort all words in alfabetical order 
## set keyword is used to take only unique words (non-repeating)
all_words = sorted(set(preprocessed))
vocab_size = len(all_words)

print(vocab_size)

1130


In [70]:
vocab = {token:integer for integer,token in enumerate(all_words)}

In [71]:
## Let's print some of them to understand 

for i,item in enumerate(vocab.items()):
    print(item)
    if(i>=20):
        break

('!', 0)
('"', 1)
("'", 2)
('(', 3)
(')', 4)
(',', 5)
('--', 6)
('.', 7)
(':', 8)
(';', 9)
('?', 10)
('A', 11)
('Ah', 12)
('Among', 13)
('And', 14)
('Are', 15)
('Arrt', 16)
('As', 17)
('At', 18)
('Be', 19)
('Begin', 20)


In [72]:
### THE ABOVE PROCESS IS ENCODER 
### reversal of this process is called DECODER

In [73]:
class SimpleTokenizerV1:
    def __init__(self,vocab):
        self.str_to_int = vocab  ##mentioned above list
        self.int_to_str = {i:s for s,i in vocab.items()}

    def encode(self,text):
        preprocessed = re.split(r'([,.;:?_!"()\']|--|\s)' , text)
        preprocessed = [ item.strip() for item in preprocessed if item.strip() ]
        ids = [ self.str_to_int[s] for s in preprocessed ]
        return ids

    def decode(self,ids):
        text = " ".join([self.int_to_str[i] for i in ids])
        ## Replace spaces before special charachters
        text = re.sub(r'\s+([,.?!"()\'])' , r'\1' , text)
        return text        

In [74]:
### Let's test it

tokenizer = SimpleTokenizerV1(vocab)
text = """"It's the last he painted you know,"
        Mrs. Gisburn said with pardonable pride.""" 

In [75]:
### Test for encode function whether it convert these text into ids and returning it 
ids = tokenizer.encode(text)
print(ids)

[1, 56, 2, 850, 988, 602, 533, 746, 1126, 596, 5, 1, 67, 7, 38, 851, 1108, 754, 793, 7]


In [76]:
tokenizer.decode(ids)

'" It\' s the last he painted you know," Mrs. Gisburn said with pardonable pride.'

In [77]:
### What if we add text that is not in book ?

tokenizer = SimpleTokenizerV1(vocab)
text = "Let's have tea ?"

In [78]:
ids = tokenizer.encode(text)
print(ids)

KeyError: 'Let'

In [79]:
### See the above wont't work as these keywords are not in book
### So , Let's add some more tokens like unknown , end of text .

all_tokens = sorted(list(set(preprocessed)))
all_tokens.extend(["<|endoftext|>", "<|unk|>"])

vocab = {token:integer for integer,token in enumerate(all_tokens)}

In [80]:
len(vocab)

1132

In [81]:
## Let's print last 5 words in vocab
for i,item in enumerate(list(vocab.items())[-5:]):
    print(item)

('younger', 1127)
('your', 1128)
('yourself', 1129)
('<|endoftext|>', 1130)
('<|unk|>', 1131)


In [82]:
class SimpleTokenizerV2:
    def __init__(self,vocab):
        self.str_to_int = vocab  ##mentioned above list
        self.int_to_str = {i:s for s,i in vocab.items()}

    def encode(self,text):
        preprocessed = re.split(r'([,.;:?_!"()\']|--|\s)' , text)
        preprocessed = [ item.strip() for item in preprocessed if item.strip() ]
        preprocessed = [
            item if item in self.str_to_int
            else "<|unk|>" for item in preprocessed
        ]
        ids = [ self.str_to_int[s] for s in preprocessed ]
        return ids

    def decode(self,ids):
        text = " ".join([self.int_to_str[i] for i in ids])
        ## Replace spaces before special charachters
        text = re.sub(r'\s+([,.?!"()\'])' , r'\1' , text)
        return text        

In [83]:
tokenizer = SimpleTokenizerV2(vocab)

text1 = "Hello, do you like tea?"
text2 = "In the sunlight terraces of the palace. "

text = "<|endoftext|>".join((text1,text2))

print(text)

Hello, do you like tea?<|endoftext|>In the sunlight terraces of the palace. 


In [84]:
tokenizer.encode(text)

[1131, 5, 355, 1126, 628, 975, 10, 1131, 988, 1131, 984, 722, 988, 1131, 7]

In [85]:
tokenizer.decode(tokenizer.encode(text))

'<|unk|>, do you like tea? <|unk|> the <|unk|> terraces of the <|unk|>.'

In [21]:
import importlib
import tiktoken

print("tiktoken version" , importlib.metadata.version("tiktoken"))

tiktoken version 0.11.0


In [22]:
### This is the Byte pair tokenizer from GPT

tokenizer = tiktoken.get_encoding("gpt2")

In [89]:
text = (
    "Hello, do you like tea? <|endoftext|> In the sunlit terraces"
    "of someunknownplace."   
       )

integers = tokenizer.encode(text,allowed_special={"<|endoftext|>"})
print(integers)

[15496, 11, 466, 345, 588, 8887, 30, 220, 50256, 554, 262, 4252, 18250, 8812, 2114, 1659, 617, 34680, 5372, 13]


In [90]:
strings = tokenizer.decode(integers)
print(strings)

Hello, do you like tea? <|endoftext|> In the sunlit terracesof someunknownplace.


In [23]:
with open("the-verdict.txt" , "r" , encoding="utf-8") as f:
    raw_text = f.read()

enc_text = tokenizer.encode(raw_text)
print(len(enc_text))

5145


In [92]:
## Take the first 50 tokens for demonstration 
enc_sample = enc_text[50:]

In [93]:
context_size = 4 ## Length of the input

x = enc_sample[:context_size]
y = enc_sample[1:context_size+1]

print(f"x:{x}")
print(f"y:{y}")

x:[290, 4920, 2241, 287]
y:[4920, 2241, 287, 257]


In [94]:
for i in range(1, context_size+1): ## Run from 1 to 5
    context = enc_sample[:i]
    desired = enc_sample[i]

    print(context , "----->", desired)

[290] -----> 4920
[290, 4920] -----> 2241
[290, 4920, 2241] -----> 287
[290, 4920, 2241, 287] -----> 257


In [95]:
for i in range(1, context_size+1): ## Run from 1 to 5
    context = enc_sample[:i]
    desired = enc_sample[i]

    print(tokenizer.decode(context) , "----->", tokenizer.decode([desired]))

 and ----->  established
 and established ----->  himself
 and established himself ----->  in
 and established himself in ----->  a


In [14]:
### The above is a simple demonstration . For larger data processing we use data loaders
### We will use pytorch's built in Dataset and Dataloader classes

from torch.utils.data import Dataset , DataLoader

class GPTDatasetV1(Dataset):
    def __init__(self,txt,tokenizer,max_length,stride):
        self.input_ids=[]
        self.target_ids=[]

        # Tokenize the entire text
        token_ids = tokenizer.encode(txt, allowed_special={"<|endoftext|>"})

        #Use the sliding window approach . 
        for i in range(0,len(token_ids) - max_length , stride):
            input_chunk = token_ids[i: i + max_length]
            target_chunk = token_ids[i + 1: i + max_length + 1]
            self.input_ids.append(torch.tensor(input_chunk)) ## 1'st row i.e X value [from 0 --> 4]
            self.target_ids.append(torch.tensor(target_chunk))  ## 2'nd row i.e Y value [from 1 --> 5]

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self,idx):  ## if the idx--> index is 50 (50th row)
        return self.input_ids[idx] , self.target_ids[idx]    ## it will return 50th row of input tensor and 50th row of output tensor

In [16]:
def create_dataloader_v1(txt , batch_size=4, max_length=256 , stride=128 , shuffle=True , drop_last=True , num_workers=0):
    ## txt --> verdict.txt
    ## batch_size --> run 4 process parallelly in cpu
    ## max_length --> context size (previously we used 4 , GPT uses 256)
    ## stride --> How much (words) we need to skip before the next batch
    ## drop_last --> drop the last row , if row size (last batch) is shorter than specified batch size , to prevent loss spikes during training 
    
    ## initialize the tokenizer --> we are using BytePair Tokenizer from GPT
    tokenizer = tiktoken.get_encoding("gpt2")

    ## create Dataloader
    dataset = GPTDatasetV1(txt,tokenizer,max_length,stride)

    #create Dataloader
    ## checks for the __get_item method in GPTDatasetV1 and returns the item
    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        drop_last=drop_last,
        num_workers=num_workers
    )
    return dataloader

In [None]:
### Let's test the dataloader with batch size of 1 for an LLM with context size of 4  

In [109]:
with open("the-verdict.txt" , "r" , encoding="utf-8") as f:
    raw_text = f.read()

In [25]:
import torch
print("pytorch version:" , torch.__version__)
dataloader = create_dataloader_v1(raw_text , batch_size=1 , max_length=4 , stride=1 , shuffle=False)
data_iter = iter(dataloader)
first_batch = next(data_iter)
print(first_batch)

pytorch version: 2.8.0+cpu
[tensor([[  40,  367, 2885, 1464]]), tensor([[ 367, 2885, 1464, 1807]])]


In [111]:
second_batch = next(data_iter)
print(second_batch)

[tensor([[ 367, 2885, 1464, 1807]]), tensor([[2885, 1464, 1807, 3619]])]


In [114]:
dataloader = create_dataloader_v1(raw_text , batch_size=8 , max_length=4 , stride=4 , shuffle=False)
data_iter = iter(dataloader)
inputs , targets = next(data_iter)
print("Inputs : \n :", inputs)
print("\n Targets : \n" , targets)

Inputs : 
 : tensor([[   40,   367,  2885,  1464],
        [ 1807,  3619,   402,   271],
        [10899,  2138,   257,  7026],
        [15632,   438,  2016,   257],
        [  922,  5891,  1576,   438],
        [  568,   340,   373,   645],
        [ 1049,  5975,   284,   502],
        [  284,  3285,   326,    11]])

 Targets : 
 tensor([[  367,  2885,  1464,  1807],
        [ 3619,   402,   271, 10899],
        [ 2138,   257,  7026, 15632],
        [  438,  2016,   257,   922],
        [ 5891,  1576,   438,   568],
        [  340,   373,   645,  1049],
        [ 5975,   284,   502,   284],
        [ 3285,   326,    11,   287]])


In [None]:
## converts words to 300 dim vectors
import gensim.downloader as api
wv = api.load('word2vec-google-news-300')

In [8]:
word_vectors = model

print(word_vectors["computer"])

NameError: name 'model' is not defined

In [None]:
print(word_vectors["cat"].shape())

In [None]:
print(word_vectors.most_similar(positive=['king','woman'] , negative=[man] , topn=10))

In [None]:
## Let's check the familarity b/w a few pair of words

print(word_vectors.similarity('woman' , 'man'))
print(word_vectors.similarity('king' , 'queen'))
print(word_vectors.similarity('uncle' , 'aunt'))
print(word_vectors.similarity('boy' , 'girl'))
print(word_vectors.similarity('paper' , 'water'))


In [None]:
print(word_vectors.most_similar("tower",topn=5))

In [33]:
## example of Absolute positional encoding

max_length = 4
data_loader = create_dataloader_v1(
    raw_text,batch_size=8,max_length=max_length,stride=max_length, shuffle=False
)
data_iter = iter(data_loader)
inputs,targets = next(data_iter)

In [34]:
print("Token ID's:\n" , inputs )
print("\n Inputs shape : \n" , inputs.shape)

Token ID's:
 tensor([[   40,   367,  2885,  1464],
        [ 1807,  3619,   402,   271],
        [10899,  2138,   257,  7026],
        [15632,   438,  2016,   257],
        [  922,  5891,  1576,   438],
        [  568,   340,   373,   645],
        [ 1049,  5975,   284,   502],
        [  284,  3285,   326,    11]])

 Inputs shape : 
 torch.Size([8, 4])


In [37]:
vocab_size = 50257 ## (rows)
output_dim = 256 ## (cols --> features --> dimensions )

token_embedding_layer = torch.nn.Embedding(vocab_size,output_dim)

In [38]:
## For each token id , we need to create 256 dimensional vector~(tensor shape)
token_embeddings = token_embedding_layer(inputs)
print(token_embeddings.shape)

torch.Size([8, 4, 256])


In [39]:
## But for positional embedding layer , we need only 4 as the batch size is only 4
context_length = max_length
pos_embedding_layer = torch.nn.Embedding(context_length,output_dim)

In [42]:
## We need only 4 positions [0,1,2,3] for a token ID as it is same for each row/ID
pos_embeddings = pos_embedding_layer(torch.arange(max_length))
print(pos_embeddings.shape)

torch.Size([4, 256])


In [None]:
## Implementing a simplified attention mechanism

import torch

inputs = torch.tensor(
    [
        [0.43, 0.15 , 0.89],  # Your
        [0.55, 0.87 , 0.66],  # journey
        [0.57 , 0.85 , 0.64],  # starts
        [0.22 , 0.58 , 0.33],  # with
        [0.77 , 0.25 , 0.10],  # one
        [0.05 , 0.80 , 0.55]   # step
    ]
)

In [None]:
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

# Corresponding words
words = ["Your" , "journey" , "starts" , "with" , "one" , "step"]

#Extract X , Y , Z coordinates
x_coords = inputs[:, 0].numpy()
y_coords = inputs[:, 1].numpy()
z_coords = inputs[:, 2].numpy()

fig = plt.figure()
ax = fig.add_subplot(111 , projection='3d')

## Plot each point and annotate with corresponding word

for x , y , z , word in zip( x_coords , y_coords , z_coords , words):
    ax.scatter(x,y,z)
    ax.text(x,y,z,word,fontsize=10)

## Set labels for axes
ax.set_xlabel('X')
ax.set_xlabel('Y')
ax.set_xlabel('Z')

plt.title('3D plot of word embeddings')
plt.show()

In [None]:
# Create 3D plot with vectors from origin to each point, using different colors
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')

# Define a list of colors for the vectors
colors = ['r', 'g', 'b', 'c', 'm', 'y']

# Plot each vector with a different color and annotate with the corresponding word
for (x, y, z, word, color) in zip(x_coords, y_coords, z_coords, words, colors):
    # Draw vector from origin to the point (x, y, z) with specified color and smaller arrow length ratio
    ax.quiver(0, 0, 0, x, y, z, color=color, arrow_length_ratio=0.05)
    ax.text(x, y, z, word, fontsize=10, color=color)

# Set labels for axes
ax.set_xlabel('X')
ax.set_ylabel('Y')
ax.set_zlabel('Z')

# Set plot limits to keep arrows within the plot boundaries
ax.set_xlim([0, 1])
ax.set_ylim([0, 1])
ax.set_zlim([0, 1])

plt.title('3D Plot of Word Embeddings with Colored Vectors')
plt.show()


In [None]:
## To calculate the attention score of query vector , we need to calculate dot product with each input vectors

query = inputs[1]  # 2nd input token is the query

attn_scores_2 = torch.empty(inputs.shape[0]) # First initialize the attention score as a empty tensor
for i, x_i in enumerate(inputs):
    attn_scores_2[i] = torch.dot(x_i, query) # dot product (transpose not necessary here since they are 1-dim vectors)

print(attn_scores_2)

In [None]:
attn_weights_2_tmp = attn_scores_2 / attn_scores_2.sum()

print("Attention weights:", attn_weights_2_tmp)
print("Sum:", attn_weights_2_tmp.sum())

In [None]:
### In practice, it's more common and advisable to use the softmax function for normalization.
### This approach is better at managing extreme values and offers more favorable gradient properties during training. 

def softmax_naive(x):
    return torch.exp(x) / torch.exp(x).sum(dim=0)

attn_weights_2_naive = softmax_naive(attn_scores_2)

print("Attention weights:", attn_weights_2_naive)
print("Sum:", attn_weights_2_naive.sum())

In [None]:
attn_weights_2 = torch.softmax(attn_scores_2, dim=0)
print("Attention weights:", attn_weights_2)
print("Sum:", attn_weights_2.sum())

In [None]:
query = inputs[1] # 2nd input token is the query

context_vec_2 = torch.zeros(query.shape)
for i,x_i in enumerate(inputs):
    context_vec_2 += attn_weights_2[i]*x_i

print(context_vec_2)

In [None]:
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

inputs = torch.tensor(
  [[0.43, 0.15, 0.89], # Your     (x^1)
   [0.55, 0.87, 0.66], # journey  (x^2)
   [0.57, 0.85, 0.64], # starts   (x^3)
   [0.22, 0.58, 0.33], # with     (x^4)
   [0.77, 0.25, 0.10], # one      (x^5)
   [0.05, 0.80, 0.55], # step     (x^6)
   [0.4419, 0.6515, 0.5683]]
)

# Corresponding words
words = ['Your', 'journey', 'starts', 'with', 'one', 'step', 'journey-context']

# Extract x, y, z coordinates
x_coords = inputs[:, 0].numpy()
y_coords = inputs[:, 1].numpy()
z_coords = inputs[:, 2].numpy()

# Create 3D plot
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')

# Plot each point and annotate with corresponding word
for x, y, z, word in zip(x_coords, y_coords, z_coords, words):
    ax.scatter(x, y, z)
    ax.text(x, y, z, word, fontsize=10)

# Set labels for axes
ax.set_xlabel('X')
ax.set_ylabel('Y')
ax.set_zlabel('Z')

plt.title('3D Plot of Word Embeddings')
plt.show()

# Create 3D plot with vectors from origin to each point, using different colors
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')

# Define a list of colors for the vectors
colors = ['r', 'g', 'b', 'c', 'm', 'y', 'r']

# Plot each vector with a different color and annotate with the corresponding word
for (x, y, z, word, color) in zip(x_coords, y_coords, z_coords, words, colors):
    # Draw vector from origin to the point (x, y, z) with specified color and smaller arrow length ratio
    ax.quiver(0, 0, 0, x, y, z, color=color, arrow_length_ratio=0.05)
    ax.text(x, y, z, word, fontsize=10, color=color)

# Set labels for axes
ax.set_xlabel('X')
ax.set_ylabel('Y')
ax.set_zlabel('Z')

# Set plot limits to keep arrows within the plot boundaries
ax.set_xlim([0, 1])
ax.set_ylim([0, 1])
ax.set_zlim([0, 1])

plt.title('3D Plot of Word Embeddings with Colored Vectors')
plt.show()

In [None]:
###  Now, we can extend this computation to calculate attention weights and context vectors for all inputs.
attn_scores = torch.empty(6, 6)

for i, x_i in enumerate(inputs):
    for j, x_j in enumerate(inputs):
        attn_scores[i, j] = torch.dot(x_i, x_j)

print(attn_scores)

In [None]:
### When computing the preceding attention score tensor, we used for-loops in Python.                                                          
## However, for-loops are generally slow, and we can achieve the same results using matrix multiplication:

attn_scores = inputs @ inputs.T
print(attn_scores)

In [None]:
## We now normalize each row so that the values in each row sum to 1:

attn_weights = torch.softmax(attn_scores, dim=-1)
print(attn_weights)

In [None]:
## In the context of using PyTorch, the dim parameter in functions like torch.softmax specifies the dimension of the input tensor along which the function will be computed. 
## By setting dim=-1, we are instructing the softmax function to apply the normalization along the last dimension of the attn_scores tensor. 

## If attn_scores is a 2D tensor (for example, with a shape of [rows, columns]), dim=-1 will normalize across the columns so that the values in
 ## each row (summing over the column dimension) sum up to 1.

row_2_sum = sum([0.1385, 0.2379, 0.2333, 0.1240, 0.1082, 0.1581])
print("Row 2 sum:", row_2_sum)
print("All row sums:", attn_weights.sum(dim=-1))

In [None]:
## In the third and last step, we now use these attention weights to compute all context vectors via matrix multiplication:

all_context_vecs = attn_weights @ inputs
print(all_context_vecs)

In [None]:
## We can double-check that the code is correct by comparing the 2nd row with the context vector z(2) calculated previously

print("Previous 2nd context vector:", context_vec_2)

In [None]:
## IMPLEMENTING SELF ATTENTION WITH TRAINABLE WEIGHTS

x_2 = inputs[1] #A
d_in = inputs.shape[1] #B
d_out = 2 #C

In [None]:
## Next, we initialize the three weight matrices Wq, Wk and Wv

torch.manual_seed(123)
W_query = torch.nn.Parameter(torch.rand(d_in, d_out), requires_grad=False)
W_key = torch.nn.Parameter(torch.rand(d_in, d_out), requires_grad=False)
W_value = torch.nn.Parameter(torch.rand(d_in, d_out), requires_grad=False)

In [None]:
## Next, we compute the query, key, and value vectors as shown earlier

query_2 = x_2 @ W_query
key_2 = x_2 @ W_key
value_2 = x_2 @ W_value
print(query_2)

In [None]:
## As we can see based on the output for the query, this results in a 2-dimensional vector. 
## This is because: we set the number of columns of the corresponding weight matrix, via d_out, to 2:
## Even though our temporary goal is to only compute the one context vector z(2),  we still require the key and value vectors for all input elements. 
## This is because they are involved in computing the attention weights with respect to the query q(2)

keys = inputs @ W_key
values = inputs @ W_value
print("keys.shape:", keys.shape)
print("values.shape:", values.shape)

In [None]:
## First, let's compute the attention score ω22

keys_2 = keys[1] #A
attn_score_22 = query_2.dot(keys_2)
print(attn_score_22)


In [None]:
## Again, we can generalize this computation to all attention scores via matrix multiplication:

attn_scores_2 = query_2 @ keys.T # All attention scores for given query
print(attn_scores_2)

In [None]:
## We compute the attention weights by scaling the attention scores and using the softmax function we used earlier. 
## The difference to earlier is that we now scale the attention scores by dividing them by the square root of the
 #embedding dimension of the keys. 

##Note that taking the square root is mathematically the same as exponentiating by 0.5:

d_k = keys.shape[-1]
attn_weights_2 = torch.softmax(attn_scores_2 / d_k**0.5, dim=-1)
print(attn_weights_2)

In [None]:
## We now compute the context vector as a weighted sum over the value vectors. 

## Here, the attention weights serve as a weighting factor that weighs the respective importance of each value vector. 

## We can use matrix multiplication to obtain the output in one step

context_vec_2 = attn_weights_2 @ values
print(context_vec_2)

In [None]:
## IMPLEMENTING A COMPACT SELF ATTENTION PYTHON CLASS

In [None]:
import torch.nn as nn

class SelfAttention_v1(nn.Module):

    def __init__(self, d_in, d_out):
        super().__init__()
        self.W_query = nn.Parameter(torch.rand(d_in, d_out))
        self.W_key   = nn.Parameter(torch.rand(d_in, d_out))
        self.W_value = nn.Parameter(torch.rand(d_in, d_out))

    def forward(self, x):
        keys = x @ self.W_key
        queries = x @ self.W_query
        values = x @ self.W_value
        
        attn_scores = queries @ keys.T # omega
        attn_weights = torch.softmax(
            attn_scores / keys.shape[-1]**0.5, dim=-1
        )

        context_vec = attn_weights @ values
        return context_vec

In [None]:
## Finally, we create a context vector by weighting the values with these normalized attention scores.

torch.manual_seed(123)
sa_v1 = SelfAttention_v1(d_in, d_out)
print(sa_v1(inputs))

In [None]:
## Since inputs contains six embedding vectors, we get a matrix storing the six context vectors, as shown in the above result. 
## As a quick check, notice how the second row ([0.3061, 0.8210]) matches the contents of context_vec_2 in the previous section.

In [None]:
## We can improve the SelfAttention_v1 implementation further by utilizing PyTorch's nn.Linear layers, which effectively perform matrix 
# multiplication when the bias units are disabled. 

## Additionally, a significant advantage of using nn.Linear instead of manually
 ## implementing nn.Parameter(torch.rand(...)) is that nn.Linear has an optimized weight
 ## initialization scheme, contributing to more stable and effective model training.

In [None]:
class SelfAttention_v2(nn.Module):

    def __init__(self, d_in, d_out, qkv_bias=False):
        super().__init__()
        self.W_query = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_key   = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_value = nn.Linear(d_in, d_out, bias=qkv_bias)

    def forward(self, x):
        keys = self.W_key(x)
        queries = self.W_query(x)
        values = self.W_value(x)
        
        attn_scores = queries @ keys.T
        attn_weights = torch.softmax(attn_scores / keys.shape[-1]**0.5, dim=-1)

        context_vec = attn_weights @ values
        return context_vec

In [None]:
## You can use the SelfAttention_v2 similar to SelfAttention_v1:

torch.manual_seed(789)
sa_v2 = SelfAttention_v2(d_in, d_out)
print(sa_v2(inputs))