In [2]:
# imports
import torch            # for normal utility functions like random etc..
import torch.nn as nn   # for neural network related functions, ex: nn.Layer, nn.Module etc..
import math
import numpy as np
import os
import pickle
import requests
import numpy as np
from torch.nn.utils.rnn import pad_sequence

In [61]:
config = {
    'embedding_dim' : 1000,
    'model_context_length' : 20,
    'q_k_dim' : 600,
    'num_heads' : 8,
    'ff_intermediate_dim':4000,
    'drop_out_prob' : 0.5,
    'n_encoders':2,
    'n_decoders':2,
}
config['value_dim'] = config['embedding_dim']//config['num_heads']

In [59]:
config

{'embedding_dim': 768,
 'model_context_length': 20,
 'q_k_dim': 500,
 'num_heads': 8,
 'ff_intermediate_dim': 4000,
 'drop_out_prob': 0.5,
 'n_encoders': 2,
 'n_decoders': 2,
 'value_dim': 96}

Data Prep. [ ref : #https://github.com/karpathy/nanoGPT/blob/master/data/shakespeare_char/prepare.py ]

In [4]:
!wget 'https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt'

--2024-02-15 10:19:33--  https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1115394 (1.1M) [text/plain]
Saving to: ‘input.txt’


2024-02-15 10:19:33 (20.0 MB/s) - ‘input.txt’ saved [1115394/1115394]



In [5]:
with open('input.txt', 'r') as f:
  data = f.read()

In [6]:
print('length of data : ', len(data))
vocab_size = len(data)

length of data :  1115394


In [7]:
print(" unique set of characters ", set(data))
chars = set(data)
print(len(chars))

 unique set of characters  {'r', 'A', "'", 'g', 'q', 'j', 'M', 'o', 'y', 'K', 'D', 'G', 'H', 'E', 'Q', 'c', 'Y', 't', 'b', 'x', 'C', 'u', 'F', 'd', ';', 'N', 'p', 'f', 'X', 'a', 's', 'h', 'l', '3', 'n', 'O', '&', '?', 'z', 'w', 'U', 'T', ',', 'i', 'Z', 'S', 'B', 'I', 'J', '$', ':', 'k', 'P', ' ', 'v', 'W', '-', 'R', '.', 'L', 'V', '!', 'm', '\n', 'e'}
65


In [62]:
config['vocab_size'] = len(chars)

In [9]:
# create a mapping from characters to integers
ctoi = { ch:i for i,ch in enumerate(chars) }
itoc = { i:ch for i,ch in enumerate(chars) }

In [10]:
print(ctoi)

{'r': 0, 'A': 1, "'": 2, 'g': 3, 'q': 4, 'j': 5, 'M': 6, 'o': 7, 'y': 8, 'K': 9, 'D': 10, 'G': 11, 'H': 12, 'E': 13, 'Q': 14, 'c': 15, 'Y': 16, 't': 17, 'b': 18, 'x': 19, 'C': 20, 'u': 21, 'F': 22, 'd': 23, ';': 24, 'N': 25, 'p': 26, 'f': 27, 'X': 28, 'a': 29, 's': 30, 'h': 31, 'l': 32, '3': 33, 'n': 34, 'O': 35, '&': 36, '?': 37, 'z': 38, 'w': 39, 'U': 40, 'T': 41, ',': 42, 'i': 43, 'Z': 44, 'S': 45, 'B': 46, 'I': 47, 'J': 48, '$': 49, ':': 50, 'k': 51, 'P': 52, ' ': 53, 'v': 54, 'W': 55, '-': 56, 'R': 57, '.': 58, 'L': 59, 'V': 60, '!': 61, 'm': 62, '\n': 63, 'e': 64}


In [11]:
# converting integers to string and string to integers
def encode(s):
    return [ctoi[c] for c in s] # encoder: take a string, output a list of integers
def decode(l):
    return ''.join([itoc[i] for i in l]) # decoder: take a list of integers, output a string

In [12]:
encode("hello there")

[31, 64, 32, 32, 7, 53, 17, 31, 64, 0, 64]

In [13]:
decode(encode("hello there"))     # decoding the encoded values

'hello there'

In [14]:
decode([0])

'r'

In [15]:
# train test split

n = len(data)
train_data = data[:int(n*0.9)]
val_data = data[int(n*0.9):]

In [16]:
# encode both to integers
train_ids = encode(train_data)
val_ids = encode(val_data)
print(f"train has {len(train_ids):,} tokens")
print(f"val has {len(val_ids):,} tokens")

train has 1,003,854 tokens
val has 111,540 tokens


In [17]:
# function for returning the batch of base data
def batch_data(batch_size, context_length, context = 'train'):
  data = val_ids if context=='test' else train_ids
  indices = torch.randint(low = 0, high=len(data) - context_length, size= (batch_size,))
  x = torch.stack([torch.tensor(data[index:index+context_length]) for index in indices])
  y = torch.stack([torch.tensor(data[index+1:index+1+context_length]) for index in indices])
  return x, y


# # function for visualizing the data how transformer processes it

# def prepare_data(batch_size, x,y):
#     train_x, train_y = [],[]

#     for batch_ind in range(batch_size):
#       train_x=[x[batch_ind,:i+1] for i in range(x.shape[1])]
#       train_y=[y[batch_ind, i] for i in range(y.shape[1])]
#       train_x = pad_sequence(train_x, batch_first=True, padding_value=-1)
#       train_y = torch.tensor(train_y)
#     return train_x, train_y

def get_data(context, batch_size=8):
  batch_size, context_length = batch_size, config['model_context_length']
  x, y = batch_data(batch_size, context_length, context=context)
  # train_x, train_y = prepare_data(batch_size,x,y)
  return x, y



x,y = get_data('train')

In [18]:
x

tensor([[64, 17, 53, 18,  0, 64, 29, 30, 17, 58],
        [64, 53, 27,  0, 64, 30, 31, 64, 30, 17],
        [34, 53, 29, 34, 53, 29, 30, 30,  2, 30],
        [ 3, 64, 53, 30, 64, 32, 23,  7, 62, 53],
        [64, 34, 42, 53, 29, 53, 31, 29, 26, 32],
        [31, 29, 17, 53, 39, 64, 53, 23, 43, 23],
        [63, 46, 64, 15, 29, 21, 30, 64, 53, 47],
        [64, 30, 53, 32,  7, 34,  3, 53, 29,  3]])

In [19]:
# text embedding

class embedding(nn.Module):
  def __init__(self):
    super(embedding, self).__init__()
    self.embedding_layer = nn.Embedding(num_embeddings = config['vocab_size'], embedding_dim = config['embedding_dim'])
    self.positional_embedding = nn.Embedding(num_embeddings = config['model_context_length'], embedding_dim = config['embedding_dim'])

  def forward(self, encodings):
    embeddings = self.embedding_layer(encodings)

    # positional embeddings
    positional_encodings = torch.arange(encodings.size(1)).unsqueeze(0)
    positionals_embed = self.positional_embedding(positional_encodings)

    return embeddings + positionals_embed                  # returns (*encodings.shape,  config.embedding_dim), sum is happening after broadcast

In [286]:
# embed = embedding()
# inp = torch.randint( high=9, size=(1,13))
# print(inp)
# embed.positional_embedding(inp)

In [20]:
config['model_context_length']

10

In [21]:
class Self_Attention(nn.Module):
  def __init__(self, mask=False):
    super(Self_Attention, self).__init__()
    self.query = nn.Linear(config['embedding_dim'], config['q_k_dim'])
    self.key = nn.Linear(config['embedding_dim'], config['q_k_dim'])
    self.value = nn.Linear(config['embedding_dim'], config['value_dim'])
    self.mask = mask
    self.softmax = nn.Softmax(dim=-1)

  def forward(self, inputs):

    if not isinstance(inputs, list):                                             # for cross attention send input as [query, key, value]
      inputs = [inputs]*3

    queries = self.query(inputs[0])
    keys = self.key(inputs[1])
    values = self.value(inputs[2])
    keys = torch.transpose(keys,1,2)
    attentions = torch.matmul(queries, keys)*(torch.sqrt(torch.tensor(keys.size(1))))**-1

    # print("are we doing masking ? ", self.mask)

    if self.mask:
      # print('masking the attentions for decoder')
      attentions = torch.tril(attentions)
      attentions = attentions.masked_fill(attentions==0, -float('inf'))

    attentions = self.softmax(attentions)

    return attentions, torch.matmul(attentions, values)


In [176]:
# embed = embedding()
# obj = Self_Attention()
# encodings = torch.randint(0, config['vocab_size'], (2, 10))
# embeddings = embed(encodings)
# attentions, res = obj(embeddings)
# res.size()

In [22]:
class multi_head_attention(nn.Module):
  def __init__(self, mask=False):
    super(multi_head_attention, self).__init__()
    self.attentions = [Self_Attention(mask) for _ in range(config['num_heads'])]

  def forward(self, inputs):
    self_attention_heads = [self_attention(inputs)[1] for self_attention in self.attentions]      # passing input to each head separately
    return torch.cat(self_attention_heads, dim=-1)

In [179]:
# embed = embedding()
# obj = multi_head_attention()

# encodings = torch.randint(0, config['vocab_size'], (2, 10))
# embeddings = embed(encodings)
# # attentions, res = obj(embeddings)
# res = obj(embeddings)
# res.size()

In [23]:
class feed_forward(nn.Module):
  def __init__(self):
    super(feed_forward, self).__init__()
    self.FF_1 = nn.Linear(config['embedding_dim'], config['ff_intermediate_dim'])
    self.FF_2 = nn.Linear(config['ff_intermediate_dim'], config['embedding_dim'])
    self.gelu = nn.GELU()
    self.dropout = nn.Dropout(config['drop_out_prob'])


  def forward(self, inputs):
    FF1_outputs = self.FF_1(inputs)
    FF1_outputs_nonlinear = self.gelu(FF1_outputs)
    FF2_outputs = self.FF_2(FF1_outputs_nonlinear)
    forward_out = self.dropout(FF2_outputs)
    return forward_out

In [24]:
embed = embedding()
obj = multi_head_attention()
ff = feed_forward()

encodings = torch.randint(0, config['vocab_size'], (2, 10))
embeddings = embed(encodings)
# attentions, res = obj(embeddings)
res = obj(embeddings)
ff(res).size()

torch.Size([2, 10, 768])

In [25]:
class Encoder(nn.Module):
  def __init__(self):
    super(Encoder, self).__init__()
    self.MHA = multi_head_attention()
    self.ff = feed_forward()
    self.layer_norm_1 = nn.LayerNorm(config['embedding_dim'])
    self.layer_norm_2 = nn.LayerNorm(config['embedding_dim'])

  def forward(self, inputs):
    encoder_inputs = self.layer_norm_1(inputs)
    Attended_embeds = self.MHA(encoder_inputs) + inputs
    ff_res = self.ff(self.layer_norm_2(Attended_embeds))+Attended_embeds
    return ff_res

In [184]:
# embed = embedding()
# encoder = Encoder()
# # obj = multi_head_attention()
# # ff = feed_forward()

# encodings = torch.randint(0, config['vocab_size'], (2, 10))
# embeddings = embed(encodings)
# # attentions, res = obj(embeddings)
# res = encoder(embeddings)
# res.size()

torch.Size([2, 10, 768])

## Decoder

In [26]:
class Decoder(nn.Module):
  def __init__(self):
    super(Decoder, self).__init__()
    self.masked_MHA = multi_head_attention(mask=True)
    self.MHA = multi_head_attention()
    self.ff = feed_forward()
    self.layer_norm_1 = nn.LayerNorm(config['embedding_dim'])
    self.layer_norm_2 = nn.LayerNorm(config['embedding_dim'])
    self.layer_norm_3 = nn.LayerNorm(config['embedding_dim'])


  def forward(self, encoder_output, decoder_input):
    decoder_input_normalized = self.layer_norm_1(decoder_input)
    masked_decoder_output = self.masked_MHA(decoder_input_normalized) + decoder_input
    masked_decoder_output_normalized = self.layer_norm_2(masked_decoder_output)
    cross_attention_output = self.MHA([masked_decoder_output_normalized, encoder_output, encoder_output]) + masked_decoder_output
    cross_attention_output_normalized = self.layer_norm_3(cross_attention_output)
    ff_output = self.ff(cross_attention_output_normalized)  + cross_attention_output

    return ff_output


# Completer Transformer Architecture

In [27]:
class Transformer(nn.Module):
  def __init__(self, n_encoders, n_decoders):
    self.encoders = [Encoder() for _ in range(n_encoders)]
    self.decoders = [Decoder() for _ in range(n_decoders)]


  def forward(self, encoder_inputs, decoder_inputs):
    encoder_outputs = [encoder_inputs := encoder(encoder_inputs) for encoder in self.encoders]
    decoder_outputs = [decoder_inputs := decoder(encoder_outputs[-1], decoder_inputs) for decoder in self.decoders]
    return decoder_outputs


In [28]:
transformers = Transformer(config['n_encoders'], config['n_decoders'])

# GPT ( Decoder only Transformer )

In [29]:
class Decoder(nn.Module):
  def __init__(self):
    super(Decoder, self).__init__()
    self.layer_norm_1 = nn.LayerNorm(config['embedding_dim'])
    self.MHA = multi_head_attention(mask=True)
    self.layer_norm_2 = nn.LayerNorm(config['embedding_dim'])
    self.ff = feed_forward()

  def forward(self, inputs):
    inputs_normalized = self.layer_norm_1(inputs)
    inputs_attention  = self.MHA(inputs_normalized) + inputs
    inputs_attention_normalized = self.layer_norm_2(inputs_attention)
    decoder_response = self.ff(inputs_attention_normalized) + inputs_attention

    return decoder_response


In [30]:
class GPT(nn.Module):
  def __init__(self, n_decoder):
    super(GPT, self).__init__()
    self.decoders = nn.Sequential(*[Decoder() for _ in range(n_decoder)])
    self.linear = nn.Linear(in_features=config['embedding_dim'], out_features=config['vocab_size'])
    # self.decoders = nn.Sequential([decoder() for decoder in self.decoders])
    self.embed = embedding()
    self.loss = torch.nn.CrossEntropyLoss()

  # def forward(self, decoder_input, targets):
  #   embddings = self.embed(decoder_input)
  #   decoders_response = self.decoders(embddings)
  #   gpt_predictions = self.linear(decoders_response)
  #   # reshaping for Loss function
  #   B,T,C = gpt_predictions.shape
  #   gpt_predictions = gpt_predictions.view(B*T, C)
  #   B,T = targets.shape
  #   targets = targets.view(B*T)

  #   loss_value = self.loss(gpt_predictions, targets)
  #   return gpt_predictions, loss_value


  def forward(self, decoder_input, targets=None):
      embddings = self.embed(decoder_input)
      decoders_response = self.decoders(embddings)
      gpt_predictions = self.linear(decoders_response)
      # reshaping for Loss function
      B,T,C = gpt_predictions.shape
      gpt_predictions = gpt_predictions.view(B*T, C)
      if targets is not None:
        B,T = targets.shape
        targets = targets.view(B*T)

      if targets is not None:
        loss_value = self.loss(gpt_predictions, targets)
      else:
        loss_value = None
      return gpt_predictions, loss_value




In [31]:
# @torch.no_grad()
# def estimate_loss():
#     out = {}
#     model.eval()
#     for split in ['train', 'val']:
#         losses = torch.zeros(eval_iters)
#         for k in range(eval_iters):
#             X, Y = get_data(split)
#             logits, loss = model(X, Y)
#             losses[k] = loss.item()
#         out[split] = losses.mean()
#     model.train()
#     return out

In [63]:
# testing to make sure model is processing properly

model = GPT(n_decoder=1)

total_params = sum(
	param.numel() for param in model.parameters()
)
print("total model params ", total_params/1e6, "Million")


xb, yb = get_data('train', batch_size=2)
print(xb)
print(xb.shape)
print(yb.shape)
# evaluate the loss
logits, loss = model(xb, yb)

print("any logits with nan ? " ,logits.isnan().sum())
print("loss value ", loss)

total model params  8.159065 Million
tensor([[62,  7,  0, 17, 29, 32, 30, 53, 17, 31, 29, 17, 53, 27, 29, 32, 32, 53,
         18, 29],
        [32, 23, 53, 62, 29, 34, 53, 23,  7, 64, 30, 53, 39, 31, 64, 34, 53, 17,
         31, 64]])
torch.Size([2, 20])
torch.Size([2, 20])
any logits with nan ?  tensor(0)
loss value  tensor(4.4004, grad_fn=<NllLossBackward0>)


In [33]:
print(logits.shape)

torch.Size([20, 65])


In [64]:
@torch.no_grad()
def predict(model, max_length, sentence_start = None):

  if sentence_start is None:
    start_token = torch.randint(low=0, high=config['vocab_size'], size=(1,))
    start_token = start_token.tolist()
  else:
    start_token = encode(sentence_start)
    start_token = start_token

  predicted_sentence = []
  print("start token " , start_token)
  predicted_sentence.append(decode(start_token))
  model.eval()
  while len(predicted_sentence)!=max_length:
    input = torch.tensor(start_token).unsqueeze(0)
    # print("input : ", input, " input shape : ", input.shape)
    prediction, loss = model(input)

    # print("predictions shape : ", prediction.shape)

    pred_char_index = np.argmax(prediction[-1,:])
    pred_char_index = pred_char_index.tolist()
    # print("PREDICTED index " , pred_char_index)

    if not len(start_token)<config['model_context_length']:
      start_token = start_token[1:]

    start_token.append(pred_char_index)
    predicted_char = decode([pred_char_index])
    predicted_sentence.append(predicted_char)
  model.train()

  print("".join(predicted_sentence))


In [66]:
predict(model, 100, sentence_start="the")

start token  [17, 31, 64]
the,!:oK3.!yQnEq hQZr:::fw.!:fw.!::fZK3LLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLL


In [86]:
val = torch.randint(high=10, size=(3,))
print(val)
print(val.tolist())
print(val.unsqueeze(0))

tensor([5, 4, 8])
[5, 4, 8]
tensor([[5, 4, 8]])


In [67]:
config['max_iters'] = 1000
config['eval_interval'] = 10
config['eval_iters']=5

In [68]:
config

{'embedding_dim': 1000,
 'model_context_length': 20,
 'q_k_dim': 600,
 'num_heads': 8,
 'ff_intermediate_dim': 4000,
 'drop_out_prob': 0.5,
 'n_encoders': 2,
 'n_decoders': 2,
 'value_dim': 125,
 'vocab_size': 65,
 'max_iters': 1000,
 'eval_interval': 10,
 'eval_iters': 5}

In [70]:
max_iters = config['max_iters']
eval_interval = config['eval_interval']
eval_iters = config['eval_iters']
learning_rate = 1e-3

# initialing the model with #decoder blocks
model = GPT(n_decoder=8)

# optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

total_params = sum(
	param.numel() for param in model.parameters()
)
print("total model params ", total_params/1e6, "Million")

@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_data(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out



for iter in range(max_iters):

    # every once in a while evaluate the loss on train and val sets
    if iter % eval_interval == 0 or iter == max_iters - 1:
        losses = estimate_loss()
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")
        predict(model, max_length=100)

    # sample a batch of data
    xb, yb = get_data('train')

    # evaluate the loss
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

total model params  64.222065 Million
step 0: train loss 4.5936, val loss 4.6434
start token  [26]
pgaPSbCE;3??QQvvLa?Q,J??????????????????????????????????????????????????????????????????????????????
step 10: train loss 5.6975, val loss 5.7208
start token  [53]
 paldshaveasayiffovisatoryyyyyyyyoryyyy:
Hayy:
Hayyyyyyyyyyyyyy:
HHay:
Hayyyyyyyyyyyyyyy:
Hay:
Hayyy
step 20: train loss 4.1381, val loss 3.9956
start token  [7]
o mow w w we h th
T rerererererererererer co co co co co atrererererererererer co co co co co atrere
step 30: train loss 3.4210, val loss 3.3728
start token  [37]
? h m moror h s  mollllllllllllld howowis thous hous hous ce hous hous hous ce hous hous hous ce hou
step 40: train loss 3.4263, val loss 3.5400
start token  [8]
youse, feffreereeathase, wise, , ase, wise, ase, ase, ase, ase, ase, ase, ase, ase, ase, ase, ase, a
step 50: train loss 3.3630, val loss 3.3791
start token  [17]
ts ar ararararar, as is arararararararararararararararararararararararararararararararar

KeyboardInterrupt: 

In [129]:
numbers = [1, 2, 3, 4, 5, 6]
# squared_evens = [square := n ** 2 for n in numbers if (n % 2 == 0)]
squared_evens = [n ** 2 for n in numbers if (n % 2 == 0)]


In [136]:
x = 2
[x:=x**i for i in list(range(1,5))]

[2, 4, 64, 16777216]

In [127]:
squared_evens

[4, 16, 36]

In [130]:
squared_evens

[4, 16, 36]

In [None]:
((5*2)+5)**2

225

In [None]:
def func1(x):
    return x * 2

def func2(x):
    return x + 5

def func3(x):
    return x ** 2


functions_list = [func1, func2, func3]

input_value = 5  # The initial input value

result = [input_value := func(input_value) for func in functions_list]
print(result[-1])  # The final output after chaining all functions


225


In [None]:
result

[10, 15, 225]

In [None]:
# class masked_head_attention(nn.Module):
#     def __init__(self):
#       self.query = nn.Linear(config['embedding_dim'], config['q_k_dim'])
#       self.key= nn.Linear(config['embedding_dim'], config['q_k_dim'])
#       self.value = nn.Linear(config['embedding_dim'], config['value_dim'])

#     def forward(self, inputs):
#       self.


In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class DecoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, dim_feedforward, dropout):
        super(DecoderLayer, self).__init__()
        self.self_attention = nn.MultiheadAttention(d_model, num_heads, dropout=dropout)
        self.cross_attention = nn.MultiheadAttention(d_model, num_heads, dropout=dropout)
        self.feed_forward = nn.Sequential(
            nn.Linear(d_model, dim_feedforward),
            nn.ReLU(),
            nn.Linear(dim_feedforward, d_model)
        )
        self.dropout = nn.Dropout(dropout)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.norm3 = nn.LayerNorm(d_model)

    def forward(self, tgt, memory, tgt_mask=None, memory_mask=None, tgt_key_padding_mask=None, memory_key_padding_mask=None):
        tgt2 = self.self_attention(tgt, tgt, tgt, attn_mask=tgt_mask, key_padding_mask=tgt_key_padding_mask)[0]
        tgt = tgt + self.dropout(tgt2)
        tgt = self.norm1(tgt)

        tgt2 = self.cross_attention(tgt, memory, memory, attn_mask=memory_mask, key_padding_mask=memory_key_padding_mask)[0]
        tgt = tgt + self.dropout(tgt2)
        tgt = self.norm2(tgt)

        tgt2 = self.feed_forward(tgt)
        tgt = tgt + self.dropout(tgt2)
        tgt = self.norm3(tgt)

        return tgt


class Decoder(nn.Module):
    def __init__(self, num_layers, d_model, num_heads, dim_feedforward, dropout):
        super(Decoder, self).__init__()
        self.layers = nn.ModuleList([DecoderLayer(d_model, num_heads, dim_feedforward, dropout) for _ in range(num_layers)])

    def forward(self, tgt, memory, tgt_mask=None, memory_mask=None, tgt_key_padding_mask=None, memory_key_padding_mask=None):
        for layer in self.layers:
            tgt = layer(tgt, memory, tgt_mask=tgt_mask, memory_mask=memory_mask, tgt_key_padding_mask=tgt_key_padding_mask, memory_key_padding_mask=memory_key_padding_mask)
        return tgt


In [None]:
import torch
import torch.nn as nn

class MultiheadAttention(nn.Module):
    def __init__(self, d_model, num_heads, dropout=0.0):
        super(MultiheadAttention, self).__init__()
        assert d_model % num_heads == 0, "d_model must be divisible by num_heads"
        self.d_model = d_model
        self.num_heads = num_heads
        self.head_dim = d_model // num_heads

        self.query_linear = nn.Linear(d_model, d_model)
        self.key_linear = nn.Linear(d_model, d_model)
        self.value_linear = nn.Linear(d_model, d_model)
        self.dropout = nn.Dropout(dropout)
        self.output_linear = nn.Linear(d_model, d_model)

    def forward(self, query, key, value, attn_mask=None, key_padding_mask=None):
        batch_size, seq_len, _ = query.size()

        query = self.query_linear(query)
        key = self.key_linear(key)
        value = self.value_linear(value)

        query = self._reshape_to_batches(query)
        key = self._reshape_to_batches(key)
        value = self._reshape_to_batches(value)

        # Compute scaled dot-product attention
        attention_scores = torch.matmul(query, key.transpose(-2, -1)) / self.head_dim**0.5

        if attn_mask is not None:
            attention_scores += attn_mask.unsqueeze(1)

        attention_probs = nn.Softmax(dim=-1)(attention_scores)
        attention_probs = self.dropout(attention_probs)

        context = torch.matmul(attention_probs, value)
        context = self._reshape_from_batches(context)
        context = self.output_linear(context)

        if key_padding_mask is not None:
            context = context.masked_fill(key_padding_mask.unsqueeze(-1), 0)

        return context

    def _reshape_to_batches(self, tensor):
        batch_size, seq_len, d_model = tensor.size()
        tensor = tensor.view(batch_size, seq_len, self.num_heads, self.head_dim)
        tensor = tensor.permute(0, 2, 1, 3).contiguous()
        return tensor.view(batch_size * self.num_heads, seq_len, self.head_dim)

    def _reshape_from_batches(self, tensor):
        batch_size, seq_len, _ = tensor.size()
        tensor = tensor.view(batch_size // self.num_heads, self.num_heads, seq_len, self.head_dim)
        tensor = tensor.permute(0, 2, 1, 3).contiguous()
        return tensor.view(batch_size // self.num_heads, seq_len, self.d_model)


In [None]:
import torch

In [None]:
torch.randint(1,100,size=(5,5))

tensor([[20,  3, 97,  6, 45],
        [29, 82, 72, 18, 97],
        [80, 45, 59, 71, 33],
        [67, 95, 92,  5, 21],
        [22, 13, 43, 69, 40]])

In [None]:
torch.tril(torch.randint(1,100, size=(5,5)), diagonal=0)

tensor([[92,  0,  0,  0,  0],
        [96, 87,  0,  0,  0],
        [80, 67,  4,  0,  0],
        [63, 71, 51, 86,  0],
        [24, 17, 74, 47, 46]])

In [None]:
matrix = torch.tril(torch.randint(1,100, size=(5,5)), diagonal=0)
matrix = matrix.float()
matrix.masked_fill(matrix==0, -float('inf'))

tensor([[55., -inf, -inf, -inf, -inf],
        [56., 58., -inf, -inf, -inf],
        [ 7., 76., 96., -inf, -inf],
        [96., 59., 60., 92., -inf],
        [42., 28., 87., 36.,  9.]])

In [None]:
torch.softmax(matrix.masked_fill(matrix==0, -float('inf')), dim=-1)

tensor([[1.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00],
        [1.1920e-01, 8.8080e-01, 0.0000e+00, 0.0000e+00, 0.0000e+00],
        [2.2274e-39, 2.0612e-09, 1.0000e+00, 0.0000e+00, 0.0000e+00],
        [9.8201e-01, 8.3796e-17, 2.2778e-16, 1.7986e-02, 0.0000e+00],
        [2.8625e-20, 2.3803e-26, 1.0000e+00, 7.0955e-23, 1.3336e-34]])

In [None]:
torch.softmax(torch.tensor([42., 28., 87., 36.,  9.]), dim=-1)

tensor([2.8625e-20, 2.3803e-26, 1.0000e+00, 7.0955e-23, 1.3336e-34])

In [None]:
torch.softmax(torch.tensor([.42, .28, .87, .36,  .9]), dim=-1)

tensor([0.1668, 0.1450, 0.2616, 0.1571, 0.2695])

In [None]:
m = nn.GELU(approximate='tanh')
input = torch.randint(100,size=(1,), dtype=torch.float32)
m(input)

tensor([24.])

In [None]:
a = torch.tensor([[1,2,3],[4,5,6]])
b = torch.tensor([10,20,30])

In [None]:
print(a)
print(b)

tensor([[1, 2, 3],
        [4, 5, 6]])
tensor([[10, 20, 30]])


In [None]:
b = b.unsqueeze(0)

In [None]:
print(a.size())
print(b.size())

torch.Size([2, 3])
torch.Size([1, 3])


In [None]:
config

{'vocab_size': 3000,
 'embedding_dim': 768,
 'model_context_length': 500,
 'q_k_dim': 560}

In [None]:
query = nn.Linear(config['embedding_dim'], config['q_k_dim'])
key = nn.Linear(config['embedding_dim'], config['q_k_dim'])
value = nn.Linear(config['embedding_dim'], config['embedding_dim'])


print(query.weight.size())
print(key.weight.size())
print(value.weight.size())

torch.Size([560, 768])
torch.Size([560, 768])
torch.Size([768, 768])


In [None]:
embedded_inputs = torch.randn(size=(3,5,768))

In [None]:
embedded_inputs.size()

torch.Size([3, 5, 768])

In [None]:
query(embedded_inputs).size()

torch.Size([3, 5, 560])

In [None]:
queries = query(embedded_inputs)
keys = key(embedded_inputs)
values = value(embedded_inputs)


In [None]:
print(queries.size())
keys.size()

torch.Size([3, 5, 560])


torch.Size([3, 5, 560])

In [None]:
keys = torch.transpose(keys,1,2)

In [None]:
keys.size()

torch.Size([3, 560, 5])

In [None]:
torch.matmul(queries, keys)

tensor([[[  2.6170,   8.8673,   6.5697,   5.3232,   8.1410],
         [ 14.5556,  -4.2279,  -2.5253,  -6.4886,  13.6497],
         [-13.8412,   0.4857,   3.5472,   1.6711,   8.3584],
         [  5.2161,   4.3817,  -0.6309,  -3.3748,   9.5136],
         [  2.6175,  -3.8543, -11.0960,   0.2316,   5.0176]],

        [[-10.2110,   1.6841,  -4.9994,   2.7654,  15.3124],
         [ -1.6696,   4.3291,  -7.8428,  -2.3466,   1.8950],
         [  3.2035,   4.5864,  -5.8894,   8.6451,  12.5560],
         [ -9.7150,   3.6997,  -7.2993,   7.6571, -12.6875],
         [ -0.3818, -24.1317,   5.5952,   1.5995,   3.9394]],

        [[ 10.9530,   5.2465,   9.9187,  -2.5286,   2.2908],
         [ 15.3642,   0.4163,  -3.3867,  14.6251,   0.6258],
         [ -6.4831,  -2.0004,   3.6250, -10.1800,   1.4576],
         [ -0.8054,  12.9416, -13.3502,  -1.9531,  -3.4387],
         [  5.9473,  -0.0591,   4.9450,   3.3467,   9.6258]]],
       grad_fn=<UnsafeViewBackward0>)

In [None]:
torch.matmul(queries, keys)*(torch.sqrt(torch.tensor(keys.size(1))))**-1

tensor([[[ 0.1106,  0.3747,  0.2776,  0.2249,  0.3440],
         [ 0.6151, -0.1787, -0.1067, -0.2742,  0.5768],
         [-0.5849,  0.0205,  0.1499,  0.0706,  0.3532],
         [ 0.2204,  0.1852, -0.0267, -0.1426,  0.4020],
         [ 0.1106, -0.1629, -0.4689,  0.0098,  0.2120]],

        [[-0.4315,  0.0712, -0.2113,  0.1169,  0.6471],
         [-0.0706,  0.1829, -0.3314, -0.0992,  0.0801],
         [ 0.1354,  0.1938, -0.2489,  0.3653,  0.5306],
         [-0.4105,  0.1563, -0.3085,  0.3236, -0.5361],
         [-0.0161, -1.0198,  0.2364,  0.0676,  0.1665]],

        [[ 0.4628,  0.2217,  0.4191, -0.1069,  0.0968],
         [ 0.6493,  0.0176, -0.1431,  0.6180,  0.0264],
         [-0.2740, -0.0845,  0.1532, -0.4302,  0.0616],
         [-0.0340,  0.5469, -0.5641, -0.0825, -0.1453],
         [ 0.2513, -0.0025,  0.2090,  0.1414,  0.4068]]],
       grad_fn=<MulBackward0>)

In [None]:
attentions = torch.matmul(queries, keys)*(torch.sqrt(torch.tensor(keys.size(1))))**-1

In [None]:
attentions.size()

torch.Size([3, 5, 5])

In [None]:
torch.matmul(attentions, values)

tensor([[[-0.3970,  0.0622,  0.4446,  ...,  0.1658, -0.1390, -0.3943],
         [-0.4759, -1.0286,  0.0432,  ..., -0.3399, -0.5756, -0.3593],
         [ 0.1316, -0.5678,  0.2755,  ..., -0.2129, -0.2254, -0.0814],
         [-0.3659, -0.4534,  0.0581,  ..., -0.0093, -0.4205, -0.5891],
         [ 0.1767, -0.5330, -0.5329,  ...,  0.2687, -0.6386, -0.4443]],

        [[-0.9429,  0.4377, -0.5721,  ...,  0.4676,  0.0925, -0.7004],
         [-0.5914,  0.1866, -0.3237,  ...,  0.1362, -0.0020, -0.3840],
         [-0.5352,  0.3863, -0.3279,  ...,  0.2841, -0.3369, -1.1716],
         [ 0.0963,  0.3325,  0.0582,  ...,  0.9236,  0.1016, -0.1899],
         [ 0.7588, -0.6284,  0.0349,  ...,  0.0626,  0.0272,  1.0723]],

        [[-0.1058, -0.7051,  0.3918,  ..., -0.5771,  0.6012, -0.5737],
         [-0.1330, -0.5764, -0.1608,  ..., -0.5232, -0.0273,  0.4008],
         [ 0.0018,  0.1371,  0.2604,  ...,  0.1715, -0.0073, -0.3464],
         [ 0.4207,  0.5730, -0.6647,  ...,  0.9251, -0.3237,  0.2907],
  

In [None]:
m = nn.Linear(768, 560)
print(m.weight.size())
input = torch.randn(3,5, 768)
output = m(input)
print(output.size())

torch.Size([560, 768])
torch.Size([3, 5, 560])


In [None]:
embed_layer  = embedding()

In [None]:
encodings = torch.randint(1,101,size=(3,10))

In [None]:
encodings.size()

torch.Size([3, 10])

In [None]:
res = embed_layer(encodings)

In [None]:
res.size()

torch.Size([3, 10, 768])

In [None]:
# rough

print(torch.randint(0,101,size=(10,), dtype=torch.float32))
# print(torch.randint(0,101,size=(1,10)))

tensor([75., 71., 77., 20., 78., 93., 26.,  8., 44., 77.])


In [None]:
nn.Linear(10, 5)(torch.randint(1,101,size=(10,), dtype=torch.float32))

tensor([-21.6303,  10.7729,  40.9039, -22.2035, -21.5263],
       grad_fn=<AddBackward0>)

In [None]:
encodings = torch.randint(1,101,size=(3,5), dtype=torch.float32)
print(encodings)
print(encodings.size())

tensor([[64., 76., 37., 27., 20.],
        [71., 84.,  4., 94., 42.],
        [79., 27., 72.,  3., 42.]])
torch.Size([3, 5])


In [None]:
res = nn.Linear(5, 10)(encodings)
print(res.shape)
print(res)

torch.Size([3, 10])
tensor([[-21.5715,  20.1209,  -0.4824,  14.3352,  18.1300,  -1.5594, -16.9697,
         -16.9604, -38.5803,  59.1881],
        [-20.9963,  -7.9470,   5.9263,  15.1599,  23.3282,  17.7045,  13.4380,
          18.2586, -58.1793,  56.8789],
        [-13.3298,  20.9227,   5.1337,   9.4617,  41.7800,  10.1199,  -9.5355,
          -3.4605, -31.0516,  62.9349]], grad_fn=<AddmmBackward0>)


In [None]:
embedding_layer = nn.Embedding(num_embeddings=100, embedding_dim=30, padding_idx=9)

In [None]:
for i in embedding_layer.parameters():
  print(i.requires_grad)

True


In [None]:
embedding_layer.weight.requires_grad

True

In [None]:
# encodings = torch.randint(1,101,size = (10,))
print(encodings)
embedded = embedding_layer(encodings)
# print(embedded)
print(embedded.size())

tensor([46, 28, 55, 48, 37,  9, 51, 57, 40,  7])
torch.Size([10, 30])


In [None]:
torch.randint(0,101,size=(3,5,100)).size(1)

5

In [None]:

torch.arange(5)  + torch.randint(1,100,size=(2,5))

tensor([[14, 22, 89, 17, 94],
        [68, 10, 85, 29, 46]])

In [None]:

torch.arange(5).unsqueeze(0)  + torch.randint(1,100,size=(2,5))

tensor([[99, 13, 57, 80, 11],
        [32, 94, 38, 33, 23]])