<a href="https://colab.research.google.com/github/Mounika-Alwar/Transformer-Architecture/blob/main/Transformer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Transformer Architecture Using NumPy

## Dataset Prep

In [36]:
# imports

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import random

In [37]:
# Dataset Prep

data = {
    'complex_english':['he is running','she is eating','they are playing','i am reading','we are working'],
    'simple_english':['he runs','she eats','they play','i read','we work']
}

df = pd.DataFrame(data)
df

Unnamed: 0,complex_english,simple_english
0,he is running,he runs
1,she is eating,she eats
2,they are playing,they play
3,i am reading,i read
4,we are working,we work


In [38]:
# Vocabulary Mapping

id_to_token = {
    0:'<pad>',
    1:'<sos>',
    2:'<eos>',
    3:'she',
    4:'he',
    5:'they',
    6:'i',
    7:'we',
    8:'is',
    9:'are',
    10:'am',
    11:'running',
    12:'eating',
    13:'playing',
    14:'reading',
    15:'working',
    16:'runs',
    17:'eats',
    18:'play',
    19:'read',
    20:'work',
    21:'now'
}

token_to_id = {token:id for id, token in id_to_token.items()}

In [39]:
# id_to_embedding
random.seed(7)

embedding_lookup = {
    0:[round(random.random(),2) for _ in range(3)],
    1:[round(random.random(),2) for _ in range(3)],
    2:[round(random.random(),2) for _ in range(3)],
    3:[round(random.random(),2) for _ in range(3)],
    4:[round(random.random(),2) for _ in range(3)],
    5:[round(random.random(),2) for _ in range(3)],
    6:[round(random.random(),2) for _ in range(3)],
    7:[round(random.random(),2) for _ in range(3)],
    8:[round(random.random(),2) for _ in range(3)],
    9:[round(random.random(),2) for _ in range(3)],
    10:[round(random.random(),2) for _ in range(3)],
    11:[round(random.random(),2) for _ in range(3)],
    12:[round(random.random(),2) for _ in range(3)],
    13:[round(random.random(),2) for _ in range(3)],
    14:[round(random.random(),2) for _ in range(3)],
    15:[round(random.random(),2) for _ in range(3)],
    16:[round(random.random(),2) for _ in range(3)],
    17:[round(random.random(),2) for _ in range(3)],
    18:[round(random.random(),2) for _ in range(3)],
    19:[round(random.random(),2) for _ in range(3)],
    20:[round(random.random(),2) for _ in range(3)],
    21:[round(random.random(),2) for _ in range(3)]
}

In [40]:
def input_to_embedding(input_sentence):
  ids = [token_to_id[token] for token in input_sentence.split()]
  embedding = [embedding_lookup[id] for id in ids]
  return embedding


In [41]:
def positional_encoding(seq_len,d_model=3):
  PE = np.zeros((seq_len,d_model))

  for pos in range(seq_len):
    for i in range(0,d_model,2):
      PE[pos,i] = round(np.sin(pos/(10000**(i/d_model))),2)
      if i+1 < d_model:
        PE[pos,i+1] = round(np.cos(pos/(10000**(i/d_model))),2)

  return PE


In [42]:
def input_preprocessing(input_sentence):
  input_embedding = input_to_embedding(input_sentence)
  positional_embedding = positional_encoding(3,3)
  final_embedding = input_embedding+positional_embedding
  return final_embedding

In [43]:
def softmax(q,k):
  scores = q @ k.T
  d_k = q.shape[1]
  scaled_scores = scores / np.sqrt(3)
  softmax_values = np.zeros((3,3))
  for row in range(scaled_scores.shape[0]):
    sum_row = 0
    for col in range(scaled_scores.shape[1]):
      sum_row += np.exp(scaled_scores[row][col])
    for col in range(scaled_scores.shape[1]):
      softmax_values[row][col] = np.exp(scaled_scores[row][col])/sum_row
  return softmax_values

In [44]:
def attention(q,k,v):
  softmax_output = softmax(q,k)
  final_result = softmax_output @ v
  return final_result

In [45]:
def multi_head_attention(X,W1,W2,W0,no_of_heads=2):
  # head1
  wq_1,wk_1,wv_1 = W1

  q1 = X @ wq_1
  k1 = X @ wk_1
  v1 = X @ wv_1

  attention_output1 = attention(q1,k1,v1)

  # head2
  wq_2,wk_2,wv_2 = W2

  q2 = X @ wq_2
  k2 = X @ wk_2
  v2 = X @ wv_2

  attention_output2 = attention(q2,k2,v2)

  attention_output = np.hstack((attention_output1,attention_output2))

  final_output = (attention_output @ W0).round(decimals=2)
  return final_output


In [46]:
def layer_normalization(cur_embedding,eps=1e-5):

  mean = np.mean(cur_embedding,axis=1,keepdims=True)
  variance = np.var(cur_embedding,axis=1,keepdims=True)

  num = cur_embedding - mean
  denom = np.sqrt(variance+eps)

  return num/denom

In [47]:
def feed_forward(input,W3,W4):

  hidden = input @ W3
  activated = np.maximum(0, hidden)  # ReLU
  output = activated @ W4     # linear

  return output

## Complete Encoder Pipeline

In [48]:
def init_encoder_params():
  W1 = [
      np.random.rand(3,3),
      np.random.rand(3,3),
      np.random.rand(3,3)
  ]

  W2 = [
      np.random.rand(3,3),
      np.random.rand(3,3),
      np.random.rand(3,3)
  ]

  W3 = np.random.rand(3,3)
  W4 = np.random.rand(3,3)

  W0 = np.random.rand(6,3)

  return {
      "W1":W1,
      "W2":W2,
      "W0":W0,
      "W3":W3,
      "W4":W4
  }

In [49]:
def encoder(input_embedding,params):
  # multi-head attention
  W1 = params["W1"]
  W2 = params["W2"]
  W0 = params["W0"]
  W3 = params["W3"]
  W4 = params["W4"]

  attn = multi_head_attention(input_embedding,W1,W2,W0)

  # addition
  cur_embedding = input_embedding+attn

  # layer normalization
  cur_embedding = layer_normalization(cur_embedding)

  # feed forward
  feed_forward_output = feed_forward(cur_embedding,W3,W4)

  # addition
  cur_embedding = cur_embedding + feed_forward_output

  # layer normalization
  final_output = layer_normalization(cur_embedding)

  return final_output

In [50]:
def output_preprocessing(output):
  output = "<sos> "+output
  output_embedding = input_to_embedding(output)
  positional_embedding = positional_encoding(3,3)
  final_embedding = output_embedding+positional_embedding
  return final_embedding

In [51]:
def softmax_mask(q,k):
  scores = q @ k.T
  scaled_scores = scores/np.sqrt(2)

  mask = np.array([[0,-np.inf,-np.inf],
                   [0,0,-np.inf],
                   [0,0,0]])

  masked_scores = scaled_scores + mask

  softmax_values = np.zeros((3,3))

  for row in range(masked_scores.shape[0]):
    sum_row = 0
    for col in range(masked_scores.shape[1]):
      sum_row += np.exp(masked_scores[row][col])
    for col in range(masked_scores.shape[1]):
      softmax_values[row][col] = np.exp(masked_scores[row][col])/sum_row

  return softmax_values

In [52]:
def attention_mask(q,k,v):
  softmax_output = softmax_mask(q,k)
  final_result = softmax_output @ v
  return final_result

In [53]:
def masked_multi_head_attention(X,W1,W2,W0,no_of_heads=2):
  wq_1,wk_1,wv_1 = W1

  q1 = X @ wq_1
  k1 = X @ wk_1
  v1 = X @ wv_1

  attention_output1 = attention_mask(q1,k1,v1)

  # head2
  wq_2,wk_2,wv_2 = W2

  q2 = X @ wq_2
  k2 = X @ wk_2
  v2 = X @ wv_2

  attention_output2 = attention_mask(q2,k2,v2)

  attention_output = np.hstack((attention_output1,attention_output2))

  final_output = attention_output @ W0
  return final_output


In [54]:
def init_decoder_params():
  W1 = [
      np.random.rand(3,3),
      np.random.rand(3,3),
      np.random.rand(3,3)
  ]

  W2 = [
      np.random.rand(3,3),
      np.random.rand(3,3),
      np.random.rand(3,3)
  ]

  W0 = np.random.rand(6,3)

  W3 = [
      np.random.rand(3,3),
      np.random.rand(3,3),
      np.random.rand(3,3)
  ]

  W4 = np.random.rand(3,3)
  W5 = np.random.rand(3,3)

  return {
      "W1":W1,
      "W2":W2,
      "W0":W0,
      "W3":W3,
      "W4":W4,
      "W5":W5
  }

In [55]:
def cross_attention(encoder_output,cur_decoder_embedding,W3):
  w_q,w_k,w_v = W3
  v = encoder_output @ w_v
  k = encoder_output @ w_k
  q = cur_decoder_embedding @ w_q

  attention_output = attention(q,k,v)

  return attention_output

## Complete Decoder Pipeline

In [56]:
def decoder(encoder_output, output_embedding,params,):
  # masked multi head attentiona
  W1 = params["W1"]
  W2 = params["W2"]
  W0 = params["W0"]
  W3 = params["W3"]
  W4 = params["W4"]
  W5 = params["W5"]

  attn = masked_multi_head_attention(output_embedding,W1,W2,W0)

  # addition
  cur_embedding = output_embedding + attn

  # layer normalization
  cur_embedding = layer_normalization(cur_embedding)

  # using X of encoder for q and k and v from before cur embedding and multihead attention
  cross_attn = cross_attention(encoder_output,cur_embedding,W3)

  # addition
  cur_embedding = cur_embedding + cross_attn

  # layer normalization
  cur_embedding = layer_normalization(cur_embedding)

  # feed forward
  feed_forward_output = feed_forward(cur_embedding,W4,W5)

  # addition
  cur_embedding += feed_forward_output

  # layer normalization
  final_embedding = layer_normalization(cur_embedding)

  return final_embedding

In [57]:
def init_linear_params():
  W = np.random.rand(3,22)
  return {
      'W':W
  }

In [58]:
def linear(decoder_output,linear_params):
  W = linear_params['W']
  return decoder_output @ W


In [59]:
def output_softmax(linear_output):
  softmax_values = np.zeros((3,22))
  for row in range(linear_output.shape[0]):
    sum_row = 0
    for col in range(linear_output.shape[1]):
      sum_row += np.exp(linear_output[row][col])
    for col in range(linear_output.shape[1]):
      softmax_values[row][col] = np.exp(linear_output[row][col])/sum_row
  return softmax_values

## Complete Transformer Architecture

In [60]:
def transformer(input,output,encoder_params,decoder_params,linear_params):


  input_embedding = input_preprocessing(input)
  encoder_output = encoder(input_embedding,encoder_params)



  output_embedding = output_preprocessing(output)
  decoder_output = decoder(encoder_output,output_embedding,decoder_params)


  linear_output = linear(decoder_output,linear_params)

  softmax_output = output_softmax(linear_output)
  token_ids = np.argmax(softmax_output,axis=1)
  words_list = [id_to_token[id] for id in token_ids]

  return ' '.join(words_list)

In [61]:
encoder_params = init_encoder_params()
decoder_params = init_decoder_params()
linear_params = init_linear_params()

In [62]:
transformer("he is running","he runs",encoder_params,decoder_params,linear_params)

'she she eating'