<a href="https://colab.research.google.com/github/Jaseelkt007/ML/blob/master/Language_Transformer_from_scratch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Language Transformer from scratch
### Based on the paper ' Attention is all you need - 2017 '

In [None]:
from google.colab import drive
drive.mount('/content/drive')


# Define the paths to the dataset files inside the multi30k-dataset folder in your Drive
train_de_path = '/content/drive/MyDrive/multi30k_data/multi30k-dataset/data/task1/raw/train.de'
train_en_path = '/content/drive/MyDrive/multi30k_data/multi30k-dataset/data/task1/raw/train.en'

val_de_path = '/content/drive/MyDrive/multi30k_data/multi30k-dataset/data/task1/raw/val.de'
val_en_path = '/content/drive/MyDrive/multi30k_data/multi30k-dataset/data/task1/raw/val.en'

test_de_path = '/content/drive/MyDrive/multi30k_data/multi30k-dataset/data/task1/raw/test_2016_flickr.de'
test_en_path = '/content/drive/MyDrive/multi30k_data/multi30k-dataset/data/task1/raw/test_2016_flickr.en'

# Function to load data from a file
def load_data(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        return f.readlines()

# Load the training, validation, and test datasets
train_ger = load_data(train_de_path)
train_eng = load_data(train_en_path)

val_ger = load_data(val_de_path)
val_eng = load_data(val_en_path)

test_ger = load_data(test_de_path)
test_eng = load_data(test_en_path)

!pip install torch torchtext spacy
!python -m spacy download de_core_news_sm
!python -m spacy download en_core_web_sm


In [27]:
import numpy as np
import spacy
from collections import Counter


# Load spacy tokenizers for German and English
spacy_de = spacy.load('de_core_news_sm')
spacy_en = spacy.load('en_core_web_sm')

def tokenize_ger(text):
    return ['<sos>'] + [ tok.text.lower() for tok in spacy_de.tokenizer(text)] + ['<eos>']

def tokenize_eng(text):
    return ['<sos>'] + [tok.text.lower() for tok in spacy_en.tokenizer(text)] +['<eos>']

# Special tokens
INIT_TOKEN = '<sos>'
EOS_TOKEN = '<eos>'
PAD_TOKEN = '<pad>'
UNK_TOKEN = '<unk>'

# Build vocabulary from tokenized sentences ---> Assign ID to each tokens
def build_vocab(sentences, tokenizer, min_freq=2 , max_size = 10000):
    counter = Counter()
    # Tokenize and count the frequency of tokens
    for sentence in sentences:
        tokens = tokenizer(sentence)
        counter.update(tokens) # create a dictionary of key value pairs of token and its freuquency

    sorted_tokens = sorted(counter.items() , key=lambda x: (-x[1], x[0])) # sort the tuples in frequency in descending order and key-token in ascending order

    # Build a vocab from words appearing more than min_freq times
    vocab = {word: i+4 for i, (word, count) in enumerate(sorted_tokens[:max_size]) if count >= min_freq}
    # special tokens
    vocab[INIT_TOKEN] = 0
    vocab[EOS_TOKEN] = 1
    vocab[PAD_TOKEN] = 2
    vocab[UNK_TOKEN] = 3
    return vocab

# Build vocab for both source and target sentences
german_vocab = build_vocab(train_ger, tokenize_ger)
english_vocab = build_vocab(train_eng, tokenize_eng)

# Reverse vocab (index to string)
german_itos = {idx : word for word , idx in german_vocab.items()}
english_itos = {idx : word for word , idx in english_vocab.items()}


vocab_size = len(german_vocab)
# initialize input embedding matrix
embedding_size = 512
batch_size = 64
embedding_matrix = np.random.randn(vocab_size, embedding_size) *0.01 # initialized the embedding matrix from normal distribution

# ensure the pad token has embeding of zeros
pad_idx = german_vocab[PAD_TOKEN]
embedding_matrix[pad_idx]= np.zeros(embedding_size)

def get_embedding(sentence_id, embedding_matrix):
    embedding = np.array( [embedding_matrix[token_id] for token_id in sentence_id ] ) # (seq_len, embedding_dim)
    return embedding

def pad_sentence_embedding(embedding , max_length, pad_embedding , embedding_dim ):
    paded_embedding = np.tile(pad_embedding, (max_length, 1)) # repeat max_len times along first axis - row --> (max_len , embedding_size)
    paded_embedding[:embedding.shape[0],:] = embedding # copy the orignial embedding to here, others will be padded.
    return paded_embedding

def positional_embedding( batch_size ,seq_len ,embedding_dim):
    pos_enc = np.zeros((seq_len, embedding_dim))
    for pos  in range(seq_len):
        for i in range(embedding_dim):
            if i % 2 == 0 :
                pos_enc[pos, i] = np.sin(pos/(10000**( i / embedding_dim)))
            else:
                pos_enc[pos, i] = np.cos(pos/(10000**( i / embedding_dim)))

    pos_enc_batch = np.tile(pos_enc , (batch_size , 1, 1)) # shape (batch , seq_len, embedding)

    return pos_enc_batch


def get_batch_embedding(batch_sentence_id, embedding_matrix):

    max_len = max([len(sentence_id) for sentence_id in batch_sentence_id])
    pad_embedding = embedding_matrix[pad_idx]
    batch_embedding = [ pad_sentence_embedding(get_embedding(sentence_id, embedding_matrix), max_len, pad_embedding ,embedding_size) for sentence_id in batch_sentence_id]
    pos_enc_batch = positional_embedding(len(batch_sentence_id), max_len, embedding_size )
    batch_embedding_with_pos = batch_embedding + pos_enc_batch
    return np.array(batch_embedding_with_pos) # (batch_size, seq_len, embedding_dim)



sentence_id = [english_vocab[INIT_TOKEN],english_vocab['hello'], english_vocab['world']] # for decoder, preapend with SOS and remove eos at end
# it ensure the decoder learns to predict the next token based on next token
embedding = get_embedding(sentence_id, embedding_matrix) # (seq_len, embedding_size)
print(embedding.shape)

batch_of_sentence_ids = [
    [english_vocab[INIT_TOKEN],english_vocab['hello'], english_vocab['world'], english_vocab[EOS_TOKEN]],
    [english_vocab['this'], english_vocab['is'], english_vocab['a'], english_vocab['test'], english_vocab[EOS_TOKEN]]]

batch_embedding_pos = get_batch_embedding(batch_of_sentence_ids,embedding_matrix )
print(batch_embedding_pos.shape)


# MULTI HEAD SELF ATTENTION

def softmax(x):
  exp_x = np.exp(x - np.max(x , axis = -1 , keepdims=True))
  return exp_x / np.sum(exp_x, axis=-1 , keepdims=True)

''' the idea is to allow the model to focus on different parts of the sequence by using multiple heads, this allows parallel processing,
    reduce computational cost cause each head works on smaller space
'''

def multi_head_attention(embedding_pos,embedding_dim , num_head = 8):

    head_size = embedding_dim // num_head
    # Initialize the wieght matrices for Q, K , V
    Wq = np.random.randn(num_head,embedding_dim, head_size) # (dim, head_size)
    Wk = np.random.randn(num_head,embedding_dim, head_size)
    Wv = np.random.randn(num_head,embedding_dim, head_size)
    all_head_outputs = []
    # batch matrix multiplication
    for i in range(num_head):
      Q = embedding_pos @ Wq[i] #(B , seq_len,head_size)
      K = embedding_pos @ Wk[i]
      V = embedding_pos @ Wv[i]
      # scaled dot product
      attention_score = ((Q @ K.transpose(0,2,1))/np.sqrt(head_size)) # (B , seq_len, seq_len)
      attention_weights = softmax(attention_score)
      head_output = attention_weights @ V # (B , seq_len , head_size)
      all_head_outputs.append(head_output)
    concatenated_heads = np.concatenate(all_head_outputs, axis= -1) # (B , seq_len , embedding_size)
    Wh = np.random.randn(embedding_dim, embedding_dim)
    output = concatenated_heads @ Wh # (B , seq_len, embedding_size)
    return output

output = multi_head_attention(batch_embedding_pos, embedding_size)
print(output.shape)

# ADD AND NORMALIZATION
''' Normalization is done independently for each token not across the batch or seq_len -> This helps in stabilizing the training
    Batch Normalization - introduces dependency across samples in batch, which can interfer with parallel procesing, thats why layer normalization is used
    Skip connection is added to reduce the vanishing gradient problems
'''
def add_and_norm(input ,output , epsilon = 1e-6 ):
    added = input + output
    mean = np.mean(added, axis=2 , keepdims=True)
    var = np.var(added , axis = 2 , keepdims=True)
    normalized = (added - mean)/ np.sqrt(var + epsilon)
    # initialize the learnable parameter gamma and beta
    gamma = np.ones((1,1,output.shape[-1]))
    beta = np.zeros((1,1, output.shape[-1]))
    output = gamma * normalized + beta # element wise multiplication or scaling of normalized array and shifting by bias
    #print(output)
    return output

output_mha = add_and_norm(output , output)
print(type(output_mha))
print(output_mha.shape)
def feed_forward_network(output_mha):
    # initialise weights
    input_dim = output_mha.shape[2] # (B , seq_len , embedding_size)
    W1 = np.random.randn(input_dim, 4 * input_dim)
    b1 = np.zeros((1 ,1, 4 * input_dim)) # Broadcastable bias
    W2 = np.random.randn(4*input_dim , input_dim)
    b2 = np.zeros((1, 1, input_dim))

    z1 = output_mha @ W1 + b1 # (B , seq_len, 4 * input_dim)
    z1 =  np.maximum(z1,0) # apply ReLU, elementwise comparison btw z1 , and 0
    output = z1 @ W2 + b2 # (B , seq_len ,input_dim) , no Relu here
    return output

final_out = feed_forward_network(output_mha)
print(final_out.shape)

# Hyperparameter
num_layers = 6
num_head = 8
batch_size = 64

class Encoder:
    def __init__(self , num_layers, embedding_dim, num_head , dropout=0.5 ,training=False) -> None:
        self.num_layers = num_layers
        self.emb_size = embedding_dim
        self.num_head = num_head
        self.dropout = dropout
        self.training = training

    def __call__(self,x):
        return self.forward(x)

    def forward(self, x):
        output_final = x
        seq_len = x.shape[1]

        for i in range(self.num_layers):
            output_mh = multi_head_attention(output_final , self.emb_size , self.num_head)

            # Dropout
            if self.training:
                dropout_mask = (np.random.rand(batch_size, seq_len,self.emb_size) >= self.dropout).astype(np.float32) # convert to float32 , True / False -> 1.0/0.0
                output_mh *= dropout_mask
                output_mh /= (1 - self.dropout) # scale the remaining elements to maintain the expected value of the output

            output_add = add_and_norm(output_final, output_mh)
            output_fnn = feed_forward_network(output_add)

            if self.training:
                dropout_mask = (np.random.rand(batch_size, seq_len,self.emb_size) >= self.dropout).astype(np.float32) # convert to float32 , True / False -> 1.0/0.0
                output_fnn *= dropout_mask
                output_fnn /= (1 - self.dropout) # scale the remaining elements to maintain the expected value of the output

            output_final = add_and_norm(output_add, output_fnn)

        return output_final

    def train(self):
      self.training = True # Enable Dropout

    def eval(self):
      self.training = False # No Dropout



encoder = Encoder(num_layers, embedding_dim=embedding_size,num_head= num_head )
encoder.eval()
final_out = encoder(batch_embedding_pos)

# Decoder Part
''' Masking is applied to the self-attention to prevent the decoder from attending to the future tokens, this is done by setting the upper traigular values to -infinity
    before applying softmax, which ensures that future tokens are ignored
'''
def create_mask(seq_len):
    mask = np.triu(np.ones((seq_len, seq_len)), k=1) # upper triangular matrix including excluding main diagonal
    mask = mask * (-np.inf) # convert 1s to -inifinity
    return mask

def masked_multi_head_attention(embedding_pos, embedding_dim, num_head=8):
    head_size = embedding_dim // num_head
    # Initialize the wieght matrices for Q, K , V
    Wq = np.random.randn(num_head,embedding_dim, head_size) # (dim, head_size)
    Wk = np.random.randn(num_head,embedding_dim, head_size)
    Wv = np.random.randn(num_head,embedding_dim, head_size)
    all_head_outputs = []
    # batch matrix multiplication
    seq_len = embedding_pos.shape[1]
    mask = create_mask(seq_len) # (seq_len , seq_len)
    mask = mask[np.newaxis, : , :]
    for i in range(num_head):
      Q = embedding_pos @ Wq[i] #(B , seq_len,head_size)
      K = embedding_pos @ Wk[i]
      V = embedding_pos @ Wv[i]
      # scaled dot product
      attention_score = ((Q @ K.transpose(0,2,1))/np.sqrt(head_size)) # (B , seq_len, seq_len)
      masked_attention_score = attention_score + mask # create mask inside
      attention_weights = softmax(masked_attention_score)
      head_output = attention_weights @ V # (B , seq_len , head_size)
      all_head_outputs.append(head_output)
    concatenated_heads = np.concatenate(all_head_outputs, axis= -1) # (B , seq_len , embedding_size)
    Wh = np.random.randn(embedding_dim, embedding_dim)
    output = concatenated_heads @ Wh # (B , seq_len, embedding_size)
    print(output.shape)
    return output

# to be continued

(3, 512)
(2, 5, 512)
(2, 5, 512)
<class 'numpy.ndarray'>
(2, 5, 512)
(2, 5, 512)
