In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [3]:
import torch
import torch.nn as nn

## Positional Encoding

In [4]:
class PositionalEncoding(nn.Module):
    def __init__(self, seq_len, d_model, max_len=5000):
        super().__init__()

        #matrix to hold encodings
        pos_encoding = torch.zeros(seq_len,d_model)

        position = torch.arange(seq_len).unsqueeze(1).float()
        dimension = torch.arange(d_model).unsqueeze(0).float()

        #calculate angle rate
        angle_rate = 1 / (10000**((2*(dimension//2)) / d_model))

        #calculate angle in radians
        angle_radians = position * angle_rate
        
        
        #sin for even pos and cos for odd pos
        pos_encoding[:,0::2] = torch.sin(angle_radians[:,0::2])
        pos_encoding[:,1::2] = torch.cos(angle_radians[:,1::2])

        #store encoding to register
        self.register_buffer("pos_encoding", pos_encoding)

    def forward(self,x):
        return x + self.pos_encoding.unsqueeze(1)

In [47]:
seq_len = 10
batch_size = 2
d_model = 8

x = torch.zeros(seq_len, batch_size, d_model)

pe = PositionalEncoding(seq_len, d_model)
out = pe(x)

print(out)
print(out.shape)

tensor([[[ 0.0000e+00,  1.0000e+00,  0.0000e+00,  1.0000e+00,  0.0000e+00,
           1.0000e+00,  0.0000e+00,  1.0000e+00],
         [ 0.0000e+00,  1.0000e+00,  0.0000e+00,  1.0000e+00,  0.0000e+00,
           1.0000e+00,  0.0000e+00,  1.0000e+00]],

        [[ 8.4147e-01,  5.4030e-01,  9.9833e-02,  9.9500e-01,  9.9998e-03,
           9.9995e-01,  1.0000e-03,  1.0000e+00],
         [ 8.4147e-01,  5.4030e-01,  9.9833e-02,  9.9500e-01,  9.9998e-03,
           9.9995e-01,  1.0000e-03,  1.0000e+00]],

        [[ 9.0930e-01, -4.1615e-01,  1.9867e-01,  9.8007e-01,  1.9999e-02,
           9.9980e-01,  2.0000e-03,  1.0000e+00],
         [ 9.0930e-01, -4.1615e-01,  1.9867e-01,  9.8007e-01,  1.9999e-02,
           9.9980e-01,  2.0000e-03,  1.0000e+00]],

        [[ 1.4112e-01, -9.8999e-01,  2.9552e-01,  9.5534e-01,  2.9995e-02,
           9.9955e-01,  3.0000e-03,  1.0000e+00],
         [ 1.4112e-01, -9.8999e-01,  2.9552e-01,  9.5534e-01,  2.9995e-02,
           9.9955e-01,  3.0000e-03,  1.0000e

## Position wise Feed Forward Network

In [6]:
class PositionFeedForward(nn.Module):
    def __init__(self, d_model, d_ff, dropout=0.1):
        super().__init__()

        self.linear1 = nn.Linear(d_model,d_ff)
        self.linear2 = nn.Linear(d_ff,d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self,x):
        a = self.linear1(x)
        b = torch.relu(a)
        c = self.dropout(b)
        output = self.linear2(c)
        return output

In [18]:
d_model = 512
d_ff = 2048

x = PositionFeedForward(d_model, d_ff)
print(x)

PositionFeedForward(
  (linear1): Linear(in_features=512, out_features=2048, bias=True)
  (linear2): Linear(in_features=2048, out_features=512, bias=True)
  (dropout): Dropout(p=0.1, inplace=False)
)


## Encoder Block

In [15]:
class Encoder(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout=0.1):
        super().__init__()

        self.self_attn = nn.MultiheadAttention(
            embed_dim = d_model,
            num_heads = num_heads,
            dropout = dropout
        )

        self.feed_forward = PositionFeedForward(
            d_model = d_model,
            d_ff = d_ff,
            dropout = dropout
        )

        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)

        self.dropout1 = nn.Dropout(dropout)
        self.dropout2 = nn.Dropout(dropout)

    def forward(self, x, src_mask=None, src_key_padding_mask=None):
        #multihead attention
        attention_output, _ = self.self_attn(
            x,x,x,
            attn_mask = src_mask,
            key_padding_mask = src_key_padding_mask
        )

        #add & norm
        x = self.norm1(x + self.dropout1(attention_output))

        #fnn
        fnn_output = self.feed_forward(x)

        #add & norm
        x = self.norm2(x + self.dropout2(fnn_output))

        return x

In [16]:
# Hyperparameters similar to attention is all you need
d_model = 512
num_heads = 8
d_ff = 2048
seq_len = 10
batch_size = 2

# Dummy input
x = torch.rand(seq_len, batch_size, d_model)

# Positional Encoding
pos_enc = PositionalEncoding(seq_len,d_model)
x = pos_enc(x)

# Encoder Block
encoder_block = Encoder(
    d_model=d_model,
    num_heads=num_heads,
    d_ff=d_ff
)

output = encoder_block(x)

print("Output shape:", output.shape)

Output shape: torch.Size([10, 2, 512])


## Encoding a real input sentence

In [46]:
input = "Hello I am Manthan"
input = input.lower()
tokens = input.split()

vocab = {word : idx for idx, word in enumerate(tokens)}
print("Vocab : ",vocab)

#convert tokens to indices
indices = torch.tensor([vocab[word] for word in tokens])
print("\nIndices : ",indices,'Shape : ',indices.shape)

Vocab :  {'hello': 0, 'i': 1, 'am': 2, 'manthan': 3}

Indices :  tensor([0, 1, 2, 3]) Shape :  torch.Size([4])


In [41]:
#embedding layer
d_model = 512
embedding = nn.Embedding(num_embeddings=len(vocab), embedding_dim = d_model)

x = embedding(indices)
print(x)
print("Shape : ",x.shape)

#adding batch dimension
x = x.unsqueeze(1)
print("\n",x.shape)

tensor([[ 2.1698,  1.2670,  1.5892,  ..., -0.0500, -0.0358, -0.7772],
        [ 1.7677, -0.0845,  1.1482,  ...,  2.4763,  1.2870, -0.5611],
        [ 0.4267, -0.1623, -0.6096,  ...,  1.1848, -0.8349, -0.2594],
        [ 1.4550,  0.9078, -0.6054,  ...,  1.7620, -0.6409, -0.4972]],
       grad_fn=<EmbeddingBackward0>)
Shape :  torch.Size([4, 512])

 torch.Size([4, 1, 512])


In [39]:
#add positional encoding
pos_encoding = PositionalEncoding(seq_len = len(tokens), d_model=d_model)
x = pos_encoding(x)

print(x)
print(x.shape)

tensor([[[-0.0057,  0.8104,  1.3416,  ...,  0.3642,  0.3274,  0.0228]],

        [[ 2.6586,  1.6395,  0.4438,  ...,  0.4102,  0.3660,  0.2786]],

        [[-0.0396, -0.4565,  0.8357,  ..., -0.0144,  0.1524,  1.1747]],

        [[ 0.9491, -1.5035, -0.1848,  ...,  0.6514, -0.8078,  0.7944]]],
       grad_fn=<AddBackward0>)
torch.Size([4, 1, 512])


In [40]:
encoder = Encoder(
    d_model = d_model,
    num_heads = 8,
    d_ff = 512
)

output = encoder(x)
print(x)
print(x.shape)

tensor([[[-0.0057,  0.8104,  1.3416,  ...,  0.3642,  0.3274,  0.0228]],

        [[ 2.6586,  1.6395,  0.4438,  ...,  0.4102,  0.3660,  0.2786]],

        [[-0.0396, -0.4565,  0.8357,  ..., -0.0144,  0.1524,  1.1747]],

        [[ 0.9491, -1.5035, -0.1848,  ...,  0.6514, -0.8078,  0.7944]]],
       grad_fn=<AddBackward0>)
torch.Size([4, 1, 512])
