# Building Transformer Model - Pytorch

![Transformer Architecture](./Transformer_Understanding/img/transformerblock.png)

Base on the upper picture, we will make our model functions with Transformer Architecture following the things we have been researching in the "[Transformer Understanding](./Transformer_Understanding/TransformerNeuralNetworks.ipynb)" part.

In [1]:
# import library
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import math
import copy
import warnings

In [2]:
warnings.filterwarnings('ignore')

In [3]:
if torch.cuda.is_available():
    print("CUDA is available. PyTorch is using GPU.")
    print("Number of GPUs available: ", torch.cuda.device_count())
    print("GPU name: ", torch.cuda.get_device_name(0))
else:
    print("CUDA is not available. PyTorch is using CPU.")

CUDA is available. PyTorch is using GPU.
Number of GPUs available:  1
GPU name:  NVIDIA GeForce GTX 1650


## Start with these small blocks, functions, ...

- **Get Using Device** (Torch requires all tensor  must be in the same device.)

In [4]:
def get_device():
    return torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

- **Token Embedding**

In [5]:
class TokenEmbedding(nn.Module):
    
    def __init__(self, vocab_size, d_model):
        """
        Token Embedding is used for converting a word / token into a embedding numeric vector space.
        
        :param vocab_size: Number of words / token in vocabulary
        :param d_model: The embedding dimension
        
        Example: With 1000 words in vocabulary and our embedding dimension is 512, the Token Embedding layer will be 1000x512
        """
        super().__init__()
        self.embedding_layer = nn.Embedding(vocab_size, d_model)

    def forward(self, x):
        """
        :param x: the word or sequence of words
        :return: the numerical representation of the input
        
        Example:
        Input: (Batch_size, Sequence of words) - (30x100)
        Output: (Batch_size, Sequence of words, d_model) - (30x100x512)
        """
        x = self.embedding_layer(x)
        return x.to(get_device())

# Or just Simple
# token_embedding = nn.Embedding(vocab_size, d_model)

In [6]:
# For Example
vocab_size = 1000
d_model = 512

embedding_layer = TokenEmbedding(vocab_size, d_model)
input_data = torch.randint(0, vocab_size, (30, 100))
embedding_layer(input_data).shape

torch.Size([30, 100, 512])

- **Positional Encoding**

In [7]:
class PositionalEncoding(nn.Module):
    
    def __init__(self, d_model, max_sequence_length, dropout=0.1):
        """
        Positional Encoding layer for adding positional information to token embeddings.
        
        :param d_model: The embedding dimension.
        :param max_sequence_length: The maximum length of the input sequences.
        :param dropout: Dropout rate.
        """
        super(PositionalEncoding,self).__init__()
        self.max_sequence_length = max_sequence_length
        self.d_model = d_model
        self.dropout = nn.Dropout(dropout)

    def forward(self):
        even_i = torch.arange(0, self.d_model, 2).float()
        denominator = torch.pow(10000, even_i/self.d_model)
        position = (torch.arange(self.max_sequence_length)
                          .reshape(self.max_sequence_length, 1))
        even_PE = torch.sin(position / denominator)
        odd_PE = torch.cos(position / denominator)
        stacked = torch.stack([even_PE, odd_PE], dim=2)
        PE = torch.flatten(stacked, start_dim=1, end_dim=2)
        PE = PE.unsqueeze(0)
        return self.dropout(PE)

In [8]:
PE = PositionalEncoding(512,100,0.1)
PE().shape

torch.Size([1, 100, 512])

- **Multi-Head Attention**

2 options for: 'encoder' and 'decoder' (**Multi-Head Cross Attention**)

options for mask: **None**, **Self-Attention Mask** (**Causal Mask** hoặc **Look-Ahead Mask**), **Padding Mask**

In [9]:
class MultiHeadAttention(nn.Module):

    def __init__(self, d_model, num_heads=8, cross=False):
        """
        Multi-Head Attention
        :param d_model: the embedding dimension
        :param num_heads: the number of heads, default equals 8
        :param cross: True for Multi-Head Cross Attention, False for Multi-Head Attention only
        
        # note: The embedding dimension must be divided by the number of heads
        """
        super(MultiHeadAttention,self).__init__()
        self.d_model = d_model
        self.num_heads = num_heads
        self.head_dim = d_model // num_heads
        self.cross = cross

        # query, key value layer
        if self.cross: # Multi-Head Cross Attention
            self.kv_layer = nn.Linear(d_model , 2 * d_model)
            self.q_layer = nn.Linear(d_model , d_model)
        else:
            self.qkv_layer = nn.Linear(d_model , 3 * d_model)
        
        
        # method 1: old, cost alot
        # self.query = nn.Linear(self.head_dim, self.head_dim, bias=False)
        # self.key = nn.Linear(self.head_dim, self.head_dim, bias=False)
        # self.value = nn.Linear(self.head_dim, self.head_dim, bias=False) 

        # method 2: the fewer linear layers the better the cost
        
        
        # Linear Layer in Multi-Head Attention
        self.linear_layer = nn.Linear(d_model, d_model)

    def scaled_dot_product(self, q, k, v, mask=None):
        d_k = q.size()[-1]
        scaled = torch.matmul(q, k.transpose(-1, -2)) / math.sqrt(d_k)
        if mask is not None:
            scaled = scaled.permute(1, 0, 2, 3) + mask
            scaled = scaled.permute(1, 0, 2, 3)
        attention = F.softmax(scaled, dim=-1)
        values = torch.matmul(attention, v)
        return values, attention
    
    def forward(self, x, mask=None):
        """
        Perform forward pass of the multi-head attention mechanism.

        :param x: if cross is True then x is a dictionary including  'encoder_output' and 'w'.
        :param mask: Optional mask tensor
        
        :return: Output tensor of shape (batch_size, length_seq, d_model)

        """

        # For MultiHead Cross Attention
        if self.cross:
            encoder_output = x['encoder_output']
            w = x['w']
            batch_size, length_seq, d_model = w.size()
            kv = self.kv_layer(w)
            q = self.q_layer(encoder_output)
            kv = kv.reshape(batch_size, length_seq, self.num_heads, 2 * self.head_dim)
            q = q.reshape(batch_size, length_seq, self.num_heads, self.head_dim)
            kv = kv.permute(0, 2, 1, 3)
            q = q.permute(0, 2, 1, 3)
            k, v = kv.chunk(2, dim=-1)
            values, attention = self.scaled_dot_product(q, k, v, mask) # mask is not required in Cross Attention
            values = values.permute(0, 2, 1, 3).reshape(batch_size, length_seq, self.num_heads * self.head_dim)
            out = self.linear_layer(values)
            return out

        # For MultiHead Attention
        batch_size, length_seq, d_model = x.size()
        qkv = self.qkv_layer(x)
        qkv = qkv.reshape(batch_size, length_seq, self.num_heads, 3 * self.head_dim)
        qkv = qkv.permute(0, 2, 1, 3)
        q, k, v = qkv.chunk(3, dim=-1)
        values, attention = self.scaled_dot_product(q, k, v, mask)
        values = values.permute(0, 2, 1, 3).reshape(batch_size, length_seq, self.num_heads * self.head_dim)
        out = self.linear_layer(values)
        return out

In [10]:
# For Example
d_model = 512
num_heads = 8


# Attention
mha_layer = MultiHeadAttention(d_model, num_heads)
mha_layer(torch.randn(1,10,d_model))

tensor([[[ 0.1934, -0.1114, -0.1102,  ...,  0.2552, -0.0385,  0.0420],
         [ 0.1294, -0.0963, -0.0465,  ...,  0.2400, -0.0796,  0.0796],
         [ 0.1541, -0.1141, -0.0579,  ...,  0.2480,  0.0236, -0.0210],
         ...,
         [ 0.0971, -0.1248, -0.0860,  ...,  0.2161, -0.1267,  0.0463],
         [ 0.1686, -0.1130, -0.0474,  ...,  0.2752, -0.0954, -0.0198],
         [ 0.1757, -0.1306, -0.0977,  ...,  0.2291, -0.0526,  0.0270]]],
       grad_fn=<ViewBackward0>)

In [11]:
# Cross Attention
mha_layer = MultiHeadAttention(d_model, num_heads,cross=True)
mha_layer({'encoder_output':torch.randn(1,10,d_model),'w':torch.randn(1,10,d_model)})

tensor([[[-0.1164,  0.1507, -0.1053,  ...,  0.1042, -0.2139,  0.2552],
         [-0.1215,  0.0759, -0.1088,  ...,  0.0505, -0.1319,  0.1522],
         [-0.1225,  0.1095, -0.1695,  ...,  0.0866, -0.1963,  0.2787],
         ...,
         [-0.0864,  0.1292, -0.1470,  ...,  0.1072, -0.1512,  0.2815],
         [-0.0796,  0.0907, -0.1386,  ...,  0.1461, -0.1886,  0.2560],
         [-0.0626,  0.0965, -0.1146,  ...,  0.1255, -0.1642,  0.3000]]],
       grad_fn=<ViewBackward0>)

- **Layer Normalization Block**

In [12]:
class LayerNormalization(nn.Module):
    def __init__(self, parameters_shape, eps=1e-5):
        super().__init__()
        self.parameters_shape=parameters_shape
        self.eps=eps
        self.gamma = nn.Parameter(torch.ones(parameters_shape))
        self.beta =  nn.Parameter(torch.zeros(parameters_shape))

    def forward(self, inputs):
        dims = [-(i + 1) for i in range(len(self.parameters_shape))]
        mean = inputs.mean(dim=dims, keepdim=True)
        var = ((inputs - mean) ** 2).mean(dim=dims, keepdim=True)
        std = (var + self.eps).sqrt()
        y = (inputs - mean) / std
        out = self.gamma * y  + self.beta
        return out

# Or using nn.LayerNorm(d_model)

In [13]:
# For Example
ln = LayerNormalization((1,2,3))
ln(torch.randn(1,2,3))

tensor([[[ 0.5837,  1.2111, -0.9032],
         [-0.5793, -1.3794,  1.0670]]], grad_fn=<AddBackward0>)

- **Positionwise Feed Forward Block**

In [14]:
class PositionwiseFeedForward(nn.Module):

    def __init__(self, d_model, hidden, drop_prob=0.1):
        super(PositionwiseFeedForward, self).__init__()
        self.linear1 = nn.Linear(d_model, hidden)
        self.linear2 = nn.Linear(hidden, d_model)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p=drop_prob)

    def forward(self, x):
        x = self.linear1(x)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.linear2(x)
        return x

# feed_forward = nn.Sequential(
#     nn.Linear(d_model, expansion_factor * d_model),  # e.g: 512x(4*512) -> (512, 2048)
#     nn.ReLU(),  # ReLU activation function
#     nn.Linear(d_model * expansion_factor, d_model),  # e.g: 4*512)x512 -> (2048, 512)
# )

In [15]:
# For Example
ff = PositionwiseFeedForward(512, 300)
ff(torch.randn(1,5,512)).shape

torch.Size([1, 5, 512])

- **Copy Block Function**: we can use nn.Sequential but i think we don't need to do that because we don't have any changes in Module Params

In [16]:
def replicate(block, N=6) -> nn.ModuleList:
    """
    Method to replicate the existing block to N set of blocks
    :param block: class inherited from nn.Module, mainly it is the encoder or decoder part of the architecture
    :param N: the number of stack, in the original paper they used 6
    :return: a set of N blocks
    """
    block_stack = nn.ModuleList([copy.deepcopy(block) for _ in range(N)])
    return block_stack

## With those small blocks and functions, let's build these important blocks!

- **Preprocessing** for Input Pre-processing and Output Pre-processing

In [17]:
class Preprocessing(nn.Module):

    def __init__(self, max_length_seq, d_model, language_to_index, start_token, end_token, pad_token, dropout=0.1):
        super().__init__()
        self.vocab_size = len(language_to_index)
        self.language_to_index = language_to_index
        self.max_length_seq = max_length_seq
        self.start_token = start_token
        self.end_token = end_token
        self.pad_token = pad_token

        # Layer
        self.token_embedding = TokenEmbedding(self.vocab_size, d_model)
        self.positional_encoding = PositionalEncoding(d_model, max_length_seq, dropout)
        self.dropout = nn.Dropout(dropout)

    def tokenize(self, sentence, start_token:bool, end_token:bool):
        encode_char = [self.language_to_index[token] for token in list(sentence)]
        if start_token:
            encode_char.insert(0, self.language_to_index[self.start_token])
        if end_token:
            encode_char.append(self.language_to_index[self.end_token])
        for _ in range(len(encode_char), self.max_length_seq):
            encode_char.append(self.language_to_index[self.pad_token])
        return torch.tensor(encode_char)
    
    def batch_tokens(self, batch, start_token:bool, end_token:bool):
        tokens = []
        for i in range(len(batch)):
            tokens.append(self.tokenize(batch[i], start_token, end_token))
        tokens = torch.stack(tokens)
        return tokens.to(get_device())

    def forward(self, x, start_token:bool, end_token:bool): 
        x = self.batch_tokens(x, start_token, end_token)
        x = self.token_embedding(x)
        pos = self.positional_encoding().to(get_device())
        x = self.dropout(x + pos)
        return x

- **Transformer Block** includes: **Multi-Head Attention**, **Add & Norm**, **Feed & Forward** and **Dropout**

2 options: 'Encoder' and 'Decoder'

In [18]:
class TransformerBlock(nn.Module):

    def __init__(self,
                 d_model=512,
                 num_heads=8,
                 ff_hidden=300,
                 dropout=0.1,
                 options='encoder'
                ):
        """
        The Transformer Block used in the encoder and decoder as well

        :param d_model: the embedding dimension
        :param num_heads: the number of heads
        :param ff_hidden: The output dimension of the feed forward layer
        :param dropout: probability dropout (between 0 and 1)
        :param options: The choice between 'encoder' and 'decoder'
        """
        super(TransformerBlock, self).__init__()
    
        self.options = options
        
        # For both 2 options: encoder and decoder
        self.attention = MultiHeadAttention(d_model=d_model, num_heads=num_heads)
        self.norm_for_attention = LayerNormalization(parameters_shape=[d_model])
        self.dropout_attention = nn.Dropout(dropout)

        
        
        # For decoder
        if self.options=='decoder':
            self.cross_attention = MultiHeadAttention(d_model=d_model, num_heads=num_heads, cross=True)
            self.norm_for_cross_attention = LayerNormalization(parameters_shape=[d_model])
            self.dropout2 = nn.Dropout(dropout)
        elif self.options!='encoder':
            raise Exception(f"Unknown option {options}")

        # For both 2 options: encoder and decoder
        self.ff = PositionwiseFeedForward(d_model=d_model, hidden=ff_hidden, drop_prob=dropout)
        self.norm_for_ff = LayerNormalization(parameters_shape=[d_model])
        self.dropout_for_ff = nn.Dropout(dropout)

    def forward(self, x, mask=None):
        # For decoder
        if self.options == 'decoder':
            encoder_output = x['encoder_output']
            w = x['w']
            w_residual = w.clone()
            w = self.attention(w,mask['self_attention_mask'])
            w = self.dropout_attention(w)
            w = self.norm_for_attention(w + w_residual)

            w_residual = w.clone()
            w = self.cross_attention({'encoder_output':encoder_output,'w':w},mask['cross_attention_mask'])
            w = self.dropout2(w)
            w = self.norm_for_cross_attention(w + w_residual)

            w_residual = w.clone()
            w = self.ff(w)
            w = self.dropout_for_ff(w)
            w = self.norm_for_ff(w + w_residual)
            return w
        else:
        # For encoder
            x_residual = x.clone()
            x = self.attention(x, mask)
            x = self.dropout_attention(x)
            x = self.norm_for_attention(x + x_residual)

            x_residual = x.clone()
            x = self.ff(x)
            x = self.dropout_for_ff(x)
            x = self.norm_for_ff(x + x_residual)
            return x

In [19]:
# Test
trans_block = TransformerBlock()
trans_block(torch.randn(1,10,512))

tensor([[[ 1.6347e+00,  2.9519e-01, -5.0073e-01,  ..., -9.8680e-01,
           5.5744e-02, -1.0531e+00],
         [ 3.3081e-01,  1.2031e+00,  7.6933e-01,  ...,  3.4544e-01,
          -6.9123e-01,  1.3671e-01],
         [ 1.3679e+00, -6.7214e-01, -6.1000e-01,  ..., -1.1033e-03,
           1.6234e+00,  7.3969e-01],
         ...,
         [-1.2435e-01, -2.5058e-01, -1.4812e-01,  ...,  1.1968e+00,
           4.7031e-01, -2.1733e-01],
         [-1.4620e-01, -1.7710e+00,  1.8269e-01,  ...,  5.5685e-01,
           9.5737e-01,  3.6664e-01],
         [ 1.5424e+00,  2.7748e+00, -7.2107e-01,  ..., -1.7869e+00,
          -9.5714e-01, -6.5211e-01]]], grad_fn=<AddBackward0>)

In [20]:
trans_block = TransformerBlock(options='decoder')
trans_block({'encoder_output':torch.randn(1,10,d_model),'w':torch.randn(1,10,d_model)},
    {'self_attention_mask': None, 'cross_attention_mask': None}
)

tensor([[[-0.1136,  0.3166, -0.3398,  ...,  0.6556, -0.3054,  1.0212],
         [-1.0302, -0.2919, -0.5072,  ...,  0.4768, -1.1288, -0.4150],
         [ 0.4223, -0.3623,  1.4023,  ...,  0.0115,  0.3815,  0.3529],
         ...,
         [-0.0731, -0.3068, -0.6297,  ..., -0.5859,  2.2606, -2.1841],
         [ 0.2536, -2.5272,  0.3633,  ..., -1.6203, -0.5734,  1.2083],
         [ 0.4487,  0.3852, -1.1425,  ..., -0.1755, -0.5927, -0.9773]]],
       grad_fn=<AddBackward0>)

- **Encoder** includes Input Pre-processing (**Token Embedding** & **Positional Encoding**) and N **Transformer Block** (Encode block in the picture on the top)

In [21]:
class Encoder(nn.Module):

    def __init__(self,
                 d_model,
                 ff_hidden,
                 num_heads,
                 dropout,
                 num_blocks,
                 max_length_seq,
                 language_to_index,
                 start_token, 
                 end_token, 
                 pad_token
                ):
        """
        The Encoder part of the Transformer architecture
        """
        super().__init__()

        # Layer
        self.input_preprocessing = Preprocessing(max_length_seq, d_model, language_to_index, start_token, end_token, pad_token, dropout)
        
        # Transformer Blocks
        self.transformer_blocks = replicate(TransformerBlock(d_model, num_heads, ff_hidden, dropout, options="encoder"),num_blocks)

    def forward(self, x, self_attention_mask, start_token:bool, end_token:bool):
        # Input Pre-processing: Token Embedding + Positional Encoding
        out = self.input_preprocessing(x, start_token, end_token)

        # Go to Transformer Blocks (Encode)
        for block in self.transformer_blocks:
            out = block(out, self_attention_mask)

        return out

# Or just using
# nn.TransformerEncoder

In [22]:
# Test
special_tokens = ["<start>", "<end>", "<pad>"]
alphabet = list("abcdefghijklmnopqrstuvwxyz ") + special_tokens
language_to_index = {word: idx for idx, word in enumerate(alphabet)}
encoder = Encoder(d_model=100, ff_hidden=50, num_heads=2, dropout=0.1 ,num_blocks=1, max_length_seq=100, language_to_index=language_to_index, start_token="<start>", end_token="<end>", pad_token="<pad>")
encoder.to(get_device())
batch = ['hello','goodbye']
encoder(batch, None, False, False)

tensor([[[-0.2348,  0.9120,  1.7391,  ..., -0.2278, -0.8417,  1.1081],
         [ 1.6559,  0.3212, -0.8062,  ...,  1.7960, -0.7977,  1.6432],
         [-0.0429, -0.7864,  0.7223,  ...,  0.8125, -0.2887, -0.0834],
         ...,
         [ 0.4337, -1.5136, -1.0085,  ..., -0.6353, -0.1083,  1.5586],
         [-0.2551, -0.4255, -0.3870,  ...,  0.2286, -0.1370,  1.7222],
         [-0.5061, -0.3137,  0.3685,  ..., -0.6921, -0.2590,  1.3873]],

        [[-1.2848, -0.0301,  0.7372,  ...,  1.3673, -1.1085,  0.1970],
         [ 0.6411, -1.6698,  1.1865,  ...,  0.1364,  0.3833, -0.1536],
         [ 0.7109, -1.0754,  1.3723,  ..., -0.0966,  0.5129,  0.1411],
         ...,
         [ 0.2068, -1.4160, -1.1323,  ..., -0.1001, -0.1457,  1.7570],
         [-0.2347, -0.4453, -0.4428,  ...,  0.0818, -0.0247,  1.4377],
         [-0.5970, -0.5825,  0.2446,  ..., -0.3669, -0.2918,  1.8792]]],
       device='cuda:0', grad_fn=<AddBackward0>)

- **Decoder** includes **Output Pre-processing** (**Token Embedding** & **Positional Encoding**), N **Transformer Block**

In [23]:
class Decoder(nn.Module):

    def __init__(self,
                 d_model,
                 ff_hidden,
                 num_heads,
                 dropout,
                 num_blocks,
                 max_length_seq,
                 language_to_index,
                 start_token, 
                 end_token, 
                 pad_token
                ):
        """
        The Decoder part of the Transformer architecture

        """
        super().__init__()
        
         # Layer
        self.output_preprocessing = Preprocessing(max_length_seq, d_model, language_to_index, start_token, end_token, pad_token, dropout)
        
        # Transformer Blocks
        self.transformer_blocks = replicate(TransformerBlock(d_model, num_heads, ff_hidden, dropout, options="decoder"),num_blocks)

    def forward(self, x, y, self_attention_mask, cross_attention_mask, start_token:bool, end_token:bool): 
        # x is output, y is output from encoder
        # Output Pre-processing: Token Embedding + Positional Encoding
        x = self.output_preprocessing(x, start_token, end_token)

        # Go to Transformer Blocks (Decode)
        encode_decode = {'encoder_output': y,'w':x}
        mask = {'self_attention_mask': self_attention_mask,'cross_attention_mask': cross_attention_mask}
        for block in self.transformer_blocks:
            encode_decode['w'] = x
            x = block(encode_decode, mask)
        return x

# Or just using
# nn.TransformerDecoder

## Finally, The Transformer Architecture is complete!

In [24]:
class Transformer(nn.Module):

    def __init__(self,
                 d_model,
                 ff_hidden,
                 num_heads,
                 dropout,
                 num_blocks,
                 max_length_seq,
                 language_to_index,
                 target_language_to_index,
                 start_token, 
                 end_token, 
                 pad_token
                ):
        super().__init__()

        # Device
        self.device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
        
        # Encoder
        self.encoder = Encoder(
            d_model=d_model,
            ff_hidden=ff_hidden,
            num_heads=num_heads,
            dropout=dropout,
            num_blocks=num_blocks,
            max_length_seq=max_length_seq,
            language_to_index=language_to_index,
            start_token=start_token,
            end_token=end_token,
            pad_token=pad_token
        )

        # Decoder
        self.decoder = Decoder(
            d_model=d_model,
            ff_hidden=ff_hidden,
            num_heads=num_heads,
            dropout=dropout,
            num_blocks=num_blocks,
            max_length_seq=max_length_seq,
            language_to_index=target_language_to_index,
            start_token=start_token,
            end_token=end_token,
            pad_token=pad_token
        )

        # Linear Layer
        self.linear = nn.Linear(d_model, len(target_language_to_index))

        # Softmax
        

    def forward(self,
                x,
                y,
                encoder_self_attention_mask=None,
                decoder_self_attention_mask=None,
                decoder_cross_attention_mask=None,
                encoder_start_token=False,
                encoder_end_token=False,
                decoder_start_token=False,
                decoder_end_token=False):
        encoder_output = self.encoder(x, encoder_self_attention_mask, encoder_start_token, encoder_end_token)
        out = self.decoder(y, encoder_output, decoder_self_attention_mask, decoder_cross_attention_mask, decoder_start_token, decoder_end_token)
        out = self.linear(out)
        return out

# or just using
# nn.Transformer, TokenEmbedding, PositionalEncoding to make the complete model

In [25]:
# Just an example to work with (Don't care about right or wrong)
special_tokens = ["<start>", "<end>", "<pad>"]
alphabet = list("abcdefghijklmnopqrstuvwxyz ") + special_tokens
language_to_index = {word: idx for idx, word in enumerate(alphabet)}

model = Transformer(512, 3000, 8, 0.1, 1, 100, language_to_index, language_to_index, "<start>", "<end>", "<pad>")
model

Transformer(
  (encoder): Encoder(
    (input_preprocessing): Preprocessing(
      (token_embedding): TokenEmbedding(
        (embedding_layer): Embedding(30, 512)
      )
      (positional_encoding): PositionalEncoding(
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer_blocks): ModuleList(
      (0): TransformerBlock(
        (attention): MultiHeadAttention(
          (qkv_layer): Linear(in_features=512, out_features=1536, bias=True)
          (linear_layer): Linear(in_features=512, out_features=512, bias=True)
        )
        (norm_for_attention): LayerNormalization()
        (dropout_attention): Dropout(p=0.1, inplace=False)
        (ff): PositionwiseFeedForward(
          (linear1): Linear(in_features=512, out_features=3000, bias=True)
          (linear2): Linear(in_features=3000, out_features=512, bias=True)
          (relu): ReLU()
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (n

In [26]:
model.to(get_device())
model(['hello','goodbye'],['xin chao','tam biet'])

tensor([[[-0.0911, -0.0599,  0.3554,  ..., -0.5377,  0.7098, -0.2050],
         [ 0.9216, -0.1291,  0.5588,  ..., -0.1937, -0.6276,  0.8985],
         [ 0.3588,  0.4092,  0.7912,  ..., -0.2776, -0.5603, -1.1455],
         ...,
         [-0.6896,  0.1985,  0.6271,  ...,  0.1436, -0.1053,  0.8588],
         [-1.1051,  0.2928,  0.5118,  ...,  0.7733, -0.0668,  0.5815],
         [-1.2186,  0.2824, -0.0428,  ...,  0.5773, -0.5747,  0.1023]],

        [[ 0.0676, -0.0335, -0.1566,  ..., -0.1537,  0.2937, -0.1206],
         [ 0.0718,  0.0806, -0.3775,  ..., -0.1113, -0.2573, -0.7228],
         [-0.2775, -0.4968,  0.6976,  ...,  0.1742, -0.0686, -0.7298],
         ...,
         [-0.7425,  0.2690,  0.3774,  ...,  0.1982,  0.1753,  0.3259],
         [-0.5395, -0.0758,  0.1406,  ...,  0.4591, -0.1938,  0.4627],
         [-0.8725,  0.4493,  0.5609,  ...,  0.4942, -0.2736,  0.2028]]],
       device='cuda:0', grad_fn=<ViewBackward0>)