# Building Transformer Model - Pytorch

![Transformer Architecture](./Transformer_Understanding/img/transformerblock.png)

Base on the upper picture, we will make our model functions with Transformer Architecture following the things we have been researching in the "[Transformer Understanding](./Transformer_Understanding/TransformerNeuralNetworks.ipynb)" part.

In [1]:
# import library
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import math
import copy
import warnings

In [2]:
warnings.filterwarnings('ignore')

In [3]:
if torch.cuda.is_available():
    print("CUDA is available. PyTorch is using GPU.")
    print("Number of GPUs available: ", torch.cuda.device_count())
    print("GPU name: ", torch.cuda.get_device_name(0))
else:
    print("CUDA is not available. PyTorch is using CPU.")

CUDA is available. PyTorch is using GPU.
Number of GPUs available:  1
GPU name:  NVIDIA GeForce GTX 1650


## Start with these small blocks, functions, ...

- **Get Using Device** (Torch requires all tensor  must be in the same device.)

In [4]:
def get_device():
    return torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

- **Token Embedding**

In [5]:
class TokenEmbedding(nn.Module):
    
    def __init__(self, vocab_size, d_model):
        """
        Token Embedding is used for converting a word / token into a embedding numeric vector space.
        
        :param vocab_size: Number of words / token in vocabulary
        :param d_model: The embedding dimension
        
        Example: With 1000 words in vocabulary and our embedding dimension is 512, the Token Embedding layer will be 1000x512
        """
        super().__init__()
        self.embedding_layer = nn.Embedding(vocab_size, d_model)

    def forward(self, x):
        """
        :param x: the word or sequence of words
        :return: the numerical representation of the input
        
        Example:
        Input: (Batch_size, Sequence of words) - (30x100)
        Output: (Batch_size, Sequence of words, d_model) - (30x100x512)
        """
        x = self.embedding_layer(x)
        return x.to(get_device())

# Or just Simple
# token_embedding = nn.Embedding(vocab_size, d_model)

In [6]:
# For Example
vocab_size = 1000
d_model = 512

embedding_layer = TokenEmbedding(vocab_size, d_model)
input_data = torch.randint(0, vocab_size, (30, 100))
embedding_layer(input_data).shape

torch.Size([30, 100, 512])

- **Positional Encoding**

In [7]:
class PositionalEncoding(nn.Module):
    
    def __init__(self, d_model, max_sequence_length, dropout=0.1):
        """
        Positional Encoding layer for adding positional information to token embeddings.
        
        :param d_model: The embedding dimension.
        :param max_sequence_length: The maximum length of the input sequences.
        :param dropout: Dropout rate.
        """
        super(PositionalEncoding,self).__init__()
        self.max_sequence_length = max_sequence_length
        self.d_model = d_model
        self.dropout = nn.Dropout(dropout)

    def forward(self):
        even_i = torch.arange(0, self.d_model, 2).float()
        denominator = torch.pow(10000, even_i/self.d_model)
        position = (torch.arange(self.max_sequence_length)
                          .reshape(self.max_sequence_length, 1))
        even_PE = torch.sin(position / denominator)
        odd_PE = torch.cos(position / denominator)
        stacked = torch.stack([even_PE, odd_PE], dim=2)
        PE = torch.flatten(stacked, start_dim=1, end_dim=2)
        PE = PE.unsqueeze(0)
        return self.dropout(PE)

In [8]:
PE = PositionalEncoding(512,100,0.1)
PE().shape

torch.Size([1, 100, 512])

- **Multi-Head Attention**

2 options for: 'encoder' and 'decoder' (**Multi-Head Cross Attention**)

options for mask: **None**, **Self-Attention Mask** (**Causal Mask** hoặc **Look-Ahead Mask**), **Padding Mask**

In [9]:
class MultiHeadAttention(nn.Module):

    def __init__(self, d_model, num_heads=8, cross=False):
        """
        Multi-Head Attention
        :param d_model: the embedding dimension
        :param num_heads: the number of heads, default equals 8
        :param cross: True for Multi-Head Cross Attention, False for Multi-Head Attention only
        
        # note: The embedding dimension must be divided by the number of heads
        """
        super(MultiHeadAttention,self).__init__()
        self.d_model = d_model
        self.num_heads = num_heads
        self.head_dim = d_model // num_heads
        self.cross = cross

        # query, key value layer
        if self.cross: # Multi-Head Cross Attention
            self.kv_layer = nn.Linear(d_model , 2 * d_model)
            self.q_layer = nn.Linear(d_model , d_model)
        else:
            self.qkv_layer = nn.Linear(d_model , 3 * d_model)
        
        
        # method 1: old, cost alot
        # self.query = nn.Linear(self.head_dim, self.head_dim, bias=False)
        # self.key = nn.Linear(self.head_dim, self.head_dim, bias=False)
        # self.value = nn.Linear(self.head_dim, self.head_dim, bias=False) 

        # method 2: the fewer linear layers the better the cost
        
        
        # Linear Layer in Multi-Head Attention
        self.linear_layer = nn.Linear(d_model, d_model)

    def scaled_dot_product(self, q, k, v, mask=None):
        d_k = q.size()[-1]
        scaled = torch.matmul(q, k.transpose(-1, -2)) / math.sqrt(d_k)
        if mask is not None:
            scaled = scaled.permute(1, 0, 2, 3) + mask
            scaled = scaled.permute(1, 0, 2, 3)
        attention = F.softmax(scaled, dim=-1)
        values = torch.matmul(attention, v)
        return values, attention
    
    def forward(self, x, mask=None):
        """
        Perform forward pass of the multi-head attention mechanism.

        :param x: if cross is True then x is a dictionary including  'encoder_output' and 'w'.
        :param mask: Optional mask tensor
        
        :return: Output tensor of shape (batch_size, length_seq, d_model)

        """

        # For MultiHead Cross Attention
        if self.cross:
            encoder_output = x['encoder_output']
            w = x['w']
            batch_size, length_seq, d_model = w.size()
            kv = self.kv_layer(w)
            q = self.q_layer(encoder_output)
            kv = kv.reshape(batch_size, length_seq, self.num_heads, 2 * self.head_dim)
            q = q.reshape(batch_size, length_seq, self.num_heads, self.head_dim)
            kv = kv.permute(0, 2, 1, 3)
            q = q.permute(0, 2, 1, 3)
            k, v = kv.chunk(2, dim=-1)
            values, attention = self.scaled_dot_product(q, k, v, mask) # mask is not required in Cross Attention
            values = values.permute(0, 2, 1, 3).reshape(batch_size, length_seq, self.num_heads * self.head_dim)
            out = self.linear_layer(values)
            return out

        # For MultiHead Attention
        batch_size, length_seq, d_model = x.size()
        qkv = self.qkv_layer(x)
        qkv = qkv.reshape(batch_size, length_seq, self.num_heads, 3 * self.head_dim)
        qkv = qkv.permute(0, 2, 1, 3)
        q, k, v = qkv.chunk(3, dim=-1)
        values, attention = self.scaled_dot_product(q, k, v, mask)
        values = values.permute(0, 2, 1, 3).reshape(batch_size, length_seq, self.num_heads * self.head_dim)
        out = self.linear_layer(values)
        return out

In [10]:
# For Example
d_model = 512
num_heads = 8


# Attention
mha_layer = MultiHeadAttention(d_model, num_heads)
mha_layer(torch.randn(1,10,d_model))

tensor([[[-0.0562, -0.0427,  0.2180,  ...,  0.0322, -0.1573, -0.0523],
         [ 0.0623, -0.0722,  0.2759,  ...,  0.0920, -0.0904, -0.0578],
         [-0.0176, -0.0694,  0.2166,  ...,  0.1226, -0.0949,  0.0099],
         ...,
         [-0.0676, -0.0433,  0.2561,  ...,  0.0992, -0.0785, -0.0028],
         [ 0.0559, -0.0197,  0.2547,  ...,  0.1085, -0.1260, -0.0105],
         [-0.0468, -0.0607,  0.2399,  ...,  0.0299, -0.1175,  0.0150]]],
       grad_fn=<ViewBackward0>)

In [11]:
# Cross Attention
mha_layer = MultiHeadAttention(d_model, num_heads,cross=True)
mha_layer({'encoder_output':torch.randn(1,10,d_model),'w':torch.randn(1,10,d_model)})

tensor([[[-0.0175,  0.1712,  0.0623,  ...,  0.0460,  0.2901, -0.1208],
         [ 0.0578,  0.0376, -0.0211,  ...,  0.0196,  0.3044, -0.0735],
         [-0.0547,  0.2268,  0.0110,  ...,  0.0335,  0.3101, -0.1269],
         ...,
         [-0.0098,  0.1542,  0.0377,  ..., -0.0568,  0.3967, -0.0505],
         [-0.0268,  0.1181,  0.0296,  ...,  0.0190,  0.3105, -0.0837],
         [-0.0478,  0.1196,  0.0164,  ...,  0.0180,  0.2942, -0.1132]]],
       grad_fn=<ViewBackward0>)

- **Layer Normalization Block**

In [12]:
class LayerNormalization(nn.Module):
    def __init__(self, parameters_shape, eps=1e-5):
        super().__init__()
        self.parameters_shape=parameters_shape
        self.eps=eps
        self.gamma = nn.Parameter(torch.ones(parameters_shape))
        self.beta =  nn.Parameter(torch.zeros(parameters_shape))

    def forward(self, inputs):
        dims = [-(i + 1) for i in range(len(self.parameters_shape))]
        mean = inputs.mean(dim=dims, keepdim=True)
        var = ((inputs - mean) ** 2).mean(dim=dims, keepdim=True)
        std = (var + self.eps).sqrt()
        y = (inputs - mean) / std
        out = self.gamma * y  + self.beta
        return out

# Or using nn.LayerNorm(d_model)

In [13]:
# For Example
ln = LayerNormalization((1,2,3))
ln(torch.randn(1,2,3))

tensor([[[-0.4980, -0.3382, -1.0320],
         [ 1.5102, -0.8764,  1.2344]]], grad_fn=<AddBackward0>)

- **Positionwise Feed Forward Block**

In [14]:
class PositionwiseFeedForward(nn.Module):

    def __init__(self, d_model, hidden, drop_prob=0.1):
        super(PositionwiseFeedForward, self).__init__()
        self.linear1 = nn.Linear(d_model, hidden)
        self.linear2 = nn.Linear(hidden, d_model)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p=drop_prob)

    def forward(self, x):
        x = self.linear1(x)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.linear2(x)
        return x

# feed_forward = nn.Sequential(
#     nn.Linear(d_model, expansion_factor * d_model),  # e.g: 512x(4*512) -> (512, 2048)
#     nn.ReLU(),  # ReLU activation function
#     nn.Linear(d_model * expansion_factor, d_model),  # e.g: 4*512)x512 -> (2048, 512)
# )

In [15]:
# For Example
ff = PositionwiseFeedForward(512, 300)
ff(torch.randn(1,5,512)).shape

torch.Size([1, 5, 512])

- **Copy Block Function**: we can use nn.Sequential but i think we don't need to do that because we don't have any changes in Module Params

In [16]:
def replicate(block, N=6) -> nn.ModuleList:
    """
    Method to replicate the existing block to N set of blocks
    :param block: class inherited from nn.Module, mainly it is the encoder or decoder part of the architecture
    :param N: the number of stack, in the original paper they used 6
    :return: a set of N blocks
    """
    block_stack = nn.ModuleList([copy.deepcopy(block) for _ in range(N)])
    return block_stack

## With those small blocks and functions, let's build these important blocks!

- **Preprocessing** for Input Pre-processing and Output Pre-processing

In [17]:
class Preprocessing(nn.Module):

    def __init__(self, max_length_seq, d_model, language_to_index, start_token, end_token, pad_token, dropout=0.1):
        super().__init__()
        self.vocab_size = len(language_to_index)
        self.language_to_index = language_to_index
        self.max_length_seq = max_length_seq
        self.start_token = start_token
        self.end_token = end_token
        self.pad_token = pad_token

        # Layer
        self.token_embedding = TokenEmbedding(self.vocab_size, d_model)
        self.positional_encoding = PositionalEncoding(d_model, max_length_seq, dropout)
        self.dropout = nn.Dropout(dropout)

    def tokenize(self, sentence, start_token:bool, end_token:bool):
        encode_char = [self.language_to_index[token] for token in list(sentence)]
        if start_token:
            encode_char.insert(0, self.language_to_index[self.start_token])
        if end_token:
            encode_char.append(self.language_to_index[self.end_token])
        for _ in range(len(encode_char), self.max_length_seq):
            encode_char.append(self.language_to_index[self.pad_token])
        return torch.tensor(encode_char)
    
    def batch_tokens(self, batch, start_token:bool, end_token:bool):
        tokens = []
        for i in range(len(batch)):
            tokens.append(self.tokenize(batch[i], start_token, end_token))
        tokens = torch.stack(tokens)
        return tokens.to(get_device())

    def forward(self, x, start_token:bool, end_token:bool): 
        x = self.batch_tokens(x, start_token, end_token)
        x = self.token_embedding(x)
        pos = self.positional_encoding().to(get_device())
        x = self.dropout(x + pos)
        return x

- **Transformer Block** includes: **Multi-Head Attention**, **Add & Norm**, **Feed & Forward** and **Dropout**

2 options: 'Encoder' and 'Decoder'

In [29]:
class TransformerBlock(nn.Module):

    def __init__(self,
                 d_model=512,
                 num_heads=8,
                 ff_hidden=300,
                 dropout=0.1,
                 options='encoder'
                ):
        """
        The Transformer Block used in the encoder and decoder as well

        :param d_model: the embedding dimension
        :param num_heads: the number of heads
        :param ff_hidden: The output dimension of the feed forward layer
        :param dropout: probability dropout (between 0 and 1)
        :param options: The choice between 'encoder' and 'decoder'
        """
        super(TransformerBlock, self).__init__()
    
        self.options = options
        
        # For both 2 options: encoder and decoder
        self.attention = MultiHeadAttention(d_model=d_model, num_heads=num_heads)
        self.norm_for_attention = LayerNormalization(parameters_shape=[d_model])
        self.dropout_attention = nn.Dropout(dropout)

        
        
        # For decoder
        if self.options=='decoder':
            self.cross_attention = MultiHeadAttention(d_model=d_model, num_heads=num_heads, cross=True)
            self.norm_for_cross_attention = LayerNormalization(parameters_shape=[d_model])
            self.dropout2 = nn.Dropout(dropout)
        elif self.options!='encoder':
            raise Exception(f"Unknown option {options}")

        # For both 2 options: encoder and decoder
        self.ff = PositionwiseFeedForward(d_model=d_model, hidden=ff_hidden, drop_prob=dropout)
        self.norm_for_ff = LayerNormalization(parameters_shape=[d_model])
        self.dropout_for_ff = nn.Dropout(dropout)

    def forward(self, x, mask=None):
        # For decoder
        if self.options == 'decoder':
            encoder_output = x['encoder_output']
            w = x['w']
            w_residual = w.clone()
            w = self.attention(w,mask['self_attention_mask'])
            w = self.dropout_attention(w)
            w = self.norm_for_attention(w + w_residual)

            w_residual = w.clone()
            w = self.cross_attention({'encoder_output':encoder_output,'w':w},mask['cross_attention_mask'])
            w = self.dropout2(w)
            w = self.norm_for_cross_attention(w + w_residual)

            w_residual = w.clone()
            w = self.ff(w)
            w = self.dropout_for_ff(w)
            w = self.norm_for_ff(w + w_residual)
            return w
        else:
        # For encoder
            x_residual = x.clone()
            x = self.attention(x, mask)
            x = self.dropout_attention(x)
            x = self.norm_for_attention(x + x_residual)

            x_residual = x.clone()
            x = self.ff(x)
            x = self.dropout_for_ff(x)
            x = self.norm_for_ff(x + x_residual)
            return x

In [30]:
# Test
trans_block = TransformerBlock()
trans_block(torch.randn(1,10,512))

tensor([[[ 0.7325, -0.7324, -0.8844,  ..., -0.1414, -1.8609,  1.2694],
         [-0.0189,  1.1276,  1.0198,  ...,  0.4492, -0.8654,  0.8697],
         [-0.9913,  2.0883,  0.5105,  ...,  0.0388,  1.1496,  0.2762],
         ...,
         [-0.2725, -0.1853, -0.0847,  ...,  0.4693, -1.0658,  1.9107],
         [ 0.2683,  1.4403,  1.5199,  ...,  1.1040,  1.2863,  1.2746],
         [-1.2582,  2.7587,  2.7908,  ...,  0.2152, -0.0603, -0.3052]]],
       grad_fn=<AddBackward0>)

In [31]:
trans_block = TransformerBlock(options='decoder')
trans_block({'encoder_output':torch.randn(1,10,d_model),'w':torch.randn(1,10,d_model)},
    {'self_attention_mask': None, 'cross_attention_mask': None}
)

tensor([[[ 1.0348,  0.1849,  0.0468,  ...,  1.0645, -0.3162, -0.3297],
         [-0.7798,  0.5172, -0.7827,  ...,  0.2869,  0.8152,  1.2620],
         [-1.7776,  0.9825, -0.1279,  ...,  0.9662,  0.9253,  0.5077],
         ...,
         [-1.1933,  0.9722, -0.7518,  ..., -0.3489, -0.6589, -0.6248],
         [-0.2129,  0.3359,  0.3757,  ...,  0.1093, -0.9151,  0.4977],
         [ 0.4353,  0.2861, -0.8510,  ..., -0.5683,  0.0124,  1.2205]]],
       grad_fn=<AddBackward0>)

- **Encoder** includes Input Pre-processing (**Token Embedding** & **Positional Encoding**) and N **Transformer Block** (Encode block in the picture on the top)

In [32]:
class Encoder(nn.Module):

    def __init__(self,
                 d_model,
                 ff_hidden,
                 num_heads,
                 dropout,
                 num_blocks,
                 max_length_seq,
                 language_to_index,
                 start_token, 
                 end_token, 
                 pad_token
                ):
        """
        The Encoder part of the Transformer architecture
        """
        super().__init__()

        # Layer
        self.input_preprocessing = Preprocessing(max_length_seq, d_model, language_to_index, start_token, end_token, pad_token, dropout)
        
        # Transformer Blocks
        self.transformer_blocks = replicate(TransformerBlock(d_model, num_heads, ff_hidden, dropout, options="encoder"),num_blocks)

    def forward(self, x, self_attention_mask, start_token:bool, end_token:bool):
        # Input Pre-processing: Token Embedding + Positional Encoding
        out = self.input_preprocessing(x, start_token, end_token)

        # Go to Transformer Blocks (Encode)
        for block in self.transformer_blocks:
            out = block(out, self_attention_mask)

        return out

In [33]:
# Test
special_tokens = ["<start>", "<end>", "<pad>"]
alphabet = list("abcdefghijklmnopqrstuvwxyz ") + special_tokens
language_to_index = {word: idx for idx, word in enumerate(alphabet)}
encoder = Encoder(d_model=100, ff_hidden=50, num_heads=2, dropout=0.1 ,num_blocks=1, max_length_seq=100, language_to_index=language_to_index, start_token="<start>", end_token="<end>", pad_token="<pad>")
encoder.to(get_device())
batch = ['hello','goodbye']
encoder(batch, None, False, False)

tensor([[[-0.1834,  0.4173, -1.4288,  ..., -0.8192,  0.0781,  1.8494],
         [ 0.0085, -0.0522,  0.3505,  ..., -0.8484, -0.7208, -1.0828],
         [ 0.2975, -2.2400, -0.9833,  ...,  0.8163, -0.2242,  0.1052],
         ...,
         [-0.1892, -0.5154, -0.9726,  ...,  1.6462, -1.0744,  1.0129],
         [-0.3732, -0.3893,  0.0549,  ...,  1.5311, -0.8773,  0.8742],
         [-1.2761, -0.1116,  0.7824,  ...,  1.4351, -0.8063,  0.7314]],

        [[ 0.8681,  1.5104, -0.2532,  ...,  0.2230, -1.6173,  0.2736],
         [ 1.9741,  1.4769,  0.1803,  ...,  0.6149, -2.3257,  0.1587],
         [ 1.8738,  0.7153,  0.3597,  ...,  0.5758, -2.6589,  0.0357],
         ...,
         [ 0.1281, -0.6617, -0.9936,  ...,  1.5914, -0.9194,  0.9107],
         [-0.2175, -0.4650,  0.0308,  ...,  1.3927, -0.7941,  0.9496],
         [-1.3464,  0.2845,  0.4976,  ...,  1.4611, -0.7491,  0.7999]]],
       device='cuda:0', grad_fn=<AddBackward0>)

- **Decoder** includes **Output Pre-processing** (**Token Embedding** & **Positional Encoding**), N **Transformer Block**

In [34]:
class Decoder(nn.Module):

    def __init__(self,
                 d_model,
                 ff_hidden,
                 num_heads,
                 dropout,
                 num_blocks,
                 max_length_seq,
                 language_to_index,
                 start_token, 
                 end_token, 
                 pad_token
                ):
        """
        The Decoder part of the Transformer architecture

        """
        super().__init__()
        
         # Layer
        self.output_preprocessing = Preprocessing(max_length_seq, d_model, language_to_index, start_token, end_token, pad_token, dropout)
        
        # Transformer Blocks
        self.transformer_blocks = replicate(TransformerBlock(d_model, num_heads, ff_hidden, dropout, options="decoder"),num_blocks)

    def forward(self, x, y, self_attention_mask, cross_attention_mask, start_token:bool, end_token:bool): 
        # x is output, y is output from encoder
        # Output Pre-processing: Token Embedding + Positional Encoding
        x = self.output_preprocessing(x, start_token, end_token)

        # Go to Transformer Blocks (Decode)
        encode_decode = {'encoder_output': y,'w':x}
        mask = {'self_attention_mask': self_attention_mask,'cross_attention_mask': cross_attention_mask}
        for block in self.transformer_blocks:
            encode_decode['w'] = x
            x = block(encode_decode, mask)
        return x

## Finally, The Transformer Architecture is complete!

In [35]:
class Transformer(nn.Module):

    def __init__(self,
                 d_model,
                 ff_hidden,
                 num_heads,
                 dropout,
                 num_blocks,
                 max_length_seq,
                 language_to_index,
                 target_language_to_index,
                 start_token, 
                 end_token, 
                 pad_token
                ):
        super().__init__()

        # Device
        self.device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
        
        # Encoder
        self.encoder = Encoder(
            d_model=d_model,
            ff_hidden=ff_hidden,
            num_heads=num_heads,
            dropout=dropout,
            num_blocks=num_blocks,
            max_length_seq=max_length_seq,
            language_to_index=language_to_index,
            start_token=start_token,
            end_token=end_token,
            pad_token=pad_token
        )

        # Decoder
        self.decoder = Decoder(
            d_model=d_model,
            ff_hidden=ff_hidden,
            num_heads=num_heads,
            dropout=dropout,
            num_blocks=num_blocks,
            max_length_seq=max_length_seq,
            language_to_index=target_language_to_index,
            start_token=start_token,
            end_token=end_token,
            pad_token=pad_token
        )

        # Linear Layer
        self.linear = nn.Linear(d_model, len(target_language_to_index))

        # Softmax
        

    def forward(self,
                x,
                y,
                encoder_self_attention_mask=None,
                decoder_self_attention_mask=None,
                decoder_cross_attention_mask=None,
                encoder_start_token=False,
                encoder_end_token=False,
                decoder_start_token=False,
                decoder_end_token=False):
        encoder_output = self.encoder(x, encoder_self_attention_mask, encoder_start_token, encoder_end_token)
        out = self.decoder(y, encoder_output, decoder_self_attention_mask, decoder_cross_attention_mask, decoder_start_token, decoder_end_token)
        out = self.linear(out)
        return out

In [36]:
# Just an example to work with (Don't care about right or wrong)
special_tokens = ["<start>", "<end>", "<pad>"]
alphabet = list("abcdefghijklmnopqrstuvwxyz ") + special_tokens
language_to_index = {word: idx for idx, word in enumerate(alphabet)}

model = Transformer(512, 3000, 8, 0.1, 1, 100, language_to_index, language_to_index, "<start>", "<end>", "<pad>")
model

Transformer(
  (encoder): Encoder(
    (input_preprocessing): Preprocessing(
      (token_embedding): TokenEmbedding(
        (embedding_layer): Embedding(30, 512)
      )
      (positional_encoding): PositionalEncoding(
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer_blocks): ModuleList(
      (0): TransformerBlock(
        (attention): MultiHeadAttention(
          (qkv_layer): Linear(in_features=512, out_features=1536, bias=True)
          (linear_layer): Linear(in_features=512, out_features=512, bias=True)
        )
        (norm_for_attention): LayerNormalization()
        (dropout_attention): Dropout(p=0.1, inplace=False)
        (ff): PositionwiseFeedForward(
          (linear1): Linear(in_features=512, out_features=3000, bias=True)
          (linear2): Linear(in_features=3000, out_features=512, bias=True)
          (relu): ReLU()
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (n

In [38]:
model.to(get_device())
model(['hello','goodbye'],['xin chao','tam biet'])

tensor([[[ 0.2282,  1.1035,  0.2848,  ..., -0.1102, -0.4442, -0.3117],
         [-0.8254, -0.1476,  0.6141,  ..., -0.5090,  0.1984,  0.2809],
         [-1.0713,  0.7109,  0.3258,  ..., -0.9360,  0.2301, -0.0523],
         ...,
         [ 0.4032, -0.3972, -0.4304,  ..., -0.3845,  0.3783,  0.7064],
         [ 0.7196, -0.2915, -0.3925,  ..., -0.2429,  0.0862,  0.3884],
         [ 0.1167, -0.0694, -0.0647,  ..., -0.4290,  0.0512,  0.2166]],

        [[-0.1074, -0.9415,  0.0861,  ...,  0.2044,  0.6242, -0.1079],
         [-0.0246,  0.2943, -0.1001,  ...,  0.0617,  0.8480, -0.8726],
         [-1.1477, -0.4672,  1.1439,  ...,  0.5774,  1.7076,  0.1687],
         ...,
         [ 0.1420, -0.1864, -0.7107,  ..., -0.3897, -0.1633,  0.4926],
         [ 0.0744, -0.0528, -0.4178,  ..., -0.2942, -0.0717,  0.2624],
         [ 0.2281, -0.0058, -0.2660,  ..., -0.3544, -0.3754,  0.2085]]],
       device='cuda:0', grad_fn=<ViewBackward0>)