In [1]:
import torch
from torch import nn
import numpy as np

from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torchtext.datasets import multi30k, Multi30k
from typing import Iterable, List




In [5]:
from torchtext.transforms import BERTTokenizer
VOCAB_FILE = "https://huggingface.co/bert-base-uncased/resolve/main/vocab.txt"
tokenizer = BERTTokenizer(vocab_path=VOCAB_FILE, do_lower_case=True, return_tokens=True)
tokenizer("Hello World, How are you!") # single sentence input
tokenizer(["Hello World","How are you!"]) # batch input

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 232k/232k [00:00<00:00, 18.3MB/s]


[['hello', 'world'], ['how', 'are', 'you', '!']]

In [3]:
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)

In [4]:
class TransformerRecreation(nn.Module):
    def __init__(self, h = 8, d_k = 64, d_v = 64, d_model = 512, n_encoder_layers = 6, n_decoder_layers = 6, d_ff=2096):
        super().__init__()
        self.h = h
        self.d_k = d_k
        self.d_v = d_v
        self.d_model = d_model
        self.n_encoder_layers = n_encoder_layers
        self.n_decoder_layers = n_decoder_layers
        self.d_ff = d_ff
        
        
        self.W_q_list = [torch.Tensor(1, self.d_model, self.d_k) for i in range(self.h)]
        self.W_k_list = [torch.Tensor(1, self.d_model, self.d_k) for i in range(self.h)]
        self.W_v_list = [torch.Tensor(1, self.d_model, self.d_v) for i in range(self.h)]
        
        self.W_out = torch.Tensor(1, self.h * self.d_v, self.d_model)
        
        self.ff_model = nn.Sequential(
            nn.Linear(self.d_model, self.d_ff),
            nn.ReLU(),
            nn.Linear(self.d_ff, self.d_model)
        )
        
    def encoder_layer(self, x):
        x_original = x
        x = positional_encoding(x)
        x = multiHeadAttention(x)
        x = nn.LayerNorm(x + x_original)
        sublayer1_output_original = x
        x = self.ff_model(x)
        x = nn.LayerNorm(x + sublayer1_output_original)
        return x
    
    def stack_encoder_layers(self, x):
        for i in range(self.n_encoder_layers):
            x = encoder_layer(x)
        return x

    def decoder_layer(self, x, encoder_output):
        x_original = x
        ### Sublayer 1: Masked Multi-head Attention
        x = multiHeadAttention(x)
        x = nn.LayerNorm(x + x_original)
        sublayer1_output_original = x
        
        ### Sublayer 2: multi-head attention
        x = multiHeadAttention(encoder_output) # multiHeadAttention(x)  ### are encoder output and sublayer1_output_original both put in this?
        x = nn.LayerNorm(x + sublayer1_output_original)
        sublayer2_output_original = x
        
        ### Sublayer 3: Feed Forward
        x = self.ff_model(x)
        x = nn.LayerNorm(x + sublayer2_output_original)
        return x
        

    def stack_decoder_layers(self, x, encoder_output):
        for i in range(self.n_decoder_layers):
            x = decoder_layer(x, encoder_output)
        return x

    
    def attention(self, Q, K, V):
        x = torch.matmul(Q, K)
        x = x / np.sqrt(self.d_k)
        x = nn.Softmax(x)
        x = nn.matmul(x, V)
        return x # n x d_v

    def multiHeadAttention(self, Q):
        K = torch.Tensor(1, self.d_k, self.d_k)
        V = torch.Tensor(1, self.d_v, self.d_v)

        acc_tensor = torch.Tensor()
        for i in range(self.h):
            # project q,k,v matrices from d_model to d_k, d_v
            proj_q = torch.matmul(Q, self.W_q_list[i])
            proj_k = torch.matmul(K, self.W_k_list[i])
            proj_v = torch.matmul(V, self.W_v_list[i])
            
            # calculate attention
            head = attention(proj_q, proj_k, proj_v)
            
            # concat head_i
            acc_tensor = torch.cat(acc_tensor, head)
        
        return torch.matmul(acc_tensor, self.W_out)
    
    def positional_encoding(self, x):
        """
        pos is position of sequence, which would be nth character / token in sequence
        In paper, they say i is dimension of sequence, so len() of input (??)
        """
        base_positional_encoding = np.array(range(len(x)))
        return np.sin(base_positional_encoding / 10_000**(2*len(x) / self.d_model))
    
    def forward(self, x):
        x = encoder_layer(x)
        encoder_output = x
        x = decoder_layer(x, encoder_output)
        decoder_output = x

        

In [16]:
# model = TransformerRecreation().to(device)
# print(model)
# inputs = torch.rand(1, 28, 28, device = device)
# pred = model(inputs)
# print(pred)

# pred_prob = nn.Softmax(dim=1)(pred)
# pred_prob

# y_pred = pred_prob.argmax(1)
# y_pred

tensor([[-0.0256, -0.0691,  0.0705, -0.0238, -0.0211,  0.0156, -0.0638,  0.0071,
          0.0155,  0.0243]], device='mps:0', grad_fn=<LinearBackward0>)
