<a href="https://colab.research.google.com/github/JonathanSum/Happy-Sugar-Life-Weekly-Training/blob/master/15_transformer_practice.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import torch
from torch import nn
import torch.nn.functional as f
import numpy as np

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
nn_Softargmax = nn.Softmax   #fox wrong name

In [3]:
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads, p, d_input=None):
        super().__init__()
        self.num_heads = num_heads
        self.d_model = d_model

        #to-do: add more explanation on this part.
        if d_input is None:
            d_xq = d_xk = d_xv = d_model
        else:
            d_xq, d_xk, d_xv = d_input
        
        # Make sure that the embedding dimension of model is a multiple of number of heads
        assert d_model % self.num_heads == 0

        self.d_k = d_model // self.num_heads
        
        # There are tstill of dimension d_model. They will be split into number of heads
        self.W_q = nn.Linear(d_xq, d_model, bias=False)
        self.W_k = nn.Linear(d_xk, d_model, bias=False)
        self.W_v = nn.Linear(d_xv, d_model, bias=False)

        # Outputs of all sub-layers need to be of dimension d_model
        self.W_h = nn.Linear(d_model, d_model)
                             
    def scaled_dot_product_attention(self, Q, K, V):
      batch_size = Q.size(0)
      k_length = K.size(-2)
      
      # Scaling by d_k so that the soft(arg)max doesn't saturate

      # (bs, n_heads, q_length, dim_per_head)
      # dim_per_head, I guess it is the self.d_k
      Q = Q / np.sqrt(self.d_k)     


      #K's size, I guess it is (bs, n_heads, dim_per_head, k_length)
      # (bs, n_heads, q_length, k_length)
      scores = torch.matmul(Q, K.transpose(2,3))
      
      A = nn_Softargmax(dim=-1)(scores)     #(bs, n_heads, q_length, k_length)

      # Get the weighted average of the values
      H = torch.matmul(A, V)                # (bs, n_heads, q_length, dim_per_head)

      return H, A

    def split_heads(self, x, batch_size):
      """
      Split the last dimension into (heads X depth)
      Return after transpose to put in shape (batch_size X num_heads X seq+length X d_k)
      """
      #I guess the reason why it is (batch_size, -1, self.num_heads, self.d_k).
      #That is because it wants to seprate about number of heads in 
      #each sentence. And each sentense it has d_k for embedding.
      return  x.view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2)

    def group_heads(self, x, batch_size):
      """
      Combine the heads again to get (batch_size X seq_length X num_heads X d_l)
      """
      return x.transpose(1,2).contiguous().view(batch_size, -1,
                                                self.num_heads * self.d_k
                                                )
    def forward(self, X_q, X_k, X_v):
        batch_size, seq_length, dim = X_q.size()

        # After transforming, split into num_heads
        # Q: (bs, n_heads, q_length, dim_per_head)
        # K: (bs, n_heads, k_length, dim_per_head)
        # V: (bs, n_heads, v_length, dim_per_head)
        Q = self.split_heads(self.W_q(X_q), batch_size)
        K = self.split_heads(self.W_k(X_k), batch_size)
        V = self.split_heads(self.W_v(X_v), batch_size)

        # Calculate the attention weights for each of the heads
        # to know how related they are
        H_cat, A = self.scaled_dot_product_attention(Q, K, V)
        
        #Put all the heads back together by concat
        # (bs, q_length, dim)
        H_cat = self.group_heads(H_cat, batch_size)

        # Final linear layer
        H = self.W_h(H_cat)           # (bs, q_length, dim)

        return H, A   

In [4]:
temp_mha = MultiHeadAttention(d_model=512, num_heads=8, p=0)
def print_out(Q, K, V):
    temp_out, temp_attn = temp_mha.scaled_dot_product_attention(Q, K, V)
    print('Attention weights are:', temp_attn.squeeze())
    print('Output is:', temp_out.squeeze())

In [5]:
test_K = torch.tensor(
    [[10, 0, 0],
     [0, 10, 0],
     [0, 0, 10],
     [0, 0, 10]]
).float()[None, None]

In [6]:
test_K.shape

torch.Size([1, 1, 4, 3])

In [7]:
test_V = torch.tensor(
    [[    1,0,0],
     [    10,0,0],
     [    100,5,0],
     [    1000,6,0]]
).float()[None, None]
test_Q = torch.tensor(
    [[0, 10, 0]]
).float()[None,None]
print_out(test_Q, test_K, test_V)

Attention weights are: tensor([3.7266e-06, 9.9999e-01, 3.7266e-06, 3.7266e-06])
Output is: tensor([1.0004e+01, 4.0993e-05, 0.0000e+00])


In [8]:
test_Q = torch.tensor(
    [0, 0, 10]
).float()[None,None]
print_out(test_Q, test_K, test_V)

Attention weights are: tensor([1.8633e-06, 1.8633e-06, 5.0000e-01, 5.0000e-01])
Output is: tensor([549.9979,   5.5000,   0.0000])


In [9]:
test_Q = torch.tensor(
    [[0, 0, 10], [0, 10, 0],[10, 10, 0]]
).float()[None,None]
print_out(test_Q, test_K, test_V)

Attention weights are: tensor([[1.8633e-06, 1.8633e-06, 5.0000e-01, 5.0000e-01],
        [3.7266e-06, 9.9999e-01, 3.7266e-06, 3.7266e-06],
        [5.0000e-01, 5.0000e-01, 1.8633e-06, 1.8633e-06]])
Output is: tensor([[5.5000e+02, 5.5000e+00, 0.0000e+00],
        [1.0004e+01, 4.0993e-05, 0.0000e+00],
        [5.5020e+00, 2.0497e-05, 0.0000e+00]])


In [10]:
test_Q = torch.tensor(
    [[0, 0, 10], [0, 10, 0], [10, 10, 0]]
).float()[None,None]
print_out(test_Q, test_K, test_V)

Attention weights are: tensor([[1.8633e-06, 1.8633e-06, 5.0000e-01, 5.0000e-01],
        [3.7266e-06, 9.9999e-01, 3.7266e-06, 3.7266e-06],
        [5.0000e-01, 5.0000e-01, 1.8633e-06, 1.8633e-06]])
Output is: tensor([[5.5000e+02, 5.5000e+00, 0.0000e+00],
        [1.0004e+01, 4.0993e-05, 0.0000e+00],
        [5.5020e+00, 2.0497e-05, 0.0000e+00]])


We can see it will return two things.
First is the index that it focus that is very similar from Q and K, index is given from the Attention argmax.
In addition, the output is the average of the element in the value of that index or index(s). 

Average here is sum(element)/(number of index)

In [11]:
test_K = torch.tensor(
    [[10, 0, 0],
     [ 0,0, 10],
     [ 0, 0,10],
     [ 0, 0,10]]
).float()[None,None]

test_V = torch.tensor(
    [[   1,0,0],
     [  10,0,0],
     [ 100,5,0],
     [1000,6,0]]
).float()[None,None]

test_Q = torch.tensor(
    [[0, 0, 10]]
).float()[None,None]
print_out(test_Q, test_K, test_V)

Attention weights are: tensor([1.2422e-06, 3.3333e-01, 3.3333e-01, 3.3333e-01])
Output is: tensor([369.9995,   3.6667,   0.0000])


In [12]:
class CNN(nn.Module):
    def __init__(self, d_model, hidden_dim, p):
        super().__init__()
        self.k1convL1 = nn.Linear(d_model, hidden_dim)
        self.k1convL2 = nn.Linear(hidden_dim, d_model)
        self.activation = nn.ReLU()

    def forward(self, x):
      x = self.k1convL1(x)
      x = self.activation(x)
      x = self.k1convL2(x)
      return x

In [50]:
class EncoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, conv_hidden_dim, p = 0.1):
        super().__init__()

        self.mha = MultiHeadAttention(d_model, num_heads, p)
        self.cnn = CNN(d_model, conv_hidden_dim, p)

        self.layernorm1 = nn.LayerNorm(normalized_shape=d_model, eps=1e-6)
        self.layernorm2 = nn.LayerNorm(normalized_shape=d_model, eps=1e-6)
    def forward(self, x):

      # Multi-head attention
      attn_output, _ = self.mha(x, x, x)  #(batch_size, input_seq_len, d_model)

      # Layer norm after adding the residual connection
      out1 = self.layernorm1(x + attn_output)   #(batch_size, input seq_len, d_model)

      #Feed forward
      cnn_output = self.cnn(out1)   # (batch_size, input_seq_len, d_model)

      #Second layer norm after adding residual connection
      out2 = self.layernorm2(out1 + cnn_output)   # (batch_size, input seq_len, d_model)
      return out2
    

##Encoder

In [124]:
def create_sinusoidal_embeddings(nb_p, dim, E):
    theta = np.array([
        [p / np.power(10000, 2* (j//2) / dim) for j in range(dim)]                      
        for p in range(nb_p)
    ])
    E[:, 0::2] = torch.FloatTensor(np.sin(theta[:, 0::2]))
    E[:, 1::2] = torch.FloatTensor(np.sin(theta[:, 1::2]))
    E.detach_()
    E.requires_grad = False
    E = E.to(device)

class Embeddings(nn.Module):
    def __init__(self, d_model, vocab_size, max_position_embeddings, p):
        super().__init__()
        self.word_embeddings = nn.Embedding(vocab_size, d_model, padding_idx=1)
        self.position_embeddings = nn.Embedding(max_position_embeddings, d_model)
        create_sinusoidal_embeddings(
            nb_p = max_position_embeddings,
            dim = d_model,
            E = self.position_embeddings.weight
        )

        self.LayerNorm = nn.LayerNorm(d_model, eps=1e-12)

    def forward(self, input_ids):
        seq_length = input_ids.size(1)

        # (max_seq_length)
        position_ids = torch.arange(seq_length, dtype=torch.long, device=input_ids.device)

        # (bs, max, seq_length)
        position_ids = position_ids.unsqueeze(0).expand_as(input_ids)

        # Get word embeddings for each input id
        #(bs, max_seq_length, dim)
        word_embeddings = self.word_embeddings(input_ids)

        # Get position embeddings for each position id
        #(bs, max_seq_length, dim)
        position_embeddings = self.position_embeddings(position_ids)
        
         # Add them both 
        embeddings = word_embeddings + position_embeddings  # (bs, max_seq_length, dim)

        # Layer norm
        embeddings = self.LayerNorm(embeddings)
        return embeddings


In [125]:
class Encoder(nn.Module):
    def __init__(self, num_layers, d_model, num_heads, input_vocab_size, ff_hidden_dim,
                 maximum_position_encoding, p = 0.1):
        super().__init__()

        self.d_model = d_model
        self.num_layers = num_layers

        self.embedding = Embeddings(d_model, input_vocab_size, maximum_position_encoding, p)

        self.enc_layers = nn.ModuleList()

        for _ in range(num_layers):
            self.enc_layers.append(EncoderLayer(d_model, num_heads, ff_hidden_dim, p))
    def forward(self, x):
        x = self.embedding(x) # Transform to (batch_size, input_seq_length, d_model)

        for i in range(self.num_layers):
            x = self.enc_layers[i](x)

        return x # (batch_size, input_seq_len, d_model)

In [126]:
import torchtext.data as data
import torchtext.datasets as datasets

In [127]:
max_len = 200
text = data.Field(sequential=True, fix_length=max_len, batch_first=True, lower=True, dtype=torch.long)
label = data.LabelField(sequential=False, dtype=torch.long)
datasets.IMDB.download('./')
ds_train, ds_test = datasets.IMDB.splits(text, label, path='./imdb/aclImdb/')
print('train : ', len(ds_train))
print('test : ', len(ds_test))
print('train.fields :', ds_train.fields)

train :  25000
test :  25000
train.fields : {'text': <torchtext.data.field.Field object at 0x7f845f01ef28>, 'label': <torchtext.data.field.LabelField object at 0x7f845f01eeb8>}


In [128]:
ds_train, ds_valid = ds_train.split(0.9)
print('train : ',len(ds_train))
print('valid : ',len(ds_valid))
print('test : ',len(ds_test))

train :  22500
valid :  2500
test :  25000


In [129]:
print(torch.__version__)

1.6.0+cu101


In [130]:
text.build_vocab

<bound method Field.build_vocab of <torchtext.data.field.Field object at 0x7f845f01ef28>>

In [131]:
num_words = 50_000
# text.build_vocab(ds_train, max_size=num_words, specials=['<pad>','<unk>'])
text.build_vocab(ds_train, max_size=num_words)
label.build_vocab(ds_train)
vocab = text.vocab

In [132]:
batch_size = 164
train_loader, valid_loader, test_loader = data.BucketIterator.splits(
    (ds_train, ds_valid, ds_test), batch_size = batch_size, sort_key = lambda x: len(x.text), repeat=False)

In [133]:
class TransformerClassifier(nn.Module):
    def __init__(self, num_layers, d_model, num_heads, conv_hidden_dim, input_vocab_size, num_answers):
      super().__init__()

      self.encoder = Encoder(num_layers, d_model, num_heads, conv_hidden_dim, input_vocab_size,
                       maximum_position_encoding=10000)
      self.dense = nn.Linear(d_model, num_answers)       #this one must be the output layer

    def forward(self, x):
        x = self.encoder(x)

        x, _ = torch.max(x, dim=1)
        x    =     self.dense(x)
        return x

In [134]:
model = TransformerClassifier(num_layers=1, d_model=32, num_heads=2, 
                         conv_hidden_dim=128, input_vocab_size=50002, num_answers=2)
model.to(device)

RuntimeError: ignored

In [136]:
!nvidia-smi

Tue Sep  8 20:41:06 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 450.66       Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-SXM2...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   38C    P0    39W / 300W |   1267MiB / 16130MiB |      0%      Default |
|                               |                      |                 ERR! |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
len(train_loader)

In [None]:
optimizer = torch.optim.AdamW(model.parameters(), lr=0.001)
epochs = 10
t_total = len(train_loader) * epochs

In [None]:
t_total

In [None]:
def train(train_loader, valid_loader):
    for epoch in range(epochs):
        train_iterator, valid_iterator = iter(train_loader), iter(valid_loader)
        nb_batches_train = len(train_loader)
        train_acc = 0
        model.train()
        losses = 0.0

        for batch in train_iterator:
            x = batch.text.to(device)
            y = batch.label.to(device)

            out = model(x)  # step 1

            loss = f.cross_entropy(out, y)  # step 2

            model.zero_grad()  # step3

            loss.backward()  # step 4
            loss += loss.item()

            optimizer.step  # step 5

            train_ac += (out.argmax(1) == y).cpu().numpy().mean()

        print(f"Train loss at epoch {epoch} is {losses / nb_batches_train}")
        print(f"Train accuracy: {train_acc / nb_batches_train}")
        print(f"Evaluating on validation:")
        evaluate(valid_loader)

In [None]:
def evaluate(data_loader):
    data_iterator = iter(data_loader)
    nb_batches = len(data_loader)
    model.eval
    acc = 0
    for batch in data_iterator:
        x = batch.text.to(device)
        y = batch.text.to(device)

        out = model(x)
        acc += (out.argmax(1) == y).cpu().numpy().mean()
    print(f"Eval accuracy: {acc / nb_batches}")

In [None]:
train(train_loader, valid_loader)

In [None]:
evaluate(test_loader)