<a href="https://colab.research.google.com/github/MilenaOehlers/diffusion_models_for_radar_object_detection/blob/main/transformer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>


The following notebook contains a transformer built from scratch, following the
article https://towardsdatascience.com/build-your-own-transformer-from-scratch-using-pytorch-84c850470dcb

Finally, a well-written expalanation about transformers:

1. Part 1: https://towardsdatascience.com/all-you-need-to-know-about-attention-and-transformers-in-depth-understanding-part-1-552f0b41d021
2. Part 2: https://towardsdatascience.com/all-you-need-to-know-about-attention-and-transformers-in-depth-understanding-part-2-bf2403804ada

From same author, article about vision transformers specifically:
https://towardsdatascience.com/are-transformers-better-than-cnns-at-image-recognition-ced60ccc7c8

In [5]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data
import math, copy

In [6]:
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads):
        super(MultiHeadAttention, self).__init__() # same as super().__init__()
        assert d_model % num_heads == 0, "d_model must be divisible by num_heads"

        self.d_model = d_model
        self.num_heads = num_heads
        self.d_k = d_model // num_heads

        # would have expected to have num_heads x (d_k x d_k) weight matrices
        # maybe the weight matrices are block/0 matrices?
        # else, there would be interaction term between different heads
        # dont think I have seen that in the explanation

        """NEW: makes more sense now!
        1) Take W_k for instance: take K, either a long word vector 1 x d_model,
           or same word stacked upon itself yielding 1 x (d_k * num_heads),
           (probably latter, but) whichever
        2) Weigh it with fully connected layer, hence regions that will belong
           to different heads later DO influence each other here! Hence the part
           of K that will be assigned to head1 influences head2 and vice versa
           - how exactly, is learnable
        3) Now, split K (and Q also for that manner) into pieces corr. to the
           heads. all K- and Q-pieces are matmuled independently from the other
           heads.
        -> The combi of 2 and 3 shows why it makes sense that there is a K and
           a Q weight! :)
        ... on the other hand, how the hack is K defined at the very beginning?
        if multiplied with W_K, which is num_heads*d_k, K might be stacked
        num_heads times onto itself, hence the influence of input_head_1-K onto
        head_2-K is irrlevant, as input_head_1_k==input_head_2_k! jeez luise...

        Ahhh wait. Fully connected linear layer. Values of input K of first
        head pass thorugh it, hence also influencing resulting K of second head?
        seems like it! So, the weights DO connect the word/token-vector part,
        which is of dim (1 x d_model)... or is it of (1 x d_k)? Really unhelpful
        that there is no documentation in this class...

        Shouldnt the bias kwarg be set to False according to article?
        is set to True by default!

        acts like a matrix multiplication, of x dims applies 'matrix' to last
        dim: (X,Y,Z,A) x (A,B) -> (X,Y,Z,B)

        Will be applied to (batch_size, seq_length, d_model)-sized vector,
        thus preserving its dimensions
        """
        self.W_q = nn.Linear(d_model, d_model) # query weights
        self.W_k = nn.Linear(d_model, d_model) # key weights
        self.W_v = nn.Linear(d_model, d_model) # value weights
        self.W_o = nn.Linear(d_model, d_model) # output weights

    def scaled_dot_product_attention(self, Q, K, V, mask=None):
        """Q,K,V: (batch_size,num_heads,seq_length,d_k)

        For each batch-head combination c, the respective sub-matrices of K and
        Q, namely K_c and Q_c
        are matrix multiplied independently of all other sub matrices, then
        stacked upon each other again at the end. Hence, as suspected,
        there is no interaction between the K_c and the Q_d of different heads
        (and hence still no reason to weigh both K and Q...)
        """
        attn_scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.d_k)
        if mask is not None:
            # replace 0 with -1e9
            # masks interesting for machine translation
            # (inhowfar not explained in the article)
            attn_scores = attn_scores.masked_fill(mask == 0, -1e9)
        attn_probs = torch.softmax(attn_scores, dim=-1)
        output = torch.matmul(attn_probs, V)
        return output

    def split_heads(self, x):
        batch_size, seq_length, d_model = x.size()
        return x.view(batch_size, seq_length, self.num_heads, self.d_k).transpose(1, 2)

    def combine_heads(self, x):
        batch_size, _, seq_length, d_k = x.size()
        return x.transpose(1, 2).contiguous().view(batch_size, seq_length, self.d_model)

    def forward(self, Q, K, V, mask=None):
        Q = self.split_heads(self.W_q(Q))
        K = self.split_heads(self.W_k(K))
        V = self.split_heads(self.W_v(V))

        # Still think that in case of scaled_dot_product_attention,
        # assigning weight matrix to both Q and K might be useless, even
        # detrimental, to learning
        attn_output = self.scaled_dot_product_attention(Q, K, V, mask)
        output = self.W_o(self.combine_heads(attn_output))
        return output

In [7]:
class PositionWiseFeedForward(nn.Module):
    def __init__(self, d_model, d_ff):
        super(PositionWiseFeedForward, self).__init__()
        """d_ff is unclear again!"""
        self.fc1 = nn.Linear(d_model, d_ff)
        self.fc2 = nn.Linear(d_ff, d_model)
        self.relu = nn.ReLU()

    def forward(self, x):
        return self.fc2(self.relu(self.fc1(x)))

In [8]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_seq_length):
        super(PositionalEncoding, self).__init__()

        pe = torch.zeros(max_seq_length, d_model)
        position = torch.arange(0, max_seq_length, dtype=torch.float).unsqueeze(1)
        """This encodes the position in the head_i-k_vector_element space,
        not as expected previously in the sequence_element space"""
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * -(math.log(10000.0) / d_model))

        """Here though, the position inside the sequence is encoded!
        Clock-idea"""
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)

        self.register_buffer('pe', pe.unsqueeze(0))

    def forward(self, x):
        """As already stated in the article, but surprisingly to me, positional
        encoding is actually added onto the input vector, instead of
        concatenated to it. this mingles info in a way i cannot imagine to be
        useful, but appearantly, it works!"""
        return x + self.pe[:, :x.size(1)]

In [9]:
class EncoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout):
        super(EncoderLayer, self).__init__()
        self.self_attn = MultiHeadAttention(d_model, num_heads)
        self.feed_forward = PositionWiseFeedForward(d_model, d_ff)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, mask):
        attn_output = self.self_attn(x, x, x, mask)
        x = self.norm1(x + self.dropout(attn_output))
        ff_output = self.feed_forward(x)
        x = self.norm2(x + self.dropout(ff_output))
        return x

In [10]:
class DecoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout):
        super(DecoderLayer, self).__init__()
        self.self_attn = MultiHeadAttention(d_model, num_heads)
        self.cross_attn = MultiHeadAttention(d_model, num_heads)
        self.feed_forward = PositionWiseFeedForward(d_model, d_ff)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.norm3 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, enc_output, src_mask, tgt_mask):
        attn_output = self.self_attn(x, x, x, tgt_mask)
        x = self.norm1(x + self.dropout(attn_output))
        # attn_module.forward(Q,K,V)
        # why would we, if we wanted to translate sth,
        # ask in the decoder language, Q==x
        # and respond in the encoded language, K,V==enc_output
        # it should be exactly the other way round!
        # still many open questions in transformer...
        attn_output = self.cross_attn(x, enc_output, enc_output, src_mask)
        x = self.norm2(x + self.dropout(attn_output))
        ff_output = self.feed_forward(x)
        x = self.norm3(x + self.dropout(ff_output))
        return x

In [11]:
class Transformer(nn.Module):
    def __init__(self, src_vocab_size, tgt_vocab_size, d_model, num_heads, num_layers, d_ff, max_seq_length, dropout):
        super(Transformer, self).__init__()
        self.encoder_embedding = nn.Embedding(src_vocab_size, d_model)
        self.decoder_embedding = nn.Embedding(tgt_vocab_size, d_model)
        self.positional_encoding = PositionalEncoding(d_model, max_seq_length)

        self.encoder_layers = nn.ModuleList([EncoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)])
        self.decoder_layers = nn.ModuleList([DecoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)])

        self.fc = nn.Linear(d_model, tgt_vocab_size)
        self.dropout = nn.Dropout(dropout)

    def generate_mask(self, src, tgt):
        src_mask = (src != 0).unsqueeze(1).unsqueeze(2)
        tgt_mask = (tgt != 0).unsqueeze(1).unsqueeze(3)
        seq_length = tgt.size(1)
        nopeak_mask = (1 - torch.triu(torch.ones(1, seq_length, seq_length), diagonal=1)).bool()
        tgt_mask = tgt_mask & nopeak_mask
        """so why lower left triangle of matrix, combined with
        src!=0, tgt!=0? no idea how src and tgt look like!
        -> seem to be the input and output sequences of words
        but when would those become 0? A lot of stuff still unclear...
        src and target probably contain the indizes of the words which will
        later be converted into an embedding.
        For instance, 1: I , 2: am, 3: hungry are the indizes
        {1: [1,0,0,0,0,0],
         2: [1,2,2,5,1,0],
         3: [9,2,6,1,8,8]}
        is the embedding. So, still unclear what src==0 or tgt==0 could mean.
        might be commas or similar or values masked on purpose beforehand.
        Using only one of the triangle matrices might mean that we only predict
        into one direction, namely forward? That sounds more like an RNN
        approach though.
        Only tgt_mask is additionally added with this triangle shaped thingy.
        I guess, when predicting, we do go word by word? So, take entire encoded
        input from encoder for predicting word 3 in sentence 2, but take only
        the predicted/ translated words 1 and 2 from sentence 2 as input, as
        those determine the next predicted word. If we predict sequentially (do
        we?) we d have to apply that mask anyways."""
        return src_mask, tgt_mask

    def forward(self, src, tgt):
        src_mask, tgt_mask = self.generate_mask(src, tgt)
        src_embedded = self.dropout(self.positional_encoding(self.encoder_embedding(src)))
        tgt_embedded = self.dropout(self.positional_encoding(self.decoder_embedding(tgt)))

        enc_output = src_embedded
        for enc_layer in self.encoder_layers:
            enc_output = enc_layer(enc_output, src_mask)

        dec_output = tgt_embedded
        for dec_layer in self.decoder_layers:
            dec_output = dec_layer(dec_output, enc_output, src_mask, tgt_mask)

        output = self.fc(dec_output)
        return output

## Training

In [12]:
src_vocab_size = 5000
tgt_vocab_size = 5000
d_model = 512
num_heads = 8
num_layers = 6
d_ff = 2048
max_seq_length = 100
dropout = 0.1

transformer = Transformer(src_vocab_size, tgt_vocab_size, d_model, num_heads, num_layers, d_ff, max_seq_length, dropout)

# Generate random sample data
src_data = torch.randint(1, src_vocab_size, (64, max_seq_length))  # (batch_size, seq_length)
tgt_data = torch.randint(1, tgt_vocab_size, (64, max_seq_length))  # (batch_size, seq_length)

In [None]:
criterion = nn.CrossEntropyLoss(ignore_index=0)
optimizer = optim.Adam(transformer.parameters(), lr=0.0001, betas=(0.9, 0.98), eps=1e-9)

transformer.train()

for epoch in range(100):
    optimizer.zero_grad()
    output = transformer(src_data, tgt_data[:, :-1])
    loss = criterion(output.contiguous().view(-1, tgt_vocab_size), tgt_data[:, 1:].contiguous().view(-1))
    loss.backward()
    optimizer.step()
    print(f"Epoch: {epoch+1}, Loss: {loss.item()}")

Epoch: 1, Loss: 8.695809364318848
Epoch: 2, Loss: 8.557910919189453
Epoch: 3, Loss: 8.482783317565918
Epoch: 4, Loss: 8.431674003601074
Epoch: 5, Loss: 8.3761625289917
Epoch: 6, Loss: 8.306273460388184
Epoch: 7, Loss: 8.235282897949219
Epoch: 8, Loss: 8.154200553894043
Epoch: 9, Loss: 8.074752807617188
Epoch: 10, Loss: 7.987854480743408
Epoch: 11, Loss: 7.916656017303467
Epoch: 12, Loss: 7.829555988311768
Epoch: 13, Loss: 7.751969337463379
Epoch: 14, Loss: 7.666973114013672
Epoch: 15, Loss: 7.584961414337158
Epoch: 16, Loss: 7.495419979095459
Epoch: 17, Loss: 7.4172186851501465
Epoch: 18, Loss: 7.332920074462891
Epoch: 19, Loss: 7.257452964782715
Epoch: 20, Loss: 7.1807861328125
Epoch: 21, Loss: 7.097526550292969
Epoch: 22, Loss: 7.026431083679199
Epoch: 23, Loss: 6.944304466247559
Epoch: 24, Loss: 6.862218856811523
Epoch: 25, Loss: 6.784948348999023
Epoch: 26, Loss: 6.7099080085754395
Epoch: 27, Loss: 6.6337714195251465
Epoch: 28, Loss: 6.560471534729004
Epoch: 29, Loss: 6.48755645751