# Chapter 15: Self-Attention and Transformers
The Transformer Encoder and its implementation in PyTorch

Programs from the book: [_Python for Natural Language Processing_](https://link.springer.com/book/9783031575488)

__Author__: Pierre Nugues

## Modules

In [1]:
import math
import torch
import torch.nn as nn
import torch.nn.functional as F
import random
import copy

In [2]:
random.seed(1234)
torch.manual_seed(1234)

<torch._C.Generator at 0x1112df170>

## Attention

In [3]:
def read_embeddings(file):
    embeddings = {}
    with open(file, encoding='utf8') as glove:
        for line in glove:
            values = line.strip().split()
            word = values[0]
            embeddings[word] = torch.FloatTensor(
                [float(value) for value in values[1:]])
    return embeddings

In [4]:
PATH = '../datasets/embeddings/'

In [5]:
embedding_file = PATH + 'glove.6B.50d.txt'
embeddings_dict = read_embeddings(embedding_file)

In [6]:
# embeddings_dict['<unk>'] = torch.randn((1, d_model))

In [7]:
sentence_odyssey = 'I must go back to my ship and to my crew'
sentence_amazon = 'We process and ship your order'
# in the most cost-efficient way possible'
words_o = sentence_odyssey.lower().split()
words_a = sentence_amazon.lower().split()

In [8]:
def embedding_matrix(words, embeddings_dict):
    X = [embeddings_dict[word] for word in words]
    X = torch.stack(X)
    return X

In [9]:
X = embedding_matrix(words_o, embeddings_dict)

In [10]:
X.size()

torch.Size([11, 50])

In [11]:
X_batch = torch.unsqueeze(X, dim=0)
X_batch

tensor([[[ 1.1891e-01,  1.5255e-01, -8.2073e-02, -7.4144e-01,  7.5917e-01,
          -4.8328e-01, -3.1009e-01,  5.1476e-01, -9.8708e-01,  6.1757e-04,
          -1.5043e-01,  8.3770e-01, -1.0797e+00, -5.1460e-01,  1.3188e+00,
           6.2007e-01,  1.3779e-01,  4.7108e-01, -7.2874e-02, -7.2675e-01,
          -7.4116e-01,  7.5263e-01,  8.8180e-01,  2.9561e-01,  1.3548e+00,
          -2.5701e+00, -1.3523e+00,  4.5880e-01,  1.0068e+00, -1.1856e+00,
           3.4737e+00,  7.7898e-01, -7.2929e-01,  2.5102e-01, -2.6156e-01,
          -3.4684e-01,  5.5841e-01,  7.5098e-01,  4.9830e-01, -2.6823e-01,
          -2.7443e-03, -1.8298e-02, -2.8096e-01,  5.5318e-01,  3.7706e-02,
           1.8555e-01, -1.5025e-01, -5.7512e-01, -2.6671e-01,  9.2121e-01],
         [ 4.7769e-01, -1.2242e-01,  2.9476e-01, -5.2751e-01,  6.9156e-01,
          -1.0851e-01,  2.0465e-01,  2.9798e-01, -2.5924e-02, -2.3163e-01,
           4.9905e-01,  7.5481e-01, -1.5678e-01,  2.6700e-02,  6.6682e-01,
           9.9944e-01,  

In [12]:
X_batch.size()

torch.Size([1, 11, 50])

$$
\text{Attention}({Q}, {K}, {V}) = \text{softmax}\left (\frac{{Q}  {K}^\intercal}{\sqrt{d_k}} \right)  {V},
$$

In [13]:
def attention(Q, K, V):
    d_k = K.size(dim=-1)
    attn_weights = F.softmax(Q @ K.T/math.sqrt(d_k), dim=-1)
    attn_output = attn_weights @ V
    return attn_output, attn_weights

In [14]:

# attn_output, attn_weights = attention(X_batch, X_batch, X_batch)
attn_output, attn_weights = attention(X, X, X)

In [15]:
attn_output.size()

torch.Size([11, 50])

In [16]:
attn_output[6]

tensor([ 1.0387,  0.1033,  0.3426, -0.4320,  0.2237, -0.0958, -0.9926,  0.6662,
         0.4424, -0.7942,  0.5638,  0.9921,  0.0205,  0.5082,  0.0743, -0.1773,
        -0.3408,  0.5675, -1.1545, -0.5718,  0.4288,  0.4191, -0.0658, -0.3339,
         0.6682, -1.7473, -0.0485,  0.1531,  0.8642, -0.1447,  2.6571, -0.0545,
        -0.5343,  0.3160,  0.4041,  0.2277,  0.3958, -0.2916, -0.1126, -0.1385,
         0.1744, -0.5375,  0.9499, -0.4145, -0.1039, -0.1755, -0.2213, -0.3995,
         0.2119, -0.3610])

In [17]:
attn_weights[6]

tensor([0.0303, 0.0302, 0.0276, 0.0407, 0.0459, 0.0343, 0.5530, 0.0297, 0.0459,
        0.0343, 0.1281])

## Attention with Projections

In [18]:
class Attention(nn.Module):
    def __init__(self, d_model, d_k):
        super().__init__()
        self.WQ = nn.Linear(d_model, d_k)
        self.WK = nn.Linear(d_model, d_k)
        self.WV = nn.Linear(d_model, d_k)

    def forward(self, X):
        attn_output, attn_weights = attention(self.WQ(X),
                                              self.WK(X),
                                              self.WV(X))
        return attn_output, attn_weights

In [19]:
d_model = embeddings_dict[
    next(iter(embeddings_dict))].size(dim=-1)
d_model

50

In [20]:
attn = Attention(d_model, d_model)

In [21]:
X@X.T/math.sqrt(50)

tensor([[5.5182, 3.4520, 3.8273, 3.5139, 3.2042, 4.8717, 1.7582, 2.6782, 3.2042,
         4.8717, 1.8698],
        [3.4520, 3.7893, 3.1256, 2.5801, 3.1651, 3.1035, 1.7562, 2.3624, 3.1651,
         3.1035, 1.6636],
        [3.8273, 3.1256, 3.5989, 3.1061, 2.9752, 3.5271, 1.6664, 2.2745, 2.9752,
         3.5271, 1.7613],
        [3.5139, 2.5801, 3.1061, 3.8350, 3.0346, 3.4017, 2.0521, 2.6433, 3.0346,
         3.4017, 1.9008],
        [3.2042, 3.1651, 2.9752, 3.0346, 3.4748, 2.9322, 2.1740, 2.8109, 3.4748,
         2.9322, 1.8026],
        [4.8717, 3.1035, 3.5271, 3.4017, 2.9322, 5.2803, 1.8810, 2.5586, 2.9322,
         5.2803, 1.8957],
        [1.7582, 1.7562, 1.6664, 2.0521, 2.1740, 1.8810, 4.6621, 1.7375, 2.1740,
         1.8810, 3.1994],
        [2.6782, 2.3624, 2.2745, 2.6433, 2.8109, 2.5586, 1.7375, 3.0785, 2.8109,
         2.5586, 1.6104],
        [3.2042, 3.1651, 2.9752, 3.0346, 3.4748, 2.9322, 2.1740, 2.8109, 3.4748,
         2.9322, 1.8026],
        [4.8717, 3.1035, 3.5271, 3.40

In [22]:
att_out, att_weights = attn(X)

In [23]:
att_out

tensor([[-0.3306, -0.0888, -0.3374,  0.4868,  0.6488, -0.3561, -0.0859,  0.5052,
         -0.0991, -0.6442, -0.5368,  0.2956,  0.9112,  0.0546,  0.0191,  0.2036,
         -0.5131,  0.3797,  0.2747,  0.0123, -0.1460,  0.2397,  0.3909, -0.1393,
          0.1060,  0.0350, -0.3627, -0.0949,  0.0245,  0.4166,  0.1087,  0.0238,
          0.4243, -0.3764,  0.2428, -0.5267,  0.2206,  0.0780,  0.3299, -0.6120,
          0.2306, -0.1049,  0.5755,  0.2655,  0.4114, -0.2689,  0.2682, -0.4302,
          0.4807,  0.0388],
        [-0.3334, -0.0812, -0.3397,  0.4612,  0.6475, -0.3204, -0.0774,  0.4791,
         -0.1121, -0.6310, -0.5556,  0.2988,  0.8794,  0.0519,  0.0175,  0.1988,
         -0.5001,  0.3794,  0.2609,  0.0229, -0.1477,  0.2269,  0.3815, -0.1764,
          0.1138,  0.0248, -0.3662, -0.0695,  0.0180,  0.4271,  0.0552,  0.0331,
          0.4242, -0.3878,  0.2463, -0.4775,  0.2158,  0.0882,  0.3340, -0.6200,
          0.2451, -0.1033,  0.5640,  0.2509,  0.4107, -0.2838,  0.2623, -0.4267,


In [24]:
att_weights

tensor([[0.1313, 0.0840, 0.0856, 0.0813, 0.0715, 0.1210, 0.0629, 0.0854, 0.0715,
         0.1210, 0.0847],
        [0.1266, 0.0839, 0.0888, 0.0776, 0.0764, 0.1025, 0.0817, 0.0819, 0.0764,
         0.1025, 0.1017],
        [0.1206, 0.0877, 0.0910, 0.0830, 0.0796, 0.1063, 0.0706, 0.0880, 0.0796,
         0.1063, 0.0874],
        [0.1081, 0.0931, 0.0993, 0.0815, 0.0823, 0.0941, 0.0752, 0.0901, 0.0823,
         0.0941, 0.1000],
        [0.1138, 0.0870, 0.0929, 0.0750, 0.0808, 0.0963, 0.0855, 0.0853, 0.0808,
         0.0963, 0.1063],
        [0.1250, 0.0911, 0.0912, 0.0828, 0.0779, 0.1029, 0.0673, 0.0913, 0.0779,
         0.1029, 0.0897],
        [0.0893, 0.0892, 0.0975, 0.0705, 0.0805, 0.0972, 0.0975, 0.0855, 0.0805,
         0.0972, 0.1153],
        [0.1072, 0.0928, 0.0950, 0.0783, 0.0855, 0.0905, 0.0860, 0.0886, 0.0855,
         0.0905, 0.1002],
        [0.1138, 0.0870, 0.0929, 0.0750, 0.0808, 0.0963, 0.0855, 0.0853, 0.0808,
         0.0963, 0.1063],
        [0.1250, 0.0911, 0.0912, 0.08

## Multihead attention

In [25]:
class MultiheadAttention(nn.Module):
    def __init__(self, d_model, nhead):
        super().__init__()
        self.nhead = nhead
        d_k = d_model // nhead
        self.attn_modules = nn.ModuleList(
            [Attention(d_model, d_k)
             for i in range(nhead)])
        self.WO = nn.Linear(d_model, d_model)

    def forward(self, X):
        attn_heads, attn_weights = zip(
            *[attn_module(X)
              for attn_module in self.attn_modules])
        attn_output = self.WO(torch.cat(attn_heads, dim=-1))
        attn_weights = torch.sum(torch.stack(attn_weights),
                                 dim=0)/self.nhead
        return attn_output, attn_weights

In [26]:
nhead = 5
multihead_attn = MultiheadAttention(d_model, nhead)

In [27]:
attn_output, attn_weights = multihead_attn(X)

In [28]:
attn_output.size()

torch.Size([11, 50])

In [29]:
attn_weights.size()

torch.Size([11, 11])

In [30]:
m_attn = MultiheadAttention(50, 5)

In [31]:
Y, W = m_attn(X)

In [32]:
Y

tensor([[ 2.4549e-01, -1.9047e-01, -2.8144e-02,  2.0581e-01, -4.3330e-02,
         -5.0632e-02, -3.2023e-01,  2.8404e-01, -2.6049e-01, -2.8724e-01,
         -1.1744e-01,  6.7870e-03, -2.3821e-01, -2.2360e-01, -9.2082e-02,
          3.1603e-02,  2.6027e-01,  4.6545e-02, -3.0289e-01,  3.7075e-01,
         -2.5228e-01,  1.2999e-01, -1.8872e-01, -3.7208e-01, -1.0846e-01,
         -1.3418e-01, -2.0951e-01,  1.0096e-02, -1.4147e-01,  2.6363e-01,
          1.2185e-01, -6.3973e-02,  1.1484e-01, -1.5159e-01, -3.7984e-01,
         -1.5144e-01,  4.8414e-01,  2.1841e-02, -1.3820e-02,  1.1447e-01,
          1.7203e-01,  4.4724e-01,  2.4810e-01, -9.6410e-02,  2.3623e-03,
          9.7365e-02, -3.2170e-01,  2.7338e-01,  5.1565e-02, -1.6691e-01],
        [ 2.4926e-01, -1.9225e-01, -2.6418e-02,  2.1094e-01, -4.4298e-02,
         -4.8967e-02, -3.2119e-01,  2.7404e-01, -2.6114e-01, -2.8801e-01,
         -1.1324e-01,  8.7281e-03, -2.3400e-01, -2.2796e-01, -9.6002e-02,
          3.2466e-02,  2.6129e-01,  4

## Residual networks

In [33]:
class LayerNormAttention(nn.Module):
    def __init__(self, d_model, nhead):
        super().__init__()
        self.multihead_attn = MultiheadAttention(d_model,
                                                 nhead)
        self.layer_norm = nn.LayerNorm(d_model)

    def forward(self, X):
        Y, _ = self.multihead_attn(X)
        return self.layer_norm(X + Y)

In [34]:
ln_m_attn = LayerNormAttention(d_model, nhead)

In [35]:
ln_m_attn(X).size()

torch.Size([11, 50])

## Encoder Layer

In [36]:
class TransformerEncoderLayer(nn.Module):
    def __init__(self, d_model, nhead, d_ff=2048):
        super().__init__()
        self.multihead_attn = MultiheadAttention(d_model,
                                                 nhead)
        self.layer_norm_1 = nn.LayerNorm(d_model)
        self.W1 = nn.Linear(d_model, d_ff)
        self.relu = nn.ReLU()
        self.W2 = nn.Linear(d_ff, d_model)
        self.layer_norm_2 = nn.LayerNorm(d_model)

    def forward(self, X):
        Xprime, _ = self.multihead_attn(X)
        Xprime = self.layer_norm_1(X + Xprime)

        Y = self.W2(self.relu(self.W1(Xprime)))
        Y = self.layer_norm_2(Xprime + Y)
        return Y

In [37]:
encoder_layer = TransformerEncoderLayer(d_model, nhead)

In [38]:
encoder_layer(X)

tensor([[-7.3089e-02,  2.5520e-01,  6.4954e-03,  1.2981e-01,  9.9077e-01,
         -7.5645e-01, -6.3673e-01,  6.3995e-01, -1.5479e+00,  4.8526e-02,
          2.5185e-02,  1.2044e+00, -8.8348e-01, -6.6131e-01,  1.7443e+00,
          4.3147e-01,  6.8408e-02,  2.0372e-01, -1.5763e-01, -1.3056e+00,
         -1.2972e+00,  9.8497e-01,  3.5865e-01,  4.9210e-02,  1.3875e+00,
         -2.9637e+00, -1.4131e+00,  5.4301e-01,  9.1319e-01, -1.6412e+00,
          3.1332e+00,  6.8485e-01, -1.3305e+00, -2.3616e-01, -4.1715e-01,
         -1.6993e-01,  8.8412e-01,  6.5939e-01,  1.1769e+00, -5.2602e-01,
         -2.1721e-02,  4.1812e-01, -2.7573e-01,  8.2442e-01, -5.3325e-01,
         -2.3177e-01, -3.9330e-01, -8.3148e-01, -3.2501e-01,  8.6363e-01],
        [ 4.1153e-01,  5.2424e-04,  4.2679e-01,  4.1211e-01,  8.7183e-01,
         -2.1111e-01,  7.8107e-02,  5.5837e-01, -3.9419e-01, -2.4135e-01,
          7.8772e-01,  1.6126e+00, -2.1682e-02, -2.0127e-01,  1.1164e+00,
          1.1989e+00,  7.3802e-01, -4

## Encoder

In [39]:
encoder_layer = TransformerEncoderLayer(d_model, nhead)
num_layers = 6

In [40]:
class TransformerEncoder(nn.Module):
    def __init__(self,
                 encoder_layer,
                 num_layers):
        super().__init__()
        self.encoder_stack = nn.ModuleList(
            [copy.deepcopy(encoder_layer)
             for _ in range(num_layers)])

    def forward(self, x):
        for encoder_layer in self.encoder_stack:
            x = encoder_layer(x)
        return x

In [41]:
encoder = TransformerEncoder(encoder_layer, num_layers)

In [42]:
encoder(X)

tensor([[ 0.1512, -0.4909, -0.9940, -0.0142, -0.2966, -0.0546, -1.5457, -1.7515,
         -1.9970,  0.4956,  0.2440, -0.6338, -0.5659, -0.8266,  3.2173,  1.1496,
         -0.4854,  1.8441,  0.2029,  0.5534,  0.2027, -0.6110, -0.9370,  1.2790,
          0.7442, -0.2579, -1.5807,  0.7472,  0.4526, -0.0546,  0.2644,  0.8616,
         -0.0689, -0.7981,  0.4569, -1.0752, -1.3287,  0.3633, -1.1101, -0.4723,
         -0.3269, -0.6482,  0.1295,  1.2319,  0.5770, -0.5724,  0.9430,  1.7819,
          1.4293,  0.1754],
        [ 0.6247, -0.4164, -0.7768, -0.1048, -0.1677,  0.1786, -1.3297, -1.5761,
         -2.1742,  0.6637,  0.7286, -0.3268, -0.0210, -0.3153,  3.1936,  0.9595,
         -0.7934,  1.9464,  0.9727, -0.1118,  0.7027, -1.2556, -0.9468,  1.3324,
          0.4256, -0.0734, -1.2600,  0.9008, -0.2109, -0.1199,  0.3844,  0.2095,
         -0.8743, -0.8353,  0.5295, -0.7031, -0.6954,  0.2768, -1.2305, -0.7709,
         -0.6747, -1.1718,  0.3155,  0.9847,  0.7110, -0.5306,  0.6240,  2.0425,


## Input Embeddings

In [43]:
embeddings_words = embeddings_dict.keys()
print('Words in GloVe:',  len(embeddings_dict.keys()))

Words in GloVe: 400000


In [44]:
embeddings_dict

{'the': tensor([ 4.1800e-01,  2.4968e-01, -4.1242e-01,  1.2170e-01,  3.4527e-01,
         -4.4457e-02, -4.9688e-01, -1.7862e-01, -6.6023e-04, -6.5660e-01,
          2.7843e-01, -1.4767e-01, -5.5677e-01,  1.4658e-01, -9.5095e-03,
          1.1658e-02,  1.0204e-01, -1.2792e-01, -8.4430e-01, -1.2181e-01,
         -1.6801e-02, -3.3279e-01, -1.5520e-01, -2.3131e-01, -1.9181e-01,
         -1.8823e+00, -7.6746e-01,  9.9051e-02, -4.2125e-01, -1.9526e-01,
          4.0071e+00, -1.8594e-01, -5.2287e-01, -3.1681e-01,  5.9213e-04,
          7.4449e-03,  1.7778e-01, -1.5897e-01,  1.2041e-02, -5.4223e-02,
         -2.9871e-01, -1.5749e-01, -3.4758e-01, -4.5637e-02, -4.4251e-01,
          1.8785e-01,  2.7849e-03, -1.8411e-01, -1.1514e-01, -7.8581e-01]),
 ',': tensor([ 0.0134,  0.2368, -0.1690,  0.4095,  0.6381,  0.4771, -0.4285, -0.5564,
         -0.3640, -0.2394,  0.1300, -0.0637, -0.3957, -0.4816,  0.2329,  0.0902,
         -0.1332,  0.0786, -0.4163, -0.1543,  0.1007,  0.4889,  0.3123, -0.1252,
   

The vocabulary consists of the GloVe words. We could add them the words in the corpus

In [45]:
vocabulary = embeddings_dict.keys()

In [46]:
idx2word = dict(enumerate(vocabulary, start=2))
word2idx = {word: idx for idx, word in idx2word.items()}

In [47]:
input_embeddings = torch.rand(
    (len(word2idx) + 2, d_model))/10 - 0.05  # range: -0.05, 0.05

In [48]:
for word in embeddings_dict:
    # If the words are in the embeddings, we fill them with a value
    input_embeddings[word2idx[word]] = embeddings_dict[word]

## Positional Encodings

In [49]:
def pos_encoding(idx, d_model):
    angles = [idx / (10000 ** (2 * j / d_model)) for j in range(d_model)]
    pe = [math.sin(angle) if i % 2 == 0
          else math.cos(angle) for i, angle in enumerate(angles)]
    return len(angles), pe

In [50]:
pos_encoding(4, 50)

(50,
 [-0.7568024953079283,
  -0.9307752026113622,
  0.9415062110105988,
  0.24378998857303116,
  0.7933832927126037,
  0.8056897785422777,
  0.42466449969116443,
  0.9543169293275685,
  0.20838457529672513,
  0.989472469606138,
  0.10030648729934574,
  0.9975850112785689,
  0.048072042985881594,
  0.9994465862752576,
  0.023015565055894787,
  0.999873211223926,
  0.011016691956451144,
  0.9999709538962321,
  0.005273002518337972,
  0.9999933458972107,
  0.0025238266985760983,
  0.9999984756318129,
  0.001207980394376433,
  0.9999996507873545,
  0.0005781758760855534,
  0.999999920000001,
  0.00027673238483550924,
  0.9999999816730588,
  0.00013245244820575305,
  0.9999999958015403,
  6.33957276559798e-05,
  0.9999999990381885,
  3.0343102996511175e-05,
  0.9999999997796617,
  1.45231221902935e-05,
  0.9999999999495234,
  6.951203314941526e-06,
  0.9999999999884365,
  3.327055084404545e-06,
  0.9999999999973509,
  1.5924286822133147e-06,
  0.9999999999993932,
  7.621842871852255e-07,
 

In [51]:
"""def pos_encoding_1(seq_len, d_model):
    angles = torch.zeros((seq_len, d_model))
    for i in range(seq_len):
        for j in range(0, d_model, 2):
            angle = torch.tensor(i / (10000.0 ** (j / d_model)))
            angles[i, j] = torch.sin(angle)
            angles[i, j + 1] = torch.cos(angle)
    return angles"""

'def pos_encoding_1(seq_len, d_model):\n    angles = torch.zeros((seq_len, d_model))\n    for i in range(seq_len):\n        for j in range(0, d_model, 2):\n            angle = torch.tensor(i / (10000.0 ** (j / d_model)))\n            angles[i, j] = torch.sin(angle)\n            angles[i, j + 1] = torch.cos(angle)\n    return angles'

In [52]:
def pos_encoding(max_len, d_model):
    dividend = torch.arange(max_len).unsqueeze(0).T
    divisor = torch.pow(10000.0,
                        torch.arange(0, d_model, 2)/d_model)
    angles = dividend / divisor
    pe = torch.zeros((max_len, d_model))
    pe[:, 0::2] = torch.sin(angles)
    pe[:, 1::2] = torch.cos(angles)
    return pe

In [53]:
pos_embeddings = pos_encoding(100, d_model)

In [54]:
pos_embeddings

tensor([[ 0.0000e+00,  1.0000e+00,  0.0000e+00,  ...,  1.0000e+00,
          0.0000e+00,  1.0000e+00],
        [ 8.4147e-01,  5.4030e-01,  6.3795e-01,  ...,  1.0000e+00,
          1.4454e-04,  1.0000e+00],
        [ 9.0930e-01, -4.1615e-01,  9.8254e-01,  ...,  1.0000e+00,
          2.8909e-04,  1.0000e+00],
        ...,
        [ 3.7961e-01, -9.2515e-01, -9.0618e-01,  ...,  9.9979e-01,
          1.4020e-02,  9.9990e-01],
        [-5.7338e-01, -8.1929e-01, -9.6762e-01,  ...,  9.9979e-01,
          1.4165e-02,  9.9990e-01],
        [-9.9921e-01,  3.9821e-02, -5.8410e-01,  ...,  9.9979e-01,
          1.4309e-02,  9.9990e-01]])

In [55]:
class Embedding(nn.Module):
    def __init__(self,
                 vocab_size,
                 d_model,
                 dropout=0.1,
                 max_len=500):
        super().__init__()
        self.d_model = d_model
        self.input_embedding = nn.Embedding(vocab_size, d_model)
        pe = self.pos_encoding(max_len, d_model)
        self.pos_embedding = nn.Embedding.from_pretrained(
            pe, freeze=True)
        self.dropout = nn.Dropout(dropout)

    def forward(self, X):
        pos_mat = torch.arange(X.size(dim=-1))
        X = self.input_embedding(X) * math.sqrt(self.d_model)
        X += self.pos_embedding(pos_mat)
        return self.dropout(X)

    def pos_encoding(self, max_len, d_model):
        dividend = torch.arange(max_len).unsqueeze(0).T
        divisor = torch.pow(10000.0,
                            torch.arange(0, d_model, 2)/d_model)
        angles = dividend / divisor
        pe = torch.zeros((max_len, d_model))
        pe[:, 0::2] = torch.sin(angles)
        pe[:, 1::2] = torch.cos(angles)
        return pe

In [56]:
vocab_size = len(word2idx) + 2
vocab_size

400002

In [57]:
embedding = Embedding(vocab_size, d_model)

In [58]:
embedding.input_embedding.weight

Parameter containing:
tensor([[-1.6290, -1.2876,  0.1961,  ...,  0.1933, -0.6194,  0.3377],
        [-0.4177, -0.2188,  0.0452,  ..., -0.4398,  0.4522,  0.8032],
        [-0.4531, -0.6725,  0.0741,  ..., -0.8321,  0.0377, -1.6589],
        ...,
        [-1.4726, -0.2718, -0.3032,  ..., -0.0696, -0.4534,  0.1071],
        [-1.0955, -1.5847,  2.4183,  ...,  1.0804,  0.2781,  0.8761],
        [ 0.0591,  0.9700, -0.0890,  ..., -1.8721, -0.3102, -0.6556]],
       requires_grad=True)

In [59]:
embedding.input_embedding.weight = nn.Parameter(input_embeddings)

In [60]:
input_embeddings

tensor([[ 3.5530e-02, -3.1094e-03,  4.3497e-02,  ..., -1.3143e-02,
         -4.4641e-03,  1.2548e-02],
        [ 4.8429e-02,  3.2579e-02, -3.0573e-05,  ...,  4.6322e-02,
          3.7171e-02,  3.0670e-02],
        [ 4.1800e-01,  2.4968e-01, -4.1242e-01,  ..., -1.8411e-01,
         -1.1514e-01, -7.8581e-01],
        ...,
        [-5.1181e-01,  5.8706e-02,  1.0913e+00,  ..., -2.5003e-01,
         -1.1250e+00,  1.5863e+00],
        [-7.5898e-01, -4.7426e-01,  4.7370e-01,  ...,  7.8954e-01,
         -1.4116e-02,  6.4480e-01],
        [ 7.2617e-02, -5.1393e-01,  4.7280e-01,  ..., -1.8907e-01,
         -5.9021e-01,  5.5559e-01]])

In [61]:
embedding.input_embedding.weight

Parameter containing:
tensor([[ 3.5530e-02, -3.1094e-03,  4.3497e-02,  ..., -1.3143e-02,
         -4.4641e-03,  1.2548e-02],
        [ 4.8429e-02,  3.2579e-02, -3.0573e-05,  ...,  4.6322e-02,
          3.7171e-02,  3.0670e-02],
        [ 4.1800e-01,  2.4968e-01, -4.1242e-01,  ..., -1.8411e-01,
         -1.1514e-01, -7.8581e-01],
        ...,
        [-5.1181e-01,  5.8706e-02,  1.0913e+00,  ..., -2.5003e-01,
         -1.1250e+00,  1.5863e+00],
        [-7.5898e-01, -4.7426e-01,  4.7370e-01,  ...,  7.8954e-01,
         -1.4116e-02,  6.4480e-01],
        [ 7.2617e-02, -5.1393e-01,  4.7280e-01,  ..., -1.8907e-01,
         -5.9021e-01,  5.5559e-01]], requires_grad=True)

In [62]:
words_o

['i', 'must', 'go', 'back', 'to', 'my', 'ship', 'and', 'to', 'my', 'crew']

In [63]:
x = torch.LongTensor(
    list(map(lambda x: word2idx.get(x, 1), words_o)))

In [64]:
x

tensor([  43,  392,  244,  139,    6,  194, 1372,    7,    6,  194, 1696])

In [65]:
x.size()

torch.Size([11])

In [66]:
embeddings_dict['must']

tensor([ 0.4777, -0.1224,  0.2948, -0.5275,  0.6916, -0.1085,  0.2046,  0.2980,
        -0.0259, -0.2316,  0.4990,  0.7548, -0.1568,  0.0267,  0.6668,  0.9994,
         0.7260, -0.2042,  0.7568, -0.9753, -0.1006, -0.1912,  0.3810,  0.2040,
         0.1715, -1.8234, -0.1860, -0.4276,  0.3640, -0.5013,  3.5515,  0.6970,
        -1.3702, -0.6317, -0.0866, -0.1688,  0.4567, -0.0492, -0.1802, -0.4342,
        -0.2246, -0.4825,  0.5571,  0.4735, -0.2184,  0.1794, -0.2613,  0.3251,
        -0.1078,  0.1224])

In [67]:
pos_embeddings[1]

tensor([8.4147e-01, 5.4030e-01, 6.3795e-01, 7.7008e-01, 4.6056e-01, 8.8763e-01,
        3.2511e-01, 9.4568e-01, 2.2709e-01, 9.7387e-01, 1.5783e-01, 9.8747e-01,
        1.0943e-01, 9.9399e-01, 7.5785e-02, 9.9712e-01, 5.2457e-02, 9.9862e-01,
        3.6300e-02, 9.9934e-01, 2.5116e-02, 9.9968e-01, 1.7377e-02, 9.9985e-01,
        1.2022e-02, 9.9993e-01, 8.3175e-03, 9.9997e-01, 5.7544e-03, 9.9998e-01,
        3.9811e-03, 9.9999e-01, 2.7542e-03, 1.0000e+00, 1.9055e-03, 1.0000e+00,
        1.3183e-03, 1.0000e+00, 9.1201e-04, 1.0000e+00, 6.3096e-04, 1.0000e+00,
        4.3652e-04, 1.0000e+00, 3.0200e-04, 1.0000e+00, 2.0893e-04, 1.0000e+00,
        1.4454e-04, 1.0000e+00])

In [68]:
embedding.eval()

Embedding(
  (input_embedding): Embedding(400002, 50)
  (pos_embedding): Embedding(500, 50)
  (dropout): Dropout(p=0.1, inplace=False)
)

In [69]:
X = embedding(x)

In [70]:
X.size()

torch.Size([11, 50])

In [71]:
X

tensor([[ 8.4082e-01,  2.0787e+00, -5.8034e-01, -4.2428e+00,  5.3681e+00,
         -2.4173e+00, -2.1927e+00,  4.6399e+00, -6.9797e+00,  1.0044e+00,
         -1.0637e+00,  6.9234e+00, -7.6346e+00, -2.6388e+00,  9.3253e+00,
          5.3846e+00,  9.7432e-01,  4.3310e+00, -5.1530e-01, -4.1389e+00,
         -5.2408e+00,  6.3219e+00,  6.2353e+00,  3.0903e+00,  9.5799e+00,
         -1.7173e+01, -9.5622e+00,  4.2442e+00,  7.1192e+00, -7.3835e+00,
          2.4563e+01,  6.5082e+00, -5.1569e+00,  2.7750e+00, -1.8495e+00,
         -1.4525e+00,  3.9486e+00,  6.3102e+00,  3.5235e+00, -8.9667e-01,
         -1.9405e-02,  8.7061e-01, -1.9867e+00,  4.9116e+00,  2.6662e-01,
          2.3120e+00, -1.0624e+00, -3.0667e+00, -1.8859e+00,  7.5139e+00],
        [ 4.2192e+00, -3.2534e-01,  2.7222e+00, -2.9600e+00,  5.3506e+00,
          1.2035e-01,  1.7722e+00,  3.0527e+00,  4.3778e-02, -6.6400e-01,
          3.6866e+00,  6.3248e+00, -9.9917e-01,  1.1828e+00,  4.7909e+00,
          8.0642e+00,  5.1858e+00, -4

In [72]:
Y = encoder(X)

In [73]:
Y.size()

torch.Size([11, 50])

## Extensions

In [74]:
sentences = [words_o] + [words_a]
sentences

[['i', 'must', 'go', 'back', 'to', 'my', 'ship', 'and', 'to', 'my', 'crew'],
 ['we', 'process', 'and', 'ship', 'your', 'order']]

In [75]:
sent_idx = []
for sent in sentences:
    sent_idx += [torch.LongTensor(
        list(map(lambda x: word2idx.get(x, 1),
                 sent)))]
sent_idx

[tensor([  43,  392,  244,  139,    6,  194, 1372,    7,    6,  194, 1696]),
 tensor([  55,  548,    7, 1372,  394,  462])]

In [76]:
from torch.nn.utils.rnn import pad_sequence
X_idx = pad_sequence(sent_idx, batch_first=True)

In [77]:
X_idx

tensor([[  43,  392,  244,  139,    6,  194, 1372,    7,    6,  194, 1696],
        [  55,  548,    7, 1372,  394,  462,    0,    0,    0,    0,    0]])

In [78]:
X_batch = embedding(X_idx)

In [79]:
X_batch.size()

torch.Size([2, 11, 50])

In [80]:
torch.bmm(X_batch,
          torch.transpose(X_batch, 1, 2))

tensor([[[ 1.9776e+03,  1.2268e+03,  1.3743e+03,  1.2414e+03,  1.1272e+03,
           1.7483e+03,  6.1404e+02,  9.4549e+02,  1.1061e+03,  1.7224e+03,
           6.5901e+02],
         [ 1.2268e+03,  1.3440e+03,  1.1234e+03,  9.0696e+02,  1.1117e+03,
           1.1106e+03,  6.1380e+02,  8.2709e+02,  1.0992e+03,  1.0989e+03,
           5.9510e+02],
         [ 1.3743e+03,  1.1234e+03,  1.2943e+03,  1.0968e+03,  1.0492e+03,
           1.2631e+03,  5.8847e+02,  7.9584e+02,  1.0380e+03,  1.2535e+03,
           6.3917e+02],
         [ 1.2414e+03,  9.0696e+02,  1.0968e+03,  1.3372e+03,  1.0484e+03,
           1.1991e+03,  6.9111e+02,  8.9329e+02,  1.0291e+03,  1.1896e+03,
           6.6301e+02],
         [ 1.1272e+03,  1.1117e+03,  1.0492e+03,  1.0484e+03,  1.1913e+03,
           1.0327e+03,  7.2530e+02,  9.5604e+02,  1.1856e+03,  1.0304e+03,
           6.1298e+02],
         [ 1.7483e+03,  1.1106e+03,  1.2631e+03,  1.1991e+03,  1.0327e+03,
           1.9106e+03,  6.6007e+02,  8.9511e+02,  1.001

This will crash as the encoder only accepts one sample

In [81]:
encoder(X_batch)

  attn_weights = F.softmax(Q @ K.T/math.sqrt(d_k), dim=-1)


RuntimeError: The size of tensor a (2) must match the size of tensor b (10) at non-singleton dimension 0

In [None]:
X_batch.size()

In [82]:
torch.tensordot(X_batch, torch.transpose(X_batch, 0, 1), dims=([1, 0], [0, 1]))

tensor([[428.5537, -38.3869,  93.5249,  ...,  18.9594,  39.1096, -55.7387],
        [-38.3869, 115.1343,  -8.8314,  ..., -90.8249,  25.3763,  62.6030],
        [ 93.5249,  -8.8314, 130.6980,  ..., -14.1816,  15.9267,   5.8828],
        ...,
        [ 18.9594, -90.8249, -14.1816,  ..., 190.1146, -14.5547,  29.0616],
        [ 39.1096,  25.3763,  15.9267,  ..., -14.5547,  51.2870, -20.9593],
        [-55.7387,  62.6030,   5.8828,  ...,  29.0616, -20.9593, 204.9189]],
       grad_fn=<ViewBackward0>)

In [83]:
X_batch @ X_batch.T

RuntimeError: The size of tensor a (2) must match the size of tensor b (50) at non-singleton dimension 0

## PyTorch Modules

### Multihead Attention

In [84]:
X_batch.size()

torch.Size([2, 11, 50])

In [85]:
padding_mask = (X_idx == 0)

In [86]:
padding_mask

tensor([[False, False, False, False, False, False, False, False, False, False,
         False],
        [False, False, False, False, False, False,  True,  True,  True,  True,
          True]])

In [87]:
multihead_attn = nn.MultiheadAttention(d_model,
                                       nhead,
                                       batch_first=True)

In [88]:
attn_output, attn_weights = multihead_attn(
    X_batch, X_batch, X_batch, key_padding_mask=padding_mask)

In [89]:
attn_output.size()

torch.Size([2, 11, 50])

In [90]:
attn_weights.size()

torch.Size([2, 11, 11])

In [91]:
multihead_attn.state_dict()

OrderedDict([('in_proj_weight',
              tensor([[ 0.1450, -0.1468,  0.0806,  ..., -0.1390,  0.1294,  0.1032],
                      [-0.1559,  0.0665,  0.1200,  ...,  0.1385,  0.0065, -0.0456],
                      [-0.0267, -0.0884, -0.0490,  ...,  0.0826,  0.0405, -0.0747],
                      ...,
                      [ 0.1246, -0.1507,  0.0068,  ...,  0.0201,  0.0908,  0.0814],
                      [ 0.0216,  0.1189, -0.1191,  ...,  0.1158,  0.0361,  0.0989],
                      [-0.1304, -0.0246,  0.0171,  ..., -0.0597,  0.1691, -0.0168]])),
             ('in_proj_bias',
              tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
                      0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
                      0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
                      0., 0., 0., 0., 0., 0., 0., 

### Encoders

In [92]:
encoder_layer = nn.TransformerEncoderLayer(d_model,
                                           nhead,
                                           batch_first=True)

In [93]:
encoder = nn.TransformerEncoder(encoder_layer, num_layers)



In [94]:
enc_layer_output = encoder_layer(
    X_batch, src_key_padding_mask=padding_mask)

In [95]:
enc_layer_output.size()

torch.Size([2, 11, 50])

In [96]:
enc_output = encoder(X_batch,
                     src_key_padding_mask=padding_mask)

In [97]:
enc_output

tensor([[[-1.8875, -0.3571, -0.1857,  ..., -0.8490, -1.6112,  0.2971],
         [-1.7004, -0.6486,  0.3490,  ..., -0.6071, -1.6858,  0.2812],
         [-2.0071, -0.3815,  0.4349,  ..., -0.8521, -2.1722,  0.0641],
         ...,
         [-1.1857, -0.7961,  0.4638,  ..., -0.9840, -1.7067,  0.4173],
         [-1.4988, -0.7805,  0.0284,  ..., -1.2267, -1.5860,  0.0933],
         [-1.5802, -0.5559,  1.0616,  ..., -0.6798, -1.5032,  0.5519]],

        [[-2.1530, -0.3022, -0.0246,  ..., -0.3549, -2.4595, -0.0732],
         [-1.8853, -1.1744, -0.5502,  ..., -0.5737, -1.9781,  0.2031],
         [-1.6822, -0.9418,  0.1530,  ..., -0.3555, -2.1787, -0.5293],
         ...,
         [-2.4170, -0.7436,  0.5290,  ..., -1.1788, -1.5404,  0.9340],
         [-1.3408, -1.0960,  0.7618,  ..., -0.7587, -2.7265,  0.9177],
         [-1.8978, -0.7731,  0.7942,  ..., -1.2923, -2.2662,  0.8332]]],
       grad_fn=<NativeLayerNormBackward0>)