<a href="https://colab.research.google.com/github/JackWittmayer/Transformer-Implementation/blob/main/EDTransformer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install tokenizers



In [2]:
import re
import string
import os
import pickle
from unicodedata import normalize
from collections import Counter

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset
from torchvision import datasets
from torch.utils.data import DataLoader
from torch.nn.functional import log_softmax, pad

from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.trainers import BpeTrainer

import random
import time

import numpy as np
import math
import matplotlib.pyplot as plt

import sys
from nltk.translate.bleu_score import sentence_bleu
from nltk.translate.bleu_score import corpus_bleu

In [3]:
torch.manual_seed(25)
random.seed(25)
if torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')
print(device)

cuda


In [4]:
SAMPLE_X = torch.tensor([[3, 2, 0, 1], [1, 2, 3, 0]], dtype=torch.int32).to(device)
SAMPLE_Z = torch.tensor([4, 1, 7, 6], dtype=torch.int32).to(device)

In [5]:
def printIfVerbose(verbose, tag, value):
    if verbose:
        print(tag, value)

In [6]:
class Embedding(nn.Module):
    def __init__(self, vocab_size, embedding_size):
        super().__init__()
        self.table = nn.Embedding(vocab_size, embedding_size).to(device)

    def forward(self, sequence):
        embeddings = self.table(sequence)
        return embeddings

In [7]:
def test_embedding():
    torch.manual_seed(25)
    vocab_size = 4
    embedding = Embedding(vocab_size, 4)
    print("weight:", embedding.table.weight)
    print("SAMPLE_X: ", SAMPLE_X)
    output = embedding(SAMPLE_X)
    print("output:", output)
    for j in range(len(output)):
        #print("sample:", sample)
        for i in range(vocab_size):
            assert output[j, i, :].eq(embedding.table.weight[SAMPLE_X[j, i]]).all()
test_embedding()

weight: Parameter containing:
tensor([[ 0.0877, -0.6113,  0.3441, -1.2916],
        [-0.5874,  0.8060,  1.3200,  0.4826],
        [ 1.6671, -0.2342,  0.1074,  1.7852],
        [ 0.7874, -0.2466,  0.2384, -0.6746]], device='cuda:0',
       requires_grad=True)
SAMPLE_X:  tensor([[3, 2, 0, 1],
        [1, 2, 3, 0]], device='cuda:0', dtype=torch.int32)
output: tensor([[[ 0.7874, -0.2466,  0.2384, -0.6746],
         [ 1.6671, -0.2342,  0.1074,  1.7852],
         [ 0.0877, -0.6113,  0.3441, -1.2916],
         [-0.5874,  0.8060,  1.3200,  0.4826]],

        [[-0.5874,  0.8060,  1.3200,  0.4826],
         [ 1.6671, -0.2342,  0.1074,  1.7852],
         [ 0.7874, -0.2466,  0.2384, -0.6746],
         [ 0.0877, -0.6113,  0.3441, -1.2916]]], device='cuda:0',
       grad_fn=<EmbeddingBackward0>)


In [8]:
class Unembedding(nn.Module):
    def __init__(self, vocab_size, embedding_size):
        super().__init__()
        self.weight = nn.Parameter(torch.rand(embedding_size, vocab_size)).to(device)

    def forward(self, x):
        return torch.matmul(x, self.weight)

In [9]:
def test_unembedding():
    torch.manual_seed(25)
    vocab_size = 10
    embedding_size = 4
    sequence_length = 4
    batch_size = 2
    input = torch.rand(batch_size, sequence_length, embedding_size).to(device)
    unembedding = Unembedding(vocab_size, embedding_size)

    print("weight:", unembedding.weight)
    print("input: ", input)
    output = unembedding(input)
    print("output:", output)
    assert output.shape == (batch_size, sequence_length, vocab_size)
test_unembedding()

weight: tensor([[0.4691, 0.6875, 0.9917, 0.2772, 0.7970, 0.2249, 0.1119, 0.6863, 0.2238,
         0.2678],
        [0.2246, 0.4711, 0.0603, 0.2517, 0.3705, 0.7340, 0.6466, 0.5172, 0.1176,
         0.7000],
        [0.8191, 0.0488, 0.3021, 0.2490, 0.7769, 0.7847, 0.8554, 0.8310, 0.1154,
         0.2578],
        [0.4702, 0.0530, 0.4207, 0.7639, 0.7536, 0.6063, 0.1899, 0.2837, 0.6097,
         0.5808]], device='cuda:0', grad_fn=<ToCopyBackward0>)
input:  tensor([[[0.7518, 0.1929, 0.0629, 0.9118],
         [0.3828, 0.2990, 0.5933, 0.2911],
         [0.2416, 0.5582, 0.0481, 0.3497],
         [0.3520, 0.9528, 0.0284, 0.8488]],

        [[0.3947, 0.5181, 0.9726, 0.8813],
         [0.0056, 0.3056, 0.9384, 0.7949],
         [0.4399, 0.1766, 0.8739, 0.1425],
         [0.4682, 0.6254, 0.3040, 0.7923]]], device='cuda:0')
output: tensor([[[0.8762, 0.6591, 1.1598, 0.9691, 1.4066, 0.9128, 0.4358, 0.9266,
          0.7540, 0.8821],
         [0.8696, 0.4484, 0.6993, 0.5514, 1.0961, 0.9476, 0.7989, 0.9

In [10]:
class PositionalEmbedding(nn.Module):
    def __init__(self, embedding_size, max_sequence_length):
        super().__init__()
        self.table = nn.Embedding(max_sequence_length, embedding_size).to(device)

    def forward(self, sequence):
        positions = torch.zeros(sequence.shape, dtype=torch.int32)
        positions[:, ::] = torch.arange(0, sequence.shape[-1])
        #print("positions", positions)
        positional_embeddings = self.table(positions.to(device))
        return positional_embeddings

In [11]:
def test_positional_embedding():
    embedding_size = 8
    max_sequence_length = 10
    batch_size = 2
    positional_embedding = PositionalEmbedding(embedding_size, max_sequence_length)
    output = positional_embedding(SAMPLE_X)
    print("output:", output)
    assert output.shape == (batch_size, SAMPLE_X.shape[-1], embedding_size)
test_positional_embedding()

output: tensor([[[-0.0079, -0.6091,  1.5286,  1.9735,  0.1646,  0.5387,  0.5112,
           0.8526],
         [-0.6024, -1.1570,  0.9000,  0.5598,  0.2992, -2.0385,  1.9378,
          -0.1953],
         [-0.2086,  0.0196, -0.0843, -1.2005,  1.1399,  1.2420,  0.1124,
          -0.0296],
         [-0.7684,  0.3472,  0.4499, -0.3574, -0.8319,  0.6517,  0.5965,
          -1.3327]],

        [[-0.0079, -0.6091,  1.5286,  1.9735,  0.1646,  0.5387,  0.5112,
           0.8526],
         [-0.6024, -1.1570,  0.9000,  0.5598,  0.2992, -2.0385,  1.9378,
          -0.1953],
         [-0.2086,  0.0196, -0.0843, -1.2005,  1.1399,  1.2420,  0.1124,
          -0.0296],
         [-0.7684,  0.3472,  0.4499, -0.3574, -0.8319,  0.6517,  0.5965,
          -1.3327]]], device='cuda:0', grad_fn=<EmbeddingBackward0>)


In [12]:
def attention(queries, keys, values, mask, verbose):
    printIfVerbose(verbose, "queries:", queries)
    printIfVerbose(verbose, "keys:", keys)
    printIfVerbose(verbose, "values:", values)
    keys_transposed = torch.transpose(keys, -2, -1)
    printIfVerbose(verbose, "keys_transposed:", keys_transposed)
    scores = torch.matmul(queries, keys_transposed)
    #assert scores.shape == (keys.shape[0], keys.shape[-1], queries.shape[-1])
    printIfVerbose(verbose, "scores:", scores)
    printIfVerbose(verbose, "scores:", scores.shape)
    printIfVerbose(verbose, "masks:", mask.shape)
    scores = scores.masked_fill(mask == 0, -1e9)
    printIfVerbose(verbose, "masked scores:", scores)
    d_attn = keys.shape[-1]
    scaled_scores = scores / math.sqrt(d_attn)
    printIfVerbose(verbose, "scaled_scores:", scaled_scores)
    softmax_scores = torch.softmax(scaled_scores, -1)
    printIfVerbose(verbose, "softmax_scores:", softmax_scores)
    printIfVerbose(verbose, "softmax_socres shape:", softmax_scores.shape)
    printIfVerbose(verbose, "values:", values)
    v_out = torch.matmul(softmax_scores, values)
    return v_out

In [13]:
def test_attention():
    d_attn = 4
    length_x = 4
    length_z = 3
    batch_size = 2
    d_out = 2

    queries = torch.rand(batch_size, length_x, d_attn)
    keys = torch.rand(batch_size, length_z, d_attn)
    values = torch.rand(batch_size, length_z, d_out)
    mask = torch.tril(torch.ones(length_x, length_z) == 1)
    padding_mask = torch.tensor([[1, 1, 1, 1], [1, 1, 0, 0]], dtype=torch.int32)

    v_out = attention(queries, keys, values, mask, True)
    #print("output:", v_out)
    assert v_out.shape == (batch_size, length_x, d_out)
test_attention()

queries: tensor([[[0.2746, 0.1760, 0.3505, 0.9246],
         [0.8537, 0.5464, 0.9339, 0.0768],
         [0.0565, 0.3594, 0.4961, 0.6278],
         [0.3572, 0.5220, 0.1997, 0.5286]],

        [[0.4723, 0.0238, 0.1838, 0.2010],
         [0.1765, 0.8587, 0.7776, 0.1199],
         [0.8638, 0.1066, 0.1084, 0.8448],
         [0.7043, 0.9275, 0.3953, 0.2704]]])
keys: tensor([[[0.6228, 0.6078, 0.7686, 0.3296],
         [0.4959, 0.0065, 0.9125, 0.8358],
         [0.6698, 0.4129, 0.0129, 0.5052]],

        [[0.5967, 0.3134, 0.1648, 0.4834],
         [0.2368, 0.7654, 0.9255, 0.3393],
         [0.5612, 0.0953, 0.5582, 0.5739]]])
values: tensor([[[0.5244, 0.6292],
         [0.7426, 0.3134],
         [0.7793, 0.9385]],

        [[0.1588, 0.3427],
         [0.3863, 0.2306],
         [0.1533, 0.0876]]])
keys_transposed: tensor([[[0.6228, 0.4959, 0.6698],
         [0.6078, 0.0065, 0.4129],
         [0.7686, 0.9125, 0.0129],
         [0.3296, 0.8358, 0.5052]],

        [[0.5967, 0.2368, 0.5612],
       

In [14]:
from enum import Enum
class MaskStrategy(Enum):
    UNMASKED = 1
    MASKED = 2

In [73]:
class MultiHeadedAttention(nn.Module):
    def __init__(self, num_heads, d_attn, d_x, d_z, d_out, d_mid, maskStrategy, verbose):
        super().__init__()
        self.verbose = verbose
        self.num_heads = num_heads
        self.d_attn = d_attn
        self.d_x = d_x
        self.d_z = d_z
        self.d_out = d_out
        self.d_mid = d_mid
        self.maskStrategy = maskStrategy
        self.weight_query = nn.Parameter(torch.rand(num_heads, d_x, d_attn))
        self.weight_key = nn.Parameter(torch.rand(num_heads, d_z, d_attn))
        self.weight_value = nn.Parameter(torch.rand(num_heads, d_z, d_mid))
        self.weight_out = nn.Parameter(torch.rand(d_mid * num_heads, d_out))
        self.bias_query = nn.Parameter(torch.zeros(num_heads, d_attn))
        self.bias_key = nn.Parameter(torch.zeros(num_heads, d_attn))
        self.bias_value = nn.Parameter(torch.zeros(num_heads, d_mid))
        self.bias_out = nn.Parameter(torch.zeros(d_out))

    def forward(self, z, x, padding_mask):
        length_z = z.shape[-2]
        length_x = x.shape[-2]
        batch_size = x.shape[0]

        queries = torch.matmul(x.unsqueeze(1), self.weight_query) + self.bias_query[None, :, None, :]
        keys = torch.matmul(z.unsqueeze(1), self.weight_key) + self.bias_key[None, :, None, :]
        values = torch.matmul(z.unsqueeze(1), self.weight_value) + self.bias_value[None, :, None, :]

        assert queries.shape == (batch_size, self.num_heads, length_x, self.d_attn)
        assert keys.shape == (batch_size, self.num_heads, length_z, self.d_attn)
        assert values.shape == (batch_size, self.num_heads, length_z, self.d_mid)

        if self.maskStrategy == MaskStrategy['UNMASKED']:
            mask = padding_mask.unsqueeze(-2)
        elif self.maskStrategy == MaskStrategy['MASKED']:
            padding_mask = padding_mask.unsqueeze(-2)
            mask = torch.tril(torch.ones(length_x, length_z) == 1).to(device)
            printIfVerbose(self.verbose, "padding mask:", padding_mask.shape)
            printIfVerbose(self.verbose, "mask tril", mask)
            mask = mask & padding_mask
            printIfVerbose(self.verbose, "merged mask:", mask)
        mask = mask.unsqueeze(1)
        printIfVerbose(self.verbose, "mask", mask)
        printIfVerbose(self.verbose, "mask", mask.shape)
        v_out = attention(queries, keys, values, mask, self.verbose)
        printIfVerbose(self.verbose, "v_out shape", v_out.shape)
        assert v_out.shape == (batch_size, self.num_heads, length_x, self.d_mid)
        printIfVerbose(self.verbose, "v_out:", v_out)
        printIfVerbose(self.verbose, "v_out shape before:", v_out.shape)
        v_out = v_out.reshape(batch_size, v_out.shape[-2], -1)
        printIfVerbose(self.verbose, "v_out shape:", v_out.shape)
        printIfVerbose(self.verbose, "weight_out shape:", self.weight_out.shape)
        printIfVerbose(self.verbose, "v_out reshaped:", v_out)
        output = torch.matmul(v_out, self.weight_out) + self.bias_out
        printIfVerbose(self.verbose, "output shape", output.shape)
        assert output.shape == (batch_size, length_x, self.d_out)
        return output




In [53]:
def test_multi_headed_attention_encoder_fixed():
    num_heads = 1
    d_attn = 3
    d_x = 4
    d_z = 4
    d_out = 1
    d_mid = 3
    length_z = 3
    batch_size = 1
    padding_mask = torch.tensor([[1, 1, 0]], dtype=torch.int32).to(device)

    multi_headed_attention = MultiHeadedAttention(num_heads, d_attn, d_x, d_z, d_out, d_mid, MaskStrategy['UNMASKED'], True).to(device)
    multi_headed_attention.weight_query = nn.Parameter(torch.tensor([[[1, 0, 1], [1, 0, 0], [0, 0, 1], [0, 1, 1]]], dtype = torch.float32).to(device))
    multi_headed_attention.weight_key = nn.Parameter(torch.tensor([[[0, 0, 1], [1, 1, 0], [0, 1, 0], [1, 1, 0]]], dtype = torch.float32).to(device))
    multi_headed_attention.weight_value = nn.Parameter(torch.tensor([[[0, 2, 0], [0, 3, 0], [1, 0, 3], [1, 1, 0]]], dtype = torch.float32).to(device))
    multi_headed_attention.weight_out = nn.Parameter(torch.tensor([[1], [0], [1]], dtype = torch.float32).to(device))
    z = torch.tensor([[[1, 0, 1, 0], [0, 2, 0, 2], [1, 1, 1, 1]]], dtype=torch.float32).to(device)
    #print("z:", z
    output = multi_headed_attention(z, z, padding_mask)
    #print("output:", output)
    assert output.shape == (batch_size, length_z, d_out)
test_multi_headed_attention_encoder_fixed()

mask tensor([[[[1, 1, 0]]]], device='cuda:0', dtype=torch.int32)
mask torch.Size([1, 1, 1, 3])
queries: tensor([[[[1.0334, 0.4025, 2.4895],
          [2.0334, 2.4025, 2.4895],
          [2.0334, 1.4025, 3.4895]]]], device='cuda:0', grad_fn=<AddBackward0>)
keys: tensor([[[[0.7353, 1.7822, 1.5923],
          [4.7353, 4.7822, 0.5923],
          [2.7353, 3.7822, 1.5923]]]], device='cuda:0', grad_fn=<AddBackward0>)
values: tensor([[[[1.8523, 2.2521, 3.5369],
          [2.8523, 8.2521, 0.5369],
          [2.8523, 6.2521, 3.5369]]]], device='cuda:0', grad_fn=<AddBackward0>)
keys_transposed: tensor([[[[0.7353, 4.7353, 2.7353],
          [1.7822, 4.7822, 3.7822],
          [1.5923, 0.5923, 1.5923]]]], device='cuda:0',
       grad_fn=<TransposeBackward0>)
scores: tensor([[[[ 5.4412,  8.2928,  8.3130],
          [ 9.7409, 22.5925, 18.6127],
          [ 9.5510, 18.4026, 16.4228]]]], device='cuda:0',
       grad_fn=<UnsafeViewBackward0>)
scores: torch.Size([1, 1, 3, 3])
masks: torch.Size([1, 1, 1, 

In [54]:
def test_multi_headed_attention_encoder():
    num_heads = 3
    d_attn = 3
    d_x = 4
    d_z = 4
    d_out = 1
    d_mid = 3
    length_z = 3
    batch_size = 3
    padding_mask = torch.tensor([[1, 1, 0], [1, 1, 0], [1, 1, 1]], dtype=torch.int32).to(device)

    multi_headed_attention = MultiHeadedAttention(num_heads, d_attn, d_x, d_z, d_out, d_mid, MaskStrategy['UNMASKED'], True).to(device)
    z = torch.tensor([[[1, 0, 1, 0], [0, 2, 0, 2], [1, 1, 1, 1]],
                      [[1, 0, 1, 0], [0, 2, 0, 2], [1, 1, 1, 1]],
                      [[1, 0, 1, 0], [0, 2, 0, 2], [1, 1, 1, 1]]], dtype=torch.float32).to(device)
    #print("z:", z
    output = multi_headed_attention(z, z, padding_mask)
    #print("output:", output)
    assert output.shape == (batch_size, length_z, d_out)
test_multi_headed_attention_encoder()

mask tensor([[[[1, 1, 0]]],


        [[[1, 1, 0]]],


        [[[1, 1, 1]]]], device='cuda:0', dtype=torch.int32)
mask torch.Size([3, 1, 1, 3])
queries: tensor([[[[1.6190, 0.6863, 2.1165],
          [3.4282, 1.5750, 3.2955],
          [3.1582, 1.4306, 3.3010]],

         [[2.1277, 1.7037, 1.2801],
          [2.8481, 2.1833, 2.1802],
          [3.1013, 2.4035, 2.2014]],

         [[0.4334, 1.3646, 1.3209],
          [2.4264, 1.4926, 3.5806],
          [1.5392, 1.9644, 3.0962]]],


        [[[1.6190, 0.6863, 2.1165],
          [3.4282, 1.5750, 3.2955],
          [3.1582, 1.4306, 3.3010]],

         [[2.1277, 1.7037, 1.2801],
          [2.8481, 2.1833, 2.1802],
          [3.1013, 2.4035, 2.2014]],

         [[0.4334, 1.3646, 1.3209],
          [2.4264, 1.4926, 3.5806],
          [1.5392, 1.9644, 3.0962]]],


        [[[1.6190, 0.6863, 2.1165],
          [3.4282, 1.5750, 3.2955],
          [3.1582, 1.4306, 3.3010]],

         [[2.1277, 1.7037, 1.2801],
          [2.8481, 2.1833, 2.1802],


In [55]:
def test_multi_headed_attention_encoder_decoder():
    num_heads = 1
    d_attn = 4
    d_x = 1
    d_z = 1
    d_out = 1
    d_mid = 1
    length_x = 3
    length_z = 3
    batch_size = 1
    padding_mask = torch.tensor([[1, 1, 0]], dtype=torch.int32).to(device)

    multi_headed_attention = MultiHeadedAttention(num_heads, d_attn, d_x, d_z, d_out, d_mid, MaskStrategy['UNMASKED'], True).to(device)
    x = torch.rand(batch_size, length_x, d_x).to(device)
    z = torch.rand(batch_size, length_z, d_z).to(device)
    output = multi_headed_attention(z, x, padding_mask)
    print("output:", output)
    assert output.shape == (batch_size, length_x, d_out)
test_multi_headed_attention_encoder_decoder()

mask tensor([[[[1, 1, 0]]]], device='cuda:0', dtype=torch.int32)
mask torch.Size([1, 1, 1, 3])
queries: tensor([[[[0.2580, 0.3788, 0.9121, 0.6015],
          [0.4090, 0.4261, 1.1700, 1.0154],
          [0.3933, 0.4212, 1.1432, 0.9725]]]], device='cuda:0',
       grad_fn=<AddBackward0>)
keys: tensor([[[[0.4685, 0.4606, 1.0917, 0.8183],
          [0.5242, 0.6823, 1.2893, 0.8993],
          [0.5205, 0.6675, 1.2761, 0.8939]]]], device='cuda:0',
       grad_fn=<AddBackward0>)
values: tensor([[[[0.5083],
          [0.7635],
          [0.7465]]]], device='cuda:0', grad_fn=<AddBackward0>)
keys_transposed: tensor([[[[0.4685, 0.5242, 0.5205],
          [0.4606, 0.6823, 0.6675],
          [1.0917, 1.2893, 1.2761],
          [0.8183, 0.8993, 0.8939]]]], device='cuda:0',
       grad_fn=<TransposeBackward0>)
scores: tensor([[[[1.7833, 2.1106, 2.0888],
          [2.4961, 2.9268, 2.8981],
          [2.4221, 2.8421, 2.8140]]]], device='cuda:0',
       grad_fn=<UnsafeViewBackward0>)
scores: torch.Size([

In [56]:
def test_multi_headed_attention_decoder_self():
    num_heads = 8
    d_attn = 4
    d_x = 4
    d_z = 4
    d_out = 4
    d_mid = 2
    length_x = 3
    batch_size = 4
    padding_mask = torch.tensor([[1, 1, 0], [1, 1, 0], [1, 0, 0], [1, 1, 1]], dtype=torch.int32).to(device)

    multi_headed_attention = MultiHeadedAttention(num_heads, d_attn, d_x, d_z, d_out, d_mid, MaskStrategy['MASKED'], True).to(device)
    x = torch.rand(batch_size, length_x, d_x).to(device)
    output = multi_headed_attention(x, x, padding_mask)
    print("output:", output)
    assert output.shape == (batch_size, length_x, d_out)
test_multi_headed_attention_decoder_self()

padding mask: torch.Size([4, 1, 3])
mask tril tensor([[ True, False, False],
        [ True,  True, False],
        [ True,  True,  True]], device='cuda:0')
merged mask: tensor([[[1, 0, 0],
         [1, 1, 0],
         [1, 1, 0]],

        [[1, 0, 0],
         [1, 1, 0],
         [1, 1, 0]],

        [[1, 0, 0],
         [1, 0, 0],
         [1, 0, 0]],

        [[1, 0, 0],
         [1, 1, 0],
         [1, 1, 1]]], device='cuda:0', dtype=torch.int32)
mask tensor([[[[1, 0, 0],
          [1, 1, 0],
          [1, 1, 0]]],


        [[[1, 0, 0],
          [1, 1, 0],
          [1, 1, 0]]],


        [[[1, 0, 0],
          [1, 0, 0],
          [1, 0, 0]]],


        [[[1, 0, 0],
          [1, 1, 0],
          [1, 1, 1]]]], device='cuda:0', dtype=torch.int32)
mask torch.Size([4, 1, 3, 3])
queries: tensor([[[[1.1067, 1.6913, 2.1271, 1.9234],
          [1.2436, 1.4748, 1.8042, 1.9838],
          [0.9214, 1.8090, 2.0074, 1.7083]],

         [[1.2254, 1.6446, 1.4996, 1.7931],
          [1.2596, 1.

In [69]:
class LayerNorm(nn.Module):
    def __init__(self, feature_length):
        super().__init__()
        self.scale = nn.Parameter(torch.ones(feature_length))
        self.offset = nn.Parameter(torch.zeros(feature_length))

    def forward(self, activations):
        mean = torch.mean(activations, -1, keepdim=True)
        #print("mean:", mean)
        #print("activations - mean", activations - mean)
        variance = torch.var(activations, -1, keepdim=True, unbiased=False)
        normalized_activations = (activations - mean) / torch.sqrt(variance + 1e-6)
        return (normalized_activations * self.scale) + self.offset

In [70]:
def test_layer_norm():
    feature_length = 4
    length_x = 3
    batch_size = 5
    layer_norm = LayerNorm(feature_length)

    activations = torch.rand(batch_size, length_x, feature_length)

    print("activations:", activations)
    print("layer_normed:", layer_norm(activations))
    assert layer_norm(activations).shape == activations.shape

test_layer_norm()

activations: tensor([[[0.0740, 0.5761, 0.1583, 0.9082],
         [0.0254, 0.4980, 0.5432, 0.7955],
         [0.3154, 0.4343, 0.3936, 0.2391]],

        [[0.6966, 0.8427, 0.1361, 0.3746],
         [0.9258, 0.1272, 0.2205, 0.4163],
         [0.5488, 0.6108, 0.0512, 0.1113]],

        [[0.8538, 0.0780, 0.4456, 0.9851],
         [0.2553, 0.5005, 0.7952, 0.3014],
         [0.7036, 0.1147, 0.7308, 0.0979]],

        [[0.7745, 0.3784, 0.5438, 0.5078],
         [0.3485, 0.9173, 0.7833, 0.2066],
         [0.0177, 0.0686, 0.7524, 0.0972]],

        [[0.1624, 0.9529, 0.2026, 0.7624],
         [0.3717, 0.1057, 0.9665, 0.7025],
         [0.5528, 0.3656, 0.8212, 0.2777]]])
layer_normed: tensor([[[-1.0581,  0.4377, -0.8070,  1.4274],
         [-1.5818,  0.1169,  0.2790,  1.1859],
         [-0.4032,  1.1844,  0.6411, -1.4223]],

        [[ 0.6683,  1.1984, -1.3662, -0.5005],
         [ 1.6301, -0.9563, -0.6540, -0.0198],
         [ 0.8691,  1.1161, -1.1122, -0.8729]],

        [[ 0.7380, -1.4375, -0.4

In [74]:
class FeedForward(nn.Module):
    def __init__(self, hiddenLayerWidth, d_e):
        super().__init__()
        self.mlp1 = nn.Parameter(torch.rand(d_e, hiddenLayerWidth))
        self.mlp2 = nn.Parameter(torch.rand(hiddenLayerWidth, d_e))
        self.mlp1_bias = nn.Parameter(torch.zeros(hiddenLayerWidth))
        self.mlp2_bias = nn.Parameter(torch.zeros(d_e))

    def forward(self, activations):
        activations = torch.matmul(activations, self.mlp1) + self.mlp1_bias
        activations = activations.relu()
        activations = torch.matmul(activations, self.mlp2) + self.mlp2_bias
        return activations


In [47]:
def test_feed_forward():
    hiddenLayerWidth = 3
    d_e = 4
    feed_forward = FeedForward(hiddenLayerWidth, d_e)
    activations = torch.rand(10, 5, d_e)

    print("activations:", activations)
    output = feed_forward(activations)
    print("feed forward:", output)
    assert output.shape == activations.shape

test_feed_forward()

activations: tensor([[[8.2009e-01, 8.3609e-01, 8.5392e-01, 6.0773e-01],
         [5.9798e-01, 3.1981e-02, 9.2482e-01, 9.4239e-01],
         [8.4224e-01, 2.7856e-01, 3.1861e-01, 9.4557e-01],
         [8.7366e-01, 3.9853e-01, 3.5887e-01, 2.8971e-02],
         [3.3410e-01, 8.8679e-01, 4.9636e-01, 2.5706e-01]],

        [[7.6555e-01, 1.7312e-01, 5.7957e-01, 6.1636e-01],
         [5.9314e-01, 3.7463e-03, 2.3211e-01, 4.5253e-01],
         [3.1880e-01, 1.5249e-01, 5.2706e-01, 9.7245e-01],
         [1.6125e-01, 2.1925e-01, 5.3639e-01, 1.0331e-01],
         [7.0899e-01, 7.4258e-01, 5.8480e-01, 5.2108e-01]],

        [[4.0680e-01, 4.1767e-01, 1.4233e-01, 2.0912e-01],
         [8.4549e-01, 3.9151e-01, 4.5374e-01, 6.2639e-01],
         [7.4198e-01, 7.4134e-01, 6.2906e-01, 6.4801e-01],
         [3.0707e-01, 7.6654e-01, 3.1045e-01, 4.3200e-01],
         [7.6641e-01, 9.1288e-01, 3.7193e-01, 3.4477e-01]],

        [[3.3231e-01, 3.7801e-01, 7.1181e-01, 7.2664e-01],
         [7.2847e-01, 6.6413e-01, 4.9

In [24]:
class EncoderLayer(nn.Module):
    def __init__(self, num_heads, d_attn, d_x, d_z, d_out, d_mid, d_mlp, verbose):
        super().__init__()
        self.verbose = verbose
        self.multi_head_attention = MultiHeadedAttention(num_heads, d_attn, d_x, d_z, d_out, d_mid, MaskStrategy['UNMASKED'], verbose)
        self.layer_norm1 = LayerNorm(d_z)
        self.feed_forward = FeedForward(d_mlp, d_z)
        self.layer_norm2 = LayerNorm(d_z)

    def forward(self, z, padding_mask):
        z = z + self.multi_head_attention(z, z, padding_mask)
        z = self.layer_norm1(z)
        z = z + self.feed_forward(z)
        z = self.layer_norm2(z)
        return z

In [25]:
class Encoder(nn.Module):
    def __init__(self, num_layers, num_heads, d_attn, d_x, d_z, d_out, d_mid, d_mlp, verbose):
        super().__init__()
        self.layers = []
        for i in range(num_layers):
            encoder_layer = EncoderLayer(num_heads, d_attn, d_x, d_z, d_out, d_mid, d_mlp, verbose)
            self.layers.append(encoder_layer)
        self.layers = nn.ModuleList(self.layers)

    def forward(self, z, padding_mask):
        for layer in self.layers:
            z = layer(z, padding_mask)
        return z

In [26]:
class DecoderLayer(nn.Module):
    def __init__(self, num_heads, d_attn, d_x, d_z, d_out, d_mid, d_mlp, verbose):
        super().__init__()
        self.verbose = verbose
        self.multi_head_self_attention = MultiHeadedAttention(num_heads, d_attn, d_x, d_z, d_out, d_mid, MaskStrategy['MASKED'], verbose)
        self.layer_norm1 = LayerNorm(d_x)
        self.multi_head_global_attention = MultiHeadedAttention(num_heads, d_attn, d_x, d_z, d_out, d_mid, MaskStrategy['UNMASKED'], verbose)
        self.layer_norm2 = LayerNorm(d_x)
        self.feed_forward = FeedForward(d_mlp, d_x)
        self.layer_norm3 = LayerNorm(d_x)

    def forward(self, z, x, src_mask, tgt_mask):
        x = x + self.multi_head_self_attention(x, x, tgt_mask)
        x = self.layer_norm1(x)
        x = x + self.multi_head_global_attention(z, x, src_mask)
        x = self.layer_norm2(x)
        x = x + self.feed_forward(x)
        x = self.layer_norm3(x)
        return x

In [27]:
class Decoder(nn.Module):
    def __init__(self, num_layers, num_heads, d_attn, d_x, d_z, d_out, d_mid, d_mlp, verbose):
        super().__init__()
        self.layers = []
        for i in range(num_layers):
            decoder_layer = DecoderLayer(num_heads, d_attn, d_x, d_z, d_out, d_mid, d_mlp, verbose)
            self.layers.append(decoder_layer)
        self.layers = nn.ModuleList(self.layers)

    def forward(self, z, x, src_mask, tgt_mask):
        for layer in self.layers:
            x = layer(z, x, src_mask, tgt_mask)
        return x

In [28]:
class EncoderDecoderTransformer(nn.Module):
    def __init__(self, num_encoder_layers, num_decoder_layers, num_heads, d_attn, d_x, d_z, d_out, d_mid, d_mlp, d_e, vocab_size, max_sequence_length, verbose):
        super().__init__()
        self.verbose = verbose
        self.embedding = Embedding(vocab_size, d_e)
        self.positionalEmbedding = PositionalEmbedding(d_e, max_sequence_length)
        self.encoder = Encoder(num_encoder_layers, num_heads, d_attn, d_x, d_z, d_out, d_mid, d_mlp, verbose)
        self.decoder = Decoder(num_decoder_layers, num_heads, d_attn, d_x, d_z, d_out, d_mid, d_mlp, verbose)
        self.unembedding = Unembedding(vocab_size, d_e)

    def forward(self, z, x, src_mask, tgt_mask):
        z = self.embedding(z) + self.positionalEmbedding(z)
        z = self.encoder(z, src_mask)
        x = self.embedding(x) + self.positionalEmbedding(x)
        x = self.decoder(z, x, src_mask, tgt_mask)
        #print("x after decoder:", x.shape)
        x = self.unembedding(x)
        #print("x after unembedding:", x.shape)
        return x




In [29]:
enRawName = "drive/MyDrive/colab data/multi30kEnTrain.txt"
deRawName = "drive/MyDrive/colab data/multi30kDeTrain.txt"
en30kVal = "drive/MyDrive/colab data/multi30kEnVal.txt"
de30kVal = "drive/MyDrive/colab data/multi30kDeVal.txt"
englishCleanName = "data/english_tokens.pkl"
germanCleanName = "data/german_tokens.pkl"
englishSortedName = "data/englishSorted.pkl"
germanSortedName = "data/germanSorted.pkl"

truncEn = "drive/MyDrive/colab data/truncEn.pkl"
truncDe = "drive/MyDrive/colab data/truncDe.pkl"

enTokenizerName = "drive/MyDrive/colab data/enTokenizer.pkl"
deTokenizerName = "drive/MyDrive/colab data/deTokenizer.pkl"
pairsName = "drive/MyDrive/colab data/pairs.pkl"
folder = "drive/MyDrive/colab data/"

enTrainingFileName = folder + "enTraining"
deTrainingFileName = folder + "deTraining"
enTestFileName = folder + "enTest"
deTestFileName = folder + "deTest"
enValFileName = folder + "enValidation"
deValFileName = folder + "deValidation"

enCombinedFileName = folder + "enCombined"
deCombinedFileName = folder + "deCombined"

In [30]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [31]:
class SentenceDataset(Dataset):

    TOKENIZER_SUFFIX = "_tokenizer"
    BOS_TOKEN = "[SOS]"
    EOS_TOKEN = "[EOS]"
    PAD_TOKEN = "[PAD]"
    UNK_TOKEN = "[UNK]"

    def __init__(self, src_filename, tgt_filename, src_vocab_size, tgt_vocab_size, max_sequences):
        src_sequences = self.to_sequences(self.load_doc(src_filename), max_sequences)
        print(len(src_sequences))
        tgt_sequences = self.to_sequences(self.load_doc(tgt_filename), max_sequences)
        src_sequences = [self.add_special_tokens(sequence) for sequence in src_sequences]
        tgt_sequences = [self.add_special_tokens(sequence) for sequence in tgt_sequences]
        self.src_tokenizer, self.tgt_tokenizer = self.setup_tokenizers(src_filename, tgt_filename, src_vocab_size, tgt_vocab_size, src_filename + SentenceDataset.TOKENIZER_SUFFIX, tgt_filename + SentenceDataset.TOKENIZER_SUFFIX)
        # src_tokenized = self.src_tokenizer.encode_batch(src_sequences)
        # tgt_tokenized = self.tgt_tokenizer.encode_batch(tgt_sequences)
        # src_tensors = [torch.IntTensor(sequence.ids) for sequence in src_tokenized]
        # tgt_tensor = [torch.IntTensor(sequence.ids) for sequence in tgt_tokenized]
        self.pairs = self.pair_sequences(src_sequences, tgt_sequences)
        print("pairs", self.pairs)

    # load doc into memory
    def load_doc(self, filename):
        # open the file as read only
        file = open(filename, mode='rt')
        # read all text
        text = file.read()
        # close the file
        file.close()
        return text

    def add_special_tokens(self, sequence):
        #sequence = self.BOS_TOKEN + " " + sequence + " " + self.EOS_TOKEN
        return sequence

    def pair_sequences(self, src_sequences, tgt_sequences):
        paired_sequences = list(zip(src_sequences, tgt_sequences))
        sorted_pairs = sorted(paired_sequences, key=lambda x: len(x[0]))
        return sorted_pairs

    # split a loaded document into sequences
    def to_sequences(self, doc, max_sequences):
        sequences = doc.strip().split('\n')
        return sequences[:max_sequences]

    def setup_tokenizers(self, src_filename, tgt_filename, src_vocab_size, tgt_vocab_size, src_tokenizer_name, tgt_tokenizer_name):
        print("creating tokenizer for " + src_filename)
        src_tokenizer = Tokenizer(BPE(unk_token=SentenceDataset.UNK_TOKEN))
        src_tokenizer.pre_tokenizer = Whitespace()
        trainer = BpeTrainer(vocab_size = src_vocab_size, special_tokens=[SentenceDataset.BOS_TOKEN, SentenceDataset.EOS_TOKEN, SentenceDataset.PAD_TOKEN, SentenceDataset.UNK_TOKEN])
        src_tokenizer.train([src_filename], trainer=trainer)
        pickle.dump(src_tokenizer, open(src_tokenizer_name, "wb"))

        print("creating tokenizer for " + tgt_filename)
        tgt_tokenizer = Tokenizer(BPE(unk_token=SentenceDataset.UNK_TOKEN))
        tgt_tokenizer.pre_tokenizer = Whitespace()
        trainer = BpeTrainer(vocab_size = tgt_vocab_size, special_tokens=[SentenceDataset.BOS_TOKEN, SentenceDataset.EOS_TOKEN, SentenceDataset.PAD_TOKEN, SentenceDataset.UNK_TOKEN])
        tgt_tokenizer.train([tgt_filename], trainer=trainer)
        pickle.dump(tgt_tokenizer, open(tgt_tokenizer_name, "wb"))
        return src_tokenizer, tgt_tokenizer

    def __len__(self):
        return len(self.pairs)

    def __getitem__(self, index):
        src_seq, tgt_seq = self.pairs[index]
        return src_seq, tgt_seq


In [32]:
class PadCollate:
    PAD_TOKEN = "[PAD]"
    PAD_ID = 2
    def __init__(self, src_tokenizer, tgt_tokenizer):
        self.src_tokenizer = src_tokenizer
        self.tgt_tokenizer = tgt_tokenizer

    def __call__(self, batch):
        # max_len_src = max([len(pair[0].split()) for pair in batch])
        # max_len_tgt = max([len(pair[1].split()) for pair in batch])

        max_len_src = len(self.src_tokenizer.encode(batch[-1][0]))
        max_len_tgt = len(self.tgt_tokenizer.encode(batch[-1][1]))

        self.src_tokenizer.enable_padding(pad_id = self.PAD_ID, pad_token = self.PAD_TOKEN, length=max_len_src)
        self.src_tokenizer.enable_truncation(max_length=max_len_src)
        self.tgt_tokenizer.enable_padding(pad_id = self.PAD_ID, pad_token = self.PAD_TOKEN, length=max_len_tgt)
        self.tgt_tokenizer.enable_truncation(max_length=max_len_tgt)

        # print("src batch:", [pair[0] for pair in batch])
        # print("tgt batch:", [pair[1] for pair in batch])

        src_tokenized = self.src_tokenizer.encode_batch([pair[0] for pair in batch])
        tgt_tokenized = self.tgt_tokenizer.encode_batch([pair[1] for pair in batch])
        # src_tokenized = [sequence.ids for sequence in src_tokenized]
        # tgt_tokenized = [sequence.ids for sequence in tgt_tokenized]
        # src_tensors = torch.IntTensor(src_tokenized)
        # tgt_tensor = torch.IntTensor(tgt_tokenized)

        return src_tokenized, tgt_tokenized

In [83]:
sequenceDataset = SentenceDataset(enRawName, deRawName, 10000, 10000, 1000)

1000
creating tokenizer for drive/MyDrive/colab data/multi30kEnTrain.txt
creating tokenizer for drive/MyDrive/colab data/multi30kDeTrain.txt
pairs [('A man sits on a rock.', 'Ein Mann sitzt auf einem Stein.'), ('A man is putting up a wall.', 'Ein Mann stellt eine Wand auf.'), ('A dog is running in the snow', 'Ein Hund rennt im Schnee.'), ('A dog walks through a field.', 'Ein Hund läuft durch ein Feld.'), ('A girl standing in the ocean', 'Ein Mädchen, das im Meer steht'), ('A black dog leaps over a log.', 'Ein schwarzer Hund springt über einen Baumstamm.'), ('A dog is playing with a hose.', 'Ein Hund spielt mit einem Schlauch.'), ('A guy wearing blue in a hole.', 'Ein Typ, der blau trägt, in einem Loch.'), ('A young boy plays on a swing.', 'Ein Junge spielt auf einer Schaukel.'), ('A old man having a beer alone.', 'Ein alter Mann, der allein ein Bier trinkt.'), ('A man with two dogs on a beach', 'Ein Mann mit zwei Hunden an einem Strand.'), ('Two men barbecuing at a beach.', 'Zwei Männe

In [34]:
print(sequenceDataset.__getitem__(0))

('A man sits on a rock.', 'Ein Mann sitzt auf einem Stein.')


In [84]:
train_dataloader = DataLoader(sequenceDataset, batch_size=16, collate_fn = PadCollate(sequenceDataset.src_tokenizer, sequenceDataset.tgt_tokenizer))

In [36]:
for src, tgt in train_dataloader:
    print("batch:", src[0].ids)
    print("decoded", sequenceDataset.src_tokenizer.decode(src[0].ids))
    print("tgt", tgt)

    print("mask:", src[0].attention_mask)
    break

batch: [30, 93, 398, 89, 57, 423, 15]
decoded A man sits on a rock .
tgt [Encoding(num_tokens=5, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing]), Encoding(num_tokens=5, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing]), Encoding(num_tokens=5, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing]), Encoding(num_tokens=5, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing]), Encoding(num_tokens=5, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing]), Encoding(num_tokens=5, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing]), Encoding(num_tokens=5, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing]), Encoding(num_tokens=5, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_to

In [37]:
print(sequenceDataset.__getitem__(0))

('A man sits on a rock.', 'Ein Mann sitzt auf einem Stein.')


In [38]:
pair = sequenceDataset.__getitem__(0)

num_encoder_layers = 5
num_decoder_layers = 5
num_heads = 8
d_attn = 64
d_x = 512
d_z = 512
d_out = 512
d_mid = 512
d_mlp = 2048
d_e = 512
vocab_size = 10000
max_sequence_length = 100



In [39]:
def decode(x, tokenizer):
    x = torch.softmax(x, -1)
    #print("x softmax:", x)
    x = torch.argmax(x, dim=-1)
    x = x.tolist()
    print("argmax x:", x)
    return tokenizer.decode(x)

In [40]:
def test_decode(tokenizer):
    x = torch.tensor([[0, 5], [10, 20]], dtype=torch.float32)
    words = decode(x, tokenizer)
    print(words)

test_decode(sequenceDataset.tgt_tokenizer)

argmax x: [1, 1]



In [82]:
torch.manual_seed(25)
encoder_decoder_transformer = EncoderDecoderTransformer(num_encoder_layers, num_decoder_layers, num_heads, d_attn, d_x, d_z, d_out, d_mid, d_mlp, d_e, vocab_size, max_sequence_length, False).to(device)
opt = optim.Adam(encoder_decoder_transformer.parameters(), lr=0.0001, betas=(0.9, 0.98), eps=1e-9)
loss_function = nn.CrossEntropyLoss(label_smoothing=0.1)
epochs = 1000
# Large models need this to actually train
for p in encoder_decoder_transformer.parameters():
    if p.dim() > 1:
        nn.init.xavier_uniform_(p)
#labelSmoothing = LabelSmoothing(2000, PADDING_IDX, 0.1)
step = 0
for i in range(epochs):
    dataloader_iter = iter(train_dataloader)
    losses = []
    for src_sequence, tgt_sequence in dataloader_iter:
        # print("x:", sequence_x)
        # print("z:", sequence_z)
        # sequence_x, sequence_z = sequenceDataset.__getitem__(i)
        src_tokens = torch.IntTensor([sequence.ids for sequence in src_sequence]).to(device)
        tgt_tokens = torch.IntTensor([sequence.ids for sequence in tgt_sequence]).to(device)
        src_masks = torch.IntTensor([sequence.attention_mask for sequence in src_sequence]).to(device)
        tgt_masks = torch.IntTensor([sequence.attention_mask for sequence in tgt_sequence]).to(device)
        # print("src masks", src_masks)
        # print("tgt masks", tgt_masks)
        output = encoder_decoder_transformer(src_tokens, tgt_tokens, src_masks, tgt_masks)
        #print(output.shape)
        # print("output", output.shape)
        output_transpose = output.transpose(-1, -2) # output needs to be N, C, other dimension for torch cross entropy
        loss = loss_function(output_transpose, tgt_tokens.long())
        opt.zero_grad()
        loss.backward()
        opt.step()
        losses.append(loss.item())
        print("Step", step)
        step += 1
    print("finished epoch", i)
    print("avg loss:", sum(losses) / len(losses))
    expected_output = sequenceDataset.tgt_tokenizer.decode(tgt_tokens[0].tolist())
    print("expected output", expected_output)
    decoded_output = decode(output[0], sequenceDataset.tgt_tokenizer)
    print("decoded output:", decoded_output)
    print()
    print()

Step 0
Step 1
Step 2
Step 3
Step 4
Step 5
Step 6
finished epoch 0
avg loss: 20.817846843174525
expected output Der Mann mit dem Rucksack sitzt
argmax x: [6543, 235, 12, 125, 7322, 114]
decoded output: Du Junge , einer museum und


Step 7
Step 8
Step 9
Step 10
Step 11
Step 12
Step 13
finished epoch 1
avg loss: 13.279285703386579
expected output Der Mann mit dem Rucksack sitzt
argmax x: [109, 235, 3426, 100, 294, 103]
decoded output: Ein Junge Nähmaschine in roten ein


Step 14
Step 15
Step 16
Step 17
Step 18
Step 19
Step 20
finished epoch 2
avg loss: 10.375390461512975
expected output Der Mann mit dem Rucksack sitzt
argmax x: [131, 268, 235, 125, 273, 111]
decoded output: Eine Personen Junge einer das einem


Step 21
Step 22
Step 23
Step 24
Step 25
Step 26
Step 27
finished epoch 3
avg loss: 8.602435861315046
expected output Der Mann mit dem Rucksack sitzt
argmax x: [109, 235, 100, 111, 294, 111]
decoded output: Ein Junge in einem roten einem


Step 28
Step 29
Step 30
Step 31
Step 32
Ste

KeyboardInterrupt: 