<a href="https://colab.research.google.com/github/JackWittmayer/Transformer-Implementation/blob/main/EDTransformer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install tokenizers



In [2]:
import re
import string
import os
import pickle
from unicodedata import normalize
from collections import Counter

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset
from torchvision import datasets
from torch.utils.data import DataLoader
from torch.nn.functional import log_softmax, pad

from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.trainers import BpeTrainer

import random
import time

import numpy as np
import math
import matplotlib.pyplot as plt

import sys
from nltk.translate.bleu_score import sentence_bleu
from nltk.translate.bleu_score import corpus_bleu

In [3]:
torch.manual_seed(25)
random.seed(25)

In [4]:
SAMPLE_X = torch.tensor([[3, 2, 0, 1], [1, 2, 3, 0]], dtype=torch.int32)
SAMPLE_Z = torch.tensor([4, 1, 7, 6], dtype=torch.int32)

In [5]:
class Embedding(nn.Module):
    def __init__(self, vocab_size, embedding_size):
        super().__init__()
        self.table = nn.Embedding(vocab_size, embedding_size)

    def forward(self, sequence):
        embeddings = self.table(sequence)
        return embeddings.transpose(-2, -1)

In [6]:
def test_embedding():
    torch.manual_seed(25)
    vocab_size = 4
    embedding = Embedding(vocab_size, 4)
    print("weight:", embedding.table.weight)
    print("SAMPLE_X: ", SAMPLE_X)
    output = embedding(SAMPLE_X)
    print("output:", output)
    for j in range(len(output)):
        #print("sample:", sample)
        for i in range(vocab_size):
            assert output[j, :, i].eq(embedding.table.weight[SAMPLE_X[j, i]]).all()
test_embedding()

weight: Parameter containing:
tensor([[ 0.0877, -0.6113,  0.3441, -1.2916],
        [-0.5874,  0.8060,  1.3200,  0.4826],
        [ 1.6671, -0.2342,  0.1074,  1.7852],
        [ 0.7874, -0.2466,  0.2384, -0.6746]], requires_grad=True)
SAMPLE_X:  tensor([[3, 2, 0, 1],
        [1, 2, 3, 0]], dtype=torch.int32)
output: tensor([[[ 0.7874,  1.6671,  0.0877, -0.5874],
         [-0.2466, -0.2342, -0.6113,  0.8060],
         [ 0.2384,  0.1074,  0.3441,  1.3200],
         [-0.6746,  1.7852, -1.2916,  0.4826]],

        [[-0.5874,  1.6671,  0.7874,  0.0877],
         [ 0.8060, -0.2342, -0.2466, -0.6113],
         [ 1.3200,  0.1074,  0.2384,  0.3441],
         [ 0.4826,  1.7852, -0.6746, -1.2916]]], grad_fn=<TransposeBackward0>)


In [7]:
embedding = nn.Embedding(10, 4)
print(embedding.weight)

Parameter containing:
tensor([[-0.9314,  0.5380,  1.8837,  1.2911],
        [-0.1041, -0.6025, -0.7860,  0.4670],
        [ 0.3695,  1.0820, -1.9087,  1.6108],
        [ 0.0211, -0.6054,  2.2265, -1.7176],
        [ 0.1845, -0.1699,  0.4921, -0.7925],
        [ 1.6591, -0.0074, -0.3345, -0.1528],
        [-1.5218,  0.1531,  0.0445, -1.4806],
        [ 0.1826, -0.1623, -0.8701, -0.2885],
        [ 0.8274, -1.7458, -1.9661, -1.1676],
        [ 0.4603,  0.7549, -0.7166, -0.1605]], requires_grad=True)


In [8]:
class Unembedding(nn.Module):
    def __init__(self, vocab_size, embedding_size):
        super().__init__()
        self.weight = nn.Parameter(torch.rand(vocab_size, embedding_size))

    def forward(self, x):
        return torch.matmul(self.weight, x)

In [9]:
def test_unembedding():
    torch.manual_seed(25)
    vocab_size = 10
    embedding_size = 4
    sequence_length = 4
    batch_size = 2
    input = torch.rand(batch_size, embedding_size, sequence_length)
    unembedding = Unembedding(vocab_size, embedding_size)

    print("weight:", unembedding.weight)
    print("input: ", input)
    output = unembedding(input)
    print("output:", output)
    assert output.shape == (batch_size, vocab_size, sequence_length)
test_unembedding()

weight: Parameter containing:
tensor([[0.4691, 0.6875, 0.9917, 0.2772],
        [0.7970, 0.2249, 0.1119, 0.6863],
        [0.2238, 0.2678, 0.2246, 0.4711],
        [0.0603, 0.2517, 0.3705, 0.7340],
        [0.6466, 0.5172, 0.1176, 0.7000],
        [0.8191, 0.0488, 0.3021, 0.2490],
        [0.7769, 0.7847, 0.8554, 0.8310],
        [0.1154, 0.2578, 0.4702, 0.0530],
        [0.4207, 0.7639, 0.7536, 0.6063],
        [0.1899, 0.2837, 0.6097, 0.5808]], requires_grad=True)
input:  tensor([[[0.7518, 0.1929, 0.0629, 0.9118],
         [0.3828, 0.2990, 0.5933, 0.2911],
         [0.2416, 0.5582, 0.0481, 0.3497],
         [0.3520, 0.9528, 0.0284, 0.8488]],

        [[0.3947, 0.5181, 0.9726, 0.8813],
         [0.0056, 0.3056, 0.9384, 0.7949],
         [0.4399, 0.1766, 0.8739, 0.1425],
         [0.4682, 0.6254, 0.3040, 0.7923]]])
output: tensor([[[0.9530, 1.1137, 0.4930, 1.2099],
         [0.9539, 0.9373, 0.2084, 1.4138],
         [0.4908, 0.6975, 0.1971, 0.7604],
         [0.4896, 0.9930, 0.1918, 0.

In [10]:
class PositionalEmbedding(nn.Module):
    def __init__(self, embedding_size, max_sequence_length):
        super().__init__()
        self.table = nn.Embedding(max_sequence_length, embedding_size)

    def forward(self, sequence):
        positions = torch.zeros(sequence.shape, dtype=torch.int32)
        positions[:, ::] = torch.arange(0, sequence.shape[-1])
        #print("positions", positions)
        positional_embeddings = self.table(positions)
        return positional_embeddings.transpose(-2, -1)

In [11]:
def test_positional_embedding():
    embedding_size = 8
    max_sequence_length = 10
    batch_size = 2
    positional_embedding = PositionalEmbedding(embedding_size, max_sequence_length)
    output = positional_embedding(SAMPLE_X)
    print("output:", output)
    assert output.shape == (batch_size, embedding_size, SAMPLE_X.shape[-1])
test_positional_embedding()

output: tensor([[[-0.0079, -0.6024, -0.2086, -0.7684],
         [-0.6091, -1.1570,  0.0196,  0.3472],
         [ 1.5286,  0.9000, -0.0843,  0.4499],
         [ 1.9735,  0.5598, -1.2005, -0.3574],
         [ 0.1646,  0.2992,  1.1399, -0.8319],
         [ 0.5387, -2.0385,  1.2420,  0.6517],
         [ 0.5112,  1.9378,  0.1124,  0.5965],
         [ 0.8526, -0.1953, -0.0296, -1.3327]],

        [[-0.0079, -0.6024, -0.2086, -0.7684],
         [-0.6091, -1.1570,  0.0196,  0.3472],
         [ 1.5286,  0.9000, -0.0843,  0.4499],
         [ 1.9735,  0.5598, -1.2005, -0.3574],
         [ 0.1646,  0.2992,  1.1399, -0.8319],
         [ 0.5387, -2.0385,  1.2420,  0.6517],
         [ 0.5112,  1.9378,  0.1124,  0.5965],
         [ 0.8526, -0.1953, -0.0296, -1.3327]]], grad_fn=<TransposeBackward0>)


In [121]:
def attention(queries, keys, values, mask):
    keys_transposed = torch.transpose(keys, -2, -1)
    #print("keys_transposed:", keys_transposed)
    scores = torch.matmul(keys_transposed, queries)
    #assert scores.shape == (keys.shape[0], keys.shape[-1], queries.shape[-1])
    #print("scores:", scores)
    #print("scores:", scores.shape)
    #print("masks:", mask.shape)
    scores = scores.masked_fill(mask == 0, -1e9)
    #print("masked scores:", scores)
    d_attn = keys.shape[-1]
    scaled_scores = scores / math.sqrt(d_attn)
    #print("scaled_scores:", scaled_scores)
    softmax_scores = torch.softmax(scaled_scores, -1)
    #print("softmax_scores:", softmax_scores)
    #print("softmax_socres shape:", softmax_scores.shape)
    v_out = torch.matmul(values, softmax_scores)
    return v_out

In [13]:
def test_attention():
    d_attn = 4
    length_x = 4
    length_z = 3
    batch_size = 2
    d_out = 2

    queries = torch.rand(batch_size, d_attn, length_x)
    keys = torch.rand(batch_size, d_attn, length_z)
    values = torch.rand(batch_size, d_out, length_z)
    # print("queries:", queries)
    # print("keys:", keys)
    # print("values:", values)
    mask = torch.tril(torch.ones(length_z, length_x) == 1)
    padding_mask = torch.tensor([[1, 1, 1, 1], [1, 1, 0, 0]], dtype=torch.int32)

    v_out = attention(queries, keys, values, mask)
    #print("output:", v_out)
    assert v_out.shape == (batch_size, d_out, length_x)
test_attention()

In [14]:
from enum import Enum
class MaskStrategy(Enum):
    UNMASKED = 1
    MASKED = 2

In [114]:
class MultiHeadedAttention(nn.Module):
    def __init__(self, num_heads, d_attn, d_x, d_z, d_out, d_mid, maskStrategy):
        super().__init__()
        self.num_heads = num_heads
        self.d_attn = d_attn
        self.d_x = d_x
        self.d_z = d_z
        self.d_out = d_out
        self.d_mid = d_mid
        self.maskStrategy = maskStrategy
        self.weight_query = nn.Parameter(torch.rand(num_heads, d_attn, d_x))
        #print("weight query:", self.weight_query)
        self.weight_key = nn.Parameter(torch.rand(num_heads, d_attn, d_z))
        self.weight_value = nn.Parameter(torch.rand(num_heads, d_mid, d_z))
        self.weight_out = nn.Parameter(torch.rand(d_out, d_mid * num_heads))
        self.maskStrategy = maskStrategy

    def forward(self, x, z, padding_mask):
        length_x = x.shape[-1]
        length_z = z.shape[-1]
        batch_size = x.shape[0]

        queries = torch.matmul(self.weight_query, x.unsqueeze(1))
        keys = torch.matmul(self.weight_key, z.unsqueeze(1))
        values = torch.matmul(self.weight_value, z.unsqueeze(1))

        assert queries.shape == (batch_size, self.num_heads, self.d_attn, length_x)
        assert keys.shape == (batch_size, self.num_heads, self.d_attn, length_z)
        assert values.shape == (batch_size, self.num_heads, self.d_mid, length_z)

        if self.maskStrategy == MaskStrategy['UNMASKED']:
            mask = padding_mask.unsqueeze(1)
        elif self.maskStrategy == MaskStrategy['MASKED']:
            mask = torch.tril(torch.ones(length_z, length_x) == 1).unsqueeze(0)
            # print("padding mask:", padding_mask.shape)
            # print("mask tril", mask)
            mask = mask & padding_mask.unsqueeze(1)
            # print("merged mask:", mask)
        mask = mask.unsqueeze(1)
        #print("mask", mask)
        v_out = attention(queries, keys, values, mask)
        #print(v_out.shape)
        assert v_out.shape == (batch_size, self.num_heads, self.d_mid, length_x)
        #print("v_out:", v_out)
        #print("v_out shape before:", v_out.shape)
        v_out = v_out.reshape(batch_size, -1, v_out.shape[-1])
        #print("v_out shape:", v_out.shape)
        #print("weight_out shape:", self.weight_out.shape)
        #print("v_out reshaped:", v_out)
        output = torch.matmul(self.weight_out, v_out)
        assert output.shape == (batch_size, self.d_out, length_x)
        return output




In [112]:
def test_multi_headed_attention_encoder():
    num_heads = 8
    d_attn = 4
    d_x = 4
    d_z = 4
    d_out = 4
    d_mid = 2
    length_x = 3
    length_z = 4
    batch_size = 3
    padding_mask = torch.tensor([[1, 1, 1, 0], [1, 1, 0, 0], [1, 1, 1, 0]], dtype=torch.int32)

    multi_headed_attention = MultiHeadedAttention(num_heads, d_attn, d_x, d_z, d_out, d_mid, MaskStrategy['UNMASKED'])
    x = torch.rand(batch_size, d_x, length_x)
    z = torch.rand(batch_size, d_z, length_z)
    output = multi_headed_attention(z, z, padding_mask)
    #print("output:", output)
    assert output.shape == (batch_size, d_out, length_z)
test_multi_headed_attention_encoder()

scores: torch.Size([3, 8, 4, 4])
masks: torch.Size([3, 1, 1, 4])
softmax_scores: tensor([[[[0.4672, 0.2944, 0.2384, 0.0000],
          [0.4360, 0.3065, 0.2575, 0.0000],
          [0.4473, 0.2952, 0.2575, 0.0000],
          [0.4779, 0.2881, 0.2339, 0.0000]],

         [[0.4361, 0.1993, 0.3645, 0.0000],
          [0.4010, 0.2419, 0.3571, 0.0000],
          [0.4302, 0.2068, 0.3630, 0.0000],
          [0.4481, 0.1843, 0.3676, 0.0000]],

         [[0.4345, 0.1955, 0.3700, 0.0000],
          [0.4081, 0.2414, 0.3505, 0.0000],
          [0.4272, 0.1999, 0.3730, 0.0000],
          [0.4464, 0.1804, 0.3732, 0.0000]],

         [[0.4920, 0.1958, 0.3122, 0.0000],
          [0.4542, 0.2247, 0.3210, 0.0000],
          [0.4787, 0.2065, 0.3148, 0.0000],
          [0.5000, 0.1839, 0.3161, 0.0000]],

         [[0.4537, 0.2195, 0.3268, 0.0000],
          [0.4282, 0.2453, 0.3265, 0.0000],
          [0.4411, 0.2278, 0.3311, 0.0000],
          [0.4663, 0.2084, 0.3253, 0.0000]],

         [[0.4361, 0.2556, 0.

In [120]:
def test_multi_headed_attention_decoder():
    num_heads = 8
    d_attn = 4
    d_x = 4
    d_z = 4
    d_out = 4
    d_mid = 2
    length_x = 3
    length_z = 4
    batch_size = 3
    padding_mask = torch.tensor([[1, 1, 1, 0], [1, 1, 0, 0], [1, 1, 0, 0]], dtype=torch.int32)

    multi_headed_attention = MultiHeadedAttention(num_heads, d_attn, d_x, d_z, d_out, d_mid, MaskStrategy['MASKED'])
    x = torch.rand(batch_size, d_x, length_x)
    z = torch.rand(batch_size, d_z, length_z)
    output = multi_headed_attention(z, x, padding_mask)
    print("output:", output)
    assert output.shape == (batch_size, d_out, length_z)
test_multi_headed_attention_decoder()

scores: torch.Size([3, 8, 3, 4])
masks: torch.Size([3, 1, 3, 4])
softmax_scores: tensor([[[[1.0000, 0.0000, 0.0000, 0.0000],
          [0.8532, 0.1468, 0.0000, 0.0000],
          [0.6541, 0.1693, 0.1767, 0.0000]],

         [[1.0000, 0.0000, 0.0000, 0.0000],
          [0.8500, 0.1500, 0.0000, 0.0000],
          [0.6315, 0.1663, 0.2023, 0.0000]],

         [[1.0000, 0.0000, 0.0000, 0.0000],
          [0.8395, 0.1605, 0.0000, 0.0000],
          [0.7050, 0.1420, 0.1530, 0.0000]],

         [[1.0000, 0.0000, 0.0000, 0.0000],
          [0.8972, 0.1028, 0.0000, 0.0000],
          [0.7602, 0.1040, 0.1358, 0.0000]],

         [[1.0000, 0.0000, 0.0000, 0.0000],
          [0.8159, 0.1841, 0.0000, 0.0000],
          [0.7503, 0.1286, 0.1211, 0.0000]],

         [[1.0000, 0.0000, 0.0000, 0.0000],
          [0.8119, 0.1881, 0.0000, 0.0000],
          [0.7308, 0.1323, 0.1368, 0.0000]],

         [[1.0000, 0.0000, 0.0000, 0.0000],
          [0.8184, 0.1816, 0.0000, 0.0000],
          [0.6941, 0.1459, 

In [17]:
class LayerNorm(nn.Module):
    def __init__(self, feature_length):
        super().__init__()
        self.scale = nn.Parameter(torch.rand(feature_length, 1))
        self.offset = nn.Parameter(torch.rand(feature_length, 1))

    def forward(self, activations):
        mean = torch.mean(activations, -2, keepdim=True)
        #print("mean:", mean)
        #print("activations - mean", activations - mean)
        variance = torch.std(activations, -2, keepdim=True)
        return (((activations - mean) / variance) * self.scale) + self.offset

In [18]:
def test_layer_norm():
    feature_length = 4
    length_x = 3
    batch_size = 5
    layer_norm = LayerNorm(feature_length)

    activations = torch.rand(batch_size, feature_length, length_x)

    print("activations:", activations)
    print("layer_normed:", layer_norm(activations))
    assert layer_norm(activations).shape == activations.shape

test_layer_norm()

activations: tensor([[[0.0704, 0.1198, 0.5434],
         [0.9352, 0.0150, 0.9013],
         [0.0820, 0.7545, 0.7015],
         [0.1260, 0.7163, 0.1843]],

        [[0.6639, 0.2327, 0.9781],
         [0.6807, 0.8303, 0.9309],
         [0.8669, 0.0960, 0.9688],
         [0.3365, 0.5425, 0.9925]],

        [[0.1154, 0.3937, 0.8867],
         [0.0920, 0.5244, 0.9967],
         [0.4847, 0.2442, 0.0764],
         [0.4376, 0.2007, 0.7481]],

        [[0.7234, 0.6003, 0.1531],
         [0.4154, 0.3634, 0.0116],
         [0.6750, 0.0157, 0.8838],
         [0.6063, 0.7218, 0.2934]],

        [[0.3979, 0.5172, 0.3437],
         [0.2271, 0.0377, 0.9050],
         [0.8330, 0.8701, 0.3124],
         [0.8493, 0.6858, 0.9586]]])
layer_normed: tensor([[[-0.0857, -0.1555,  0.0849],
         [ 1.9535,  0.2657,  1.6510],
         [ 0.2382,  0.6312,  0.4894],
         [-0.0636,  1.0282, -0.8555]],

        [[ 0.1864, -0.0997,  0.2984],
         [ 1.0736,  1.7746, -0.0040],
         [ 0.6679,  0.1071,  0.39

In [19]:
class FeedForward(nn.Module):
    def __init__(self, hiddenLayerWidth, d_e):
        super().__init__()
        self.mlp1 = nn.Parameter(torch.rand(hiddenLayerWidth, d_e))
        self.mlp2 = nn.Parameter(torch.rand(d_e, hiddenLayerWidth))

    def forward(self, activations):
        activations = torch.matmul(self.mlp1, activations)
        activations = activations.relu()
        activations = torch.matmul(self.mlp2, activations)
        return activations


In [20]:
def test_feed_forward():
    hiddenLayerWidth = 4
    d_e = 4
    feed_forward = FeedForward(hiddenLayerWidth, d_e)
    feed_forward = FeedForward(8, 4)
    activations = torch.rand(10, 4, 4)

    print("activations:", activations)
    output = feed_forward(activations)
    print("feed forward:", output)
    assert output.shape == activations.shape

test_feed_forward()

activations: tensor([[[0.7625, 0.9868, 0.4461, 0.6112],
         [0.9535, 0.9772, 0.1404, 0.1646],
         [0.1711, 0.3559, 0.4114, 0.1469],
         [0.9462, 0.8611, 0.9841, 0.8701]],

        [[0.3741, 0.4518, 0.3968, 0.1586],
         [0.7565, 0.4712, 0.1372, 0.4064],
         [0.4532, 0.6836, 0.8032, 0.2358],
         [0.9014, 0.3753, 0.3579, 0.4402]],

        [[0.7379, 0.9645, 0.3429, 0.4758],
         [0.1447, 0.6538, 0.2173, 0.5897],
         [0.0629, 0.1146, 0.8246, 0.3298],
         [0.9730, 0.1295, 0.0286, 0.6541]],

        [[0.2985, 0.6923, 0.8146, 0.2188],
         [0.5439, 0.8871, 0.1613, 0.1575],
         [0.9211, 0.2334, 0.5237, 0.8322],
         [0.2061, 0.1870, 0.0273, 0.9490]],

        [[0.7635, 0.8103, 0.6008, 0.6052],
         [0.8353, 0.0822, 0.6737, 0.4559],
         [0.3786, 0.8732, 0.2472, 0.9585],
         [0.8650, 0.0683, 0.0543, 0.5648]],

        [[0.4272, 0.2048, 0.5931, 0.2845],
         [0.4615, 0.7539, 0.5990, 0.1542],
         [0.8416, 0.1384, 0.849

In [21]:
class EncoderLayer(nn.Module):
    def __init__(self, num_heads, d_attn, d_x, d_z, d_out, d_mid, d_mlp):
        super().__init__()
        self.multi_head_attention = MultiHeadedAttention(num_heads, d_attn, d_x, d_z, d_out, d_mid, MaskStrategy['UNMASKED'])
        self.layer_norm1 = LayerNorm(d_z)
        self.feed_forward = FeedForward(d_mlp, d_z)
        self.layer_norm2 = LayerNorm(d_z)

    def forward(self, z, padding_mask):
        z = z + self.multi_head_attention(z, z, padding_mask)
        z = self.layer_norm1(z)
        z = z + self.feed_forward(z)
        z = self.layer_norm2(z)
        return z

In [22]:
class Encoder(nn.Module):
    def __init__(self, num_layers, num_heads, d_attn, d_x, d_z, d_out, d_mid, d_mlp):
        super().__init__()
        self.layers = []
        for i in range(num_layers):
            encoder_layer = EncoderLayer(num_heads, d_attn, d_x, d_z, d_out, d_mid, d_mlp)
            self.layers.append(encoder_layer)
        self.layers = nn.ModuleList(self.layers)

    def forward(self, z, padding_mask):
        for layer in self.layers:
            z = layer(z, padding_mask)
        return z

In [23]:
class DecoderLayer(nn.Module):
    def __init__(self, num_heads, d_attn, d_x, d_z, d_out, d_mid, d_mlp):
        super().__init__()
        self.multi_head_self_attention = MultiHeadedAttention(num_heads, d_attn, d_x, d_z, d_out, d_mid, MaskStrategy['MASKED'])
        self.layer_norm1 = LayerNorm(d_x)
        self.multi_head_global_attention = MultiHeadedAttention(num_heads, d_attn, d_x, d_z, d_out, d_mid, MaskStrategy['UNMASKED'])
        self.layer_norm2 = LayerNorm(d_x)
        self.feed_forward = FeedForward(d_mlp, d_x)
        self.layer_norm3 = LayerNorm(d_x)

    def forward(self, x, z, padding_mask):
        x = x + self.multi_head_self_attention(x, x, padding_mask)
        x = self.layer_norm1(x)
        x = x + self.multi_head_global_attention(x, z, padding_mask)
        x = self.layer_norm2(x)
        x = x + self.feed_forward(x)
        x = self.layer_norm3(x)
        return x

In [24]:
class Decoder(nn.Module):
    def __init__(self, num_layers, num_heads, d_attn, d_x, d_z, d_out, d_mid, d_mlp):
        super().__init__()
        self.layers = []
        for i in range(num_layers):
            decoder_layer = DecoderLayer(num_heads, d_attn, d_x, d_z, d_out, d_mid, d_mlp)
            self.layers.append(decoder_layer)
        self.layers = nn.ModuleList(self.layers)

    def forward(self, x, z, padding_mask):
        for layer in self.layers:
            x = layer(x, z, padding_mask)
        return x

In [25]:
class EncoderDecoderTransformer(nn.Module):
    def __init__(self, num_encoder_layers, num_decoder_layers, num_heads, d_attn, d_x, d_z, d_out, d_mid, d_mlp, d_e, vocab_size, max_sequence_length):
        super().__init__()
        self.embedding = Embedding(vocab_size, d_e)
        self.positionalEmbedding = PositionalEmbedding(d_e, max_sequence_length)
        self.encoder = Encoder(num_encoder_layers, num_heads, d_attn, d_x, d_z, d_out, d_mid, d_mlp)
        self.decoder = Decoder(num_decoder_layers, num_heads, d_attn, d_x, d_z, d_out, d_mid, d_mlp)
        self.unembedding = Unembedding(vocab_size, d_e)

    def forward(self, x, z, src_mask, tgt_mask):
        z = self.embedding(z) + self.positionalEmbedding(z)
        z = self.encoder(z, src_mask)
        x = self.embedding(x)
        x = self.decoder(x, z, tgt_mask)
        #print("x after decoder:", x.shape)
        x = self.unembedding(x)
        #print("x after unembedding:", x.shape)
        return x




In [26]:
enRawName = "drive/MyDrive/colab data/multi30kEnTrain.txt"
deRawName = "drive/MyDrive/colab data/multi30kDeTrain.txt"
en30kVal = "drive/MyDrive/colab data/multi30kEnVal.txt"
de30kVal = "drive/MyDrive/colab data/multi30kDeVal.txt"
englishCleanName = "data/english_tokens.pkl"
germanCleanName = "data/german_tokens.pkl"
englishSortedName = "data/englishSorted.pkl"
germanSortedName = "data/germanSorted.pkl"

truncEn = "drive/MyDrive/colab data/truncEn.pkl"
truncDe = "drive/MyDrive/colab data/truncDe.pkl"

enTokenizerName = "drive/MyDrive/colab data/enTokenizer.pkl"
deTokenizerName = "drive/MyDrive/colab data/deTokenizer.pkl"
pairsName = "drive/MyDrive/colab data/pairs.pkl"
folder = "drive/MyDrive/colab data/"

enTrainingFileName = folder + "enTraining"
deTrainingFileName = folder + "deTraining"
enTestFileName = folder + "enTest"
deTestFileName = folder + "deTest"
enValFileName = folder + "enValidation"
deValFileName = folder + "deValidation"

enCombinedFileName = folder + "enCombined"
deCombinedFileName = folder + "deCombined"

In [27]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [28]:
class SentenceDataset(Dataset):

    TOKENIZER_SUFFIX = "_tokenizer"
    BOS_TOKEN = "[SOS]"
    EOS_TOKEN = "[EOS]"
    PAD_TOKEN = "[PAD]"
    UNK_TOKEN = "[UNK]"

    def __init__(self, src_filename, tgt_filename, src_vocab_size, tgt_vocab_size):
        src_sequences = self.to_sequences(self.load_doc(src_filename))
        tgt_sequences = self.to_sequences(self.load_doc(tgt_filename))
        src_sequences = [self.add_special_tokens(sequence) for sequence in src_sequences]
        tgt_sequences = [self.add_special_tokens(sequence) for sequence in tgt_sequences]
        self.src_tokenizer, self.tgt_tokenizer = self.setup_tokenizers(src_filename, tgt_filename, src_vocab_size, tgt_vocab_size, src_filename + SentenceDataset.TOKENIZER_SUFFIX, tgt_filename + SentenceDataset.TOKENIZER_SUFFIX)
        # src_tokenized = self.src_tokenizer.encode_batch(src_sequences)
        # tgt_tokenized = self.tgt_tokenizer.encode_batch(tgt_sequences)
        # src_tensors = [torch.IntTensor(sequence.ids) for sequence in src_tokenized]
        # tgt_tensor = [torch.IntTensor(sequence.ids) for sequence in tgt_tokenized]
        self.pairs = self.pair_sequences(src_sequences, tgt_sequences)
        print("pairs", self.pairs)

    # load doc into memory
    def load_doc(self, filename):
        # open the file as read only
        file = open(filename, mode='rt')
        # read all text
        text = file.read()
        # close the file
        file.close()
        return text

    def add_special_tokens(self, sequence):
        sequence = self.BOS_TOKEN + " " + sequence + " " + self.EOS_TOKEN
        return sequence

    def pair_sequences(self, src_sequences, tgt_sequences):
        paired_sequences = list(zip(src_sequences, tgt_sequences))
        sorted_pairs = sorted(paired_sequences, key=lambda x: len(x[0]))
        return sorted_pairs

    # split a loaded document into sequences
    def to_sequences(self, doc):
        sequences = doc.strip().split('\n')
        return sequences

    def setup_tokenizers(self, src_filename, tgt_filename, src_vocab_size, tgt_vocab_size, src_tokenizer_name, tgt_tokenizer_name):
        print("creating tokenizer for " + src_filename)
        src_tokenizer = Tokenizer(BPE(unk_token=SentenceDataset.UNK_TOKEN))
        src_tokenizer.pre_tokenizer = Whitespace()
        trainer = BpeTrainer(vocab_size = src_vocab_size, special_tokens=[SentenceDataset.BOS_TOKEN, SentenceDataset.EOS_TOKEN, SentenceDataset.PAD_TOKEN, SentenceDataset.UNK_TOKEN])
        src_tokenizer.train([src_filename], trainer=trainer)
        pickle.dump(src_tokenizer, open(src_tokenizer_name, "wb"))

        print("creating tokenizer for " + tgt_filename)
        tgt_tokenizer = Tokenizer(BPE(unk_token=SentenceDataset.UNK_TOKEN))
        tgt_tokenizer.pre_tokenizer = Whitespace()
        trainer = BpeTrainer(vocab_size = tgt_vocab_size, special_tokens=[SentenceDataset.BOS_TOKEN, SentenceDataset.EOS_TOKEN, SentenceDataset.PAD_TOKEN, SentenceDataset.UNK_TOKEN])
        tgt_tokenizer.train([tgt_filename], trainer=trainer)
        pickle.dump(tgt_tokenizer, open(tgt_tokenizer_name, "wb"))
        return src_tokenizer, tgt_tokenizer

    def __len__(self):
        return len(self.pairs)

    def __getitem__(self, index):
        src_seq, tgt_seq = self.pairs[index]
        return src_seq, tgt_seq


In [125]:
class PadCollate:
    PAD_TOKEN = "[PAD]"
    PAD_ID = 2
    def __init__(self, src_tokenizer, tgt_tokenizer):
        self.src_tokenizer = src_tokenizer
        self.tgt_tokenizer = tgt_tokenizer

    def __call__(self, batch):
        max_len_src = max([len(pair[0].split()) for pair in batch])
        max_len_tgt = max([len(pair[1].split()) for pair in batch])

        self.src_tokenizer.enable_padding(pad_id = self.PAD_ID, pad_token = self.PAD_TOKEN, length=max_len_src)
        self.src_tokenizer.enable_truncation(max_length=max_len_src)
        self.tgt_tokenizer.enable_padding(pad_id = self.PAD_ID, pad_token = self.PAD_TOKEN, length=max_len_tgt)
        self.tgt_tokenizer.enable_truncation(max_length=max_len_tgt)

        print("src batch:", [pair[0] for pair in batch])
        print("tgt batch:", [pair[1] for pair in batch])

        src_tokenized = self.src_tokenizer.encode_batch([pair[0] for pair in batch])
        tgt_tokenized = self.tgt_tokenizer.encode_batch([pair[1] for pair in batch])
        # src_tokenized = [sequence.ids for sequence in src_tokenized]
        # tgt_tokenized = [sequence.ids for sequence in tgt_tokenized]
        # src_tensors = torch.IntTensor(src_tokenized)
        # tgt_tensor = torch.IntTensor(tgt_tokenized)

        return src_tokenized, tgt_tokenized

In [126]:
sequenceDataset = SentenceDataset(enRawName, deRawName, 2000, 2000)

creating tokenizer for drive/MyDrive/colab data/multi30kEnTrain.txt
creating tokenizer for drive/MyDrive/colab data/multi30kDeTrain.txt


In [127]:
print(sequenceDataset.__getitem__(0))

('[SOS] A dog in a car. [EOS]', '[SOS] Ein Hund in einem Auto. [EOS]')


In [128]:
train_dataloader = DataLoader(sequenceDataset, batch_size=16, collate_fn = PadCollate(sequenceDataset.src_tokenizer, sequenceDataset.tgt_tokenizer))

In [129]:
for src, tgt in train_dataloader:
    print("batch:", src)
    print("tgt", tgt)

    print("mask:", src[0].attention_mask)
    break

src batch: ['[SOS] A dog in a car. [EOS]', '[SOS] A man on the sea. [EOS]', '[SOS] Men play baseball. [EOS]', '[SOS] A child in a swing. [EOS]', '[SOS] A brown dog nursing [EOS]', '[SOS] Two houses and fire [EOS]', '[SOS] A man climbs a rock. [EOS]', '[SOS] A man sits on a rock. [EOS]', '[SOS] Two dogs and a puppy. [EOS]', '[SOS] A man playing cricket [EOS]', '[SOS] People are skydiving. [EOS]', '[SOS] A car is in the water [EOS]', '[SOS] A young male surfing. [EOS]', '[SOS] A dog licks his nose. [EOS]', '[SOS] Two dogs in the snow. [EOS]', '[SOS] A mountain landscape. [EOS]']
tgt batch: ['[SOS] Ein Hund in einem Auto. [EOS]', '[SOS] Ein Mann auf dem Meer. [EOS]', '[SOS] Männer spielen Baseball. [EOS]', '[SOS] Ein Kind auf einer Schaukel. [EOS]', '[SOS] Ein brauner Hund wird gestillt. [EOS]', '[SOS] Zwei Häuser und Feuer. [EOS]', '[SOS] Ein Mann klettert an einem Felsen. [EOS]', '[SOS] Ein Mann sitzt auf einem Stein. [EOS]', '[SOS] Zwei Hunde und ein Welpe. [EOS]', '[SOS] Ein Mann spie

In [34]:
print(sequenceDataset.__getitem__(1000))

('[SOS] Two football teams playing a game [EOS]', '[SOS] Zwei Football-Mannschaften spielen ein Match. [EOS]')


In [122]:
pair = sequenceDataset.__getitem__(0)

num_encoder_layers = 3
num_decoder_layers = 3
num_heads = 8
d_attn = 64
d_x = 512
d_z = 512
d_out = 512
d_mid = 512
d_mlp = 2048
d_e = 512
vocab_size = 2000
max_sequence_length = 100

encoder_decoder_transformer = EncoderDecoderTransformer(num_encoder_layers, num_decoder_layers, num_heads, d_attn, d_x, d_z, d_out, d_mid, d_mlp, d_e, vocab_size, max_sequence_length)

In [36]:
def decode(x, tokenizer):
    #print("x:", x)
    x = x.transpose(0, 1)
    x = torch.softmax(x, -1)
    #print("x softmax:", x)
    x = torch.argmax(x, dim=-1)
    x = x.tolist()
    print("argmax x:", x)
    return tokenizer.decode(x)

In [130]:
opt = optim.Adam(encoder_decoder_transformer.parameters(), lr=0.05, betas=(0.9, 0.98), eps=1e-9)
loss_function = nn.CrossEntropyLoss()
for i in range(1000):
    for sequence_x, sequence_z in train_dataloader:
        # print("x:", sequence_x)
        # print("z:", sequence_z)
        # sequence_x, sequence_z = sequenceDataset.__getitem__(i)
        sequence_x_tokens = torch.IntTensor([sequence.ids for sequence in sequence_x])
        sequence_z_tokens = torch.IntTensor([sequence.ids for sequence in sequence_z])
        src_masks = torch.IntTensor([sequence.attention_mask for sequence in sequence_z])
        tgt_masks = torch.IntTensor([sequence.attention_mask for sequence in sequence_x])

        output = encoder_decoder_transformer(sequence_x_tokens, sequence_z_tokens, src_masks, tgt_masks)
        decoded_output = decode(output[0], sequenceDataset.tgt_tokenizer)
        print("decoded output:", decoded_output)
        loss = loss_function(output, sequence_x_tokens.long())
        print("loss:", loss)
        print()
        print()
        opt.zero_grad()
        loss.backward()
        opt.step()

src batch: ['[SOS] A dog in a car. [EOS]', '[SOS] A man on the sea. [EOS]', '[SOS] Men play baseball. [EOS]', '[SOS] A child in a swing. [EOS]', '[SOS] A brown dog nursing [EOS]', '[SOS] Two houses and fire [EOS]', '[SOS] A man climbs a rock. [EOS]', '[SOS] A man sits on a rock. [EOS]', '[SOS] Two dogs and a puppy. [EOS]', '[SOS] A man playing cricket [EOS]', '[SOS] People are skydiving. [EOS]', '[SOS] A car is in the water [EOS]', '[SOS] A young male surfing. [EOS]', '[SOS] A dog licks his nose. [EOS]', '[SOS] Two dogs in the snow. [EOS]', '[SOS] A mountain landscape. [EOS]']
tgt batch: ['[SOS] Ein Hund in einem Auto. [EOS]', '[SOS] Ein Mann auf dem Meer. [EOS]', '[SOS] Männer spielen Baseball. [EOS]', '[SOS] Ein Kind auf einer Schaukel. [EOS]', '[SOS] Ein brauner Hund wird gestillt. [EOS]', '[SOS] Zwei Häuser und Feuer. [EOS]', '[SOS] Ein Mann klettert an einem Felsen. [EOS]', '[SOS] Ein Mann sitzt auf einem Stein. [EOS]', '[SOS] Zwei Hunde und ein Welpe. [EOS]', '[SOS] Ein Mann spie

KeyboardInterrupt: 