<a href="https://colab.research.google.com/github/JackWittmayer/Transformer-Implementation/blob/main/EDTransformer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install tokenizers



In [2]:
import re
import string
import os
import pickle
from unicodedata import normalize
from collections import Counter

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset
from torchvision import datasets
from torch.utils.data import DataLoader
from torch.nn.functional import log_softmax, pad

from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.trainers import BpeTrainer

import random
import time

import numpy as np
import math
import matplotlib.pyplot as plt

import sys
from nltk.translate.bleu_score import sentence_bleu
from nltk.translate.bleu_score import corpus_bleu

In [3]:
torch.manual_seed(25)
random.seed(25)

In [4]:
SAMPLE_X = torch.tensor([3, 2, 0, 1], dtype=torch.int32)
SAMPLE_Z = torch.tensor([4, 1, 7, 6], dtype=torch.int32)

In [5]:
class Embedding(nn.Module):
    def __init__(self, vocab_size, embedding_size):
        super().__init__()
        self.table = nn.Embedding(vocab_size, embedding_size)

    def forward(self, sequence):
        embeddings = self.table(sequence)
        return embeddings.transpose(0, 1)

In [6]:
def test_embedding():
    torch.manual_seed(25)
    vocab_size = 4
    embedding = Embedding(vocab_size, 4)
    print("weight:", embedding.table.weight)
    print("SAMPLE_X: ", SAMPLE_X)
    output = embedding(SAMPLE_X)
    print("output:", output)
    for i in range(vocab_size):
        assert output[:, i].eq(embedding.table.weight[SAMPLE_X[i]]).all()
test_embedding()

weight: Parameter containing:
tensor([[ 0.0877, -0.6113,  0.3441, -1.2916],
        [-0.5874,  0.8060,  1.3200,  0.4826],
        [ 1.6671, -0.2342,  0.1074,  1.7852],
        [ 0.7874, -0.2466,  0.2384, -0.6746]], requires_grad=True)
SAMPLE_X:  tensor([3, 2, 0, 1], dtype=torch.int32)
output: tensor([[ 0.7874,  1.6671,  0.0877, -0.5874],
        [-0.2466, -0.2342, -0.6113,  0.8060],
        [ 0.2384,  0.1074,  0.3441,  1.3200],
        [-0.6746,  1.7852, -1.2916,  0.4826]], grad_fn=<TransposeBackward0>)


In [7]:
embedding = nn.Embedding(10, 4)
print(embedding.weight)

Parameter containing:
tensor([[-0.9314,  0.5380,  1.8837,  1.2911],
        [-0.1041, -0.6025, -0.7860,  0.4670],
        [ 0.3695,  1.0820, -1.9087,  1.6108],
        [ 0.0211, -0.6054,  2.2265, -1.7176],
        [ 0.1845, -0.1699,  0.4921, -0.7925],
        [ 1.6591, -0.0074, -0.3345, -0.1528],
        [-1.5218,  0.1531,  0.0445, -1.4806],
        [ 0.1826, -0.1623, -0.8701, -0.2885],
        [ 0.8274, -1.7458, -1.9661, -1.1676],
        [ 0.4603,  0.7549, -0.7166, -0.1605]], requires_grad=True)


In [8]:
class Unembedding(nn.Module):
    def __init__(self, vocab_size, embedding_size):
        super().__init__()
        self.weight = nn.Parameter(torch.rand(vocab_size, embedding_size))

    def forward(self, x):
        return torch.matmul(self.weight, x)

In [9]:
def test_unembedding():
    torch.manual_seed(25)
    vocab_size = 10
    embedding_size = 4
    sequence_length = 4
    input = torch.rand(embedding_size, sequence_length)
    unembedding = Unembedding(vocab_size, embedding_size)

    print("weight:", unembedding.weight)
    print("input: ", input)
    output = unembedding(input)
    print("output:", output)
    assert output.shape == (vocab_size, sequence_length)
test_unembedding()

weight: Parameter containing:
tensor([[0.3947, 0.5181, 0.9726, 0.8813],
        [0.0056, 0.3056, 0.9384, 0.7949],
        [0.4399, 0.1766, 0.8739, 0.1425],
        [0.4682, 0.6254, 0.3040, 0.7923],
        [0.4691, 0.6875, 0.9917, 0.2772],
        [0.7970, 0.2249, 0.1119, 0.6863],
        [0.2238, 0.2678, 0.2246, 0.4711],
        [0.0603, 0.2517, 0.3705, 0.7340],
        [0.6466, 0.5172, 0.1176, 0.7000],
        [0.8191, 0.0488, 0.3021, 0.2490]], requires_grad=True)
input:  tensor([[0.7518, 0.1929, 0.0629, 0.9118],
        [0.3828, 0.2990, 0.5933, 0.2911],
        [0.2416, 0.5582, 0.0481, 0.3497],
        [0.3520, 0.9528, 0.0284, 0.8488]])
output: tensor([[1.0403, 1.6136, 0.4041, 1.5988],
        [0.6278, 1.3736, 0.2495, 1.0969],
        [0.6596, 0.7612, 0.1785, 0.8790],
        [0.9437, 1.2018, 0.4376, 1.3877],
        [0.9530, 1.1137, 0.4930, 1.2099],
        [0.9539, 0.9373, 0.2084, 1.4138],
        [0.4908, 0.6975, 0.1971, 0.7604],
        [0.4896, 0.9930, 0.1918, 0.8808],
        

In [10]:
class PositionalEmbedding(nn.Module):
    def __init__(self, embedding_size, max_sequence_length):
        super().__init__()
        self.table = nn.Embedding(max_sequence_length, embedding_size)

    def forward(self, sequence):
        positional_embeddings = self.table(torch.arange(0, sequence.shape[-1]))
        return positional_embeddings.transpose(0, 1)

In [11]:
def test_positional_embedding():
    embedding_size = 8
    max_sequence_length = 10
    positional_embedding = PositionalEmbedding(embedding_size, max_sequence_length)
    output = positional_embedding(SAMPLE_X)
    print("output:", output)
    assert output.shape == (embedding_size, SAMPLE_X.shape[0])
test_positional_embedding()

output: tensor([[-1.5218,  0.8274, -0.0079, -0.6024],
        [ 0.1531, -1.7458, -0.6091, -1.1570],
        [ 0.0445, -1.9661,  1.5286,  0.9000],
        [-1.4806, -1.1676,  1.9735,  0.5598],
        [ 0.1826,  0.4603,  0.1646,  0.2992],
        [-0.1623,  0.7549,  0.5387, -2.0385],
        [-0.8701, -0.7166,  0.5112,  1.9378],
        [-0.2885, -0.1605,  0.8526, -0.1953]], grad_fn=<TransposeBackward0>)


In [12]:
def attention(queries, keys, values, mask):
    keys_transposed = torch.transpose(keys, -2, -1)
    #print("keys_transposed:", keys_transposed)
    scores = torch.matmul(keys_transposed, queries)
    #print("scores:", scores)
    scores = scores.masked_fill(mask == 0, -1e9)
    #print("masked scores:", scores)
    d_attn = keys.shape[-1]
    scaled_scores = scores / math.sqrt(d_attn)
    #print("scaled_scores:", scaled_scores)
    softmax_scores = torch.softmax(scaled_scores, -1)
    #print("softmax_scores:", softmax_scores)
    return torch.matmul(values, softmax_scores)

In [13]:
def test_attention():
    d_attn = 4
    length_x = 4
    length_z = 3

    queries = torch.rand(d_attn, length_x)
    keys = torch.rand(d_attn, length_z)
    values = torch.rand(d_attn, length_z)
    print("queries:", queries)
    print("keys:", keys)
    print("values:", values)
    mask = mask = torch.tril(torch.ones(length_z, length_x) == 1)


    v_out = attention(queries, keys, values, mask)
    print("output:", v_out)
    assert v_out.shape == (d_attn, length_x)
test_attention()

queries: tensor([[0.2312, 0.5850, 0.4959, 0.0404],
        [0.0333, 0.5615, 0.8019, 0.2183],
        [0.2667, 0.8491, 0.8948, 0.0137],
        [0.2536, 0.1351, 0.4520, 0.1235]])
keys: tensor([[0.2746, 0.1760, 0.3505],
        [0.9246, 0.8537, 0.5464],
        [0.9339, 0.0768, 0.0565],
        [0.3594, 0.4961, 0.6278]])
values: tensor([[0.3572, 0.5220, 0.1997],
        [0.5286, 0.4723, 0.0238],
        [0.1838, 0.2010, 0.1765],
        [0.8587, 0.7776, 0.1199]])
output: tensor([[0.6346, 0.3650, 0.0793, 0.0000],
        [0.7374, 0.2779, 0.0095, 0.0000],
        [0.3175, 0.1738, 0.0701, 0.0000],
        [1.2242, 0.4844, 0.0476, 0.0000]])


In [14]:
from enum import Enum
class MaskStrategy(Enum):
    UNMASKED = 1
    MASKED = 2

In [15]:
class MultiHeadedAttention(nn.Module):
    def __init__(self, num_heads, d_attn, d_x, d_z, d_out, d_mid, maskStrategy):
        super().__init__()
        self.num_heads = num_heads
        self.weight_query = nn.Parameter(torch.rand(num_heads, d_attn, d_x))
        #print("weight query:", self.weight_query)
        self.weight_key = nn.Parameter(torch.rand(num_heads, d_attn, d_z))
        self.weight_value = nn.Parameter(torch.rand(num_heads, d_mid, d_z))
        self.weight_out = nn.Parameter(torch.rand(d_out, d_mid * num_heads))
        self.maskStrategy = maskStrategy

    def forward(self, x, z):
        queries = torch.matmul(self.weight_query, x)
        keys = torch.matmul(self.weight_key, z)
        values = torch.matmul(self.weight_value, z)
        #print("queries:", queries)
        #print("keys:", keys)
        #print("values:", values)

        # queries_with_heads = queries.reshape(self.num_heads, -1, x.shape[-1])
        # keys_with_heads = keys.reshape(self.num_heads, -1, z.shape[-1])
        # values_with_heads = values.reshape(self.num_heads, -1, z.shape[-1])

        # print("queries_with_heads", queries_with_heads)
        # print("keys_with_heads", keys_with_heads)
        # print("values_with_heads", values_with_heads)

        length_x = x.shape[-1]
        length_z = z.shape[-1]
        if self.maskStrategy == MaskStrategy['UNMASKED']:
            mask = torch.ones(length_z, length_x)
        elif self.maskStrategy == MaskStrategy['MASKED']:
            mask = torch.tril(torch.ones(length_z, length_x) == 1)
        v_out = attention(queries, keys, values, mask)
        #print("v_out:", v_out)
        v_out = v_out.reshape(-1, v_out.shape[-1])
        #print("v_out reshaped:", v_out)
        return torch.matmul(self.weight_out, v_out)




In [16]:
def test_multi_headed_attention():
    num_heads = 2
    d_attn = 4
    d_x = 4
    d_z = 4
    d_out = 4
    d_mid = 2
    length_x = 4
    length_z = 3

    multi_headed_attention = MultiHeadedAttention(num_heads, d_attn, d_x, d_z, d_out, d_mid, MaskStrategy['MASKED'])
    x = torch.rand(d_x, length_x)
    z = torch.rand(d_z, length_z)
    output = multi_headed_attention(x, z)
    print("output:", output)
    assert output.shape == (d_out, length_x)
test_multi_headed_attention()

output: tensor([[ 2.2665,  0.2006,  0.1284,  0.0000],
        [ 7.0058,  0.6321,  0.3442,  0.0000],
        [ 8.1959,  0.7536,  0.4499,  0.0000],
        [11.8376,  1.0757,  0.6037,  0.0000]], grad_fn=<MmBackward0>)


In [17]:
class LayerNorm(nn.Module):
    def __init__(self, feature_length):
        super().__init__()
        self.scale = nn.Parameter(torch.rand(feature_length, 1))
        self.offset = nn.Parameter(torch.rand(feature_length, 1))

    def forward(self, activations):
        mean = torch.mean(activations, -2, keepdim=True)
        #print("mean:", mean)
        #print("activations - mean", activations - mean)
        variance = torch.std(activations, -2, keepdim=True)
        return (((activations - mean) / variance) * self.scale) + self.offset

In [18]:
def test_layer_norm():
    feature_length = 4
    length_x = 3
    layer_norm = LayerNorm(feature_length)

    activations = torch.rand(feature_length, length_x)

    print("activations:", activations)
    print("layer_normed:", layer_norm(activations))
    assert layer_norm(activations).shape == activations.shape

test_layer_norm()

activations: tensor([[7.1962e-01, 3.8297e-01, 9.1271e-01],
        [6.3607e-01, 7.0707e-01, 1.4284e-01],
        [8.6427e-06, 7.6875e-01, 2.2828e-01],
        [7.0166e-01, 9.2101e-01, 5.2097e-01]])
layer_normed: tensor([[ 0.3874, -0.3564,  0.6635],
        [ 0.5135,  0.4567,  0.2782],
        [ 0.0149,  0.0349,  0.0243],
        [ 0.7457,  0.9744,  0.5732]], grad_fn=<AddBackward0>)


In [19]:
class FeedForward(nn.Module):
    def __init__(self, hiddenLayerWidth, d_e):
        super().__init__()
        self.mlp1 = nn.Parameter(torch.rand(hiddenLayerWidth, d_e))
        self.mlp2 = nn.Parameter(torch.rand(d_e, hiddenLayerWidth))

    def forward(self, activations):
        activations = torch.matmul(self.mlp1, activations)
        activations = activations.relu()
        activations = torch.matmul(self.mlp2, activations)
        return activations


In [20]:
def test_feed_forward():
    hiddenLayerWidth = 4
    d_e = 4
    feed_forward = FeedForward(hiddenLayerWidth, d_e)

In [21]:
feed_forward = FeedForward(8, 4)
activations = torch.rand(4, 4)

print("activations:", activations)
print("feed forward:", feed_forward(activations))

activations: tensor([[0.2384, 0.5974, 0.8991, 0.1890],
        [0.1692, 0.6369, 0.6885, 0.4129],
        [0.5404, 0.5601, 0.5734, 0.0695],
        [0.9766, 0.5577, 0.6605, 0.9500]])
feed forward: tensor([[3.4618, 4.0866, 4.9860, 2.7549],
        [4.1394, 4.7213, 5.7609, 3.2795],
        [4.7369, 5.3860, 6.5191, 3.8934],
        [4.3997, 4.8025, 5.7954, 3.5539]], grad_fn=<MmBackward0>)


In [22]:
class EncoderLayer(nn.Module):
    def __init__(self, num_heads, d_attn, d_x, d_z, d_out, d_mid, d_mlp):
        super().__init__()
        self.multi_head_attention = MultiHeadedAttention(num_heads, d_attn, d_x, d_z, d_out, d_mid, MaskStrategy['UNMASKED'])
        self.layer_norm1 = LayerNorm(d_z)
        self.feed_forward = FeedForward(d_mlp, d_z)
        self.layer_norm2 = LayerNorm(d_z)



    def forward(self, z):
        z = z + self.multi_head_attention(z, z)
        z = self.layer_norm1(z)
        z = z + self.feed_forward(z)
        z = self.layer_norm2(z)
        return z

In [23]:
class Encoder(nn.Module):
    def __init__(self, num_layers, num_heads, d_attn, d_x, d_z, d_out, d_mid, d_mlp):
        super().__init__()
        self.layers = []
        for i in range(num_layers):
            encoder_layer = EncoderLayer(num_heads, d_attn, d_x, d_z, d_out, d_mid, d_mlp)
            self.layers.append(encoder_layer)
        self.layers = nn.ModuleList(self.layers)

    def forward(self, z):
        for layer in self.layers:
            z = layer(z)
        return z

In [24]:
class DecoderLayer(nn.Module):
    def __init__(self, num_heads, d_attn, d_x, d_z, d_out, d_mid, d_mlp):
        super().__init__()
        self.multi_head_self_attention = MultiHeadedAttention(num_heads, d_attn, d_x, d_z, d_out, d_mid, MaskStrategy['MASKED'])
        self.layer_norm1 = LayerNorm(d_x)
        self.multi_head_global_attention = MultiHeadedAttention(num_heads, d_attn, d_x, d_z, d_out, d_mid, MaskStrategy['UNMASKED'])
        self.layer_norm2 = LayerNorm(d_x)
        self.feed_forward = FeedForward(d_mlp, d_x)
        self.layer_norm3 = LayerNorm(d_x)

    def forward(self, x, z):
        x = x + self.multi_head_self_attention(x, x)
        x = self.layer_norm1(x)
        x = x + self.multi_head_global_attention(x, z)
        x = self.layer_norm2(x)
        x = x + self.feed_forward(x)
        x = self.layer_norm3(x)
        return x

In [25]:
class Decoder(nn.Module):
    def __init__(self, num_layers, num_heads, d_attn, d_x, d_z, d_out, d_mid, d_mlp):
        super().__init__()
        self.layers = []
        for i in range(num_layers):
            decoder_layer = DecoderLayer(num_heads, d_attn, d_x, d_z, d_out, d_mid, d_mlp)
            self.layers.append(decoder_layer)
        self.layers = nn.ModuleList(self.layers)

    def forward(self, x, z):
        for layer in self.layers:
            x = layer(x, z)
        return x

In [26]:
class EncoderDecoderTransformer(nn.Module):
    def __init__(self, num_encoder_layers, num_decoder_layers, num_heads, d_attn, d_x, d_z, d_out, d_mid, d_mlp, d_e, vocab_size, max_sequence_length):
        super().__init__()
        self.embedding = Embedding(vocab_size, d_e)
        self.positionalEmbedding = PositionalEmbedding(d_e, max_sequence_length)
        self.encoder = Encoder(num_encoder_layers, num_heads, d_attn, d_x, d_z, d_out, d_mid, d_mlp)
        self.decoder = Decoder(num_decoder_layers, num_heads, d_attn, d_x, d_z, d_out, d_mid, d_mlp)
        self.unembedding = Unembedding(vocab_size, d_e)

    def forward(self, x, z):
        z = self.embedding(z) + self.positionalEmbedding(z)
        z = self.encoder(z)
        x = self.embedding(x)
        x = self.decoder(x, z)
        #print("x after decoder:", x.shape)
        x = self.unembedding(x)
        #print("x after unembedding:", x.shape)
        return x




In [27]:
enRawName = "drive/MyDrive/colab data/multi30kEnTrain.txt"
deRawName = "drive/MyDrive/colab data/multi30kDeTrain.txt"
en30kVal = "drive/MyDrive/colab data/multi30kEnVal.txt"
de30kVal = "drive/MyDrive/colab data/multi30kDeVal.txt"
englishCleanName = "data/english_tokens.pkl"
germanCleanName = "data/german_tokens.pkl"
englishSortedName = "data/englishSorted.pkl"
germanSortedName = "data/germanSorted.pkl"

truncEn = "drive/MyDrive/colab data/truncEn.pkl"
truncDe = "drive/MyDrive/colab data/truncDe.pkl"

enTokenizerName = "drive/MyDrive/colab data/enTokenizer.pkl"
deTokenizerName = "drive/MyDrive/colab data/deTokenizer.pkl"
pairsName = "drive/MyDrive/colab data/pairs.pkl"
folder = "drive/MyDrive/colab data/"

enTrainingFileName = folder + "enTraining"
deTrainingFileName = folder + "deTraining"
enTestFileName = folder + "enTest"
deTestFileName = folder + "deTest"
enValFileName = folder + "enValidation"
deValFileName = folder + "deValidation"

enCombinedFileName = folder + "enCombined"
deCombinedFileName = folder + "deCombined"

In [28]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [54]:
class SentenceDataset(Dataset):

    TOKENIZER_SUFFIX = "_tokenizer"
    BOS_TOKEN = "[SOS]"
    EOS_TOKEN = "[EOS]"
    PAD_TOKEN = "[PAD]"
    UNK_TOKEN = "[UNK]"

    def __init__(self, src_filename, tgt_filename, src_vocab_size, tgt_vocab_size):
        src_sentences = self.to_sentences(self.load_doc(src_filename))
        tgt_sentences = self.to_sentences(self.load_doc(tgt_filename))
        src_sentences = [self.addSpecialTokens(sentence) for sentence in src_sentences]
        tgt_sentences = [self.addSpecialTokens(sentence) for sentence in tgt_sentences]
        self.src_tokenizer, self.tgt_tokenizer = self.setup_tokenizers(src_filename, tgt_filename, src_vocab_size, tgt_vocab_size, src_filename + SentenceDataset.TOKENIZER_SUFFIX, tgt_filename + SentenceDataset.TOKENIZER_SUFFIX)
        src_tokenized = self.src_tokenizer.encode_batch(src_sentences)
        tgt_tokenized = self.tgt_tokenizer.encode_batch(tgt_sentences)
        src_tensors = [torch.IntTensor(sequence.ids) for sequence in src_tokenized]
        tgt_tensor = [torch.IntTensor(sequence.ids) for sequence in tgt_tokenized]
        self.pairs = self.pair_sequences(src_tensors, tgt_tensor)

    # load doc into memory
    def load_doc(self, filename):
        # open the file as read only
        file = open(filename, mode='rt')
        # read all text
        text = file.read()
        # close the file
        file.close()
        return text

    def addSpecialTokens(self, sequence):
        sequence = SentenceDataset.BOS_TOKEN + sequence + SentenceDataset.EOS_TOKEN
        return sequence

    def pair_sequences(self, src_sequences, tgt_sequences):
        pairs = []
        for i in range(len(src_sequences)):
            pairs.append((src_sequences[i], tgt_sequences[i]))
        return pairs

    # split a loaded document into sentences
    def to_sentences(self, doc):
        return doc.strip().split('\n')

    def setup_tokenizers(self, src_filename, tgt_filename, src_vocab_size, tgt_vocab_size, src_tokenizer_name, tgt_tokenizer_name):
        print("creating tokenizer for " + src_filename)
        src_tokenizer = Tokenizer(BPE(unk_token=SentenceDataset.UNK_TOKEN))
        src_tokenizer.pre_tokenizer = Whitespace()
        trainer = BpeTrainer(vocab_size = src_vocab_size, special_tokens=[SentenceDataset.BOS_TOKEN, SentenceDataset.EOS_TOKEN, SentenceDataset.PAD_TOKEN, SentenceDataset.UNK_TOKEN])
        src_tokenizer.train([src_filename], trainer=trainer)
        pickle.dump(src_tokenizer, open(src_tokenizer_name, "wb"))

        print("creating tokenizer for " + tgt_filename)
        tgt_tokenizer = Tokenizer(BPE(unk_token=SentenceDataset.UNK_TOKEN))
        tgt_tokenizer.pre_tokenizer = Whitespace()
        trainer = BpeTrainer(vocab_size = tgt_vocab_size, special_tokens=[SentenceDataset.BOS_TOKEN, SentenceDataset.EOS_TOKEN, SentenceDataset.PAD_TOKEN, SentenceDataset.UNK_TOKEN])
        tgt_tokenizer.train([tgt_filename], trainer=trainer)
        pickle.dump(tgt_tokenizer, open(tgt_tokenizer_name, "wb"))
        return src_tokenizer, tgt_tokenizer

    def __len__(self):
        return len(self.pairs)

    def __getitem__(self, index):
        return self.pairs[index]

In [55]:
sequenceDataset = SentenceDataset(enRawName, deRawName, 2000, 2000)

creating tokenizer for drive/MyDrive/colab data/multi30kEnTrain.txt
creating tokenizer for drive/MyDrive/colab data/multi30kDeTrain.txt
[0, 138, 171, 13, 1808, 132, 1793, 120, 280, 338, 938, 210, 1221, 15, 1]


In [56]:
print(sequenceDataset.__getitem__(0))

(tensor([   0,  138,  171,   13, 1808,  132, 1793,  120,  280,  338,  938,  210,
        1221,   15,    1], dtype=torch.int32), tensor([   0,  160,  351,  649,  217,  444,  148,  457,  100,  121,  513,  788,
         102, 1207,  500,   14,    1], dtype=torch.int32))


In [57]:
pair = sequenceDataset.__getitem__(0)
sequence_z = pair[0]
sequence_x = pair[1]

num_encoder_layers = 3
num_decoder_layers = 3
num_heads = 8
d_attn = 64
d_x = 512
d_z = 512
d_out = 512
d_mid = 512
d_mlp = 2048
d_e = 512
vocab_size = 2000
max_sequence_length = 100

encoder_decoder_transformer = EncoderDecoderTransformer(num_encoder_layers, num_decoder_layers, num_heads, d_attn, d_x, d_z, d_out, d_mid, d_mlp, d_e, vocab_size, max_sequence_length)
output = encoder_decoder_transformer(sequence_x, sequence_z)
print("output:", output)

output: tensor([[128.7010, 128.8575, 128.8577,  ..., 128.8579, 128.8577, 128.8581],
        [113.6734, 113.9075, 113.9076,  ..., 113.9080, 113.9079, 113.9081],
        [125.9031, 125.8873, 125.8874,  ..., 125.8878, 125.8876, 125.8878],
        ...,
        [122.2063, 122.4274, 122.4275,  ..., 122.4280, 122.4278, 122.4279],
        [119.7615, 119.6788, 119.6788,  ..., 119.6792, 119.6791, 119.6793],
        [115.0268, 115.0118, 115.0118,  ..., 115.0123, 115.0121, 115.0123]],
       grad_fn=<MmBackward0>)


In [39]:
print(output.shape)
x = output.transpose(0, 1)
print("x:", x)
x = torch.softmax(x, -1)
print("x softmax:", x)
x = torch.argmax(x, dim=-1)
print("argmax x:", x)

torch.Size([2000, 15])
x: tensor([[112.4095, 115.1396, 116.1140,  ..., 124.1159, 117.6144, 113.8082],
        [112.4091, 115.1391, 116.1135,  ..., 124.1155, 117.6139, 113.8076],
        [112.4036, 115.1666, 116.1268,  ..., 124.1245, 117.6131, 113.8099],
        ...,
        [112.4395, 115.1805, 116.1912,  ..., 124.2022, 117.6688, 113.8203],
        [112.4084, 115.1385, 116.1132,  ..., 124.1148, 117.6133, 113.8070],
        [112.4445, 115.3520, 115.8624,  ..., 123.8243, 117.4075, 113.7522]],
       grad_fn=<TransposeBackward0>)
x softmax: tensor([[9.4449e-11, 1.4483e-09, 3.8374e-09,  ..., 1.1461e-05, 1.7205e-08,
         3.8249e-10],
        [9.4457e-11, 1.4483e-09, 3.8376e-09,  ..., 1.1462e-05, 1.7205e-08,
         3.8247e-10],
        [9.3355e-11, 1.4794e-09, 3.8644e-09,  ..., 1.1494e-05, 1.7084e-08,
         3.8096e-10],
        ...,
        [9.2743e-11, 1.4376e-09, 3.9501e-09,  ..., 1.1905e-05, 1.7311e-08,
         3.6891e-10],
        [9.4434e-11, 1.4482e-09, 3.8379e-09,  ..., 1.14

In [45]:
def decode(x, tokenizer):
    #print("x:", x)
    x = x.transpose(0, 1)
    x = torch.softmax(x, -1)
    #print("x softmax:", x)
    x = torch.argmax(x, dim=-1)
    x = x.tolist()
    print("argmax x:", x)
    return tokenizer.decode(x)

In [58]:
decoded_output = decode(output, sequenceDataset.tgt_tokenizer)
print("decoded output:", decoded_output)

argmax x: [1415, 1415, 1415, 1415, 1415, 1415, 1415, 1415, 1415, 1415, 1415, 1415, 1415, 1415, 1415, 1415, 1415]
decoded output: bens bens bens bens bens bens bens bens bens bens bens bens bens bens bens bens bens


In [59]:
opt = optim.Adam(encoder_decoder_transformer.parameters(), lr=0.05, betas=(0.9, 0.98), eps=1e-9)
loss_function = nn.CrossEntropyLoss()
print(sequence_z)
print(sequence_x)
for i in range(1000):
    # sequence_x, sequence_z = sequenceDataset.__getitem__(i)
    output = encoder_decoder_transformer(sequence_x, sequence_z)
    decoded_output = decode(output, sequenceDataset.tgt_tokenizer)
    print("decoded output:", decoded_output)
    output = output.transpose(0, 1)
    loss = loss_function(output, sequence_x.long())
    print("loss:", loss)
    print()
    print()
    opt.zero_grad()
    loss.backward()
    opt.step()

tensor([   0,  138,  171,   13, 1808,  132, 1793,  120,  280,  338,  938,  210,
        1221,   15,    1], dtype=torch.int32)
tensor([   0,  160,  351,  649,  217,  444,  148,  457,  100,  121,  513,  788,
         102, 1207,  500,   14,    1], dtype=torch.int32)
argmax x: [1415, 1415, 1415, 1415, 1415, 1415, 1415, 1415, 1415, 1415, 1415, 1415, 1415, 1415, 1415, 1415, 1415]
decoded output: bens bens bens bens bens bens bens bens bens bens bens bens bens bens bens bens bens
loss: tensor(18.6713, grad_fn=<NllLossBackward0>)


argmax x: [14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14]
decoded output: . . . . . . . . . . . . . . . . .
loss: tensor(11.7611, grad_fn=<NllLossBackward0>)


argmax x: [217, 217, 217, 217, 217, 217, 217, 217, 217, 217, 217, 217, 217, 217, 217, 217, 217]
decoded output: Männer Männer Männer Männer Männer Männer Männer Männer Männer Männer Männer Männer Männer Männer Männer Männer Männer
loss: tensor(10.4530, grad_fn=<NllLossBackward0>)


argmax

KeyboardInterrupt: 