<a href="https://colab.research.google.com/github/JackWittmayer/Transformer-Implementation/blob/main/EDTransformer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [61]:
!pip install tokenizers



In [62]:
import re
import string
import os
import pickle
from unicodedata import normalize
from collections import Counter

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset
from torchvision import datasets
from torch.utils.data import DataLoader
from torch.nn.functional import log_softmax, pad

from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.trainers import BpeTrainer

import random
import time

import numpy as np
import math
import matplotlib.pyplot as plt

import sys
from nltk.translate.bleu_score import sentence_bleu
from nltk.translate.bleu_score import corpus_bleu

In [63]:
torch.manual_seed(25)
random.seed(25)

In [64]:
class Embedding(nn.Module):
    def __init__(self, embedding_size, vocab_size):
        super().__init__()
        self.table = nn.Embedding(vocab_size, embedding_size)

    def forward(self, sequence):
        embeddings = self.table(sequence)
        return embeddings.transpose(-1, -2)

In [65]:
sequence = torch.randint(0, 9, [5])
print("sequence:", sequence)

embedding = Embedding(4, 10)
print("embedding:", embedding(sequence))

sequence: tensor([2, 8, 3, 5, 5])
embedding: tensor([[-0.2466,  0.2298, -0.1450, -0.2387, -0.2387],
        [ 0.2384,  0.9110, -0.0539,  2.0324,  2.0324],
        [-0.6746,  1.6274, -0.6296, -0.0943, -0.0943],
        [ 0.4569, -1.4217,  0.0329,  0.3271,  0.3271]],
       grad_fn=<TransposeBackward0>)


In [66]:
class Unembedding(nn.Module):
    def __init__(self, embedding_size, vocab_size):
        super().__init__()
        self.unembedding = nn.Parameter(torch.rand(vocab_size, embedding_size))

    def forward(self, x):
        return torch.matmul(self.unembedding, x)

In [67]:
embedding_size = 4
vocab_size = 10
length_x = 4
x = torch.rand([embedding_size, length_x])
print("x:", x)

unembedding = Unembedding(embedding_size, vocab_size)
unembedding_x = unembedding(x)
print("unembedding:", unembedding_x)
print("softmax unembedding:", torch.softmax(unembedding_x, -2))

x: tensor([[0.2578, 0.4702, 0.0530, 0.4207],
        [0.7639, 0.7536, 0.6063, 0.1899],
        [0.2837, 0.6097, 0.5808, 0.1660],
        [0.5746, 0.7927, 0.8780, 0.0566]])
unembedding: tensor([[1.4112, 1.8584, 1.4135, 0.6347],
        [0.3483, 0.5418, 0.2618, 0.3161],
        [0.7932, 1.3326, 0.9671, 0.5479],
        [0.6077, 0.9338, 0.9328, 0.1695],
        [0.8636, 1.3411, 1.0986, 0.4331],
        [0.9475, 1.3700, 1.2893, 0.2961],
        [0.7712, 1.1694, 1.1384, 0.2347],
        [0.6879, 0.7872, 0.6302, 0.2270],
        [1.0989, 1.6686, 1.3678, 0.5347],
        [0.4655, 0.7824, 0.4713, 0.3968]], grad_fn=<MmBackward0>)
softmax unembedding: tensor([[0.1763, 0.1825, 0.1482, 0.1277],
        [0.0609, 0.0489, 0.0468, 0.0929],
        [0.0950, 0.1079, 0.0948, 0.1171],
        [0.0789, 0.0724, 0.0916, 0.0802],
        [0.1020, 0.1088, 0.1081, 0.1044],
        [0.1109, 0.1120, 0.1309, 0.0910],
        [0.0930, 0.0916, 0.1125, 0.0856],
        [0.0855, 0.0625, 0.0677, 0.0849],
        [0.129

In [68]:
class PositionalEmbedding(nn.Module):
    def __init__(self, embedding_size, max_sequence_length):
        super().__init__()
        self.table = nn.Embedding(max_sequence_length, embedding_size)

    def forward(self, sequence):
        positional_embeddings = self.table(torch.arange(0, sequence.shape[-1]))
        return positional_embeddings

In [69]:
sequence = torch.rand(5)
print("sequence:", sequence)

positionalEmbedding = PositionalEmbedding(4, 10)
print("positional embedding:", positionalEmbedding(sequence))

sequence: tensor([0.6902, 0.2961, 0.3343, 0.5147, 0.2864])
positional embedding: tensor([[ 0.4316,  0.7833, -0.0568, -0.9700],
        [ 0.3140, -0.7241,  0.1579, -2.6927],
        [-0.5925,  0.6631, -1.0311,  0.0513],
        [-0.2251, -0.3241,  1.3287, -1.5935],
        [ 0.9240,  0.2862, -0.0059,  0.8485]], grad_fn=<EmbeddingBackward0>)


In [70]:
def attention(queries, keys, values, mask):
    keys_transposed = torch.transpose(keys, -2, -1)
    #print("keys_transposed:", keys_transposed)
    scores = torch.matmul(keys_transposed, queries)
    #print("scores:", scores)
    scores = scores.masked_fill(mask == 0, -1e9)
    #print("masked scores:", scores)
    d_attn = keys.shape[-1]
    scaled_scores = scores / math.sqrt(d_attn)
    #print("scaled_scores:", scaled_scores)
    softmax_scores = torch.softmax(scaled_scores, -1)
    #print("softmax_scores:", softmax_scores)
    return torch.matmul(values, softmax_scores)

In [71]:
torch.manual_seed(25)
d_attn = 4
length_x = 4
length_z = 3

queries = torch.rand(d_attn, length_x)
keys = torch.rand(d_attn, length_z)
values = torch.rand(d_attn, length_z)
print("queries:", queries)
print("keys:", keys)
print("values:", values)
mask = mask = torch.tril(torch.ones(length_z, length_x) == 1)


v_out = attention(queries, keys, values, mask)
print("output:", v_out)

queries: tensor([[0.7518, 0.1929, 0.0629, 0.9118],
        [0.3828, 0.2990, 0.5933, 0.2911],
        [0.2416, 0.5582, 0.0481, 0.3497],
        [0.3520, 0.9528, 0.0284, 0.8488]])
keys: tensor([[0.3947, 0.5181, 0.9726],
        [0.8813, 0.0056, 0.3056],
        [0.9384, 0.7949, 0.4399],
        [0.1766, 0.8739, 0.1425]])
values: tensor([[0.4682, 0.6254, 0.3040],
        [0.7923, 0.4691, 0.6875],
        [0.9917, 0.2772, 0.7970],
        [0.2249, 0.1119, 0.6863]])
output: tensor([[0.8602, 0.4571, 0.0803, 0.0000],
        [1.2720, 0.4951, 0.1816, 0.0000],
        [1.4332, 0.4221, 0.2106, 0.0000],
        [0.5505, 0.2912, 0.1813, 0.0000]])


In [72]:
from enum import Enum
class MaskStrategy(Enum):
    UNMASKED = 1
    MASKED = 2

In [73]:
class MultiHeadedAttention(nn.Module):
    def __init__(self, num_heads, d_attn, d_x, d_z, d_out, d_mid, maskStrategy):
        super().__init__()
        self.num_heads = num_heads
        self.weight_query = nn.Parameter(torch.rand(num_heads, d_attn, d_x))
        #print("weight query:", self.weight_query)
        self.weight_key = nn.Parameter(torch.rand(num_heads, d_attn, d_z))
        self.weight_value = nn.Parameter(torch.rand(num_heads, d_mid, d_z))
        self.weight_out = nn.Parameter(torch.rand(d_out, d_mid * num_heads))
        self.maskStrategy = maskStrategy

    def forward(self, x, z):
        queries = torch.matmul(self.weight_query, x)
        keys = torch.matmul(self.weight_key, z)
        values = torch.matmul(self.weight_value, z)
        #print("queries:", queries)
        #print("keys:", keys)
        #print("values:", values)

        # queries_with_heads = queries.reshape(self.num_heads, -1, x.shape[-1])
        # keys_with_heads = keys.reshape(self.num_heads, -1, z.shape[-1])
        # values_with_heads = values.reshape(self.num_heads, -1, z.shape[-1])

        # print("queries_with_heads", queries_with_heads)
        # print("keys_with_heads", keys_with_heads)
        # print("values_with_heads", values_with_heads)

        length_x = x.shape[-1]
        length_z = z.shape[-1]
        if self.maskStrategy == MaskStrategy['UNMASKED']:
            mask = torch.ones(length_z, length_x)
        elif self.maskStrategy == MaskStrategy['MASKED']:
            mask = torch.tril(torch.ones(length_z, length_x) == 1)
        v_out = attention(queries, keys, values, mask)
        #print("v_out:", v_out)
        v_out = v_out.reshape(-1, v_out.shape[-1])
        #print("v_out reshaped:", v_out)
        return torch.matmul(self.weight_out, v_out)




In [74]:
num_heads = 2
d_attn = 4
d_x = 4
d_z = 4
d_out = 4
d_mid = 2
mask_strategy = MaskStrategy['MASKED']
multi_headed_attention = MultiHeadedAttention(num_heads, d_attn, d_x, d_z, d_out, d_mid, mask_strategy)

x = torch.rand(d_x, 4)
z = torch.rand(d_z, 3)

print("output:", multi_headed_attention(x, z))

output: tensor([[2.0343, 1.0845, 0.5154, 0.0000],
        [1.6330, 0.8570, 0.4020, 0.0000],
        [3.8696, 2.2448, 1.0726, 0.0000],
        [3.8732, 2.1996, 1.0571, 0.0000]], grad_fn=<MmBackward0>)


In [75]:
class LayerNorm(nn.Module):
    def __init__(self, feature_length):
        super().__init__()
        self.scale = nn.Parameter(torch.rand(feature_length, 1))
        self.offset = nn.Parameter(torch.rand(feature_length, 1))

    def forward(self, activations):
        mean = torch.mean(activations, -2, keepdim=True)
        #print("mean:", mean)
        #print("activations - mean", activations - mean)
        variance = torch.std(activations, -2, keepdim=True)
        return (((activations - mean) / variance) * self.scale) + self.offset

In [76]:
layer_norm = LayerNorm(3)

activations = torch.rand(3, 4)

print("activations:", activations)
print("layer_normed:", layer_norm(activations))

activations: tensor([[0.1838, 0.2010, 0.1765, 0.8587],
        [0.7776, 0.1199, 0.8638, 0.1066],
        [0.1084, 0.8448, 0.7043, 0.9275]])
layer_normed: tensor([[ 0.3603,  0.3599,  0.1265,  0.7073],
        [ 1.0719,  0.1192,  0.8819, -0.1287],
        [-0.1114,  0.2532,  0.0920,  0.1539]], grad_fn=<AddBackward0>)


In [77]:
class FeedForward(nn.Module):
    def __init__(self, hiddenLayerWidth, d_e):
        super().__init__()
        self.mlp1 = nn.Parameter(torch.rand(hiddenLayerWidth, d_e))
        self.mlp2 = nn.Parameter(torch.rand(d_e, hiddenLayerWidth))

    def forward(self, activations):
        activations = torch.matmul(self.mlp1, activations)
        activations = activations.relu()
        activations = torch.matmul(self.mlp2, activations)
        return activations


In [78]:
feed_forward = FeedForward(8, 4)
activations = torch.rand(4, 4)

print("activations:", activations)
print("feed forward:", feed_forward(activations))

activations: tensor([[0.3844, 0.2967, 0.8505, 0.7321],
        [0.8624, 0.1573, 0.9203, 0.3480],
        [0.0408, 0.8737, 0.2148, 0.3923],
        [0.0306, 0.0444, 0.5565, 0.1516]])
feed forward: tensor([[2.1880, 2.5828, 4.5060, 2.9975],
        [2.3387, 2.6883, 4.5283, 2.9903],
        [3.4061, 3.9161, 6.9070, 4.6564],
        [3.2265, 3.6029, 6.1579, 4.3486]], grad_fn=<MmBackward0>)


In [79]:
class EncoderLayer(nn.Module):
    def __init__(self, num_heads, d_attn, d_x, d_z, d_out, d_mid, d_mlp):
        super().__init__()
        self.multi_head_attention = MultiHeadedAttention(num_heads, d_attn, d_x, d_z, d_out, d_mid, MaskStrategy['UNMASKED'])
        self.layer_norm1 = LayerNorm(d_z)
        self.feed_forward = FeedForward(d_mlp, d_z)
        self.layer_norm2 = LayerNorm(d_z)



    def forward(self, z):
        z = z + self.multi_head_attention(z, z)
        z = self.layer_norm1(z)
        z = z + self.feed_forward(z)
        z = self.layer_norm2(z)
        return z

In [80]:
class Encoder(nn.Module):
    def __init__(self, num_layers, num_heads, d_attn, d_x, d_z, d_out, d_mid, d_mlp):
        super().__init__()
        self.layers = []
        for i in range(num_layers):
            encoder_layer = EncoderLayer(num_heads, d_attn, d_x, d_z, d_out, d_mid, d_mlp)
            self.layers.append(encoder_layer)
        self.layers = nn.ModuleList(self.layers)

    def forward(self, z):
        for layer in self.layers:
            z = layer(z)
        return z

In [81]:
class DecoderLayer(nn.Module):
    def __init__(self, num_heads, d_attn, d_x, d_z, d_out, d_mid, d_mlp):
        super().__init__()
        self.multi_head_self_attention = MultiHeadedAttention(num_heads, d_attn, d_x, d_z, d_out, d_mid, MaskStrategy['MASKED'])
        self.layer_norm1 = LayerNorm(d_x)
        self.multi_head_global_attention = MultiHeadedAttention(num_heads, d_attn, d_x, d_z, d_out, d_mid, MaskStrategy['UNMASKED'])
        self.layer_norm2 = LayerNorm(d_x)
        self.feed_forward = FeedForward(d_mlp, d_x)
        self.layer_norm3 = LayerNorm(d_x)

    def forward(self, x, z):
        x = x + self.multi_head_self_attention(x, x)
        x = self.layer_norm1(x)
        x = x + self.multi_head_global_attention(x, z)
        x = self.layer_norm2(x)
        x = x + self.feed_forward(x)
        x = self.layer_norm3(x)
        return x

In [82]:
class Decoder(nn.Module):
    def __init__(self, num_layers, num_heads, d_attn, d_x, d_z, d_out, d_mid, d_mlp):
        super().__init__()
        self.layers = []
        for i in range(num_layers):
            decoder_layer = DecoderLayer(num_heads, d_attn, d_x, d_z, d_out, d_mid, d_mlp)
            self.layers.append(decoder_layer)
        self.layers = nn.ModuleList(self.layers)

    def forward(self, x, z):
        for layer in self.layers:
            x = layer(x, z)
        return x

In [113]:
class EncoderDecoderTransformer(nn.Module):
    def __init__(self, num_encoder_layers, num_decoder_layers, num_heads, d_attn, d_x, d_z, d_out, d_mid, d_mlp, d_e, vocab_size, max_sequence_length):
        super().__init__()
        self.embedding = Embedding(d_e, vocab_size)
        self.positionalEmbedding = PositionalEmbedding(d_e, max_sequence_length)
        self.encoder = Encoder(num_encoder_layers, num_heads, d_attn, d_x, d_z, d_out, d_mid, d_mlp)
        self.decoder = Decoder(num_decoder_layers, num_heads, d_attn, d_x, d_z, d_out, d_mid, d_mlp)
        self.unembedding = Unembedding(d_e, vocab_size)

    def forward(self, x, z):
        z = self.embedding(z)
        z = self.encoder(z)
        x = self.embedding(x)
        x = self.decoder(x, z)
        #print("x after decoder:", x.shape)
        x = self.unembedding(x)
        #print("x after unembedding:", x.shape)
        return x




In [84]:
enRawName = "drive/MyDrive/colab data/multi30kEnTrain.txt"
deRawName = "drive/MyDrive/colab data/multi30kDeTrain.txt"
en30kVal = "drive/MyDrive/colab data/multi30kEnVal.txt"
de30kVal = "drive/MyDrive/colab data/multi30kDeVal.txt"
englishCleanName = "data/english_tokens.pkl"
germanCleanName = "data/german_tokens.pkl"
englishSortedName = "data/englishSorted.pkl"
germanSortedName = "data/germanSorted.pkl"

truncEn = "drive/MyDrive/colab data/truncEn.pkl"
truncDe = "drive/MyDrive/colab data/truncDe.pkl"

enTokenizerName = "drive/MyDrive/colab data/enTokenizer.pkl"
deTokenizerName = "drive/MyDrive/colab data/deTokenizer.pkl"
pairsName = "drive/MyDrive/colab data/pairs.pkl"
folder = "drive/MyDrive/colab data/"

enTrainingFileName = folder + "enTraining"
deTrainingFileName = folder + "deTraining"
enTestFileName = folder + "enTest"
deTestFileName = folder + "deTest"
enValFileName = folder + "enValidation"
deValFileName = folder + "deValidation"

enCombinedFileName = folder + "enCombined"
deCombinedFileName = folder + "deCombined"

In [85]:
# clean a list of lines
def clean_lines(lines):
    cleaned = list()
    # prepare regex for char filtering
    re_print = re.compile('[^%s]' % re.escape(string.printable))
    # prepare translation table for removing punctuation
    #table = str.maketrans('', '', string.punctuation)
    for line in lines:
        # normalize unicode characters
        line = normalize('NFD', line).encode('ascii', 'ignore')
        line = line.decode('UTF-8')
        # tokenize on white space
        line = line.split()
        # convert to lower case
        #line = [word.lower() for word in line]
        # remove punctuation from each token
        #line = [word.translate(table) for word in line]
        # remove non-printable chars form each token
        line = [re_print.sub('', w) for w in line]
        # remove tokens with numbers in them
        line = [word for word in line if word.isalpha()]
        # store as string
        cleaned.append(' '.join(line))
    return cleaned

In [86]:
def cleanLine(line, addSOS=False):
    # prepare regex for char filtering
    #re_print = re.compile('[^%s]' % re.escape(string.printable))
    # normalize unicode characters
    #line = normalize('NFD', line).encode('ascii', 'ignore')
    #line = line.decode('UTF-8')
    # tokenize on white space
    line = line.split()
    # convert to lower case
    #line = [word.lower() for word in line]
    # remove punctuation from each token
    #line = [word.translate(table) for word in line]
    # remove non-printable chars form each token
    #line = [re_print.sub('', w) for w in line]
    if addSOS:
        line.insert(0, "[SOS]")
    line.append("[EOS]")
    line = (' '.join(line))
    return line

In [87]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [89]:
def createDatasets(src_filename, target_filename, size=10000):
  enLines = to_sentences(load_doc(src_filename))
  deLines = to_sentences(load_doc(target_filename))

  temp = list(zip(enLines, deLines))
  random.shuffle(temp)
  res1, res2 = zip(*temp)
  # res1 and res2 come out as tuples, and so must be converted to lists.
  enLines, deLines = list(res1)[:size], list(res2)[:size]

  trainEnLines = enLines[:(int)(0.8*len(enLines))]
  testEnLines = enLines[(int)(0.8*len(enLines)):(int)(0.9*len(enLines))]
  valEnLines = enLines[(int)(0.9*len(enLines)):]

  trainDeLines = deLines[:(int)(0.8*len(deLines))]
  testDeLines = deLines[(int)(0.8*len(deLines)):(int)(0.9*len(deLines))]
  valDeLines = deLines[(int)(0.9*len(deLines)):]

  enFileNames = [enTrainingFileName, enTestFileName, enValFileName, enCombinedFileName]
  deFileNames = [deTrainingFileName, deTestFileName, deValFileName, deCombinedFileName]

  enFileNames = [name + suffix for name in enFileNames]
  deFileNames = [name + suffix for name in deFileNames]
  enDatasets = [trainEnLines, testEnLines, valEnLines, enLines]
  deDatasets = [trainDeLines, testDeLines, valDeLines, deLines]

  enPair = (enDatasets, enFileNames)
  dePair = (deDatasets, deFileNames)

  for pair in [enPair, dePair]:
    for i in range(len(pair[0])):
      writeLinesWithNewLines(pair[0][i], pair[1][i])


In [90]:
class SentenceDataset(Dataset):

    TOKENIZER_SUFFIX = "_tokenizer"

    def __init__(self, src_filename, tgt_filename, src_vocab_size, tgt_vocab_size):
        src_sentences = self.to_sentences(self.load_doc(src_filename))
        tgt_sentences = self.to_sentences(self.load_doc(tgt_filename))
        self.src_tokenizer, self.tgt_tokenizer = self.setup_tokenizers(src_filename, tgt_filename, src_vocab_size, tgt_vocab_size, src_filename + SentenceDataset.TOKENIZER_SUFFIX, tgt_filename + SentenceDataset.TOKENIZER_SUFFIX)
        src_tokenized = self.src_tokenizer.encode_batch(src_sentences)
        tgt_tokenized = self.tgt_tokenizer.encode_batch(tgt_sentences)
        print(src_tokenized[0].ids)
        src_tensors = [torch.IntTensor(sequence.ids) for sequence in src_tokenized]
        tgt_tensor = [torch.IntTensor(sequence.ids) for sequence in tgt_tokenized]
        self.pairs = self.pair_sequences(src_tensors, tgt_tensor)

    # load doc into memory
    def load_doc(self, filename):
        # open the file as read only
        file = open(filename, mode='rt')
        # read all text
        text = file.read()
        # close the file
        file.close()
        return text


    def pair_sequences(self, src_sequences, tgt_sequences):
        pairs = []
        for i in range(len(src_sequences)):
            pairs.append((src_sequences[i], tgt_sequences[i]))
        return pairs

    # split a loaded document into sentences
    def to_sentences(self, doc):
        return doc.strip().split('\n')

    def setup_tokenizers(self, src_filename, tgt_filename, src_vocab_size, tgt_vocab_size, src_tokenizer_name, tgt_tokenizer_name):
        print("creating tokenizer for " + src_filename)
        src_tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
        src_tokenizer.pre_tokenizer = Whitespace()
        trainer = BpeTrainer(vocab_size = src_vocab_size, special_tokens=["[SOS]", "[EOS]", "[PAD]", "[UNK]"])
        src_tokenizer.train([src_filename], trainer=trainer)
        pickle.dump(src_tokenizer, open(src_tokenizer_name, "wb"))

        print("creating tokenizer for " + tgt_filename)
        tgt_tokenizer = Tokenizer(BPE())
        tgt_tokenizer.pre_tokenizer = Whitespace()
        trainer = BpeTrainer(vocab_size = tgt_vocab_size, special_tokens=["[SOS]", "[EOS]", "[PAD]", "[UNK]"])
        tgt_tokenizer.train([tgt_filename], trainer=trainer)
        pickle.dump(tgt_tokenizer, open(tgt_tokenizer_name, "wb"))
        return src_tokenizer, tgt_tokenizer

    def __len__(self):
        return len(self.pairs)

    def __getitem__(self, index):
        return self.pairs[index]

In [91]:
sequenceDataset = SentenceDataset(enRawName, deRawName, 2000, 2000)

creating tokenizer for drive/MyDrive/colab data/multi30kEnTrain.txt
creating tokenizer for drive/MyDrive/colab data/multi30kDeTrain.txt
[138, 171, 13, 1808, 132, 1793, 120, 280, 338, 938, 210, 1221, 15]


In [92]:
print(sequenceDataset.__getitem__(0))

(tensor([ 138,  171,   13, 1808,  132, 1793,  120,  280,  338,  938,  210, 1221,
          15], dtype=torch.int32), tensor([ 160,  351,  649,  217,  444,  148,  457,  100,  121,  513,  788,  102,
        1207,  500,   14], dtype=torch.int32))


In [127]:
pair = sequenceDataset.__getitem__(0)
sequence_z = pair[0]
sequence_x = pair[1]

num_encoder_layers = 5
num_decoder_layers = 5
num_heads = 5
d_attn = 256
d_x = 256
d_z = 256
d_out = 256
d_mid = 256
d_mlp = 256
d_e = 256
vocab_size = 2000
max_sequence_length = 100

encoder_decoder_transformer = EncoderDecoderTransformer(num_encoder_layers, num_decoder_layers, num_heads, d_attn, d_x, d_z, d_out, d_mid, d_mlp, d_e, vocab_size, max_sequence_length)
output = encoder_decoder_transformer(sequence_x, sequence_z)
print("output:", output)

output: tensor([[68.4374, 68.3614, 68.3729,  ..., 68.3615, 68.3619, 68.3617],
        [58.0023, 58.1410, 58.1555,  ..., 58.1412, 58.1416, 58.1413],
        [69.6137, 69.4154, 69.4182,  ..., 69.4154, 69.4157, 69.4155],
        ...,
        [70.6456, 70.6527, 70.6549,  ..., 70.6526, 70.6531, 70.6527],
        [70.6414, 70.5186, 70.5124,  ..., 70.5184, 70.5186, 70.5185],
        [61.1637, 61.2862, 61.2932,  ..., 61.2863, 61.2865, 61.2864]],
       grad_fn=<MmBackward0>)


In [124]:
def decode(x, tokenizer):
    x = torch.softmax(x, -1)
    x = torch.argmax(x, dim=-1)
    x = x.tolist()
    print("argmax x:", x)
    return tokenizer.decode(x)

In [129]:
opt = optim.Adam(encoder_decoder_transformer.parameters(), lr=0.01, betas=(0.9, 0.98), eps=1e-9)
loss_function = nn.CrossEntropyLoss()
print(sequence_z)
print(sequence_x)
for i in range(1000):
    output = encoder_decoder_transformer(sequence_x, sequence_z)
    decoded_output = decode(output, sequenceDataset.tgt_tokenizer)
    print("decoded output:", decoded_output)
    output = output.transpose(0, 1)
    loss = loss_function(output, sequence_x.long())
    print("loss:", loss)
    print()
    print()
    opt.zero_grad()
    loss.backward()
    opt.step()

tensor([ 138,  171,   13, 1808,  132, 1793,  120,  280,  338,  938,  210, 1221,
          15], dtype=torch.int32)
tensor([ 160,  351,  649,  217,  444,  148,  457,  100,  121,  513,  788,  102,
        1207,  500,   14], dtype=torch.int32)
argmax x: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 14, 0, 14, 14, 0, 0, 0, 0, 14, 0, 0, 0, 0, 0, 13, 0, 0, 0, 0, 13, 0, 0, 0, 0, 0, 0, 0, 0, 0, 13, 0, 14, 0, 0, 0, 0, 0, 0, 0, 0, 0, 14, 0, 0, 0, 0, 0, 0, 0, 0, 0, 13, 14, 0, 0, 0, 0, 0, 13, 0, 0, 0, 0, 0, 14, 0, 0, 0, 0, 0, 0, 14, 0, 0, 0, 0, 0, 0, 0, 13, 0, 0, 0, 0, 0, 0, 14, 14, 14, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 14, 0, 0, 14, 0, 0, 0, 13, 0, 0, 0, 14, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 13, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 14, 0, 0, 14, 0, 0, 0, 0, 0, 0, 14, 0, 0, 0, 0, 0, 0, 13, 0, 0, 0, 13, 0, 0, 0, 14, 0, 0, 0, 0, 13, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 13, 0, 0, 0, 0, 0, 0, 0, 13, 0, 0, 0, 0, 0, 14, 0, 0, 0, 0, 0, 0, 0, 14, 0, 0, 0, 0, 0, 13, 0, 0, 0, 0, 

KeyboardInterrupt: 