<a href="https://colab.research.google.com/github/JackWittmayer/Transformer-Implementation/blob/main/EDTransformer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
folder = "drive/MyDrive/colab data/Transformer-Implementation"

In [3]:
import sys
sys.path.insert(0, folder)

In [5]:
import torch
from torch.utils.data import DataLoader
from torch import nn

from src.model.encoder_decoder_transformer import EncoderDecoderTransformer
from src.training.trainer import train_model
from src.dataset.train_and_validation_sequence_datasets import (
    TrainAndValidationSequenceDatasets,
)
from src.dataset.pad_collate import PadCollate
from datetime import datetime

In [6]:
num_encoder_layers = 4
num_decoder_layers = 4
num_heads = 8
d_attn = 256
d_x = 256
d_z = 256
d_out = 256
d_mid = 256
d_mlp = 512
d_e = 256
max_sequence_length = 100
p_dropout = 0.1
vocab_size = 10000

if torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

enRawName = folder + "/multi30kEnAll.txt"
deRawName = folder + "/multi30kDeAll.txt"

In [9]:
saveDirectory = "./"
nameSuffix = ""
state_dict_filename = (
    saveDirectory
    + "encoder_decoder_transformer_state_dict_"
    + datetime.today().strftime("%Y-%m-%d %H")
    + nameSuffix
)

train_and_validation_sequence_datasets = TrainAndValidationSequenceDatasets(
    enRawName, deRawName, 0, 29000, 29000, 30014
)
custom_encoder_decoder_transformer = EncoderDecoderTransformer(
    num_encoder_layers,
    num_decoder_layers,
    num_heads,
    d_attn,
    d_x,
    d_z,
    d_out,
    d_mid,
    d_mlp,
    d_e,
    vocab_size,
    max_sequence_length,
    p_dropout,
    device
).to(device)
custom_encoder_decoder_transformer.src_embedding.table = custom_encoder_decoder_transformer.src_embedding.table.to(device)
custom_encoder_decoder_transformer.tgt_embedding.table = custom_encoder_decoder_transformer.tgt_embedding.table.to(device)
custom_encoder_decoder_transformer.positionalEmbedding.table = custom_encoder_decoder_transformer.positionalEmbedding.table.to(device)
train_dataset = train_and_validation_sequence_datasets.train_dataset
val_dataset = train_and_validation_sequence_datasets.val_dataset
pad_collate = PadCollate(enRawName, deRawName, vocab_size, vocab_size)
train_dataloader = DataLoader(train_dataset, batch_size=256, collate_fn=pad_collate)
val_dataloader = DataLoader(val_dataset, batch_size=256, collate_fn=pad_collate)
train_model(
    custom_encoder_decoder_transformer,
    train_dataloader,
    val_dataloader,
    pad_collate.tgt_tokenizer,
    device,
    state_dict_filename
)

creating tokenizer for drive/MyDrive/colab data/Transformer-Implementation/multi30kEnAll.txt
creating tokenizer for drive/MyDrive/colab data/Transformer-Implementation/multi30kDeAll.txt
<generator object Module.parameters at 0x77fd3c454dd0>
Completed training step 0
Completed training step 20
Completed training step 40
Completed training step 60
Completed training step 80
Completed training step 100
Completed validation step 0
epoch 0 took 21.499788284301758
avg training loss: 7.658419458489669
avg validation loss: 6.705053806304932
expected train output Zwei Frauen stehen im Freien und blicken mit einem kom ischen Gesichtsausdruck etwas an , das den anderen Personen um sie herum nicht aufzu fallen scheint .
argmax x: [109, 124, 114, 114, 114, 114, 114, 114, 114, 114, 114, 114, 114, 114, 114, 114, 14, 14, 14, 14, 14, 14, 14, 14, 14, 1, 14, 1, 14, 1, 14, 14, 14, 1, 1, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14]
decoded train output: Ein Mann und und und und und und und und u

In [10]:
model = EncoderDecoderTransformer(num_encoder_layers, num_decoder_layers, num_heads, d_attn, d_x, d_z, d_out, d_mid, d_mlp, d_e, vocab_size, max_sequence_length, p_dropout, device).to(device)
state_dict_filename = "/content/encoder_decoder_transformer_state_dict_2024-08-14 14"
state_dict = torch.load(state_dict_filename, map_location = device)
model.load_state_dict(state_dict)

<All keys matched successfully>

In [12]:
def predict_from_tokens(model, input, src_tokenizer, tgt_tokenizer):
    model.disable_subsequent_mask()
    src_tokenizer.no_padding()
    tgt_tokenizer.no_padding()

    src_tokenizer.no_truncation()
    tgt_tokenizer.no_truncation()
    src_sequence = input
    print(src_sequence)
    src_sequence = src_tokenizer.encode(src_sequence)
    print(src_sequence)
    print(src_tokenizer.decode(src_sequence.ids))
    src_sequence = torch.IntTensor(src_sequence.ids).unsqueeze(0).to(device)
    print("src tokens", src_sequence)
    tgt_sequence = torch.IntTensor([0]).unsqueeze(0).to(device)
    src_mask = torch.ones(src_sequence.shape, dtype=torch.int32).to(device)
    print("decoder input", tgt_sequence)
    predictions = []
    with torch.no_grad():
        model.eval()
        length_gen = 100
        for i in range(length_gen):
            tgt_mask = torch.ones(tgt_sequence.shape, dtype=torch.int32).to(device)
            prediction = model(src_sequence, tgt_sequence, src_mask, tgt_mask)
            prediction = torch.softmax(prediction, -1)
            prediction = torch.argmax(prediction, dim=-1)
            print("argmax prediction:", prediction)
            print("actual prediction:", tgt_tokenizer.decode(prediction[0].tolist()))
            last_token = prediction[0][-1]
            tgt_sequence = torch.cat((tgt_sequence, last_token.unsqueeze(0).unsqueeze(0)), dim=-1)
            if last_token == 1:
                break
    return tgt_sequence

pad_collate = PadCollate(enRawName, deRawName, vocab_size, vocab_size)
tgt_sequence = predict_from_tokens(model, "A man eats an apple in the park.", pad_collate.src_tokenizer, pad_collate.tgt_tokenizer)
print(tgt_sequence)
print(pad_collate.tgt_tokenizer.decode(tgt_sequence[0].tolist()))

creating tokenizer for drive/MyDrive/colab data/Transformer-Implementation/multi30kEnAll.txt
creating tokenizer for drive/MyDrive/colab data/Transformer-Implementation/multi30kDeAll.txt
A man eats an apple in the park.
Encoding(num_tokens=9, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])
A man eats an apple in the park .
src tokens tensor([[  30,   93, 2354,   84, 2933,   83,   94,  411,   15]],
       device='cuda:0', dtype=torch.int32)
decoder input tensor([[0]], device='cuda:0', dtype=torch.int32)
argmax prediction: tensor([[109]], device='cuda:0')
actual prediction: Ein
argmax prediction: tensor([[124, 124]], device='cuda:0')
actual prediction: Mann Mann
argmax prediction: tensor([[898, 124, 898]], device='cuda:0')
actual prediction: isst Mann isst
argmax prediction: tensor([[898, 124, 148, 100]], device='cuda:0')
actual prediction: isst Mann im in
argmax prediction: tensor([[898, 124, 148, 144, 111]], device='cuda:0')
actual predicti