In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader

In [2]:
from datasets import load_dataset # load dataset from huggingface
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

## Dataset

In [3]:
dataset = load_dataset("ted_talks_iwslt", language_pair=("de", "en"), year="2016", trust_remote_code=True)

## Tokenizer

In [4]:
tokenizer = get_tokenizer("basic_english")

In [5]:
def yield_tokens_from_hf(split):
    for item in split:
        de_text = item["translation"]["de"]
        en_text = item["translation"]["en"]
        yield tokenizer(de_text) + tokenizer(en_text)

## Vocab

In [6]:
train_split = dataset["train"]
vocab = build_vocab_from_iterator(yield_tokens_from_hf(train_split), specials=["<unk>"])
vocab.set_default_index(vocab["<unk>"])

In [7]:
def collate_batch(batch):
    for item in batch:
        en_vector = vocab(tokenizer(item["translation"]["en"]))
        de_vector = vocab(tokenizer(item["translation"]["de"]))
    en_vector = torch.tensor(en_vector, dtype=torch.int64)
    de_vector = torch.tensor(de_vector, dtype=torch.int64)
    return en_vector, de_vector

In [8]:
dataloader = DataLoader(dataset["train"], batch_size=4, shuffle=False, collate_fn=collate_batch)

## Model

In [9]:
class TransformerModel(nn.Module):
    def __init__(self, src_vocab_size, trg_vocab_size, d_model=512, nhead=8, num_encoder_layers=6, num_decoder_layers=6):
        super(TransformerModel, self).__init__()
        self.embedding_src = nn.Embedding(src_vocab_size, d_model)
        self.embedding_trg = nn.Embedding(trg_vocab_size, d_model)
        self.transformer = nn.Transformer(
            d_model=d_model,
            nhead=nhead,
            num_encoder_layers=num_encoder_layers,
            num_decoder_layers=num_decoder_layers
        )
        self.fc = nn.Linear(d_model, trg_vocab_size)

    def forward(self, src, trg):
        src = self.embedding_src(src)
        trg = self.embedding_trg(trg)
        output = self.transformer(src, trg)
        output = self.fc(output)
        return output

In [10]:
model = TransformerModel(
    src_vocab_size=len(vocab),
    trg_vocab_size=len(vocab),
    d_model=64,
    nhead=1,
    num_encoder_layers=1,
    num_decoder_layers=1
)
model

TransformerModel(
  (embedding_src): Embedding(28984, 64)
  (embedding_trg): Embedding(28984, 64)
  (transformer): Transformer(
    (encoder): TransformerEncoder(
      (layers): ModuleList(
        (0): TransformerEncoderLayer(
          (self_attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=64, out_features=64, bias=True)
          )
          (linear1): Linear(in_features=64, out_features=2048, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
          (linear2): Linear(in_features=2048, out_features=64, bias=True)
          (norm1): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
          (norm2): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
          (dropout1): Dropout(p=0.1, inplace=False)
          (dropout2): Dropout(p=0.1, inplace=False)
        )
      )
      (norm): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
    )
    (decoder): TransformerDecoder(
      (layers): ModuleList(
        (0): Tr

## Loss and optimizer

In [11]:
criterion = nn.CrossEntropyLoss()

In [12]:
optimizer = optim.Adam(model.parameters(), lr=0.001)

## Training

In [13]:
num_epochs = 10
for epoch in range(num_epochs):
    print("Epoch", epoch)
    for en_vector, de_vector in dataloader:
        optimizer.zero_grad()
        output = model(en_vector, de_vector)
        output = output[1:].view(-1, len(vocab))
        loss = criterion(output, de_vector[1:].view(-1))
        loss.backward()
        optimizer.step()

Epoch 0
Epoch 1
Epoch 2
Epoch 3
Epoch 4
Epoch 5
Epoch 6
Epoch 7
Epoch 8
Epoch 9
