# Installations

In [4]:
!pip install torch==2.0.1 torchtext==0.15.2 spacy[cuda-autodetect]



In [9]:
# 1. Clean existing installs
!pip uninstall -y spacy cython numpy

# 2. Install latest compatible versions
!pip install --upgrade --no-cache-dir numpy cython
!pip install --no-binary spacy spacy==3.7.4  # Latest stable as of 2024

# 3. Verify
!python -c "import spacy; print(spacy.__version__)"

Found existing installation: spacy 3.8.4
Uninstalling spacy-3.8.4:
  Successfully uninstalled spacy-3.8.4
Found existing installation: Cython 3.0.12
Uninstalling Cython-3.0.12:
  Successfully uninstalled Cython-3.0.12
Found existing installation: numpy 1.26.4
Uninstalling numpy-1.26.4:
  Successfully uninstalled numpy-1.26.4
Collecting numpy
  Downloading numpy-2.2.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (62 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.0/62.0 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting cython
  Downloading Cython-3.0.12-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.3 kB)
Downloading numpy-2.2.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (16.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.4/16.4 MB[0m [31m314.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading Cython-3.0.12-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (

Collecting spacy==3.7.4
  Downloading spacy-3.7.4.tar.gz (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m13.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hcanceled
[31mERROR: Operation cancelled by user[0m[31m
[0m^C
^C


In [5]:
!pip install -U 'spacy[cuda-autodetect]' -q

In [3]:
!python -m spacy download en_core_web_sm
!python -m spacy download de_core_news_sm

Collecting en-core-web-sm==3.8.0
  Using cached https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.
Collecting de-core-news-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/de_core_news_sm-3.8.0/de_core_news_sm-3.8.0-py3-none-any.whl (14.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.6/14.6 MB[0m [31m77.1 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('de_core_news_sm')
[38;5;3m⚠ Restart to reload dependencie

In [6]:
# Verify installation
import torch
import torchtext
import spacy

print("PyTorch version:", torch.__version__)
print("TorchText version:", torchtext.__version__)

nlp_en = spacy.load("en_core_web_sm")
nlp_de = spacy.load("de_core_news_sm")  # For German

PyTorch version: 2.0.1+cu117
TorchText version: 0.15.2+cpu


# Imports

In [7]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data
import math
import copy
from functools import partial

from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split

In [8]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

In [9]:
random_seed = 42

# MultiHeadAttention

In [10]:
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads):
        super(MultiHeadAttention, self).__init__()
        # Ensure d_model is divisible by num_heads for splitting into multiple heads
        assert d_model % num_heads == 0, "d_model must be divisible by num_heads"
        self.d_model = d_model  # Dimension of the model
        self.num_heads = num_heads  # Number of attention heads
        self.d_k = d_model // num_heads  # Dimension of each head

        # Linear transformations for query, key, value, and output
        self.W_q = nn.Linear(d_model, d_model)  # Query weights
        self.W_k = nn.Linear(d_model, d_model)  # Key weights
        self.W_v = nn.Linear(d_model, d_model)  # Value weights
        self.W_o = nn.Linear(d_model, d_model)  # Output weights

    def scaled_dot_product_attention(self, Q, K, V, mask=None):
        # Compute attention scores (dot product of Q and K, scaled by sqrt(d_k))
        attn_scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.d_k)
        # Apply mask (if provided) to prevent attention to certain positions
        if mask is not None:
            attn_scores = attn_scores.masked_fill(mask == 0, -1e9)
        # Compute attention probabilities using softmax
        attn_probs = torch.softmax(attn_scores, dim=-1)
        # Compute the weighted sum of values (V) using attention probabilities
        output = torch.matmul(attn_probs, V)
        return output

    def split_heads(self, x):
        # Reshape input tensor to split into multiple heads
        batch_size, seq_len, d_model = x.size()
        return x.reshape(batch_size, seq_len, self.num_heads, self.d_k).transpose(1, 2)

    def combine_heads(self, x):
        # Reshape tensor to combine multiple heads back into a single tensor
        batch_size, _, seq_len, d_k = x.size()
        return x.transpose(1, 2).reshape(batch_size, seq_len, self.d_model)

    def forward(self, Q, K, V, mask=None):
        # Apply linear transformations and split into multiple heads
        Q = self.split_heads(self.W_q(Q))  # Query
        K = self.split_heads(self.W_k(K))  # Key
        V = self.split_heads(self.W_v(V))  # Value

        # Compute attention output using scaled dot-product attention
        attn_output = self.scaled_dot_product_attention(Q, K, V, mask)
        # Combine heads and apply output linear transformation
        output = self.W_o(self.combine_heads(attn_output))
        return output

# Position wise Feed Forward Network

In [11]:
class PositionWiseFeedForward(nn.Module):
    def __init__(self, d_model, d_ff):
        super(PositionWiseFeedForward, self).__init__()
        # First fully connected layer: expands dimension from d_model to d_ff
        self.fc1 = nn.Linear(d_model, d_ff)
        # Second fully connected layer: reduces dimension back to d_model
        self.fc2 = nn.Linear(d_ff, d_model)
        # ReLU activation function
        self.relu = nn.ReLU()

    def forward(self, x):
        # Apply the first fully connected layer followed by ReLU activation
        # Then apply the second fully connected layer
        return self.fc2(self.relu(self.fc1(x)))

# Positional Encoding

In [12]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_seq_length):
        super(PositionalEncoding, self).__init__()

        # Initialize a zero matrix to store positional encodings
        pe = torch.zeros(max_seq_length, d_model, device=device)

        # Create a tensor for positions (0 to max_seq_length - 1)
        position = torch.arange(0, max_seq_length, dtype=torch.float, device=device).unsqueeze(1)

        # Compute the divisor term for sinusoidal functions
        div_term = torch.pow(10_000, (-torch.arange(0, d_model, 2, device=device).float() / d_model))

        # Apply sine to even indices in the positional encoding matrix
        pe[:, 0::2] = torch.sin(position * div_term)
        # Apply cosine to odd indices in the positional encoding matrix
        pe[:, 1::2] = torch.cos(position * div_term)

        # Register the positional encoding matrix as a buffer (not a trainable parameter)
        self.register_buffer('pe', pe.unsqueeze(0))

    def forward(self, x):
        # Add positional encodings to the input tensor x
        # The encodings are truncated to match the sequence length of x
        return x + self.pe[:, :x.size(1)]

# Encoder Layer

In [13]:
class EncoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout):
        super(EncoderLayer, self).__init__()

        # Multi-head self-attention mechanism
        self.self_attn = MultiHeadAttention(d_model, num_heads)
        # Position-wise feed-forward network
        self.feed_forward = PositionWiseFeedForward(d_model, d_ff)
        # Layer normalization for the attention output
        self.norm1 = nn.LayerNorm(d_model)
        # Layer normalization for the feed-forward output
        self.norm2 = nn.LayerNorm(d_model)
        # Dropout for regularization
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, mask=None):
        # Compute self-attention output (Q, K, V are all x)
        attn_output = self.self_attn(x, x, x, mask)
        # Apply residual connection, dropout, and layer normalization
        x = self.norm1(x + self.dropout(attn_output))
        # Compute feed-forward network output
        ff_output = self.feed_forward(x)
        # Apply residual connection, dropout, and layer normalization
        x = self.norm2(x + self.dropout(ff_output))
        return x

# Decoder Layer

In [14]:
class DecoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout):
        super(DecoderLayer, self).__init__()

        # Multi-head self-attention mechanism
        self.self_attn = MultiHeadAttention(d_model, num_heads)
        # Multi-head cross-attention mechanism
        self.cross_attn = MultiHeadAttention(d_model, num_heads)
        # Position-wise feed-forward network
        self.feed_forward = PositionWiseFeedForward(d_model, d_ff)
        # Layer normalization for self-attention output
        self.norm1 = nn.LayerNorm(d_model)
        # Layer normalization for cross-attention output
        self.norm2 = nn.LayerNorm(d_model)
        # Layer normalization for feed-forward output
        self.norm3 = nn.LayerNorm(d_model)
        # Dropout for regularization
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, enc_output, src_mask, tgt_mask):
        # Compute self-attention output (Q, K, V are all x, with target mask)
        attn_output = self.self_attn(x, x, x, tgt_mask)
        # Apply residual connection, dropout, and layer normalization
        x = self.norm1(x + self.dropout(attn_output))
        # Compute cross-attention output (Q is x, K and V are encoder output, with source mask)
        attn_output = self.cross_attn(x, enc_output, enc_output, src_mask)
        # Apply residual connection, dropout, and layer normalization
        x = self.norm2(x + self.dropout(attn_output))
        # Compute feed-forward network output
        ff_output = self.feed_forward(x)
        # Apply residual connection, dropout, and layer normalization
        x = self.norm3(x + self.dropout(ff_output))
        return x

# Transformer Model

In [15]:


class Transformer(nn.Module):
    def __init__(self, src_vocab_size, tgt_vocab_size, d_model, num_heads, num_layers, d_ff, max_seq_length, dropout):
        super(Transformer, self).__init__()
        # Embedding layers for source and target sequences
        self.encoder_embedding = nn.Embedding(src_vocab_size, d_model)
        self.decoder_embedding = nn.Embedding(tgt_vocab_size, d_model)
        # Positional encoding layer
        self.positional_encoding = PositionalEncoding(d_model, max_seq_length)

        # Stack of encoder and decoder layers
        self.encoder_layers = nn.ModuleList([EncoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)])
        self.decoder_layers = nn.ModuleList([DecoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)])

        # Final linear layer to project decoder output to target vocabulary size
        self.fc = nn.Linear(d_model, tgt_vocab_size)
        # Dropout for regularization
        self.dropout = nn.Dropout(dropout)

    def generate_mask(self, src, tgt):
        # Create source mask to ignore padding tokens (where src == 0)
        src_mask = (src != 0).unsqueeze(1).unsqueeze(2)
        # Create target mask to ignore padding tokens (where tgt == 0)
        tgt_mask = (tgt != 0).unsqueeze(1).unsqueeze(3)
        # Create a "no-peak" mask to prevent attending to future tokens in the target sequence
        seq_length = tgt.size(1)
        nopeak_mask = (1 - torch.triu(torch.ones(1, seq_length, seq_length, device=device), diagonal=1)).bool()
        # Combine padding mask and no-peak mask for the target sequence
        tgt_mask = tgt_mask & nopeak_mask
        return src_mask, tgt_mask

    def forward(self, src, tgt):
        # Generate source and target masks
        src_mask, tgt_mask = self.generate_mask(src, tgt)

        # Embed source and target sequences and add positional encoding
        src_embedded = self.dropout(self.positional_encoding(self.encoder_embedding(src)))
        tgt_embedded = self.dropout(self.positional_encoding(self.decoder_embedding(tgt)))

        # Pass the embedded source sequence through the encoder layers
        enc_output = src_embedded
        for enc_layer in self.encoder_layers:
            enc_output = enc_layer(enc_output, src_mask)

        # Pass the embedded target sequence and encoder output through the decoder layers
        dec_output = tgt_embedded
        for dec_layer in self.decoder_layers:
            dec_output = dec_layer(dec_output, enc_output, src_mask, tgt_mask)

        # Project the decoder output to the target vocabulary size
        output = self.fc(dec_output)
        return output


# Load Data

In [16]:
# Download German-English dataset
!wget https://www.manythings.org/anki/deu-eng.zip

--2025-03-21 08:08:32--  https://www.manythings.org/anki/deu-eng.zip
Resolving www.manythings.org (www.manythings.org)... 173.254.30.110
Connecting to www.manythings.org (www.manythings.org)|173.254.30.110|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 10364105 (9.9M) [application/zip]
Saving to: ‘deu-eng.zip’


2025-03-21 08:08:33 (27.2 MB/s) - ‘deu-eng.zip’ saved [10364105/10364105]



In [17]:

!unzip deu-eng.zip

Archive:  deu-eng.zip
  inflating: deu.txt                 
  inflating: _about.txt              


In [18]:
# file reading
with open('deu.txt', 'r') as f:
    lines = f.readlines()

In [19]:
len(lines)

277891

In [20]:
lines[10000]

"I'm doing fine.\tMir geht es gut.\tCC-BY 2.0 (France) Attribution: tatoeba.org #239432 (CK) & #659407 (Esperantostern)\n"

In [21]:
# Remove everything after the 2nd tab character.
# As we can see above, we only need the first two columns of the data
lines = [line.split('\t') for line in lines]
lines = ['\t'.join(line[:2]) for line in lines]

In [22]:
lines[10000]

"I'm doing fine.\tMir geht es gut."

In [23]:
# Create train, val, test split
train_lines, val_test_lines = train_test_split(lines, test_size=0.2, random_state=random_seed, shuffle=True)
val_lines, test_lines = train_test_split(val_test_lines, test_size=0.5, random_state=random_seed, shuffle=True)

In [24]:
print(len(train_lines))
print(len(val_lines))
print(len(test_lines))

222312
27789
27790


In [25]:
train_lines[0]

"I've never seen Tom eat meat.\tIch habe Tom noch nie Fleisch essen sehen."

In [26]:
val_lines[0]

"There's probably a better solution.\tEs gibt vielleicht eine noch bessere Lösung."

In [27]:
test_lines[0]

"People who have children are happier than people who don't have children.\tLeute, die Kinder haben, sind glücklicher als solche ohne."

# Preprocess Data

In [28]:
SRC_LANGUAGE = "en"
TGT_LANGUAGE = "de"

In [29]:
tokenizer = {}
tokenizer[SRC_LANGUAGE] = get_tokenizer("spacy", "en_core_web_sm")
tokenizer[TGT_LANGUAGE] = get_tokenizer("spacy", "de_core_news_sm")

## Create Dataset

In [30]:
class SentencePairDataset(Dataset):
    def __init__(self, lines, src_tokenizer, tgt_tokenizer):
        super(SentencePairDataset, self).__init__()

        # Store the list of sentence pairs
        self.lines = lines
        # Tokenizers for source and target languages
        self.src_tokenizer = src_tokenizer
        self.tgt_tokenizer = tgt_tokenizer

    def __len__(self):
        # Return the total number of sentence pairs
        return len(self.lines)

    def __getitem__(self, idx):
        # Retrieve a sentence pair at the given index
        line = self.lines[idx]

        # Split the line into source and target sentences
        src, tgt = line.split('\t')
        # Tokenize the source and target sentences
        src_tokens = self.src_tokenizer(src)
        tgt_tokens = self.tgt_tokenizer(tgt)

        # Return the tokenized source and target sequences
        return src_tokens, tgt_tokens

In [31]:
train_ds = SentencePairDataset(train_lines, tokenizer[SRC_LANGUAGE], tokenizer[TGT_LANGUAGE])
val_ds = SentencePairDataset(val_lines, tokenizer[SRC_LANGUAGE], tokenizer[TGT_LANGUAGE])
test_ds = SentencePairDataset(test_lines, tokenizer[SRC_LANGUAGE], tokenizer[TGT_LANGUAGE])

In [32]:
# Length of longest src sequence
print(max(len(x[0]) for x in train_ds))
print(max(len(x[0]) for x in val_ds))
print(max(len(x[0]) for x in test_ds))

111
53
35


In [33]:
# Length of longest tgt sequence
print(max(len(x[1]) for x in train_ds))
print(max(len(x[1]) for x in val_ds))
print(max(len(x[1]) for x in test_ds))

88
63
33


In [34]:
next(iter(train_ds))

(['I', "'ve", 'never', 'seen', 'Tom', 'eat', 'meat', '.'],
 ['Ich', 'habe', 'Tom', 'noch', 'nie', 'Fleisch', 'essen', 'sehen', '.'])

## Create Vocabulary

In [35]:
vocab = {}

In [36]:
src_vocab_size = 10_000
tgt_vocab_size = 10_000
max_seq_len = 100

PAD_IDX = 0
UNK_IDX = 1
BOS_IDX = 2
EOS_IDX = 3

special_symbols = ['<PAD>', '<UNK>', '<BOS>', '<EOS>']

In [37]:
def yield_tokens(dataset, lang_idx=0):
    # Get the total number of items in the dataset
    n = len(dataset)
    # Initialize the index counter
    i = 0

    # Iterate through the dataset
    while i < n:
        # Yield the token sequence at the specified language index (0 for source, 1 for target)
        yield dataset[i][lang_idx]
        # Move to the next item in the dataset
        i += 1

In [38]:
src_iterator = yield_tokens(train_ds, lang_idx=0)
tgt_iterator = yield_tokens(train_ds, lang_idx=1)

In [39]:
vocab[SRC_LANGUAGE] = build_vocab_from_iterator(
    src_iterator,
    min_freq=1,
    specials=special_symbols,
    special_first=True,
    max_tokens=src_vocab_size,
)

In [40]:
vocab[TGT_LANGUAGE] = build_vocab_from_iterator(
    tgt_iterator,
    min_freq=1,
    specials=special_symbols,
    special_first=True,
    max_tokens=tgt_vocab_size,
)

In [41]:
vocab[SRC_LANGUAGE].set_default_index(UNK_IDX)
vocab[TGT_LANGUAGE].set_default_index(UNK_IDX)

In [42]:
vocab[SRC_LANGUAGE]['hello']

2203

In [43]:
vocab[TGT_LANGUAGE]['Hallo']

2345

In [44]:


def collate_fn(batch, vocab):
    # Get the batch size
    batch_size = len(batch)
    # Unzip the batch into source and target sequences
    srcs, tgts = zip(*batch)
    # Initialize tensors for source and target sequences with padding
    src_vectors = torch.zeros((batch_size, max_seq_len), dtype=torch.long, device=device)
    tgt_vectors = torch.zeros((batch_size, max_seq_len), dtype=torch.long, device=device)

    # Process each sequence in the batch
    for i in range(batch_size):
        # Convert source sequence to tensor: add BOS, EOS, and pad to max_seq_len
        src_vectors[i] = torch.tensor(
            ([BOS_IDX] + vocab[SRC_LANGUAGE](srcs[i]) + [EOS_IDX] + [0] * (max_seq_len - len(srcs[i])))[:max_seq_len],
            dtype=torch.long, device=device
        )
        # Convert target sequence to tensor: add BOS, EOS, and pad to max_seq_len
        tgt_vectors[i] = torch.tensor(
            ([BOS_IDX] + vocab[TGT_LANGUAGE](tgts[i]) + [EOS_IDX] + [0] * (max_seq_len - len(tgts[i])))[:max_seq_len],
            dtype=torch.long, device=device
        )

    # Return the processed source and target tensors
    return src_vectors, tgt_vectors


In [45]:
train_dataloader = DataLoader(train_ds, batch_size=64, shuffle=True, collate_fn=partial(collate_fn, vocab=vocab))
val_dataloader = DataLoader(val_ds, batch_size=64, shuffle=True, collate_fn=partial(collate_fn, vocab=vocab))
test_dataloader = DataLoader(test_ds, batch_size=64, shuffle=True, collate_fn=partial(collate_fn, vocab=vocab))

In [46]:
src_vocab_size = 10_000
tgt_vocab_size = 10_000
d_model = 512
num_heads = 4
num_layers = 6
d_ff = 2048
max_seq_length = 100
dropout = 0.1
num_epochs = 1

transformer = Transformer(src_vocab_size, tgt_vocab_size, d_model, num_heads, num_layers, d_ff, max_seq_length, dropout).to(device)

criterion = nn.CrossEntropyLoss(ignore_index=0)
optimizer = optim.Adam(transformer.parameters(), lr=0.0001, betas=(0.9, 0.98), eps=1e-9)

transformer.train()

for epoch in range(num_epochs):
    print(f"Epoch: {epoch+1}\n------------------------------")
    transformer.train()
    for data in train_dataloader:
        src_data, tgt_data = data
        optimizer.zero_grad()
        output = transformer(src_data, tgt_data[:, :-1])
        loss = criterion(output.contiguous().view(-1, tgt_vocab_size), tgt_data[:, 1:].contiguous().view(-1))
        loss.backward()
        optimizer.step()
        print(f"Epoch: {epoch+1}, Training Loss: {loss.item()}")

    transformer.eval()
    with torch.no_grad():
        for data in val_dataloader:
            src_data, tgt_data = data
            output = transformer(src_data, tgt_data[:, :-1])
            loss = criterion(output.contiguous().view(-1, tgt_vocab_size), tgt_data[:, 1:].contiguous().view(-1))
            print(f"Epoch: {epoch+1}, Validation Loss: {loss.item()}")

    torch.save(transformer.state_dict(), f'./transformer_state_dict_epoch_{epoch+1}')

Epoch: 1
------------------------------
Epoch: 1, Training Loss: 9.277532577514648
Epoch: 1, Training Loss: 8.17051887512207
Epoch: 1, Training Loss: 7.805712699890137
Epoch: 1, Training Loss: 7.453547954559326
Epoch: 1, Training Loss: 7.307225704193115
Epoch: 1, Training Loss: 7.313108444213867
Epoch: 1, Training Loss: 7.0290117263793945
Epoch: 1, Training Loss: 7.357789516448975
Epoch: 1, Training Loss: 6.998960971832275
Epoch: 1, Training Loss: 6.995582580566406
Epoch: 1, Training Loss: 6.985655784606934
Epoch: 1, Training Loss: 6.706568717956543
Epoch: 1, Training Loss: 6.706528663635254
Epoch: 1, Training Loss: 6.592241287231445
Epoch: 1, Training Loss: 6.605266571044922
Epoch: 1, Training Loss: 6.608693599700928
Epoch: 1, Training Loss: 6.416293621063232
Epoch: 1, Training Loss: 6.387775421142578
Epoch: 1, Training Loss: 6.289871692657471
Epoch: 1, Training Loss: 6.16506814956665
Epoch: 1, Training Loss: 6.160472393035889
Epoch: 1, Training Loss: 6.128838062286377
Epoch: 1, Train

In [47]:
transformer.eval()
with torch.no_grad():
    for data in test_dataloader:
        src_data, tgt_data = data
        output = transformer(src_data, tgt_data[:, :-1])
        loss = criterion(output.contiguous().view(-1, tgt_vocab_size), tgt_data[:, 1:].contiguous().view(-1))
        print(f"Test Loss: {loss.item()}")

Test Loss: 1.6052852869033813
Test Loss: 1.7744276523590088
Test Loss: 1.7148017883300781
Test Loss: 1.682302713394165
Test Loss: 1.7974210977554321
Test Loss: 1.8694230318069458
Test Loss: 1.9955693483352661
Test Loss: 1.478837251663208
Test Loss: 1.612669825553894
Test Loss: 1.7183172702789307
Test Loss: 1.962624192237854
Test Loss: 1.717918872833252
Test Loss: 1.9392225742340088
Test Loss: 1.8702709674835205
Test Loss: 1.7806113958358765
Test Loss: 1.8002043962478638
Test Loss: 1.7125005722045898
Test Loss: 1.5445551872253418
Test Loss: 1.5599184036254883
Test Loss: 1.8407514095306396
Test Loss: 1.8727189302444458
Test Loss: 1.6432327032089233
Test Loss: 1.8154791593551636
Test Loss: 1.8881146907806396
Test Loss: 1.492788314819336
Test Loss: 1.6067529916763306
Test Loss: 1.7294621467590332
Test Loss: 2.1103644371032715
Test Loss: 1.979024887084961
Test Loss: 1.7265684604644775
Test Loss: 1.655185580253601
Test Loss: 1.3915770053863525
Test Loss: 1.550269365310669
Test Loss: 1.571853

# Inference

In [51]:
model_path = "./transformer_state_dict_epoch_1"
state_dict = torch.load(model_path)

src_vocab_size = 10_000
tgt_vocab_size = 10_000
d_model = 512
num_heads = 4
num_layers = 6
d_ff = 2048
max_seq_length = 100
dropout = 0.1
num_epochs = 3

transformer_loaded = Transformer(src_vocab_size, tgt_vocab_size, d_model, num_heads, num_layers, d_ff, max_seq_length, dropout).to(device)
transformer_loaded.load_state_dict(state_dict)

<All keys matched successfully>

In [52]:


def translate(src):
    # Tokenize the source sentence
    src_tokens = tokenizer[SRC_LANGUAGE](src)
    # Initialize the target tokens with <BOS> (beginning of sequence)
    tgt_tokens = ["<BOS>"]

    # Convert source tokens to tensor: add BOS, EOS, and pad to max_seq_len
    src_vectors = torch.tensor(
        ([BOS_IDX] + vocab[SRC_LANGUAGE](src_tokens) + [EOS_IDX] + [0] * (max_seq_len - len(src_tokens)))[:max_seq_len],
        dtype=torch.long, device=device
    ).unsqueeze(0)  # Add batch dimension

    # Generate target tokens step by step
    for i in range(max_seq_len):
        # Convert target tokens to tensor: pad to max_seq_len
        tgt_vectors = torch.tensor(
            (vocab[TGT_LANGUAGE](tgt_tokens) + [0] * (max_seq_len - len(tgt_tokens)))[:max_seq_len],
            dtype=torch.long, device=device
        ).unsqueeze(0)  # Add batch dimension

        # Pass source and target vectors through the transformer model
        output = transformer(src_vectors, tgt_vectors)
        # Get the predicted token index (argmax of softmax output)
        idx = torch.argmax(nn.functional.softmax(output, dim=2)[0][i]).item()
        # Append the predicted token to the target tokens
        tgt_tokens.append(vocab[TGT_LANGUAGE].lookup_token(idx))

        # Stop if <EOS> (end of sequence) is predicted
        if idx == EOS_IDX:
            break

    # Join the target tokens into a sentence, remove special tokens, and strip whitespace
    return " ".join(tgt_tokens).replace("<BOS>", "").replace("<EOS>", "").replace("<PAD>", "").strip()


In [53]:
translate("Hello, I am a student.")

'Ich bin Student .'

In [54]:
translate("My name is John.")

'Mein Name ist Johannes .'

In [55]:
translate("I am learning German.")

'Ich lerne .'

In [56]:
translate("I eat bananas.")

'Ich esse <UNK> .'

In [57]:
translate("I have three books and two pens.")

'Ich habe drei Bücher und zwei .'

In [58]:
translate("Do you work in an office?")

'<UNK> du in einem Büro ?'

In [59]:
translate("How are you?")

'Wie geht es dir ?'

In [60]:
eng, ger = test_lines[0].split('\t')
print(eng)
print(ger)
translate(eng)

People who have children are happier than people who don't have children.
Leute, die Kinder haben, sind glücklicher als solche ohne.


'Die Leute haben Kinder , die Leute haben , die Kinder nicht haben .'

In [61]:
eng, ger = test_lines[500].split('\t')
print(eng)
print(ger)
translate(eng)

You don't need to know everything.
Du musst nicht alles wissen.


'Du brauchst nicht alles wissen .'

In [62]:
eng, ger = train_lines[1000].split('\t')
print(eng)
print(ger)
translate(eng)

It could get complicated.
Es könnte schwierig werden.


'Es könnte kompliziert werden .'

In [63]:
eng, ger = train_lines[10000].split('\t')
print(eng)
print(ger)
translate(eng)

Do you think it's a good idea to feed your dog table scraps?
Meinst du, dass es eine gute Idee ist, deinen Hund mit Tischabfällen zu füttern?


'Denkst du , es ist ein guter Hund , dein Hund zu füttern ?'

# Export model and vocabulary

In [64]:
torch.save(vocab[SRC_LANGUAGE], "./vocab-english")
torch.save(vocab[TGT_LANGUAGE], "./vocab-german")

In [65]:
torch.save(tokenizer[SRC_LANGUAGE], "./tokenizer-english")
torch.save(tokenizer[TGT_LANGUAGE], "./tokenizer-german")

In [66]:
torch.save(transformer, "./transformer_model")