In [1]:
# Import PyTorch
import torch
from torch import nn

# Import torchvision 
import torchtext

# Import matplotlib for visualization
import matplotlib.pyplot as plt

import random

# Check versions
# Note: your PyTorch version shouldn't be lower than 1.10.0 and torchtext version shouldn't be lower than 0.11
print(f"PyTorch version: {torch.__version__}\ntorchtext version: {torchtext.__version__}")

PyTorch version: 2.1.2
torchtext version: 0.16.2


In [2]:
device = "mps" if torch.backends.mps.is_available() \
    else "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

# Data

In [3]:
import pandas as pd

raw_df = pd.read_csv("/kaggle/input/machine-translation-dataset-de-en/translation_train.csv")
raw_df.sample(5)

Unnamed: 0,english,german
2975,A black dog plays with a bit of ice by the fro...,Ein schwarzer Hund spielt neben dem gefrorenen...
13767,A child in a white and green soccer uniform ki...,Ein Kind in einer weißen und grünen Fußballuni...
6668,An old man sits on a bench overlooking the water.,Ein alter Mann sitzt auf einer Bank mit Blick ...
24171,A group of people are riding down a roller coa...,Eine Gruppe von Menschen fährt eine Achterbahn...
7432,People stand on boat dock waiting for their bo...,Menschen stehen an einer Bootsanlegestelle und...


In [4]:
raw_df['english'][8550], raw_df['german'][8550]

('Man with helmet performing a trick while rollerblading.',
 'Mann mit Helm vollführt ein Kunststück auf Rollerblades.')

## Tokenization

In [5]:
!python -m spacy download de_core_news_sm
!python -m spacy download en_core_web_sm

Collecting de-core-news-sm==3.7.0
  Downloading https://github.com/explosion/spacy-models/releases/download/de_core_news_sm-3.7.0/de_core_news_sm-3.7.0-py3-none-any.whl (14.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.6/14.6 MB[0m [31m76.4 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[0mInstalling collected packages: de-core-news-sm
Successfully installed de-core-news-sm-3.7.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('de_core_news_sm')
Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m82.1 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[0m[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [6]:
from torchtext.data.utils import get_tokenizer

en_tokenizer = get_tokenizer(tokenizer = 'spacy', language = "en_core_web_sm")
de_tokenizer = get_tokenizer(tokenizer = 'spacy', language = "de_core_news_sm")

# apply tokenizer to our dataset
tokenized_en_df = raw_df['english'].map(en_tokenizer)
tokenized_de_df = raw_df['german'].map(de_tokenizer)

# Create vocab

In [7]:
from torchtext.vocab import build_vocab_from_iterator

en_vocab = build_vocab_from_iterator(
    tokenized_en_df,
    min_freq=2,
    specials= ['<pad>', '<sos>', '<eos>', '<unk>'],
    special_first=True,
)
en_vocab.set_default_index(en_vocab['<unk>'])

de_vocab = build_vocab_from_iterator(
    tokenized_de_df,
    min_freq=2,
    specials= ['<pad>', '<sos>', '<eos>', '<unk>'],
    special_first=True,
)
de_vocab.set_default_index(de_vocab['<unk>'])

In [8]:
print(f"length of english vocabulary: {len(en_vocab)}")
print(f"length of german vocabulary: {len(de_vocab)}")

length of english vocabulary: 6191
length of german vocabulary: 8014


In [9]:
from torchtext import transforms

en_transform = transforms.Sequential(
    ## converts the sentences to indices based on given vocabulary
    transforms.VocabTransform(vocab=en_vocab),

    ## Add <sos> at beginning of each sentence. 1 because the index for <sos> in vocabulary is 1 as seen in previous section
    transforms.AddToken(en_vocab['<sos>'], begin=True),

    ## Add <eos> at end of each sentence. 2 because the index for <eos> in vocabulary is 2 as seen in previous section
    transforms.AddToken(en_vocab['<eos>'], begin=False),
    
    ## converts data into tensor
    transforms.ToTensor(),
    
#     ## padding
#     transforms.PadTransform(
#         max_length = tokenized_en_df.map(len).max() + 2, #2 for <sos> and <eos>
#         pad_value = 0)
    )

In [10]:
from torchtext import transforms

de_transform = transforms.Sequential(
    ## converts the sentences to indices based on given vocabulary
    transforms.VocabTransform(vocab=de_vocab),

    ## Add <sos> at beginning of each sentence. 1 because the index for <sos> in vocabulary is 1 as seen in previous section
    transforms.AddToken(de_vocab['<sos>'], begin=True),
    
    ## converts data into tensor
    transforms.ToTensor(),
    
#     ## padding
#     transforms.PadTransform(
#         max_length = tokenized_de_df.map(len).max() + 1, #1 for <sos> 
#         pad_value = 0)
    )

In [11]:
from torchtext import transforms

label_transform = transforms.Sequential(
    ## converts the sentences to indices based on given vocabulary
    transforms.VocabTransform(vocab=de_vocab),

    ## Add <eos> at end of each sentence. 2 because the index for <eos> in vocabulary is 2 as seen in previous section
    transforms.AddToken(de_vocab['<eos>'], begin=False),
    
    ## converts data into tensor
    transforms.ToTensor(),
    
#     ## padding
#     transforms.PadTransform(
#         max_length = tokenized_de_df.map(len).max() + 1, #1 for <sos> 
#         pad_value = 0)
    )

## Make a dataset

In [12]:
# Write a custom dataset class (inherits from torch.utils.data.Dataset)
from torch.utils.data import Dataset


# 1. Subclass torch.utils.data.Dataset
class En_De_DatasetCustom(Dataset):
    
    # 2. Initialize with a target_dir and transform (optional) parameter
    def __init__(self, df, transform, is_test = False):
        
    # 3. Create class attributes
        # Get all image paths
        self.df = df
        # Setup transforms
        self.transform = transform
        # Check if df is used for test
        self.is_test = is_test
    
    # 5. Overwrite the __len__() method (optional but recommended for subclasses of torch.utils.data.Dataset)
    def __len__(self) -> int:
        "Returns the total number of samples."
        return len(self.df)
    
    # 6. Overwrite the __getitem__() method (required for subclasses of torch.utils.data.Dataset)
    def __getitem__(self, index: int):
        "Returns one sample of data, data and label (X, y)."
        if self.is_test == False:
            tokenized_en_text = en_tokenizer(self.df['english'].values[index])
            tokenized_de_text = de_tokenizer(self.df['german'].values[index])
            transformed_en_text = self.transform['en'](tokenized_en_text)
            transformed_de_text = self.transform['de'](tokenized_de_text)
            transformed_label = self.transform['label'](tokenized_de_text)
            return transformed_en_text, transformed_de_text, transformed_label # return data, label (X, y)
        else:
            tokenized_en_text = en_tokenizer(self.df['english'].values[index])
            transformed_en_text = self.transform['en'](tokenized_en_text)
            return transformed_en_text

In [13]:
En_De_Dataset = En_De_DatasetCustom(
    df = raw_df,
    transform = {'en': en_transform, 
                 'de': de_transform,
                 'label': label_transform}
)

In [14]:
En_De_Dataset[1]

(tensor([   1,  165,   36,    7,  335,  287,   17, 1224,    4,  758, 4496, 2957,
            5,    2]),
 tensor([   1,   84,   31,   10,  847, 2208,   15,    3,    4]),
 tensor([  84,   31,   10,  847, 2208,   15,    3,    4,    2]))

## Split dataset into train and validate

In [15]:
from torch.utils.data import Dataset, DataLoader, random_split

# Define the sizes of the splits
train_size = int(0.8 * len(En_De_Dataset))
val_size = len(En_De_Dataset) - train_size

# Use random_split to split the dataset
train_dataset, val_dataset = random_split(En_De_Dataset, [train_size, val_size])

## Prepare dataloader

In [16]:
from torch.nn.utils.rnn import pad_sequence

def collate_fn(batch):
    # Separate data and labels
    en_sequences, de_sequences, labels = zip(*batch)
    # Pad the sequences
    padded_en_sequences = pad_sequence(en_sequences, batch_first=True, padding_value=0)
    padded_de_sequences = pad_sequence(de_sequences, batch_first=True, padding_value=0)
    padded_labels = pad_sequence(labels, batch_first=True, padding_value=0)

    
    return padded_en_sequences, padded_de_sequences, padded_labels

In [17]:
from torch.utils.data import DataLoader
import os

# Setup the batch size hyperparameter
BATCH_SIZE = 512
NUM_CORES = os.cpu_count()

# Turn datasets into iterables (batches)
train_dataloader = DataLoader(
    train_dataset, # dataset to turn into iterable
    batch_size=BATCH_SIZE, # how many samples per batch? 
    shuffle=True, # shuffle data every epoch?
    num_workers = NUM_CORES,
    pin_memory =True,
    collate_fn=collate_fn
    
)

val_dataloader = DataLoader(
    val_dataset,
    batch_size=1,
    shuffle=False, # don't necessarily have to shuffle the testing data
    num_workers = NUM_CORES,
    pin_memory =True,
    collate_fn=collate_fn
                            
)

# Let's check out what we've created
print(f"Dataloaders: {train_dataloader, val_dataloader}") 
print(f"Length of training dataset: {len(train_dataloader.dataset)}")
print(f"Length of train dataloader: {len(train_dataloader)} batches of {BATCH_SIZE}")
print(f"Length of val dataset: {len(val_dataloader.dataset)}")
print(f"Length of val dataloader: {len(val_dataloader)} batches of {1}")

Dataloaders: (<torch.utils.data.dataloader.DataLoader object at 0x7fcd7670cdf0>, <torch.utils.data.dataloader.DataLoader object at 0x7fcd7670dfc0>)
Length of training dataset: 23200
Length of train dataloader: 46 batches of 512
Length of val dataset: 5800
Length of val dataloader: 5800 batches of 1


In [18]:
input_sequences_batch, output_sequences_batch, labels_batch = next(iter(train_dataloader))
input_sequences_batch.shape, output_sequences_batch.shape, labels_batch.shape

(torch.Size([512, 30]), torch.Size([512, 32]), torch.Size([512, 32]))

In [19]:
input_sequences_batch, output_sequences_batch, labels_batch = next(iter(val_dataloader))
input_sequences_batch.shape, output_sequences_batch.shape, labels_batch.shape

(torch.Size([1, 13]), torch.Size([1, 11]), torch.Size([1, 11]))

# Model

In [20]:
import math
class PositionalEncoding(torch.nn.Module):
    def __init__(self, 
                 d_model = 512,
                 max_length = 5000):
        
        super().__init__()
        
        # Initialize Positional Encoding with zeros
        pe = torch.zeros(max_length, d_model)
        ### pe_shape = [max_length,d_model]
        
        # Get the position of tokens in a sequence
        position = torch.arange(0, max_length, dtype=torch.float).unsqueeze(dim = 1)
        ### position_shape = [max_length,1]
        
        # Compute positional embedding
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(dim = 0)
        ### pe_shape = [1,max_length,d_model]
        
        self.register_buffer('pe', pe) # we don't train the pe
        
    def forward(self,x):
        ### x_shape = [N,seq,d_model]
        
        out = x + self.pe[:,:x.shape[1]]
        ### output_shape = [N,seq,d_model]
        
        return out

In [21]:
class InputEmbedding(torch.nn.Module):
    def __init__(self, vocab, d_model = 512, max_length = 5000):
        super().__init__()
        
        self.embedding = torch.nn.Embedding(
            num_embeddings = len(vocab), 
            embedding_dim = d_model, 
            padding_idx = vocab['<pad>']
        )
        self.pe = PositionalEncoding(
            d_model = d_model,
            max_length = max_length
        )
        
    def forward(self,x):
        ### x_shape = [N,seq]
        
        out = self.embedding(x)
        ### output_embedding_shape = [N,seq,d_model]
        
        out = self.pe(out)
        ### output_pe_shape = [N,seq,d_model]
        
        return out

In [22]:
class Transformer_model(torch.nn.Module):
    def __init__(self, 
                 d_model,
                 nhead,
                 dim_feedforward,
                 output_size,
                 src_vocab, tgt_vocab,
                 max_length = 5000):
        
        super().__init__()
        
        self.max_length = max_length
        self.src_vocab = src_vocab
        self.tgt_vocab = tgt_vocab
        
        # The input Embedding
        self.src_embedding = InputEmbedding(
            vocab = src_vocab,
            d_model = d_model,
            max_length = max_length
        )
        
        self.tgt_embedding = InputEmbedding(
            vocab = tgt_vocab,
            d_model = d_model,
            max_length = max_length
        )
        
        # The core encoder-decoder transformer
        self.encoder_layer = torch.nn.TransformerEncoderLayer(
            d_model = d_model,
            nhead = nhead,
            dim_feedforward = dim_feedforward,
            batch_first = True,
            bias = True
        )

        self.decoder_layer = torch.nn.TransformerDecoderLayer(
            d_model = d_model,
            nhead = nhead,
            dim_feedforward = dim_feedforward,
            batch_first = True,
            bias = True
        )

        self.encoder_transformer = torch.nn.TransformerEncoder(
            encoder_layer = self.encoder_layer,
            num_layers = 1
        )

        self.decoder_transformer = torch.nn.TransformerDecoder(
            decoder_layer = self.decoder_layer,
            num_layers = 1
        )

        # The classifier
        self.classifier = torch.nn.Sequential(
            torch.nn.LayerNorm(d_model),
            torch.nn.Linear(d_model, d_model),
            torch.nn.ReLU(),
            torch.nn.LayerNorm(d_model),
            torch.nn.Linear(d_model, output_size),
            torch.nn.LayerNorm(output_size)
        )
        
        
    def do_training(self, encoder_input, decoder_input, epoch):
        # encoder_input_shape = [N,seq_en], decoder_input_shape = [N,seq_de]

        encoder_padding_mask = generate_key_padding_mask(encoder_input)
        # encoder_padding_mask_shape = [N,seq_en]
        
        encoder_embedding = self.src_embedding(encoder_input)
        # encoder_embedding_shape = [N,seq_en,d_model]
        
        encoder_output = self.encoder_transformer(
            src = encoder_embedding,
            src_key_padding_mask = encoder_padding_mask
        )
        # encoder_output_shape = [N,seq_en,d_model]
        
        decoder_mask = generate_square_mask(decoder_input.shape[1])
        # decoder_mask_shape = [seq_de,seq_de]

        decoder_padding_mask = generate_key_padding_mask(decoder_input)
        # decoder_padding_mask_shape = [N,seq_de]

        decoder_embedding = self.tgt_embedding(decoder_input)
        # encoder_embedding_shape = [N,seq_de,d_model]

        decoder_output = self.decoder_transformer(
            tgt = decoder_embedding,
            tgt_mask = decoder_mask,
            tgt_key_padding_mask = decoder_padding_mask,
            memory = encoder_output,
            memory_key_padding_mask = encoder_padding_mask
        )
        # decoder_output_shape = [N,seq_de,d_model]

        decoder_output = self.classifier(decoder_output)
        # decoder_output_shape = [N,seq_de,tgt_vocab_size]
        
        return decoder_output
    
    def make_generation(self, encoder_input, search_strategy = 'beam_search', beam_size = 7):
        """
            Batch_size must be one 
        """
        assert encoder_input.shape[0] == 1, "batch_size != 1"
        
        if search_strategy == 'beam_search':
            generated_sequence = self.Beam_Search(beam_size, encoder_input)
        else:
            generated_sequence = self.Greedy_Search(encoder_input)
        
        return generated_sequence

In [23]:
def generate_square_mask(seq):
    mask = torch.ones(size = (seq, seq), 
                      device=device if device == 'cuda' else 'cpu')
    mask = (torch.triu(mask)).transpose(0, 1)
    mask = mask.type(torch.float32).masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
    return mask

In [24]:
def generate_key_padding_mask(X):
    # X_shape = [N,seq]
    padding_mask = (X == 0)
    return padding_mask.type(torch.float32)

In [25]:
def Greedy_step(self, output_at_t):
    # output_at_t = [N,1,vocab_size]
    
    top_prob, top_idx = output_at_t.topk(1, dim = 2) 
    # top_idx_shape = [N,1,1]

    top_idx = top_idx.squeeze(1).detach()
    # top_idx_shape = [N,1]

    return top_idx

def Greedy_Search(self, encoder_input):
    # encoder_input_shape = [N,seq_en]

    encoder_padding_mask = generate_key_padding_mask( encoder_input)
    # encoder_padding_mask_shape = [N,seq_en]

    encoder_embedding = self.src_embedding(encoder_input)
    # encoder_embedding_shape = [N,seq_en,d_model]

    encoder_output = self.encoder_transformer(
        src = encoder_embedding,
        src_key_padding_mask = encoder_padding_mask
    )
    # encoder_output_shape = [N,seq_en,d_model]
            
    
    batch_size = encoder_input.shape[0]
    inputs = torch.ones(
        size = (batch_size, 1),
        dtype = torch.long,
        device = device if device == 'cuda' else 'cpu'
    ) * self.tgt_vocab['<sos>']
    # input_shape = [N,1]

    for i in range(self.max_length):
        inputs_mask = generate_square_mask(inputs.shape[1])
        # decoder_mask_shape = [seq_intput,seq_input]

        inputs_padding_mask = generate_key_padding_mask(inputs)
        # decoder_padding_mask_shape = [N,seq_input]

        inputs_embedding = self.tgt_embedding(inputs)
        # inputs_embedding_shape = [N,seq_input,d_model]

        outputs = self.decoder_transformer(
            tgt = inputs_embedding,
            memory = encoder_output,
            tgt_mask = inputs_mask,
            tgt_key_padding_mask = inputs_padding_mask,
            memory_key_padding_mask = encoder_padding_mask
        )
        # output_shape = [N,seq_input,d_model]
        
        outputs = outputs[:,-1].unsqueeze(dim=1)
        # output[:,-1]_shape = [N,d_model], output_shape = [N,1,d_model]

        outputs = self.classifier(outputs)
        # output_shape = [N,1,tgt_vocab_size]

        next_words = self.Greedy_step(torch.softmax(outputs,dim=2))
        # inputs_shape = [N,1]
        
        inputs = torch.cat([inputs, next_words], dim = 1)
        # inputs_shape = [N,seq_input]
        
        if next_words.item() == self.tgt_vocab['<eos>']:
            break
        
    generated_sequence = inputs
    # generated_sequence_shape = [N,seq_input]
        
    return generated_sequence.squeeze(0)

Transformer_model.Greedy_step = Greedy_step
Transformer_model.Greedy_Search = Greedy_Search

In [26]:
def Beam_step(self, beam_size, output_at_t):
    # output_at_t = [N,1,vocab_size]
    
    top_prob, top_idx = output_at_t.topk(beam_size, dim = 2) 
    # top_idx_shape = [N,1,beam_size]
    # top_prob_shape = [N,1,beam_size]

    top_idx = top_idx.squeeze(1).detach()
    # top_idx_shape = [N,beam_size]

    top_prob = top_prob.squeeze(1).detach()
    # top_prob_shape = [N,beam_size]
    
    return top_idx, top_prob

def Beam_Search(self, beam_size, encoder_input):
    # encoder_input_shape = [N,seq_en]

    encoder_padding_mask = generate_key_padding_mask(encoder_input)
    # encoder_padding_mask_shape = [N,seq_en]

    encoder_embedding = self.src_embedding(encoder_input)
    # encoder_embedding_shape = [N,seq_en,d_model]

    encoder_output = self.encoder_transformer(
        src = encoder_embedding,
        src_key_padding_mask = encoder_padding_mask
    )
    # encoder_output_shape = [N,seq_en,d_model]
            
    
    batch_size = encoder_input.shape[0]
    inputs = torch.ones(
        size = (batch_size, 1),
        dtype = torch.long,
        device = device if device == 'cuda' else 'cpu'
    ) * self.tgt_vocab['<sos>']
    # input_shape = [N,1]
    
    candidate_sequence = inputs
    # candidate_sequence_shape = [N,1]
    
    score = torch.log(torch.ones(size = [batch_size, 1], 
                                device = device if device == 'cuda' else 'cpu'))
    # score_shape = [N,1]
    
    candidates = [(candidate_sequence, score)]
    
    finished_candidates = []
    for i in range(self.max_length):
        
        new_candidates = []
        for candidate_sequence, score in candidates:
            # candidates_shape = beam_size * ()
            # candidate_sequence_shape = [N,seq]
            # score_shape = [N,1]

            last_token = candidate_sequence[:,-1].unsqueeze(-1)
            # last_token_shape = [N,1]
            
            if last_token.item() == self.tgt_vocab['<eos>']:
                finished_candidates.append((candidate_sequence, score))
                continue
            
            inputs_mask = generate_square_mask(candidate_sequence.shape[1])
            # decoder_mask_shape = [seq_intput,seq_input]

            inputs_padding_mask = generate_key_padding_mask(candidate_sequence)
            # decoder_padding_mask_shape = [N,seq_input]

            inputs_embedding = self.tgt_embedding(candidate_sequence)
            # inputs_embedding_shape = [N,seq_input,d_model]

            outputs = self.decoder_transformer(
                tgt = inputs_embedding,
                memory = encoder_output,
                tgt_mask = inputs_mask,
                tgt_key_padding_mask = inputs_padding_mask,
                memory_key_padding_mask = encoder_padding_mask
            )
            # output_shape = [N,seq_input,d_model]
            
            outputs = outputs[:,-1].unsqueeze(dim=1)
            # output[:,-1]_shape = [N,d_model], output_shape = [N,1,d_model]

            outputs = self.classifier(outputs)
            # output_shape = [N,1,tgt_vocab_size]
            
            top_tokens, top_probs = self.Beam_step(beam_size = beam_size, 
                                                   output_at_t = torch.softmax(outputs,dim=2))
            # top_tokens = [N,beam_size]
            # top_probs = [N,beam_size]
            
            for j in range(beam_size):
                new_candidate_sequence = torch.cat([candidate_sequence, top_tokens[:,j].unsqueeze(-1)], dim = 1) # top_tokens[:,j].unsqueeze(-1) -> shape: [N,1]
                # new_candidate_sequence_shape = [N,seq+1]
                new_score = score + torch.log(top_probs[:,j].unsqueeze(-1)) # top_probs[:,j].unsqueeze(-1) -> shape: [N,1]
                # new_score_shape = [N,1]
                new_candidates.append((new_candidate_sequence, new_score))
            
        
            new_candiadates = sorted(new_candidates, key=lambda x: x[1]/len(x[0]), reverse=True)
            candidates = new_candidates[:beam_size]
        
        
        if all([candidate_sequence[:,-1].unsqueeze(-1) == self.tgt_vocab['<eos>'] \
                for candidate_sequence, _, in candidates]):
            break
        
        if len(finished_candidates) == beam_size/2:
            break
            
    candidates.extend(finished_candidates)
    candidates = sorted(candidates, key=lambda x: x[1]/len(x[0]), reverse=True)
        
    return candidates[0][0].squeeze(0)

Transformer_model.Beam_step = Beam_step
Transformer_model.Beam_Search = Beam_Search

In [27]:
def convert_to_sentence(sequence, vocab):

    try:
        sos_idx = list(sequence).index(vocab['<sos>'])
    except ValueError as Error:
        sos_idx = -1
    
    try:
        eos_idx = list(sequence).index(vocab['<eos>'])
    except ValueError as Error:
        eos_idx = -1
        
    if eos_idx != -1:
        sequence = vocab.lookup_tokens(list(sequence)[sos_idx+1: eos_idx])
    else:
        sequence = vocab.lookup_tokens(list(sequence)[sos_idx+1:])
        
    sentences = ' '.join(sequence)
    
    return sentences

In [28]:
model_2 = Transformer_model(
    d_model = 512,          
    nhead = 8,
    dim_feedforward = 2048,
    output_size = len(de_vocab),
    src_vocab = en_vocab, 
    tgt_vocab = de_vocab,
    max_length = 50
)
model_2.to(device)

Transformer_model(
  (src_vocab): Vocab()
  (tgt_vocab): Vocab()
  (src_embedding): InputEmbedding(
    (embedding): Embedding(6191, 512, padding_idx=0)
    (pe): PositionalEncoding()
  )
  (tgt_embedding): InputEmbedding(
    (embedding): Embedding(8014, 512, padding_idx=0)
    (pe): PositionalEncoding()
  )
  (encoder_layer): TransformerEncoderLayer(
    (self_attn): MultiheadAttention(
      (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
    )
    (linear1): Linear(in_features=512, out_features=2048, bias=True)
    (dropout): Dropout(p=0.1, inplace=False)
    (linear2): Linear(in_features=2048, out_features=512, bias=True)
    (norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
    (norm2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
    (dropout1): Dropout(p=0.1, inplace=False)
    (dropout2): Dropout(p=0.1, inplace=False)
  )
  (decoder_layer): TransformerDecoderLayer(
    (self_attn): MultiheadAttention(
      (out_p

In [29]:
sample = input_sequences_batch[0].unsqueeze(0)

In [30]:
sequence = model_2.make_generation(sample.to(device))
sequence

tensor([   1,  120,  593, 3554, 4250, 2968, 3206, 7643, 6060, 4378, 3853, 6198,
        4892, 5744, 7689, 1334, 3724, 4425,   49, 7238, 5649, 5663, 1085, 3488,
        3702, 4599, 5740, 3991, 3553,  768, 4533, 2678, 7825, 2826, 6112, 7360,
        4212, 6746, 6725, 3272,  264, 7504, 6273, 5568, 3268, 1159, 1508, 3822,
        4385, 7351, 3337], device='cuda:0')

In [31]:
sentence = convert_to_sentence(sequence, de_vocab)
sentence

'Gras leeren poliert riesiger Struktur Bulldozer mexikanische Hebebühne Baseballplatz Meeresufer Kiosk Steaks Campingstühlen pastellfarbenen steinigen Fallschirmspringen Bürgersteigs neben bewegungslos Betonblock Birthday Mänteln fertigen Dirtbike Interessantes Campen Steinwand plantscht davor Geschirrspüler Ziege streitet Eiswagen Holztreppe ethnischer letzte Sommerkleider Sitzender Kamin Trikot hellrotes Latzhosen Augen-Makeup Inneren Schäferhund Waldweg Kälte Basketbälle erobern Radio'

# Train

In [32]:
# Set seeds
def set_seeds(seed: int=42):
    """Sets random sets for torch operations.

    Args:
        seed (int, optional): Random seed to set. Defaults to 42.
    """
    # Set the seed for general torch operations
    torch.manual_seed(seed)
    # Set the seed for CUDA torch operations (ones that happen on the GPU)
    torch.cuda.manual_seed(seed)

In [33]:
from timeit import default_timer as timer 

def print_train_time(start: float, end: float, device: torch.device = None):
    total_time = end - start
    print(f"Train time on {device}: {total_time:.3f} seconds")
    return total_time

In [34]:
def train_step(model: torch.nn.Module,
               data_loader: torch.utils.data.DataLoader,
               loss_fn: torch.nn.Module,
               optimizer: torch.optim.Optimizer,
               epoch,
               device: torch.device = 'cpu'):
    
    if device != 'cpu':
        model.to(device)
    
    train_loss = 0
    model.train()
    for batch, (X_encoder,X_decoder, y) in enumerate(data_loader):
        # Send data to GPU
        if device != 'cpu':
            X_encoder, X_decoder, y = X_encoder.to(device), X_decoder.to(device), y.to(device)

        # 1. Forward pass
        y_pred = model.do_training(X_encoder, X_decoder, epoch)
            # y_pred_shape = [N,seq_de,vocab_size]
        
        # 2. Calculate loss
        loss = loss_fn(y_pred.view(-1,y_pred.shape[2]), y.view(-1))
        train_loss += loss
				
        # 3. Optimizer zero grad
        optimizer.zero_grad()

        # 4. Loss backward
        loss.backward()

        # 5. Optimizer step
        optimizer.step()

    # Calculate loss and accuracy per epoch and print out what's happening
    train_loss /= len(data_loader)

    return train_loss

def test_step(model: torch.nn.Module,
              data_loader: torch.utils.data.DataLoader,
              device: torch.device = 'cpu'):
    
    if device != 'cpu':
        model.to(device)

    model.eval() 
    with torch.inference_mode(): 
        
        X_encoder,X_decoder, y = list(data_loader)[7]
        
        # Send data to GPU
        if device != 'cpu':
            X_encoder, X_decoder, y = X_encoder.to(device), X_decoder.to(device), y.to(device)

        sequence = model.make_generation(X_encoder)

        # print translation results
        Generated_sentence = convert_to_sentence(sequence.cpu(), model.tgt_vocab)
        Input_sentence = convert_to_sentence(X_encoder.squeeze(0).cpu(), model.src_vocab)
        Label_sentence = convert_to_sentence(y.squeeze(0).cpu(), model.tgt_vocab)

            
    return Input_sentence, Label_sentence, Generated_sentence 

In [35]:
from typing import Dict, List, Tuple

def train(model: torch.nn.Module, 
          train_dataloader: torch.utils.data.DataLoader, 
          test_dataloader: torch.utils.data.DataLoader, 
          optimizer: torch.optim.Optimizer,
          loss_fn: torch.nn.Module,
          epochs: int,
          device: torch.device = 'cpu') -> Dict[str, List]:
    
    # Create empty results dictionary
    results = {
        "Training_Loss": [],
    }

    # Loop through training and testing steps for a number of epochs
    for epoch in tqdm(range(epochs)):
            Training_Loss = train_step(
                data_loader=train_dataloader, 
                model=model, 
                loss_fn=loss_fn,
                optimizer=optimizer,
                epoch=epoch,
                device=device
            )
            Translation_results = test_step(
                data_loader=test_dataloader,
                model=model,
                device=device
            )
            # Print out what's happening
            print(
                f"Epoch: {epoch} | Training_Loss: {Training_Loss:.4f} \n"
                f"Input_sentence: {Translation_results[0]}\n"
                f"Output_sentence: {Translation_results[1]}\n"
                f"Generated_sentence: {Translation_results[2]}\n"
            )

            # Update results dictionary
            results["Training_Loss"].append(Training_Loss.item())

  # Return the filled results at the end of the epochs
    return results

In [36]:
def plot_results(Training_Loss):
    plt.figure(figsize=(10,5))
    
    plt.plot(Training_Loss, color="blue", label="Training Loss")
    plt.title("Training and Test Loss curves")
    plt.xlabel("Epoch")
    plt.ylabel("Loss value")
    plt.legend()

    plt.tight_layout()
    plt.show()

In [37]:
# Setup loss function and optimizer
loss_fn = nn.CrossEntropyLoss(ignore_index=model_2.tgt_vocab['<pad>']) 
optimizer = torch.optim.SGD(params=model_2.parameters(), lr=0.03)

In [None]:
from tqdm.auto import tqdm

set_seeds()

# Measure time
from timeit import default_timer as timer
train_time_start = timer()

# Setup the num_epochs hyperparameter
NUM_EPOCHS = 700

results = train(model=model_2,
                train_dataloader=train_dataloader,
                test_dataloader=val_dataloader,
                optimizer=optimizer,
                loss_fn=loss_fn,
                epochs=NUM_EPOCHS,
                device=device)

train_time_end = timer()
total_train_time_model_2 = print_train_time(start=train_time_start,
                                            end=train_time_end,
                                            device=device)

  0%|          | 0/500 [00:00<?, ?it/s]

  output = torch._nested_tensor_from_mask(output, src_key_padding_mask.logical_not(), mask_check=False)


Epoch: 0 | Training_Loss: 7.6023 
Input_sentence: A man is performing a <unk> <unk> as people walk by and watch his performance .
Output_sentence: Ein Mann <unk> ein <unk> Kunststück , während Menschen vorbeilaufen und seiner Darbietung zuschauen .
Generated_sentence: 

Epoch: 1 | Training_Loss: 6.3284 
Input_sentence: A man is performing a <unk> <unk> as people walk by and watch his performance .
Output_sentence: Ein Mann <unk> ein <unk> Kunststück , während Menschen vorbeilaufen und seiner Darbietung zuschauen .
Generated_sentence: Ein Mann in einem <unk> .

Epoch: 2 | Training_Loss: 5.9648 
Input_sentence: A man is performing a <unk> <unk> as people walk by and watch his performance .
Output_sentence: Ein Mann <unk> ein <unk> Kunststück , während Menschen vorbeilaufen und seiner Darbietung zuschauen .
Generated_sentence: Ein Mann in einem <unk> .

Epoch: 3 | Training_Loss: 5.7664 
Input_sentence: A man is performing a <unk> <unk> as people walk by and watch his performance .
Output_

In [None]:
plot_results(results["Training_Loss"])

In [None]:
from pathlib import Path
target_dir_path = Path("/kaggle/working/experiment")
target_dir_path.mkdir(parents=True,exist_ok=True)
model_name = 'simple_seq2seq.pt'
model_save_path = target_dir_path / model_name
torch.save(obj=model_2.state_dict(), f=model_save_path)

# Make Prediction

In [None]:
import pandas as pd

test_df = pd.read_csv("/kaggle/input/machine-translation-dataset-de-en/translation_test.csv")
test_df.head(5)

In [None]:
from torchtext.data.utils import get_tokenizer

en_tokenizer = get_tokenizer(tokenizer = 'spacy', language = "en_core_web_sm")

# apply tokenizer to our dataset
tokenized_en_test_df = test_df['english'].map(en_tokenizer)
tokenized_en_test_df

In [None]:
from torchtext import transforms

test_transform = transforms.Sequential(
    ## converts the sentences to indices based on given vocabulary
    transforms.VocabTransform(vocab=en_vocab),

    ## Add <sos> at beginning of each sentence. 1 because the index for <sos> in vocabulary is 1 as seen in previous section
    transforms.AddToken(en_vocab['<sos>'], begin=True),

    ## Add <eos> at end of each sentence. 2 because the index for <eos> in vocabulary is 2 as seen in previous section
    transforms.AddToken(en_vocab['<sos>'], begin=False),
    
    ## converts data into tensor
    transforms.ToTensor()
)

In [None]:
Test_Dataset = En_De_DatasetCustom(
    df = test_df,
    transform = {'en': en_transform},
    is_test = True
)

In [None]:
from torch.utils.data import DataLoader
import os

# Setup the batch size hyperparameter
BATCH_SIZE = 1
NUM_CORES = os.cpu_count()

# Turn datasets into iterables (batches)
test_dataloader = DataLoader(
    Test_Dataset, # dataset to turn into iterable
    batch_size=BATCH_SIZE, # how many samples per batch? 
    shuffle = False,
    num_workers = NUM_CORES,
    pin_memory = True
    
)
# Let's check out what we've created
print(f"Dataloaders: {test_dataloader}") 
print(f"Length of training dataset: {len(test_dataloader.dataset)}")
print(f"Length of train dataloader: {len(test_dataloader)} batches of {BATCH_SIZE}")

In [None]:
def make_predictions(model: torch.nn.Module, 
                     data_loader: torch.utils.data.DataLoader, 
                     device: torch.device = 'cpu'):
    
    if device != 'cpu':
        model.to(device)
   
    model.eval()
    with torch.inference_mode():
        y = []
        pbar = tqdm(enumerate(data_loader), total=len(data_loader), desc="Predicting")
        for batch, X_encoder in pbar:
            # Send data to GPU
            if device != 'cpu':
                X_encoder= X_encoder.to(device)
            
            sequence = model.make_generation(X_encoder)
            Generated_sentence = convert_to_sentence(sequence.cpu(), model.tgt_vocab)
            y.append(Generated_sentence)
        
    return y

In [None]:
predictions = make_predictions(
    model = model_2, 
    data_loader = test_dataloader, 
    device = device)
    
test_df['generated'] = predictions

print(test_df['english'][7], test_df['german'][7], test_df['generated'][7], sep = '\n')
print()
print(test_df['english'][9], test_df['german'][9], test_df['generated'][9], sep = '\n')
print()
print(test_df['english'][12], test_df['german'][12], test_df['generated'][12], sep = '\n')