In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchtext.datasets import Multi30k
from torchtext.data import Field, BucketIterator
import numpy as np
import spacy
import random
from torch.utils.tensorboard import SummaryWriter # to print to tensorboard

In [2]:
spacy_ger = spacy.load('de_core_news_sm')
spacy_eng = spacy.load('en_core_web_sm')

In [3]:
def tokenizer_ger(text):
    return [tok.text for tok in spacy_ger.tokenizer(text)]

def tokenizer_eng(text):
    return [token.text for token in spacy_eng.tokenizer(text)]

print(tokenizer_eng("Hello are you OK?"))

[&#39;Hello&#39;, &#39;are&#39;, &#39;you&#39;, &#39;OK&#39;, &#39;?&#39;]


In [4]:
german = Field(tokenize=tokenizer_ger, lower=True, init_token='<sos>', eos_token='<eos>')
english = Field(tokenize=tokenizer_eng, lower=True, init_token='<sos>', eos_token='<eos>')

In [5]:
train_data, validation_data, test_data = Multi30k.splits(exts=('.de', '.en'), fields=(german, english))
german.build_vocab(train_data, max_size=10000, min_freq=2)
english.build_vocab(train_data, max_size=10000, min_freq=2)


In [6]:
class Encoder(nn.Module):
    def __init__(self, input_size, embedding_size, hidden_size, num_layers, dropout_p):
        super(Encoder, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers

        self.dropout = nn.Dropout(dropout_p)
        self.embedding = nn.Embedding(input_size, embedding_size)
        self.rnn = nn.LSTM(embedding_size, hidden_size, num_layers, dropout=dropout_p)
    
    def forward(self, x):
        # x shape: (seq_length, N) N is batch size
        embedding = self.dropout(self.embedding(x))
        # embedding shape: (seq_length, N, embedding_size)
        outputs, (hidden, cell) = self.rnn(embedding)
        return hidden, cell


class Decoder(nn.Module):
    # input_size: english vocab
    # output_size should same as input_size
    def __init__(self, input_size, embedding_size, hidden_size, output_size, num_layers, dropout_p):
        super(Decoder, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers

        self.dropout = nn.Dropout(dropout_p)
        self.embedding = nn.Embedding(input_size, embedding_size)
        self.rnn = nn.LSTM(embedding_size, hidden_size, num_layers, dropout=dropout_p)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x, hidden, cell):
        # shape of x: (N) but we want (1,N)
        x = x.unsqueeze(0)

        embedding = self.dropout(self.embedding(x))
        outputs, (hidden, cell) = self.rnn(embedding, (hidden, cell))
        # shape of outputs: (1, N, hidden_size)
        predictions = self.fc(outputs)
        # shape of predictions: (1, N, length_of_vocab)
        predictions = predictions.squeeze(0)
        return predictions, hidden, cell

class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, source, target, teacher_force_ratio=0.5):
        batch_size = source.shape[1]
        # (trg_len, N)
        target_len = target.shape[0]
        target_vocab_size = len(english.vocab)

        outputs = torch.zeros(target_len, batch_size, target_vocab_size).to(device)

        hidden, cell = self.encoder(source)

        # grab start token
        x = target[0]
        for t in range(1, target_len):
            output, hidden, cell = self.decoder(x, hidden, cell)
            outputs[t] = output
            # (N, english_vocab_size)
            best_guess = output.argmax(1)
            x = target[t] if random.random() < teacher_force_ratio else best_guess
        
        return outputs

In [11]:

from datetime import datetime

def print_time():
    now = datetime.now()
    current_time = now.strftime("%H:%M:%S")
    print("Current Time =", current_time)

num_epochs = 16
learning_rate = 0.001
batch_size = 64
load_model = False
device = torch.device('cuda')
input_size_encoder = len(german.vocab)
input_size_decoder = len(english.vocab)
output_size = len(english.vocab)
encoder_embedding_size = 300
decoder_embedding_size = 300
hidden_size = 1024
num_layers = 2
enc_dropout = 0.5
dec_dropout = 0.5

writer = SummaryWriter(f'runs/loss_plot')
step = 0

train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train_data, validation_data, test_data), 
    batch_size=batch_size,
    sort_within_batch=True,
    sort_key = lambda x : len(x.src),
    device=device)

encoder_net = Encoder(input_size_encoder, encoder_embedding_size, hidden_size, num_layers, enc_dropout).to(device)
decoder_net = Decoder(input_size_decoder, decoder_embedding_size, hidden_size, output_size, num_layers, dec_dropout).to(device)
model = Seq2Seq(encoder_net, decoder_net).to(device)
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

pad_idx = english.vocab.stoi['<pad>']
criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)

for epoch in range(num_epochs):
    print(f'Epoch {epoch} / {num_epochs}')
    print_time()
    for batch_idx, batch in enumerate(train_iterator):
        inp_data = batch.src.to(device)
        target = batch.trg.to(device)
        output = model(inp_data, target)
        # output shape: (trg_len, batch_size, output_dim)
        output = output[1:].reshape(-1, output.shape[2])
        target = target[1:].reshape(-1)
        optimizer.zero_grad()
        loss = criterion(output, target)
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)
        optimizer.step()

        writer.add_scalar('Training Loss', loss,global_step=step)
        step += 1

# final save
print_time()
to_save = {"step": step, 'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict()}
torch.save(to_save, f'.models/{step}.save')

        



Epoch 0 / 16
Current Time = 15:01:55
Epoch 1 / 16
Current Time = 15:03:25
Epoch 2 / 16
Current Time = 15:04:56
Epoch 3 / 16
Current Time = 15:06:27
Epoch 4 / 16
Current Time = 15:07:57
Epoch 5 / 16
Current Time = 15:09:29
Epoch 6 / 16
Current Time = 15:11:00
Epoch 7 / 16
Current Time = 15:12:30
Epoch 8 / 16
Current Time = 15:14:01
Epoch 9 / 16
Current Time = 15:15:32
Epoch 10 / 16
Current Time = 15:17:02
Epoch 11 / 16
Current Time = 15:18:33
Epoch 12 / 16
Current Time = 15:20:04
Epoch 13 / 16
Current Time = 15:21:35
Epoch 14 / 16
Current Time = 15:23:06
Epoch 15 / 16
Current Time = 15:24:37
Current Time = 15:26:08
