#**Machine Translation using RNNs**

##**Dataset**

In [None]:
!pip install -q datasets

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m536.6/536.6 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m38.3/38.3 MB[0m [31m28.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m15.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m17.3 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
ibis-framework 7.1.0 requires pyarrow<15,>=2, but you have pyarrow 15.0.0 which is incompatible.[0m[31m
[0m

In [None]:
from datasets import load_dataset

data = load_dataset(
    "mt_eng_vietnamese",
    "iwslt2015-en-vi"
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading data:   0%|          | 0.00/17.8M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/181k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/181k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/133318 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1269 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1269 [00:00<?, ? examples/s]

In [None]:
data['train']

Dataset({
    features: ['translation'],
    num_rows: 133318
})

In [None]:
data['train']['translation'][0]

{'en': 'Rachel Pike : The science behind a climate headline',
 'vi': 'Khoa học đằng sau một tiêu đề về khí hậu'}

##**Tokenization**

In [None]:
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

In [None]:
SRC_LANGUAGE = 'en'
TGT_LANGUAGE = 'vi'

token_transform = {}
vocab_transform = {}

token_transform[SRC_LANGUAGE] = get_tokenizer('basic_english')
token_transform[TGT_LANGUAGE] = get_tokenizer('basic_english')

UNK_IDX, PAD_IDX, BOS_IDX, EOS_IDX = 0, 1, 2, 3
special_symbols = ['<unk>', '<pad>', '<bos>', '<eos>']

In [None]:
def yield_tokens(data_iter, lang):
    for data_sample in data_iter['translation']:
        yield token_transform[lang](data_sample[lang])


for lang in [SRC_LANGUAGE, TGT_LANGUAGE]:
    train_iter = data['train']

    # Create torchtext's Vocab object
    vocab_transform[lang] = build_vocab_from_iterator(
        yield_tokens(train_iter, lang),
        min_freq=1,
        specials=special_symbols,
        special_first=True
    )

    vocab_transform[lang].set_default_index(UNK_IDX)

In [None]:
vocab_transform[SRC_LANGUAGE].get_itos()[:10]

['<unk>', '<pad>', '<bos>', '<eos>', ',', '.', 'the', 'and', 'to', '&apos']

In [None]:
vocab_transform[TGT_LANGUAGE].get_itos()[:10]

['<unk>', '<pad>', '<bos>', '<eos>', ',', '.', 'và', 'tôi', 'là', 'một']

In [None]:
len(vocab_transform[SRC_LANGUAGE]), len(vocab_transform[TGT_LANGUAGE])

(47271, 21114)

##**Dataloader**

In [None]:
import torch
from torch.nn.utils.rnn import pad_sequence

MAX_LEN = 100

# helper function to club together sequential operations
def sequential_transforms(*transforms):
    def func(txt_input):
        for transform in transforms:
            txt_input = transform(txt_input)
        return txt_input
    return func

# function to add BOS/EOS and create tensor for input sequence indices
def tensor_transform(token_ids):
    return torch.cat((torch.tensor([BOS_IDX]),
                      torch.tensor(token_ids),
                      torch.tensor([EOS_IDX])))

# ``src`` and ``tgt`` language text transforms to convert raw strings into tensors indices
text_transform = {}
for lang in [SRC_LANGUAGE, TGT_LANGUAGE]:
    text_transform[lang] = sequential_transforms(
        token_transform[lang], # Tokenization
        vocab_transform[lang], # Numericalization
        tensor_transform # Add BOS/EOS and create tensor
    )

def truncate(sample):
    if sample.size(0) > MAX_LEN:
        return sample[MAX_LEN:, :]
    else:
        return sample

# function to collate data samples into batch tensors
def collate_fn(batch):
    src_batch, tgt_batch = [], []
    for sample in batch:
        src_sample, tgt_sample = sample[SRC_LANGUAGE], sample[TGT_LANGUAGE]
        src_batch.append(text_transform[SRC_LANGUAGE](src_sample).to(dtype=torch.int64))
        tgt_batch.append(text_transform[TGT_LANGUAGE](tgt_sample).to(dtype=torch.int64))

    src_batch = pad_sequence(src_batch, padding_value=PAD_IDX, batch_first=True)
    src_batch = truncate(src_batch)
    tgt_batch = pad_sequence(tgt_batch, padding_value=PAD_IDX, batch_first=True)
    tgt_batch = truncate(tgt_batch)
    return src_batch, tgt_batch

In [None]:
from torch.utils.data import DataLoader

BATCH_SIZE = 32

train_dataloader = DataLoader(
    data['train']['translation'],
    batch_size=BATCH_SIZE,
    collate_fn=collate_fn
)

valid_dataloader = DataLoader(
    data['validation']['translation'],
    batch_size=BATCH_SIZE,
    collate_fn=collate_fn
)

test_dataloader = DataLoader(
    data['test']['translation'],
    batch_size=BATCH_SIZE,
    collate_fn=collate_fn
)

In [None]:
src_ids, tgt_ids = next(iter(train_dataloader))

In [None]:
src_ids.shape, tgt_ids.shape

(torch.Size([32, 52]), torch.Size([32, 78]))

##**Model**

In [None]:
import torch.nn as nn

class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size, dropout_p=0.1):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size, batch_first=True)
        self.dropout = nn.Dropout(dropout_p)

    def forward(self, input):
        embedded = self.dropout(self.embedding(input))
        output, hidden = self.gru(embedded)
        return output, hidden

In [None]:
input_size = len(vocab_transform[SRC_LANGUAGE])
hidden_size = 256

encoder = EncoderRNN(input_size, hidden_size)

In [None]:
encoder_output, encoder_hidden = encoder(src_ids)
encoder_output.shape, encoder_hidden.shape

(torch.Size([32, 52, 256]), torch.Size([1, 32, 256]))

In [None]:
class DecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size):
        super(DecoderRNN, self).__init__()
        self.embedding = nn.Embedding(output_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size, batch_first=True)
        self.out = nn.Linear(hidden_size, output_size)

    def forward(self, input, hidden):
        output = self.embedding(input)
        output, hidden = self.gru(output, hidden)
        output = self.out(output)
        return output, hidden

In [None]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device, BOS_IDX):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
        self.BOS_IDX = BOS_IDX

    def forward(self, src_ids, tgt_ids):
        batch_size = tgt_ids.size(0)
        seq_len = tgt_ids.size(1)

        decoder_input = torch.empty(batch_size, 1, dtype=torch.long, device=self.device).fill_(self.BOS_IDX)
        encoder_output, decoder_hidden = self.encoder(src_ids)
        decoder_outputs = []

        for i in range(seq_len):
            decoder_output, decoder_hidden  = self.decoder(decoder_input, decoder_hidden)
            decoder_outputs.append(decoder_output)

            # Teacher forcing: Feed the target as the next input
            decoder_input = tgt_ids[:, i].unsqueeze(1) # Teacher forcing

        decoder_outputs = torch.cat(decoder_outputs, dim=1)
        return decoder_outputs, decoder_hidden

In [None]:
input_size = len(vocab_transform[SRC_LANGUAGE])
output_size = len(vocab_transform[TGT_LANGUAGE])
hidden_size = 256
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

encoder = EncoderRNN(input_size, hidden_size)
decoder = DecoderRNN(hidden_size, output_size)
model = Seq2Seq(encoder, decoder, device, EOS_IDX)
model.to(device)

Seq2Seq(
  (encoder): EncoderRNN(
    (embedding): Embedding(47271, 256)
    (gru): GRU(256, 256, batch_first=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (decoder): DecoderRNN(
    (embedding): Embedding(21114, 256)
    (gru): GRU(256, 256, batch_first=True)
    (out): Linear(in_features=256, out_features=21114, bias=True)
  )
)

In [None]:
tgt_input = tgt_ids[:, :-1]
tgt_output = tgt_ids[:, 1:]
decoder_outputs, decoder_hidden = model(src_ids.to(device), tgt_input.to(device))

In [None]:
decoder_outputs.shape, decoder_hidden.shape

(torch.Size([32, 77, 21114]), torch.Size([1, 32, 256]))

In [None]:
tgt_output.shape

torch.Size([32, 77])

In [None]:
criterion = nn.CrossEntropyLoss(ignore_index=PAD_IDX)

In [None]:
decoder_outputs.reshape(-1, decoder_outputs.shape[-1]).shape

torch.Size([2464, 21114])

In [None]:
tgt_output.reshape(-1).shape

torch.Size([2464])

In [None]:
loss = criterion(decoder_outputs.reshape(-1, decoder_outputs.shape[-1]), tgt_output.reshape(-1).to(device))
loss

tensor(9.9647, device='cuda:0', grad_fn=<NllLossBackward0>)

##**Trainer**

In [None]:
import time

def train_epoch(model, optimizer, criterion, train_dataloader, device):
    model.train()
    losses = []

    for idx, (src_ids, tgt_ids) in enumerate(train_dataloader):
        src_ids = src_ids.to(device)
        tgt_ids = tgt_ids.to(device)

        tgt_input = tgt_ids[:, :-1]
        tgt_output = tgt_ids[:, 1:]

        optimizer.zero_grad()
        decoder_outputs, decoder_hidden = model(src_ids, tgt_input)

        loss = criterion(
            decoder_outputs.reshape(-1, decoder_outputs.shape[-1]),
            tgt_output.reshape(-1))
        loss.backward()

        optimizer.step()
        losses.append(loss.item())

    return sum(losses) / len(losses)

def evaluate(model, data_loader, criterion, device):
    model.eval()
    losses = []
    with torch.no_grad():
        for idx, (src_ids, tgt_ids) in enumerate(data_loader):
            src_ids = src_ids.to(device)
            tgt_ids = tgt_ids.to(device)
            tgt_input = tgt_ids[:, :-1]
            tgt_output = tgt_ids[:, 1:]

            decoder_outputs, decoder_hidden = model(src_ids, tgt_input)
            loss = criterion(
                decoder_outputs.reshape(-1, decoder_outputs.shape[-1]),
                tgt_output.reshape(-1)
            )
            losses.append(loss.item())
    return sum(losses) / len(losses)

def train(model, train_dataloader, valid_dataloader, optimizer, criterion, device, epochs):
    for epoch in range(1, epochs+1):
        start_time = time.time()
        train_loss = train_epoch(model, optimizer, criterion, train_dataloader, device)
        valid_loss = evaluate(model, valid_dataloader, criterion, device)
        end_time = time.time()
        print((f"Epoch: {epoch}, Train loss: {train_loss:.3f}, Val loss: {valid_loss:.3f}, "f"Epoch time = {(end_time - start_time):.3f}s"))

##**Training**

In [None]:
import torch.optim as optim

input_size = len(vocab_transform[SRC_LANGUAGE])
output_size = len(vocab_transform[TGT_LANGUAGE])
hidden_size = 256
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

encoder = EncoderRNN(input_size, hidden_size)
decoder = DecoderRNN(hidden_size, output_size)
model = Seq2Seq(encoder, decoder, device, EOS_IDX)
model.to(device)

epochs = 10
criterion = nn.CrossEntropyLoss(ignore_index=PAD_IDX)
optimizer = optim.Adam(model.parameters())
train(model, train_dataloader, valid_dataloader, optimizer, criterion, device, epochs)

Epoch: 1, Train loss: 5.498, Val loss: 5.210, Epoch time = 325.136s
Epoch: 2, Train loss: 4.955, Val loss: 5.052, Epoch time = 325.669s
Epoch: 3, Train loss: 4.728, Val loss: 4.984, Epoch time = 322.890s
Epoch: 4, Train loss: 4.578, Val loss: 4.962, Epoch time = 321.924s
Epoch: 5, Train loss: 4.463, Val loss: 4.944, Epoch time = 325.804s
Epoch: 6, Train loss: 4.371, Val loss: 4.940, Epoch time = 325.295s
Epoch: 7, Train loss: 4.295, Val loss: 4.952, Epoch time = 325.516s
Epoch: 8, Train loss: 4.231, Val loss: 4.959, Epoch time = 324.839s
Epoch: 9, Train loss: 4.176, Val loss: 4.963, Epoch time = 324.653s
Epoch: 10, Train loss: 4.128, Val loss: 4.977, Epoch time = 322.358s
