In [1]:
pip install -U torchtext



In [None]:
import torchtext
import torch
from torchtext.data.utils import get_tokenizer
from collections import Counter
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torchtext.data import *
from torchtext.datasets import IWSLT2016
from torchtext.vocab import Vocab
from torchtext.utils import download_from_url, extract_archive
from torch.utils.data import Dataset, DataLoader
from torchtext.legacy.data import Field, BucketIterator
from torch import Tensor
import re
import io
import pandas as pd 
import numpy as np
import random
import math
import time

In [8]:
train_iter, valid_iter, test_iter = IWSLT2016()
src_sentence, tgt_sentence = next(train_iter)

2016-01.tgz: 188MB [00:02, 67.0MB/s]


In [12]:
def tokenizer_fr(text):
  return [tok.text for tok in src_sentence.tokenizer(text)]

In [13]:
def tokenizer_eng(text):
  return [tok.text for tok in tgt_sentence.tokenizer(text)]

In [14]:
french = Field(tokenize=tokenizer_fr, lower=True, init_token='<sos>', eos_token='<eos>')

english = Field(tokenize=tokenizer_eng, lower=True, init_token='<sos>', eos_token='<eos>')

In [15]:
train_data, valid_data, test_data = torchtext.datasets.IWSLT2016(root='.data', split=('train', 'valid', 'test'), language_pair=('fr', 'en'), valid_set='tst2013', test_set='tst2014')

In [16]:
train_data = list(train_data)
valid_data = list(valid_data)
test_data = list(test_data)

In [17]:
train_data[0]

('David Gallo: Voici Bill Lange. Je suis Dave Gallo.\n',
 "David Gallo: This is Bill Lange. I'm Dave Gallo.\n")

In [18]:
french.build_vocab([t[0] for t in train_data], max_size=10000, min_freq=2)
english.build_vocab([t[1] for t in train_data], max_size=10000, min_freq=2)

In [21]:
class NMTDataset(Dataset):
  def __init__(self, data, source_stoi, target_stoi, max_length  = 128):
    self.data = data
    self.source_stoi = source_stoi
    self.target_stoi = target_stoi
    self.max_length = max_length

  def __len__(self):
    return len(self.data)
    
  def __getitem__(self, index):
    source_sent = self.data[index][0].lower()
    target_sent = self.data[index][1].lower()

    # source_words = [self.source_stoi[word] for word in source_sent.split()]
    # target_words = [self.target_stoi[word] for word in target_sent.split()]

    source_words = [self.source_stoi[word] if word in self.source_stoi else  self.source_stoi["<unk>"] for word in source_sent.split()]
    target_words = [self.target_stoi[word] if word in self.target_stoi else  self.target_stoi["<unk>"] for word in target_sent.split()]

    source_words.extend([0] * self.max_length)
    target_words.extend([0] * self.max_length)

    source_words = torch.tensor(source_words[:self.max_length], dtype=torch.long)
    target_words = torch.tensor(target_words[:self.max_length], dtype=torch.long)


    return source_words, target_words
  

In [23]:
english_words = Counter()
french_words = Counter()
for (french_sent, english_sent) in train_data:
    french_sent = re.sub("[^A-Za-z0-9\']+", " ", french_sent)
    english_sent = re.sub("[^A-Za-z0-9\']+", " ", english_sent)

    french_words.update(french_sent.strip().lower().split())
    english_words.update(english_sent.strip().lower().split())

In [24]:
english_most_frequent = english_words.most_common(10000)
french_most_frequent = french_words.most_common(10000)

english_words = [word for word, count in english_most_frequent]
french_words = [word for word, count in french_most_frequent]

specials = ["<s>", "</s>", "<pad>", "<unk>"]
for s in specials:
    english_words.append(s)
    french_words.append(s)

english_stoi = {word:i for i, word in enumerate(english_words)}
french_stoi = {word:i for i, word in enumerate(french_words)}

In [25]:
train_dataset = NMTDataset(train_data, french_stoi, english_stoi)

In [26]:
test_dataset = NMTDataset(test_data, french_stoi, english_stoi)

In [27]:
train_data[100]

("Donc, dès qu'un crabe les touche ils se rétractent dans leurs coquilles, tout comme vos ongles.\n",
 'So, as soon as a crab touches them, they retract down into their shells, just like your fingernails.\n')

In [30]:
english_stoi["they"]

18

In [31]:
len(english_stoi)

10004

In [32]:
len(french_stoi)

10004

In [23]:
test_data[0]

("Quand j'avais la vingtaine, j'ai vu mes tout premiers clients comme psychothérapeute.\n",
 'When I was in my 20s, I saw my very first psychotherapy client.\n')

In [33]:
train_loader = DataLoader(train_dataset, batch_size = 32)

In [34]:
test_loader = DataLoader(test_dataset, batch_size = 32)

In [36]:
class Encoder(nn.Module):
    def __init__(self,
                 input_dim: int,
                 emb_dim: int,
                 enc_hid_dim: int,
                 dec_hid_dim: int,
                 dropout: float):
        super().__init__()

        self.input_dim = input_dim
        self.emb_dim = emb_dim
        self.enc_hid_dim = enc_hid_dim
        self.dec_hid_dim = dec_hid_dim
        self.dropout = dropout

        self.embedding = nn.Embedding(input_dim, emb_dim)

        self.rnn = nn.GRU(emb_dim, enc_hid_dim, bidirectional = True)

        self.fc = nn.Linear(enc_hid_dim * 2, dec_hid_dim)

        self.dropout = nn.Dropout(dropout)

    def forward(self,
                src: Tensor):

        embedded = self.dropout(self.embedding(src))

        outputs, hidden = self.rnn(embedded)

        hidden = torch.tanh(self.fc(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1)))

        return outputs, hidden

In [39]:
class Attention(nn.Module):
    def __init__(self,
                 enc_hid_dim: int,
                 dec_hid_dim: int,
                 attn_dim: int):
        super().__init__()

        self.enc_hid_dim = enc_hid_dim
        self.dec_hid_dim = dec_hid_dim

        self.attn_in = (enc_hid_dim * 2) + dec_hid_dim

        self.attn = nn.Linear(self.attn_in, attn_dim)

    def forward(self,
                decoder_hidden: Tensor,
                encoder_outputs: Tensor) -> Tensor:

        src_len = encoder_outputs.shape[0]

        repeated_decoder_hidden = decoder_hidden.unsqueeze(1).repeat(1, src_len, 1)

        encoder_outputs = encoder_outputs.permute(1, 0, 2)

        energy = torch.tanh(self.attn(torch.cat((
            repeated_decoder_hidden,
            encoder_outputs),
            dim = 2)))

        attention = torch.sum(energy, dim=2)

        return F.softmax(attention, dim=1)

In [41]:
class Decoder(nn.Module):
    def __init__(self,
                 output_dim: int,
                 emb_dim: int,
                 enc_hid_dim: int,
                 dec_hid_dim: int,
                 dropout: int,
                 attention: nn.Module):
        super().__init__()

        self.emb_dim = emb_dim
        self.enc_hid_dim = enc_hid_dim
        self.dec_hid_dim = dec_hid_dim
        self.output_dim = output_dim
        self.dropout = dropout
        self.attention = attention

        self.embedding = nn.Embedding(output_dim, emb_dim)

        self.rnn = nn.GRU((enc_hid_dim * 2) + emb_dim, dec_hid_dim)

        self.out = nn.Linear(self.attention.attn_in + emb_dim, output_dim)

        self.dropout = nn.Dropout(dropout)


    def _weighted_encoder_rep(self,
                              decoder_hidden: Tensor,
                              encoder_outputs: Tensor) -> Tensor:

        a = self.attention(decoder_hidden, encoder_outputs)

        a = a.unsqueeze(1)

        encoder_outputs = encoder_outputs.permute(1, 0, 2)

        weighted_encoder_rep = torch.bmm(a, encoder_outputs)

        weighted_encoder_rep = weighted_encoder_rep.permute(1, 0, 2)

        return weighted_encoder_rep


    def forward(self,
                input: Tensor,
                decoder_hidden: Tensor,
                encoder_outputs: Tensor):

        input = input.unsqueeze(0)

        embedded = self.dropout(self.embedding(input))

        weighted_encoder_rep = self._weighted_encoder_rep(decoder_hidden,
                                                          encoder_outputs)

        rnn_input = torch.cat((embedded, weighted_encoder_rep), dim = 2)

        output, decoder_hidden = self.rnn(rnn_input, decoder_hidden.unsqueeze(0))

        embedded = embedded.squeeze(0)
        output = output.squeeze(0)
        weighted_encoder_rep = weighted_encoder_rep.squeeze(0)

        output = self.out(torch.cat((output,
                                     weighted_encoder_rep,
                                     embedded), dim = 1))

        return output, decoder_hidden.squeeze(0)

In [42]:
class Seq2Seq(nn.Module):
    def __init__(self,
                 encoder: nn.Module,
                 decoder: nn.Module,
                 device: torch.device):
        super().__init__()

        self.encoder = encoder
        self.decoder = decoder
        self.device = device

    def forward(self,
                src: Tensor,
                trg: Tensor,
                teacher_forcing_ratio: float = 0.5) -> Tensor:

        batch_size = src.shape[1]
        max_len = trg.shape[0]
        trg_vocab_size = self.decoder.output_dim

        outputs = torch.zeros(max_len, batch_size, trg_vocab_size).to(self.device)

        encoder_outputs, hidden = self.encoder(src)

        # first input to the decoder is the <sos> token
        output = trg[0,:]

        for t in range(1, max_len):
            output, hidden = self.decoder(output, hidden, encoder_outputs)
            outputs[t] = output
            teacher_force = random.random() < teacher_forcing_ratio
            top1 = output.max(1)[1]
            output = (trg[t] if teacher_force else top1)

        return outputs

In [48]:
INPUT_DIM = len(french_stoi)
OUTPUT_DIM = len(english_stoi)
ENC_EMB_DIM = 32
DEC_EMB_DIM = 32
ENC_HID_DIM = 64
DEC_HID_DIM = 64
ATTN_DIM = 8
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5
device = 'cuda' if torch.cuda.is_available() else 'cpu'

enc = Encoder(INPUT_DIM, ENC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, ENC_DROPOUT)

attn = Attention(ENC_HID_DIM, DEC_HID_DIM, ATTN_DIM)

dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, DEC_DROPOUT, attn)

model = Seq2Seq(enc, dec, device).to(device)


def init_weights(m: nn.Module):
    for name, param in m.named_parameters():
        if 'weight' in name:
            nn.init.normal_(param.data, mean=0, std=0.01)
        else:
            nn.init.constant_(param.data, 0)


model.apply(init_weights)

optimizer = optim.Adam(model.parameters())


def count_parameters(model: nn.Module):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)


print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 2,981,980 trainable parameters


In [49]:
PAD_IDX = english.vocab.stoi['<pad>']
criterion = nn.CrossEntropyLoss(ignore_index=PAD_IDX)

In [None]:
import math
import time

def train(model: nn.Module,
          iterator: torch.utils.data.DataLoader,
          optimizer: optim.Optimizer,
          criterion: nn.Module,
          clip: float):

    model.train()

    epoch_loss = 0

    for _, (src, trg) in enumerate(iterator):
        src, trg = src.to(device), trg.to(device)

        optimizer.zero_grad()

        output = model(src, trg)

        output = output[1:].view(-1, output.shape[-1])
        trg = trg[1:].view(-1)

        loss = criterion(output, trg)

        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)

        optimizer.step()

        epoch_loss += loss.item()

    return epoch_loss / len(iterator)


def evaluate(model: nn.Module,
             iterator: torch.utils.data.DataLoader,
             criterion: nn.Module):

    model.eval()

    epoch_loss = 0

    with torch.no_grad():

        for _, (src, trg) in enumerate(iterator):
            # print(src)
            src, trg = src.to(device), trg.to(device)

            output = model(src, trg, 0) #turn off teacher forcing

            output = output[1:].view(-1, output.shape[-1])
            trg = trg[1:].view(-1)

            loss = criterion(output, trg)

            epoch_loss += loss.item()

    return epoch_loss / len(iterator)


def epoch_time(start_time: int,
               end_time: int):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs


N_EPOCHS = 1
CLIP = 1

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):

    start_time = time.time()

    train_loss = train(model, train_loader, optimizer, criterion, CLIP)
    valid_loss = evaluate(model, test_loader, criterion)

    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)

    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')

test_loss = evaluate(model, test_loader, criterion)

print(f'| Test Loss: {test_loss:.3f} | Test PPL: {math.exp(test_loss):7.3f} |')

In [46]:
import matplotlib.pyplot as plt

In [None]:
plt.matshow(a) #visualize the attention