# 作業 : 實作英文-德文翻譯機器人
***
## [作業目標]

用 pytorch 實作一個英文-德文翻譯機器人

## [作業目標]

*   語言資料處理
*   使用 LSTM 建構 Encoder: EncoderLSTM
*   使用 LSTM 建構 Decoder: DecoderLSTM
*   搭建 Sequence to Sequence 模型: Seq2Seq
*   撰寫訓練函式
*   撰寫測試函式

## [問題]

在 Colab 實際上執行完這個範例後，請改用 BiLSTM 來建構 Encoder 與 Decoder


## 安裝 spacy

We'll also make use of spaCy to tokenize our data. To install spaCy, follow the instructions here making sure to install both the English and German models with:

In [1]:
# !pip uninstall spacy -y
# !pip install -U spacy

Uninstalling spacy-2.2.4:
  Successfully uninstalled spacy-2.2.4
Collecting spacy
[?25l  Downloading https://files.pythonhosted.org/packages/1b/d8/0361bbaf7a1ff56b44dca04dace54c82d63dad7475b7d25ea1baefafafb2/spacy-3.0.6-cp37-cp37m-manylinux2014_x86_64.whl (12.8MB)
[K     |████████████████████████████████| 12.8MB 201kB/s 
Collecting typer<0.4.0,>=0.3.0
  Downloading https://files.pythonhosted.org/packages/90/34/d138832f6945432c638f32137e6c79a3b682f06a63c488dcfaca6b166c64/typer-0.3.2-py3-none-any.whl
Collecting catalogue<2.1.0,>=2.0.3
  Downloading https://files.pythonhosted.org/packages/82/a5/b5021c74c04cac35a27d34cbf3146d86eb8e173b4491888bc4908c4c8b3b/catalogue-2.0.3-py3-none-any.whl
Collecting spacy-legacy<3.1.0,>=3.0.4
  Downloading https://files.pythonhosted.org/packages/8d/67/d4002a18e26bf29b17ab563ddb55232b445ab6a02f97bf17d1345ff34d3f/spacy_legacy-3.0.5-py2.py3-none-any.whl
Collecting pathy>=0.3.5
[?25l  Downloading https://files.pythonhosted.org/packages/13/87/5991d87be8ed60be

In [7]:
# !python -m spacy download de_core_news_sm
# !python -m spacy download en_core_web_sm

2021-04-29 07:32:55.154439: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('de_core_news_sm')
2021-04-29 07:33:01.062098: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


## 引用需要的模組

In [29]:
import io
import torch
import torch.nn as nn
import torch.optim as optim
import spacy
import random
from collections import Counter

from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader
from torch.utils.tensorboard import SummaryWriter
from torchtext.data.utils import get_tokenizer
from torchtext.data.metrics import bleu_score
from torchtext.vocab import Vocab
from torchtext.utils import download_from_url, extract_archive

## 下載英文及德文語料

In [9]:
url_base = 'https://raw.githubusercontent.com/multi30k/dataset/master/data/task1/raw/'
train_urls = ('train.de.gz', 'train.en.gz')
val_urls = ('val.de.gz', 'val.en.gz')
test_urls = ('test_2016_flickr.de.gz', 'test_2016_flickr.en.gz')

train_filepaths = [extract_archive(download_from_url(url_base + url))[0] for url in train_urls]
val_filepaths = [extract_archive(download_from_url(url_base + url))[0] for url in val_urls]
test_filepaths = [extract_archive(download_from_url(url_base + url))[0] for url in test_urls]

train.de.gz: 100%|██████████| 637k/637k [00:00<00:00, 21.8MB/s]
train.en.gz: 100%|██████████| 569k/569k [00:00<00:00, 21.8MB/s]
val.de.gz: 100%|██████████| 24.7k/24.7k [00:00<00:00, 6.44MB/s]
val.en.gz: 100%|██████████| 21.6k/21.6k [00:00<00:00, 5.96MB/s]
test_2016_flickr.de.gz: 100%|██████████| 22.9k/22.9k [00:00<00:00, 5.49MB/s]
test_2016_flickr.en.gz: 100%|██████████| 21.1k/21.1k [00:00<00:00, 4.97MB/s]


In [10]:
de_tokenizer = get_tokenizer('spacy', language='de_core_news_sm')
en_tokenizer = get_tokenizer('spacy', language='en_core_web_sm')

def build_vocab(filepath, tokenizer):
    counter = Counter()
    with io.open(filepath, encoding="utf8") as f:
        for string_ in f:
            counter.update(tokenizer(string_))
    return Vocab(counter, specials=['<unk>', '<pad>', '<bos>', '<eos>'])

de_vocab = build_vocab(train_filepaths[0], de_tokenizer)
en_vocab = build_vocab(train_filepaths[1], en_tokenizer)

In [11]:
raw_en_data = []
raw_de_data = []

def data_process(filepaths):
    raw_de_iter = iter(io.open(filepaths[0], encoding="utf8"))
    raw_en_iter = iter(io.open(filepaths[1], encoding="utf8"))
    data = []
    for (raw_de, raw_en) in zip(raw_de_iter, raw_en_iter):
        raw_de = raw_de.strip()
        raw_en = raw_en.strip()
        raw_en_data.append(raw_en)
        raw_de_data.append(raw_de)
        de_tensor_ = torch.tensor([de_vocab[token] for token in de_tokenizer(raw_de)],
                                dtype=torch.long)
        en_tensor_ = torch.tensor([en_vocab[token] for token in en_tokenizer(raw_en)],
                                dtype=torch.long)
        data.append((de_tensor_, en_tensor_))
    return data

train_data = data_process(train_filepaths)
val_data = data_process(val_filepaths)
test_data = data_process(test_filepaths)

In [12]:
def tokenize_de(text):
    return [token for token in de_tokenizer(text)]

def tokenize_english(text):
    return [token for token in en_tokenizer(text)]

### Sample Run ###
sample_text = "I love machine learning"
print(tokenize_english(sample_text))

print(f"Unique tokens in source (german) vocabulary: {len(de_vocab.stoi)}")
print(f"Unique tokens in target (en) vocabulary: {len(en_vocab.stoi)}")

['I', 'love', 'machine', 'learning']
Unique tokens in source (german) vocabulary: 19215
Unique tokens in target (en) vocabulary: 10838


In [13]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
BATCH_SIZE = 128
PAD_IDX = de_vocab['<pad>']
BOS_IDX = de_vocab['<bos>']
EOS_IDX = de_vocab['<eos>']

def generate_batch(data_batch):
    de_batch, en_batch = [], []
    for (de_item, en_item) in data_batch:
        de_batch.append(torch.cat([torch.tensor([BOS_IDX]), de_item, torch.tensor([EOS_IDX])], dim=0))
        en_batch.append(torch.cat([torch.tensor([BOS_IDX]), en_item, torch.tensor([EOS_IDX])], dim=0))
        
    de_batch = pad_sequence(de_batch, padding_value=PAD_IDX)
    en_batch = pad_sequence(en_batch, padding_value=PAD_IDX)
    return de_batch, en_batch

train_iter = DataLoader(train_data, batch_size=BATCH_SIZE,
                        shuffle=True, collate_fn=generate_batch)
valid_iter = DataLoader(val_data, batch_size=BATCH_SIZE,
                        shuffle=True, collate_fn=generate_batch)
test_iter = DataLoader(test_data, batch_size=BATCH_SIZE,
                       shuffle=True, collate_fn=generate_batch)

In [14]:
print(f"Number of training examples: {len(train_data)}")
print(f"Number of validation examples: {len(val_data)}")
print(f"Number of testing examples: {len(test_data)}")

german = " ".join([de_vocab.itos[i] for i in train_data[0][0]])
english = " ".join([en_vocab.itos[i] for i in train_data[0][1]])

print(f"German example: {german}")
print(f"English example: {english}")

Number of training examples: 29000
Number of validation examples: 1014
Number of testing examples: 1000
German example: Zwei junge weiße Männer sind im Freien in der Nähe vieler Büsche .
English example: Two young , White males are outside near many bushes .


In [15]:
count = 0
max_len_eng = []
max_len_ger = []
for german_seq, en_seq in train_data:
    german_sentence = " ".join([de_vocab.itos[i] for i in german_seq])
    en_sentence = " ".join([en_vocab.itos[i] for i in en_seq])
    if count < 10 :
        print("German - ",german_sentence, " Length - ", len(german_seq))
        print("English - ",en_sentence, " Length - ", len(en_seq))
        print()
    count += 1

# train_data = [(german_sentence, english_sentence), ...]
en_max_length = len(max(train_data, key=lambda pair: len(pair[1]))[1])
en_min_length = len(min(train_data, key=lambda pair: len(pair[1]))[1])
german_max_length = len(max(train_data, key=lambda pair: len(pair[0]))[0])
german_min_length = len(min(train_data, key=lambda pair: len(pair[0]))[0])

print("Maximum Length of English sentence {} and German sentence {} in the dataset".format(en_max_length, german_max_length))
print("Minimum Length of English sentence {} and German sentence {} in the dataset".format(en_min_length, german_min_length))

German -  Zwei junge weiße Männer sind im Freien in der Nähe vieler Büsche .  Length -  13
English -  Two young , White males are outside near many bushes .  Length -  11

German -  Mehrere Männer mit Schutzhelmen bedienen ein Antriebsradsystem .  Length -  8
English -  Several men in hard hats are operating a giant pulley system .  Length -  12

German -  Ein kleines Mädchen klettert in ein Spielhaus aus Holz .  Length -  10
English -  A little girl climbing into a wooden playhouse .  Length -  9

German -  Ein Mann in einem blauen Hemd steht auf einer Leiter und putzt ein Fenster .  Length -  15
English -  A man in a blue shirt is standing on a ladder cleaning a window .  Length -  15

German -  Zwei Männer stehen am Herd und bereiten Essen zu .  Length -  10
English -  Two men are at the stove preparing food .  Length -  9

German -  Ein Mann in grün hält eine Gitarre , während der andere Mann sein Hemd ansieht .  Length -  16
English -  A man in green holds a guitar while the other

## 用 LSTM 搭建的 Encoder 類別: EncoderLSTM



In [16]:
class EncoderLSTM(nn.Module):
    def __init__(self, input_size, embedding_size, hidden_size, num_layers, p):
        super(EncoderLSTM, self).__init__()

        # Size of the one hot vectors that will be the input to the encoder
        #self.input_size = input_size

        # Output size of the word embedding NN
        #self.embedding_size = embedding_size

        # Dimension of the NN's inside the lstm cell/ (hs,cs)'s dimension.
        self.hidden_size = hidden_size

        # Number of layers in the lstm
        self.num_layers = num_layers

        # Regularization parameter
        self.dropout = nn.Dropout(p)
        self.tag = True

        # Shape --------------------> (19219, 300) [input size, embedding dims]
        self.embedding = nn.Embedding(input_size, embedding_size)

        # Shape -----------> (300, 1024, 2) [embedding dims, hidden size, num layers]
        self.LSTM = nn.LSTM(embedding_size, hidden_size, num_layers, dropout = p)

    # Shape of x (26, 128) [Sequence_length, batch_size]
    def forward(self, x):
        # Shape -----------> (26, 128, 300) [Sequence_length , batch_size , embedding dims]
        embedding = self.dropout(self.embedding(x))

        # Shape --> outputs (26, 128, 1024) [Sequence_length , batch_size , hidden_size]
        # Shape --> (hs, cs) (2, 128, 1024) , (2, 128, 1024) [num_layers, batch_size size, hidden_size]
        outputs, (hidden_state, cell_state) = self.LSTM(embedding)

        return hidden_state, cell_state

## 用 LSTM 搭建的 decoder 類別: DecoderLSTM


In [17]:
class DecoderLSTM(nn.Module):
    def __init__(self, input_size, embedding_size, hidden_size, num_layers, p, output_size):
        super(DecoderLSTM, self).__init__()

        # Size of the one hot vectors that will be the input to the encoder
        #self.input_size = input_size

        # Output size of the word embedding NN
        #self.embedding_size = embedding_size

        # Dimension of the NN's inside the lstm cell/ (hs,cs)'s dimension.
        self.hidden_size = hidden_size

        # Number of layers in the lstm
        self.num_layers = num_layers

        # Size of the one hot vectors that will be the output to the encoder (English Vocab Size)
        self.output_size = output_size

        # Regularization parameter
        self.dropout = nn.Dropout(p)

        # Shape --------------------> (10838, 300) [input size, embedding dims]
        self.embedding = nn.Embedding(input_size, embedding_size)

        # Shape -----------> (300, 1024, 2) [embedding dims, hidden size, num layers]
        self.LSTM = nn.LSTM(embedding_size, hidden_size, num_layers, dropout = p)

        # Shape -----------> (1024, 10838) [hidden size, output size]
        self.fc = nn.Linear(hidden_size, output_size)

    # Shape of x (128) [batch_size]
    def forward(self, x, hidden_state, cell_state):
        # Shape of x (1, 128) [1, batch_size]
        x = x.unsqueeze(0)

        # Shape -----------> (1, 128, 300) [1, batch_size, embedding dims]
        embedding = self.dropout(self.embedding(x))

        # Shape --> outputs (1, 128, 1024) [1, batch_size , hidden_size]
        # Shape --> (hs, cs) (2, 128, 1024) , (2, 128, 1024) [num_layers, batch_size size, hidden_size] 
        # (passing encoder's hs, cs - context vectors)
        outputs, (hidden_state, cell_state) = self.LSTM(embedding, (hidden_state, cell_state))

        # Shape --> predictions (1, 128, 10838) [ 1, batch_size , output_size]
        predictions = self.fc(outputs)

        # Shape --> predictions (128, 10838) [batch_size , output_size]
        predictions = predictions.squeeze(0)

        return predictions, hidden_state, cell_state

In [18]:
# Encoder
encoder_input_size = len(de_vocab.stoi)
encoder_embedding_size = 300
hidden_size = 1024
num_layers = 2
encoder_dropout = 0.5

encoder_lstm = EncoderLSTM(encoder_input_size, encoder_embedding_size,
                           hidden_size, num_layers, encoder_dropout).to(device)

# Decoder
decoder_input_size = len(en_vocab.stoi)
decoder_embedding_size = 300
hidden_size = 1024
num_layers = 2
decoder_dropout = 0.5
output_size = len(en_vocab.stoi)

decoder_lstm = DecoderLSTM(decoder_input_size, decoder_embedding_size,
                           hidden_size, num_layers, decoder_dropout, output_size).to(device)

print("Encoder:", encoder_lstm, "\n")
print("Decoder:", decoder_lstm)

Encoder: EncoderLSTM(
  (dropout): Dropout(p=0.5, inplace=False)
  (embedding): Embedding(19215, 300)
  (LSTM): LSTM(300, 1024, num_layers=2, dropout=0.5)
) 

Decoder: DecoderLSTM(
  (dropout): Dropout(p=0.5, inplace=False)
  (embedding): Embedding(10838, 300)
  (LSTM): LSTM(300, 1024, num_layers=2, dropout=0.5)
  (fc): Linear(in_features=1024, out_features=10838, bias=True)
)


# Sequence to Sequence 類別

In [19]:
class Seq2Seq(nn.Module):
    def __init__(self, Encoder_LSTM, Decoder_LSTM):
        super(Seq2Seq, self).__init__()
        self.Encoder_LSTM = Encoder_LSTM
        self.Decoder_LSTM = Decoder_LSTM

    def forward(self, source, target, tfr=0.5):
        # Shape - Source : (10, 128) [(Sentence length german + some padding), Number of Sentences]
        batch_size = source.shape[1]

        # Shape - Source : (14, 128) [(Sentence length English + some padding), Number of Sentences]
        target_len = target.shape[0]
        target_vocab_size = len(en_vocab.itos)

        # Shape --> outputs (14, 128, 10838) 
        outputs = torch.zeros(target_len, batch_size, target_vocab_size).to(device)

        # Shape --> (hs, cs) (2, 128, 1024) ,(2, 128, 1024) [num_layers, batch_size size, hidden_size] 
        # (contains encoder's hs, cs - context vectors)
        hidden_state, cell_state = self.Encoder_LSTM(source)

        # [<bos> * 128] (128 elements)
        x = target[0] # Trigger token <bos>

        for i in range(1, target_len):
            # Shape --> output (128, 10838) 
            output, hidden_state, cell_state = self.Decoder_LSTM(x, hidden_state, cell_state)
            outputs[i] = output
            best_guess = output.argmax(1) # 0th dimension is batch size, 1st dimension is word embedding
            x = target[i] if random.random() < tfr else best_guess # Either pass the next word correctly from the dataset or use the earlier predicted word

        # Shape --> outputs (14, 128, 10838) 
        return outputs

In [20]:
# Hyperparameters

learning_rate = 0.001
writer = SummaryWriter(f"runs/loss_plot")
step = 0

model = Seq2Seq(encoder_lstm, decoder_lstm).to(device)
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

pad_idx = en_vocab.stoi["<pad>"]
criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)

In [21]:
model

Seq2Seq(
  (Encoder_LSTM): EncoderLSTM(
    (dropout): Dropout(p=0.5, inplace=False)
    (embedding): Embedding(19215, 300)
    (LSTM): LSTM(300, 1024, num_layers=2, dropout=0.5)
  )
  (Decoder_LSTM): DecoderLSTM(
    (dropout): Dropout(p=0.5, inplace=False)
    (embedding): Embedding(10838, 300)
    (LSTM): LSTM(300, 1024, num_layers=2, dropout=0.5)
    (fc): Linear(in_features=1024, out_features=10838, bias=True)
  )
)

In [27]:
def translate_sentence(model, sentence, german, english, device, max_length=50):
    spacy_ger = spacy.load("de_core_news_sm")

    if type(sentence) == str:
        tokens = [token.text.lower() for token in spacy_ger(sentence)]
    else:
        tokens = [german.itos[idx] for idx in sentence]
    text_to_indices = [german.stoi['<bos>']] + [german.stoi[token] for token in tokens] + [german.stoi['<eos>']]
    sentence_tensor = torch.LongTensor(text_to_indices).unsqueeze(1).to(device)

    # Build encoder hidden, cell state
    with torch.no_grad():
        hidden, cell = model.Encoder_LSTM(sentence_tensor)

    outputs = [english.stoi["<bos>"]]

    for _ in range(max_length):
        previous_word = torch.LongTensor([outputs[-1]]).to(device)

        with torch.no_grad():
            output, hidden, cell = model.Decoder_LSTM(previous_word, hidden, cell)
            best_guess = output.argmax(1).item()

        outputs.append(best_guess)

        # Model predicts it's the end of the sentence
        if best_guess == english.stoi["<eos>"]:
            break

    translated_sentence = [english.itos[idx] for idx in outputs]
    return translated_sentence[1:]

# 用來評估模型的函式: bleu
def bleu(data, model, german, english, device):
    targets = []
    outputs = []

    for example in data:
        src = example[0]
        trg = [english.itos[idx] for idx in example[1][1:]]

        prediction = translate_sentence(model, src, german, english, device)
        if len(prediction) == 49 and prediction[-1] == english.stoi['<eos>']:
            prediction = prediction[:-1]  # remove <eos> token

        targets.append(trg)
        outputs.append(prediction)

    return bleu_score(outputs, targets)

def checkpoint_and_save(model, best_loss, epoch, optimizer, epoch_loss):
    print('saving')
    print()
    state = {'model': model,'best_loss': best_loss,'epoch': epoch,'rng_state': torch.get_rng_state(), 'optimizer': optimizer.state_dict(),}
    torch.save(state, './checkpoint-NMT')
    torch.save(model.state_dict(),'./checkpoint-NMT-SD')

In [24]:
epoch_loss = 0.0
num_epochs = 100
best_loss = 999999
best_epoch = -1
sentence1 = "ein mann in einem blauen hemd steht auf einer leiter und putzt ein fenster"
ts1  = []

for epoch in range(num_epochs):
    print("Epoch - {} / {}".format(epoch+1, num_epochs))
    model.eval()
    translated_sentence1 = translate_sentence(model, sentence1, de_vocab, en_vocab, device, max_length=50)
    print(f"Translated example sentence 1: \n {translated_sentence1}")
    ts1.append(translated_sentence1)

    model.train(True)
    for batch_idx, batch in enumerate(train_iter):
        input_german = batch[0].to(device)
        target_en = batch[1].to(device)

        # Pass the input and target for model's forward method
        output = model(input_german, target_en)
        output = output[1:].reshape(-1, output.shape[2])
        target_en = target_en[1:].reshape(-1)

        # Clear the accumulating gradients
        optimizer.zero_grad()

        # Calculate the loss value for every epoch
        loss = criterion(output, target_en)

        # Calculate the gradients for weights & biases using back-propagation
        loss.backward()

        # Clip the gradient value is it exceeds > 1
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)

        # Update the weights values using the gradients we calculated using bp 
        optimizer.step()
        step += 1
        epoch_loss += loss.item()
        writer.add_scalar("Training loss", loss, global_step=step)

    if epoch_loss < best_loss:
        best_loss = epoch_loss
        epoch_loss = 0.0
        best_epoch = epoch
        checkpoint_and_save(model, best_loss, epoch, optimizer, epoch_loss) 
        if ((epoch - best_epoch) >= 10):
            print("no improvement in 10 epochs, break")
            break
    print("Epoch_Loss - {}".format(loss.item()))
    print()
    
print(epoch_loss / len(train_iter))

score = bleu(test_data[:100], model, de_vocab, en_vocab, device)
print(f"Bleu score {score*100:.4f}")

Epoch - 1 / 100
Translated example sentence 1: 
 ['A', 'man', 'a', 'a', 'a', 'a', '.', '.', '.', '<eos>']
saving

Epoch_Loss - 4.6899285316467285

Epoch - 2 / 100
Translated example sentence 1: 
 ['A', 'man', 'in', 'a', 'blue', 'shirt', 'is', 'a', 'a', 'a', 'a', 'a', 'a', '.', '<eos>']
Epoch_Loss - 4.134108543395996

Epoch - 3 / 100
Translated example sentence 1: 
 ['The', 'in', 'a', 'red', 'and', 'and', 'a', 'a', 'a', 'a', 'a', 'a', '.', '<eos>']
Epoch_Loss - 3.983480930328369

Epoch - 4 / 100
Translated example sentence 1: 
 ['A', 'in', 'a', 'in', 'a', 'a', ',', 'a', 'a', 'a', 'a', 'a', 'a', 'a', '.', '<eos>']
Epoch_Loss - 3.8228628635406494

Epoch - 5 / 100
Translated example sentence 1: 
 ['A', 'in', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', '.', '<eos>']
Epoch_Loss - 3.5762898921966553

Epoch - 6 / 100
Translated example sentence 1: 
 ['A', 'worker', 'in', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', '.', '<eos>']
Epoch_Loss - 3.3658337593078613

Epoch - 7 / 100
Trans

NameError: ignored

In [30]:
print(epoch_loss / len(train_iter))

score = bleu(test_data[:10], model, de_vocab, en_vocab, device)
print(f"Bleu score {score*100:.4f}")

105.08028613762839
Bleu score 0.0000
