In [250]:
import spacy
###### spaCy has model for each language ("de_core_news_sm" for German and "en_core_web_sm" for English) which need to be loaded so we can access the tokenizer of each model
spacy_de = spacy.load('de_core_news_sm')
spacy_en = spacy.load('en_core_web_sm')
import torch
import random
import evaluate
import torchtext
import pandas as pd
from jiwer import wer
from tqdm import tqdm
from datasets import load_metric 
from torchtext.datasets import Multi30k


SEED = 1234
BATCH_SIZE = 32
random.seed(SEED)
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#DEVICE ='cpu'

# **1) Text Preprocessing and Bucketing**

In [240]:
EN_Field = torchtext.data.Field(tokenize=lambda text: [token.text for token in spacy_en(text)], eos_token="<eos>", init_token="<bos>", lower=True, batch_first=False, include_lengths=True)
DE_Field = torchtext.data.Field(tokenize=lambda text: [token.text for token in spacy_de(text)], eos_token="<eos>", init_token="<bos>", lower=True, batch_first=False, include_lengths=True)

fields = [("de_text", DE_Field), ("en_text", EN_Field)]

train_en_data = pd.read_csv("./multi30k-dataset/data/task1/raw/train.en", delimiter='\t', header=None)
train_de_data = pd.read_csv("./multi30k-dataset/data/task1/raw/train.de", delimiter='\t', header=None)

df = pd.concat([train_de_data, train_en_data], axis=1)
df.to_csv("data_de_en.csv", index=False)
TabularData = torchtext.data.TabularDataset(path="data_de_en.csv",
                                            format="CSV",
                                            fields=fields,
                                            skip_header=True)
train_data, valid_data, test_data = TabularData.split(split_ratio=[0.9, 0.05, 0.05], random_state=random.seed(SEED))
train_Iterator = torchtext.data.BucketIterator(dataset=train_data,
                                               batch_size=BATCH_SIZE,
                                               device=DEVICE,
                                               sort_key=lambda x: len(x.en_text),
                                               sort_within_batch=True)
test_Iterator = torchtext.data.BucketIterator(dataset=test_data,
                                              batch_size=BATCH_SIZE,
                                              device=DEVICE,
                                              sort_key=lambda x: len(x.en_text),
                                              sort_within_batch=True)
valid_Iterator = torchtext.data.BucketIterator(dataset=valid_data,
                                              batch_size=BATCH_SIZE,
                                              device=DEVICE,
                                              sort_key=lambda x: len(x.en_text),
                                              sort_within_batch=True)

print(f"Number of training examples: {len(train_data.examples)}")
print(f"Number of validating examples: {len(valid_data.examples)}")
print(f"Number of testing examples: {len(test_data.examples)}")

Number of training examples: 26100
Number of validating examples: 1450
Number of testing examples: 1450


# **2) Creating Vocabulary**

In [241]:
EN_Field.build_vocab(train_data, min_freq=2)
DE_Field.build_vocab(train_data, min_freq=2)
print(EN_Field.vocab["young"])
print("Size of English vocabulary: ", len(EN_Field.vocab))
print("Size of Dutch vocabulary: ", len(DE_Field.vocab))


24
Size of English vocabulary:  5641
Size of Dutch vocabulary:  7344


# **3) Encoder**

In [242]:
class Encoder(torch.nn.Module):
    def __init__(self, input_dim, emb_dim, hid_dim, n_layers, bidirectional, dropout):
        super(Encoder, self).__init__()
        self.input_dim = input_dim
        self.emb_dim = emb_dim
        self.dropout = torch.nn.Dropout(dropout)
        
        self.embedding = torch.nn.Embedding(num_embeddings=input_dim, embedding_dim=emb_dim)
        self.lstm = torch.nn.LSTM(input_size=emb_dim, hidden_size=hid_dim, num_layers=n_layers, batch_first=False, bidirectional=bidirectional, dropout=dropout)

    def forward(self, src):
        ##### src shape: (seq_length, batch_size)
        #print(f"src_shape: {src.shape}")
        embedded = self.dropout(self.embedding(src))

        ##### embedding shape: (Seq_length, batch_size, embedding_size)
        #print(f"embedding_shape: {embedded.shape}")
        out, (h_n, c_n) = self.lstm(embedded)
        
        ##### The first layer will receive a hidden and cell state from the previous time-step (It should be passed to the decoder as the h_0, and c_0)
        return h_n, c_n

# **4) Decoder**

In [243]:
class Decoder(torch.nn.Module):
    def __init__(self, out_dim, emb_dim, hid_dim, n_layers, bidirectional, dropout):
        super(Decoder, self).__init__()
        self.out_dim = out_dim
        
        self.embedding = torch.nn.Embedding(num_embeddings=out_dim, embedding_dim=emb_dim)
        self.lstm = torch.nn.LSTM(input_size=emb_dim, hidden_size=hid_dim, num_layers=n_layers, batch_first=False, bidirectional=bidirectional, dropout=dropout)
        self.fc = torch.nn.Linear(hid_dim, out_dim)
        self.dropout = torch.nn.Dropout(dropout)

    def forward(self, input, hidden, cell):
        ###### shape of input: (batch_size) but we want to be (1, batch_size) so we use unsqueeze
        input = input.unsqueeze(0)
        ###### shape of input: (1, batch_size)

        embedded = self.dropout(self.embedding(input))
        ###### shape of embedded: (1, batch_size, embedding_size)
        
        out, (h_n, c_n) = self.lstm(embedded, (hidden, cell))
        ###### shape of out: (1, batch_size, hidden_size)
        
        prediction = self.fc(out.squeeze(0))
        ###### shape of Predictions: (1, batch_size, length_of_EN_vocabulary)
        
        prediction = prediction.squeeze()
        
        return prediction, h_n, c_n

# **5) Seq2Seq**

In [244]:
class Seq2Seq(torch.nn.Module):
    def __init__(self, encoder, decoder, device):
        super(Seq2Seq, self).__init__()

        self.encoder = encoder
        self.decoder = decoder
        self.device = device

    def forward(self, src, trg, teacher_forcing=True):
        ##### trg shape: (trg_len, Batch_size)
        trg_len = trg.shape[0]
        N = trg.shape[1]
        Y_hat = torch.zeros((trg_len, N, self.decoder.out_dim))

        s_0, c_0 = self.encoder(src)

        ## Selecting the first word in each target sentence in the batch to start decoding. It is "<bos>" token.
        input = trg[0]

        for i in range(1, trg_len):
            y_hat, s_0, c_0 = self.decoder(input, s_0, c_0)

            ##### y_hat shape: (Btach_size, length_of_EN_vocabulary)
            Y_hat[i] = y_hat

            
            if(teacher_forcing):
                input = trg[i]
            else:
                ##### argmax(1) because we want to apply the argmax over the second dim of y_hat ----> y_hat.argmax(1): (Batch_size, 1)
                input = y_hat.argmax(1).squeeze()
        return Y_hat

# **6) Parameter initialization**

In [245]:
INPUT_DIM = len(DE_Field.vocab)
OUTPUT_DIM = len(EN_Field.vocab)
ENC_EMB_DIM = 300
DEC_EMB_DIM = 300
HID_DIM = 1024
N_LAYERS = 2
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5
BIDIRECTIONAL = False

enc = Encoder(INPUT_DIM, ENC_EMB_DIM, HID_DIM, N_LAYERS, BIDIRECTIONAL, ENC_DROPOUT)
dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, HID_DIM, N_LAYERS, BIDIRECTIONAL, DEC_DROPOUT)

model = Seq2Seq(enc, dec, DEVICE).to(DEVICE)
print(model)

print(f"Total number of parameters: {sum(p.numel() for p in model.parameters() if p.requires_grad)}")


Optimizer = torch.optim.Adam(params=model.parameters(), lr=1e-4)
# by passing the index of the <pad> token as the ignore_index argument we ignore the loss whenever the target token is a padding token.
criterion = torch.nn.CrossEntropyLoss(ignore_index = EN_Field.vocab.stoi["<pad>"])


print(f"unk_token: {EN_Field.vocab.stoi[EN_Field.unk_token]}")
print(f"pad_token: {EN_Field.vocab.stoi[EN_Field.pad_token]}")
print(f"eos_token: {EN_Field.vocab.stoi[EN_Field.eos_token]}")
print(EN_Field.vocab.stoi["."])

Seq2Seq(
  (encoder): Encoder(
    (dropout): Dropout(p=0.5, inplace=False)
    (embedding): Embedding(7344, 300)
    (lstm): LSTM(300, 1024, num_layers=2, dropout=0.5)
  )
  (decoder): Decoder(
    (embedding): Embedding(5641, 300)
    (lstm): LSTM(300, 1024, num_layers=2, dropout=0.5)
    (fc): Linear(in_features=1024, out_features=5641, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
)
Total number of parameters: 37333717
unk_token: 0
pad_token: 1
eos_token: 3
5


# **7) Train function**

In [246]:
def train(model, iterator, optimizer, criterion, clip):
    model.train()

    epoch_loss = 0

    for batch in iterator:
        src = batch.de_text[0]
        trg = batch.en_text[0]

        optimizer.zero_grad()
        output = model(src, trg)

        output = output[1:].view(-1, output.shape[2])
        trg = trg[1:].view(-1)
        output = output.to(DEVICE)
        trg = trg.to(DEVICE)

        loss = criterion(output, trg)
        
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        
        optimizer.step()
        
        epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

# **8) Evaluation function**

In [247]:
def evaluate(model, iterator, criterion):
    
    model.eval()
    
    epoch_loss = 0

    criterion = torch.nn.CrossEntropyLoss()

    
    with torch.no_grad():
        for i, batch in enumerate(iterator):
            

            src = batch.de_text[0]
            trg = batch.en_text[0]

            output = model(src, trg, False) #turn off teacher forcing

            output = output[1:].view(-1, output.shape[2])
            trg = trg[1:].view(-1)
            output = output.to(DEVICE)
            trg = trg.to(DEVICE)

            loss = criterion(output, trg)
            
            epoch_loss += loss.item()
    return epoch_loss / len(iterator)

In [248]:
import math 
N_EPOCHS = 10
CLIP = 1

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    
    train_loss = train(model, train_Iterator, Optimizer, criterion, CLIP)
    valid_loss = evaluate(model, valid_Iterator, criterion)
    
    
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'tut1-model.pt')
    
    print(f"Epoch: {epoch+1}")
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
    print(f'\tVal. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')

Epoch: 1
	Train Loss: 4.597 | Train PPL:  99.209
	Val. Loss: 5.762 |  Val. PPL: 318.070
Epoch: 2
	Train Loss: 3.852 | Train PPL:  47.109
	Val. Loss: 5.966 |  Val. PPL: 389.993
Epoch: 3
	Train Loss: 3.563 | Train PPL:  35.276
	Val. Loss: 5.886 |  Val. PPL: 360.127
Epoch: 4
	Train Loss: 3.329 | Train PPL:  27.912
	Val. Loss: 5.951 |  Val. PPL: 384.308
Epoch: 5
	Train Loss: 3.142 | Train PPL:  23.158
	Val. Loss: 5.786 |  Val. PPL: 325.851
Epoch: 6
	Train Loss: 2.994 | Train PPL:  19.966
	Val. Loss: 5.720 |  Val. PPL: 304.792
Epoch: 7
	Train Loss: 2.868 | Train PPL:  17.598
	Val. Loss: 5.671 |  Val. PPL: 290.256
Epoch: 8
	Train Loss: 2.755 | Train PPL:  15.715
	Val. Loss: 5.648 |  Val. PPL: 283.636
Epoch: 9
	Train Loss: 2.653 | Train PPL:  14.195
	Val. Loss: 5.653 |  Val. PPL: 285.233
Epoch: 10
	Train Loss: 2.557 | Train PPL:  12.900
	Val. Loss: 5.608 |  Val. PPL: 272.486


# **9) Computing Metrics (Bleu Score, Rouge, WER)**

In [251]:
# Function to translate a sentence using argmax
def translate_sentence(model, sentence, src_field, tgt_field, device, max_len=50):
    model.eval()
    tokens = src_field.tokenize(sentence)
    tokens = [src_field.init_token] + tokens + [src_field.eos_token]
    src_indexes = [src_field.vocab.stoi[token] for token in tokens]
    src_tensor = torch.LongTensor(src_indexes).unsqueeze(1).to(device)
    src_len = torch.LongTensor([len(src_indexes)])

    with torch.no_grad():
        h_0_decoder, c_0_decoder = model.encoder(src_tensor)

    trg_indexes = [tgt_field.vocab.stoi[tgt_field.init_token]]
    for i in range(max_len):
        trg_tensor = torch.LongTensor([trg_indexes[-1]]).to(device)

        with torch.no_grad():
            output, h_0_decoder, c_0_decoder = model.decoder(trg_tensor, h_0_decoder, c_0_decoder)

        pred_token = output.argmax(0).item()
        trg_indexes.append(pred_token)

        if pred_token == tgt_field.vocab.stoi[tgt_field.eos_token]:
            break
    

    trg_tokens = [tgt_field.vocab.itos[i] for i in trg_indexes]
    return trg_tokens[1:-1], trg_indexes[1:-1]

# Translate each test data point using argmax and print the English sentence, the ground truth Dutch translation, and the model's generated Dutch translation
predicted_output = []
ground_truth_output = []
bleu = load_metric("bleu") 
rouge = evaluate.load('rouge')
Bleu_total = 0
rouge1_total = 0
rouge2_total = 0
rougeL_total = 0
rougeLsum_total = 0
WER_total = 0
n = 0
for batch in tqdm(valid_Iterator):
    src = batch.en_text[0]
    trg = batch.de_text[0]


    for idx in range(src.shape[1]):
        src_sent = ' '.join([EN_Field.vocab.itos[i] for i in src[1:src.shape[0]-1, idx]])
        trg_sent = ' '.join([DE_Field.vocab.itos[i] for i in trg[1:, idx]])
        
        trg_sent = trg_sent.split(" ")

        final_trg = [] 
        for w in trg_sent:
            if w != '<pad>' and w != '<eos>':
                final_trg.append(w)

        final_trg = " ".join(w for w in final_trg)

        translated_sent, _ = translate_sentence(model, src_sent, EN_Field, DE_Field, DEVICE)

        predicted_output = translated_sent
        ground_truth_output = final_trg.split()

        ## Computing the Rouge
        results = rouge.compute(predictions=[' '.join(predicted_output)], references=[final_trg])
        rouge1_total += results['rouge1']
        rouge2_total += results['rouge2']
        rougeL_total += results['rougeL']
        rougeLsum_total += results['rougeLsum']

        ## Computing the Word Error Rate
        WER_total += wer(final_trg, ' '.join(predicted_output))

        ## Computing the BLEU score
        predicted_output = [predicted_output]
        ground_truth_output = [[ground_truth_output]]

        Bleu_total += bleu.compute(predictions=predicted_output, references=ground_truth_output)['bleu']
        n += 1


print(f"The Average Testing Bleu Score is: {Bleu_total / n}")
print(f"The Average Testing Rouge-1 Score is: {rouge1_total / n}")
print(f"The Average Testing Rouge-2 Score is: {rouge2_total / n}")
print(f"The Average Testing Rouge-L Score is: {rougeL_total / n}")
print(f"The Average Testing Rouge-Lsum Score is: {rougeLsum_total / n}")
print(f"Average Word Error Rate: {WER_total / n}")
        

100%|██████████| 46/46 [27:48<00:00, 36.28s/it]

The Average Testing Bleu Score is: 0.0
The Average Testing Rouge-1 Score is: 0.1391844802309793
The Average Testing Rouge-2 Score is: 0.006443801035686695
The Average Testing Rouge-L Score is: 0.10549167452135848
The Average Testing Rouge-Lsum Score is: 0.10549167452135848
Average Word Error Rate: 1.0090863994647892





In [21]:
import gc
gc.collect()
torch.cuda.empty_cache()