In [129]:
import os
import sys
import math
import spacy
###### spaCy has model for each language ("de_core_news_sm" for German and "en_core_web_sm" for English) which need to be loaded so we can access the tokenizer of each model
spacy_de = spacy.load('de_core_news_sm')
spacy_en = spacy.load('en_core_web_sm')
import torch
import random
import evaluate
import torchtext
import numpy as np
import pandas as pd
from tqdm import tqdm
from jiwer import wer
from datasets import load_metric
from torchtext.datasets import Multi30k


SEED = 1234
BATCH_SIZE = 32
random.seed(SEED)
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#DEVICE ='cpu'

# **1) Text Preprocessing and Bucketing**

In [130]:
trg_Field = torchtext.data.Field(tokenize=lambda text: [token.text for token in spacy_en(text)], eos_token="<eos>", init_token="<bos>", lower=True, include_lengths=True, batch_first=True)
src_Field = torchtext.data.Field(tokenize=lambda text: [token.text for token in spacy_de(text)], eos_token="<eos>", init_token="<bos>", lower=True, include_lengths=True, batch_first=True)

fields = [("src_field", src_Field), ("trg_field", trg_Field)]


if (not os.path.exists("data_de_en.csv")):
    train_en_data = pd.read_csv("./multi30k-dataset/data/task1/raw/train.en", delimiter='\t', header=None)
    train_de_data = pd.read_csv("./multi30k-dataset/data/task1/raw/train.de", delimiter='\t', header=None)

    df = pd.concat([train_de_data, train_en_data], axis=1)

    df.to_csv("data_de_en.csv")

TabularData = torchtext.data.TabularDataset(path="data_de_en.csv",
                                            format="CSV",
                                            fields = fields,
                                            skip_header=True)
train_data, valid_data, test_data = TabularData.split(split_ratio=[0.9, 0.05, 0.05], random_state=random.seed(SEED))


## Bucket Iterator
train_Iterator = torchtext.data.BucketIterator(dataset=train_data,
                                               batch_size=BATCH_SIZE,
                                               device=DEVICE,
                                               sort_key=lambda x: len(x.src_field),
                                               sort_within_batch=True)
test_Iterator = torchtext.data.BucketIterator(dataset=test_data,
                                              batch_size=BATCH_SIZE,
                                              device=DEVICE,
                                              sort_key=lambda x: len(x.src_field),
                                              sort_within_batch=True)
valid_Iterator = torchtext.data.BucketIterator(dataset=valid_data,
                                               batch_size=BATCH_SIZE,
                                               sort_key=lambda x: len(x.src_field),
                                               sort_within_batch= True)

## Building Vocabulary
src_Field.build_vocab(train_data, min_freq=2)
trg_Field.build_vocab(train_data, min_freq=2)
print(f"Size of Dutch Vocabulary: {len(src_Field.vocab)}")
print(f"Size of English Vocabulary: {len(trg_Field.vocab)}")

Size of Dutch Vocabulary: 7344
Size of English Vocabulary: 5641


# **2) Positional Encoding**

In [131]:
class PositionalEncoding(torch.nn.Module):
    def __init__(self, d_model, max_seq_length):
        super(PositionalEncoding, self).__init__()
        
        pe = torch.zeros(max_seq_length, d_model)

        ##### position shape: (max_seq_length, 1)
        position = torch.arange(0, max_seq_length, dtype=torch.float).unsqueeze(1)

        ##### position shape: (d_model/2, )
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * -(math.log(10000.0) / d_model))
        
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
                
        self.register_buffer('pe', pe.unsqueeze(0))
        
    def forward(self, x):
        return x + self.pe[:, :x.size(1)]

# **3) Multi-Head Attention Layer**

In [132]:
class MultiHeadAttentionLayer(torch.nn.Module):
    def __init__(self, hid_dim, n_heads, dropout, device):
        super(MultiHeadAttentionLayer, self).__init__()

        assert hid_dim % n_heads == 0
        
        self.hid_dim = hid_dim
        self.n_heads = n_heads
        self.head_dim = self.hid_dim // self.n_heads

        self.fc_q = torch.nn.Linear(self.hid_dim, self.hid_dim)
        self.fc_k = torch.nn.Linear(self.hid_dim, self.hid_dim)
        self.fc_v = torch.nn.Linear(self.hid_dim, self.hid_dim)

        self.fc_o = torch.nn.Linear(self.hid_dim, self.hid_dim)

        self.dropout = torch.nn.Dropout(dropout)
        
        self.softmax = torch.nn.Softmax(dim=-1)

        self.scale = torch.sqrt(torch.FloatTensor([self.head_dim])).to(device)

    def forward(self, query, key, value, mask=None):
        ##### query: (batch_size, seq_len, hid_dim)
        batch_size = query.shape[0]

        ##### Q: (batch_size, seq_len, hid_dim)
        Q = self.fc_q(query)
        K = self.fc_q(key)
        V = self.fc_q(value)

        ##### Q: (batch_size, n_heads, seq_len, head_dim)
        Q = Q.view(batch_size, -1, self.n_heads, self.head_dim).permute(0, 2, 1, 3)
        K = K.view(batch_size, -1, self.n_heads, self.head_dim).permute(0, 2, 1, 3)
        V = V.view(batch_size, -1, self.n_heads, self.head_dim).permute(0, 2, 1, 3)

        ##### Unnormalized Attention Weights: (batch_size, n_heads, query_len, key_len)  ---- query_len = seq_len, key_len = seq_len
        energy = torch.matmul(Q, K.permute(0,1, 3, 2)) / self.scale

        if mask is not None:
            energy = energy.masked_fill(mask == 0, -1e10)
        
        attention = self.softmax(energy)

        ##### Attention: (batch_size, n_heads, query_len, head_dim)
        x = torch.matmul(self.dropout(energy), V)

        ##### Attention: (batch_size, query_len, n_heads, head_dim)
        x = x.permute(0, 2, 1, 3).contiguous()

        ##### size: (batch_size, seq_len, hid_dim)
        x = x.view(batch_size, -1, self.hid_dim)

        ##### size: (batch_size, seq_len, hid_dim)
        x = self.fc_o(x)

        return x, attention

        


        


att = MultiHeadAttentionLayer(4, 2, 0.5, DEVICE).to(DEVICE)
src = torch.ones((1,3,4)).to(DEVICE)
print(src.shape)

a, b = att(src, src, src)
print(a)
print(b)

torch.Size([1, 3, 4])
tensor([[[ 7.2706,  7.1466,  8.4439, -8.3005],
         [ 2.5720,  2.5876,  2.7680, -2.5002],
         [ 4.9907,  4.8779,  5.7770, -5.5623]]], device='cuda:0',
       grad_fn=<ViewBackward0>)
tensor([[[[0.3333, 0.3333, 0.3333],
          [0.3333, 0.3333, 0.3333],
          [0.3333, 0.3333, 0.3333]],

         [[0.3333, 0.3333, 0.3333],
          [0.3333, 0.3333, 0.3333],
          [0.3333, 0.3333, 0.3333]]]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>)


# **4) Position-wise Feedforward Layer**

In [133]:
class PositionwiseFeedforwardLayer(torch.nn.Module):
    def __init__(self, hid_dim, pf_dim, dropout):
        super(PositionwiseFeedforwardLayer, self).__init__()

        self.fc1 = torch.nn.Linear(hid_dim, pf_dim)
        self.fc2 = torch.nn.Linear(pf_dim, hid_dim)

        self.dropout = torch.nn.Dropout(dropout)
    
    def forward(self, x):
        x = self.dropout(torch.relu(self.fc1(x)))

        x = self.fc2(x)

        return x

# **5) Encoder**

In [134]:
class EncoderLayer(torch.nn.Module):
    def __init__(self, hid_dim, n_heads, pf_hid, dropout, device):
        super(EncoderLayer, self).__init__()

        self.self_attention = MultiHeadAttentionLayer(hid_dim, n_heads, dropout, device)
        self.positionwise_ff = PositionwiseFeedforwardLayer(hid_dim, pf_hid, dropout)
        self.atten_layre_norm = torch.nn.LayerNorm(hid_dim)
        self.pw_ff_layer_norm = torch.nn.LayerNorm(hid_dim)

        self.dropout = torch.nn.Dropout(dropout)

    def forward(self, src, src_mask):
        ##### src = (batch_size, src_len, hid_dim)
        ##### src_mask = (batch_size, 1, 1, src_len)
        ##### att_out: (batch_size, seq_len, hid_dim)
        att_out, _ = self.self_attention(src, src, src, src_mask)
        pw_ff_in = self.atten_layre_norm(self.dropout(att_out) + src)
        
        ##### out: (batch_size, seq_len, hid_dim)
        pw_ff_out = self.positionwise_ff(pw_ff_in)
        out = self.pw_ff_layer_norm(pw_ff_in + self.dropout(pw_ff_out))

        return out


class Encoder(torch.nn.Module):
    def __init__ (self, input_dim, n_layers, hid_dim, n_heads, pf_hid, dropout, device, max_seq_length=100):
        super(Encoder, self).__init__()

        self.device = device
        self.token_embedding = torch.nn.Embedding(input_dim, hid_dim)
        self.position_embedding = torch.nn.Embedding(max_seq_length, hid_dim)
        self.layers = torch.nn.ModuleList([EncoderLayer(hid_dim, n_heads, pf_hid, dropout, device) for _ in range(n_layers)])

        self.dropout = torch.nn.Dropout(dropout)
        self.scale = torch.sqrt(torch.FloatTensor([hid_dim])).to(device)

    def forward(self, src, src_mask):
        ##### src = (batch size, src len)
        batch_size = src.shape[0]
        src_len = src.shape[1]

        ##### pos: (batch_size, seq_len)
        pos = torch.arange(0, src_len).unsqueeze(0).repeat(batch_size, 1).to(self.device)

        ##### src: (batch_size, seq_len, hid_dim)
        src = self.dropout(self.token_embedding(src)*self.scale + self.position_embedding(pos))

        for layer in self.layers:
            src = layer(src, src_mask)

        return src


src = torch.ones((5, 10), dtype=int).to(DEVICE)
src_mask = torch.ones((5, 1, 1, 10), dtype=int).to(DEVICE)
encoder = Encoder(15, 2, 16, 2, 64, 0.5, DEVICE, 100).to(DEVICE)
a = encoder(src, src_mask)


# **6) Decoder**

In [135]:
class DecoderLayer(torch.nn.Module):
    def __init__(self, hid_dim, n_heads, pf_dim, dropout, device):
        super(DecoderLayer, self).__init__()

        self.self_att_layer_norm = torch.nn.LayerNorm(hid_dim)
        self.enc_att_layer_norm = torch.nn.LayerNorm(hid_dim)
        self.ff_layer_norm = torch.nn.LayerNorm(hid_dim)

        self.self_attention = MultiHeadAttentionLayer(hid_dim, n_heads, dropout, device)
        self.enc_attention = MultiHeadAttentionLayer(hid_dim, n_heads, dropout, device)
        self.positionwise_FF = PositionwiseFeedforwardLayer(hid_dim, pf_dim, dropout)

        self.dropout = torch.nn.Dropout(dropout)

    def forward(self, trg, enc_src, trg_mask, src_mask):
        ##### trg: (batch_size, trg_len, hid_dim)
        ##### enc_src: (batch_size, src_len, hid_dim)
        ##### src_mask: (batch_size, 1, 1, trg_len)
        ##### trg_mask: (batch_size, 1, trg_len, trg_len)
        ##### self_att_out: (batch_size, trg_len, hid_dim)
        self_att_out, _ = self.self_attention(trg, trg, trg, trg_mask)
        out = self.self_att_layer_norm(self.dropout(self_att_out) + trg)

        enc_att_out, attention = self.enc_attention(out, enc_src, enc_src, src_mask)
        out = self.enc_att_layer_norm(self.dropout(enc_att_out) + out)

        pw_ff_out = self.positionwise_FF(out)
        out = self.ff_layer_norm(self.dropout(pw_ff_out) + out)

        return out, attention
    


class Decoder(torch.nn.Module):
    def __init__(self, output_dim, n_layers, hid_dim, n_heads, pf_hid, dropout, device, max_seq_length=100):
        super(Decoder, self).__init__()

        self.device = device
        self.token_embedding = torch.nn.Embedding(output_dim, hid_dim)
        self.position_embedding = torch.nn.Embedding(max_seq_length, hid_dim)
        self.layers = torch.nn.ModuleList([DecoderLayer(hid_dim, n_heads, pf_hid, dropout, device) for _ in range(n_layers)])

        self.fc_out = torch.nn.Linear(hid_dim, output_dim)

        self.dropout = torch.nn.Dropout(dropout)
        self.scale = torch.sqrt(torch.FloatTensor([hid_dim])).to(device)

    def forward(self, trg, enc_src, trg_mask, src_mask):
        ##### trg: (batch_size, trg_len)
        batch_size = trg.shape[0]
        trg_len = trg.shape[1]

        ##### pos: (batch_size, seq_len)
        pos = torch.arange(0, trg_len).unsqueeze(0).repeat(batch_size, 1).to(self.device)
        
        ##### src: (batch_size, seq_len, hid_dim)
        """print(f"pos: {pos.shape}")
        print(self.token_embedding(trg).shape)
        print(self.position_embedding(pos).shape)"""
        trg = self.dropout(self.token_embedding(trg)*self.scale + self.position_embedding(pos))

        for layer in self.layers:
            trg, attention = layer(trg, enc_src, trg_mask, src_mask)

        ##### output = (batch size, trg len, output dim)
        output = self.fc_out(trg)

        return output, attention

# **7) Seq2Seq**

In [136]:
class Seq2Seq(torch.nn.Module):
    def __init__(self, encoder, decoder, src_pad_idx, trg_pad_idx, device):
        super(Seq2Seq, self).__init__()

        self.encoder = encoder
        self.decoder = decoder
        self.src_pad_idx = src_pad_idx
        self.trg_pad_idx = trg_pad_idx
        self.device = device

    def make_src_mask(self, src):
        src_mask = (src != self.src_pad_idx).unsqueeze(1).unsqueeze(2)
        return src_mask
    
    def make_trg_mask(self, trg):
        trg_pad_mask = (trg != self.trg_pad_idx).unsqueeze(1).unsqueeze(2)
        
        trg_len = trg.shape[1]
        trg_sub_mask = torch.tril(torch.ones((trg_len, trg_len), device=self.device)).bool()

        trg_mask = trg_pad_mask & trg_sub_mask

        return trg_mask
    
    def forward(self, src, trg):
        src_mask = self.make_src_mask(src)
        trg_mask = self.make_trg_mask(trg)

        enc_src = self.encoder(src, src_mask)
        #print(enc_src.shape)

        output, attention = self.decoder(trg, enc_src, trg_mask, src_mask)

        #print(output.shape)

        return output, attention
        



# **6) Parameter initialization**

In [137]:
Input_Dim = len(src_Field.vocab)
Output_Dim = len(trg_Field.vocab)
Hid_Dim = 256
Encoder_N_Layer = 3
Decoder_N_Layer = 3
Encoder_N_Heads = 8
Decoder_N_Heads = 8
Encoder_PF_Dim = 512
Decoder_PF_Dim = 512
Encoder_Dropout = 0.1
Decoder_Dropout = 0.1
Src_Pad_Idx = src_Field.vocab.stoi[src_Field.pad_token]
Trg_Pad_Idx = trg_Field.vocab.stoi[trg_Field.pad_token]


encoder = Encoder(input_dim=Input_Dim, n_layers=Encoder_N_Layer, hid_dim=Hid_Dim, n_heads=Encoder_N_Heads, pf_hid=Encoder_PF_Dim, dropout=Decoder_Dropout, device=DEVICE)
decoder = Decoder(output_dim=Output_Dim, n_layers=Decoder_N_Layer, hid_dim=Hid_Dim, n_heads=Decoder_N_Heads, pf_hid=Decoder_PF_Dim, dropout=Encoder_Dropout, device=DEVICE)
seq2seq = Seq2Seq(encoder=encoder, decoder=decoder, src_pad_idx=Src_Pad_Idx, trg_pad_idx=Trg_Pad_Idx, device=DEVICE).to(DEVICE)

Learning_Rate = 0.0005
optimizer = torch.optim.Adam(seq2seq.parameters(), lr=Learning_Rate)
criterion = torch.nn.CrossEntropyLoss(ignore_index=Trg_Pad_Idx)

In [138]:
def initialize_weights(m):
    if hasattr(m, 'weight') and m.weight.dim() > 1:
        torch.nn.init.xavier_uniform_(m.weight.data)

seq2seq.apply(initialize_weights);


# **7) Train function**

In [139]:
def train(model, iterator, optimzer, criterion, clip):
    model.train()
    epoch_loss = 0

    for i, batch in enumerate(iterator):
        #print(src)
        src, src_len = batch.src_field
        trg, trg_len = batch.trg_field

        optimizer.zero_grad()
        output, _ = model(src, trg[:,:-1])

        output = output.contiguous().view(-1, output.shape[-1])
        trg = trg[:,1:].contiguous().view(-1)

        loss = criterion(output, trg)

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)

        optimizer.step()

        epoch_loss += loss.item()
    return epoch_loss / len(iterator)

# **8) Evaluation function**

In [140]:
def evaluatef(model, iterator, criterion):
    model.eval()

    epoch_loss = 0

    for i, batch in enumerate(iterator):
        src, src_len = batch.src_field
        trg, trg_len = batch.trg_field

        output, _ = model(src, trg[:,:-1])

        output = output.contiguous().view(-1, output.shape[-1])
        trg = trg[:, 1:].contiguous().view(-1)

        loss = criterion(output, trg)
        epoch_loss += loss.item()

    return epoch_loss / len(iterator)

In [141]:
N_EPOCHS = 25
CLIP = 1

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):    
    train_loss = train(seq2seq, train_Iterator, optimizer, criterion, CLIP)
    valid_loss = evaluatef(seq2seq, test_Iterator, criterion)
            
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(seq2seq.state_dict(), 'tut6-model.pt')
    
    print(f'Epoch: {epoch+1:02}')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')

Epoch: 01
	Train Loss: 5.360 | Train PPL: 212.825
	 Val. Loss: 5.228 |  Val. PPL: 186.453
Epoch: 02
	Train Loss: 4.926 | Train PPL: 137.886
	 Val. Loss: 4.795 |  Val. PPL: 120.919
Epoch: 03
	Train Loss: 5.030 | Train PPL: 152.995
	 Val. Loss: 5.059 |  Val. PPL: 157.397
Epoch: 04
	Train Loss: 4.967 | Train PPL: 143.526
	 Val. Loss: 5.010 |  Val. PPL: 149.914
Epoch: 05
	Train Loss: 4.899 | Train PPL: 134.120
	 Val. Loss: 4.941 |  Val. PPL: 139.882
Epoch: 06
	Train Loss: 4.817 | Train PPL: 123.572
	 Val. Loss: 4.899 |  Val. PPL: 134.215
Epoch: 07
	Train Loss: 4.736 | Train PPL: 113.930
	 Val. Loss: 4.844 |  Val. PPL: 126.942
Epoch: 08
	Train Loss: 4.671 | Train PPL: 106.833
	 Val. Loss: 4.811 |  Val. PPL: 122.881
Epoch: 09
	Train Loss: 4.626 | Train PPL: 102.145
	 Val. Loss: 4.780 |  Val. PPL: 119.107
Epoch: 10
	Train Loss: 4.575 | Train PPL:  97.010
	 Val. Loss: 4.796 |  Val. PPL: 121.082
Epoch: 11
	Train Loss: 4.538 | Train PPL:  93.541
	 Val. Loss: 4.737 |  Val. PPL: 114.113
Epoch: 12


# **9) Computing Metrics (Bleu Score, Rouge, WER)**

In [142]:
# Function to translate a sentence using argmax
def translate_sentence(model, sentence, trg_sent, src_field, tgt_field, device, max_len=50):
    model.eval()


    tokens = src_field.tokenize(sentence)
    tokens = [src_field.init_token] + tokens + [src_field.eos_token]
    src_indexes = [src_field.vocab.stoi[token] for token in tokens]
    src_tensor = torch.LongTensor(src_indexes).unsqueeze(0).to(device)

    tokens = tgt_field.tokenize(trg_sent)
    tokens = [tgt_field.init_token] + tokens + [tgt_field.eos_token]
    src_indexes = [tgt_field.vocab.stoi[token] for token in tokens]
    trg_tensor = torch.LongTensor(src_indexes).unsqueeze(0).to(device)


    src_mask = model.make_src_mask(src_tensor)
    trg_mask = model.make_trg_mask(trg_tensor)
    with torch.no_grad():
        encoder_outputs = model.encoder(src_tensor, src_mask)


    
    """for i in range(max_len):"""

    with torch.no_grad():
        output, _ = model.decoder(trg_tensor, encoder_outputs, trg_mask, src_mask)


    output = output.squeeze()

    pred_token = output.argmax(1)


    """if pred_token == tgt_field.vocab.stoi[tgt_field.eos_token]:
        break"""
    
    trg_indexes = list(np.array(pred_token.cpu()))

    trg_tokens = [tgt_field.vocab.itos[i] for i in trg_indexes[:-1]]
    return trg_tokens[1:-1], trg_indexes[1:-1]

# Translate each test data point using argmax and print the English sentence, the ground truth Dutch translation, and the model's generated Dutch translation
predicted_output = []
ground_truth_output = []
bleu = load_metric("bleu") 
rouge = evaluate.load('rouge')
Bleu_total = 0
rouge1_total = 0
rouge2_total = 0
rougeL_total = 0
rougeLsum_total = 0
WER_total = 0

n = 0
for batch in tqdm(valid_Iterator):
    src = batch.src_field[0]
    trg = batch.trg_field[0]



    for idx in range(src.shape[0]):

        src_sent = ' '.join([src_Field.vocab.itos[i] for i in src[idx, 1:src.shape[1]-1]])
        trg_sent = ' '.join([trg_Field.vocab.itos[i] for i in trg[idx, 1:trg.shape[1]-1]])
        trg_sent = trg_sent.split(" ")
        final_trg = [] 
        for w in trg_sent:
            if w != '<pad>' and w != '<eos>':
                final_trg.append(w)

        
        
        translated_sent, _ = translate_sentence(seq2seq, src_sent, ' '.join(final_trg), src_Field, trg_Field, DEVICE)

        final_trg = [] 
        for w in trg_sent:
            if w != '<pad>' and w != '<eos>':
                final_trg.append(w)
        final_trg = " ".join(w for w in final_trg)
        
        """print(f"English: {src_sent}")
        print(f"Ground Truth Dutch: {final_trg}")
        print(f"Generated Dutch: {' '.join(translated_sent)}")"""
        predicted_output = translated_sent
        ground_truth_output = final_trg.split()

        ## Computing the Rouge
        results = rouge.compute(predictions=[' '.join(predicted_output)], references=[final_trg])
        rouge1_total += results['rouge1']
        rouge2_total += results['rouge2']
        rougeL_total += results['rougeL']
        rougeLsum_total += results['rougeLsum']

        ## Computing the Word Error Rate
        WER_total += wer(final_trg, ' '.join(predicted_output))

        ## Computing the BLEU score
        predicted_output = [predicted_output]
        ground_truth_output = [[ground_truth_output]]

        Bleu_total += bleu.compute(predictions=predicted_output, references=ground_truth_output)['bleu']
        n += 1



print(f"The Average Testing Bleu Score is: {Bleu_total / n}")
print(f"The Average Testing Rouge-1 Score is: {rouge1_total / n}")
print(f"The Average Testing Rouge-2 Score is: {rouge2_total / n}")
print(f"The Average Testing Rouge-L Score is: {rougeL_total / n}")
print(f"The Average Testing Rouge-Lsum Score is: {rougeLsum_total / n}")
print(f"Average Word Error Rate: {WER_total / n}")


        

100%|██████████| 46/46 [03:07<00:00,  4.07s/it]

The Average Testing Bleu Score is: 0.0015744735271415128
The Average Testing Rouge-1 Score is: 0.25114728547952536
The Average Testing Rouge-2 Score is: 0.028272034809479856
The Average Testing Rouge-L Score is: 0.22628870846405386
The Average Testing Rouge-Lsum Score is: 0.22628870846405386
Average Word Error Rate: 0.7800511333513602





In [38]:
import gc
gc.collect()
torch.cuda.empty_cache()