In [1]:
### Imports ###

import matplotlib.pyplot as plt
plt.switch_backend('TkAgg')
import matplotlib.ticker as ticker
import numpy as np
import random

img_path = 'imgs/'

from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import re
import pandas as pd
import random

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

import numpy as np
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, random_split

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [2]:
SOS_token = 0
EOS_token = 1

class Chars:
    def __init__(self, name):
        self.name = name
        self.char2index = {}
        self.char2count = {}
        self.index2char = {0: "SOS", 1: "EOS"}
        self.n_chars = 2

    def addWord(self, word):
        for char in word:
            self.addChar(char)

    def addChar(self, char):
        if char not in self.char2index:
            self.char2index[char] = self.n_chars
            self.char2count[char] = 1
            self.index2char[self.n_chars] = char
            self.n_chars += 1
        else:
            self.char2count[char] += 1

In [8]:
def readChars(lang1, lang2, reverse=False):
    path = lang1 + '-' + lang2
    data_path = 'data/'+path+'.csv'

    df = pd.read_csv(data_path)
    source_words = []
    target_words = []
    pairs = []

    for i in range(len(df)):
        pair = df.iloc[i][[lang1, lang2]]
        pairs.append(pair)
        source_words.append(pair[0])
        target_words.append(pair[1])

    if reverse:
        pairs = [list(reversed(p)) for p in pairs]
        input_lang = Chars(lang2)
        output_lang = Chars(lang1)
    else:
        input_lang = Chars(lang1)
        output_lang = Chars(lang2)

    return input_lang, output_lang, pairs

In [4]:
def prepareData(lang1, lang2, reverse=False):
    input_lang, output_lang, pairs = readChars(lang1, lang2, reverse)
    print("Read {} sentence pairs".format(len(pairs)))
    print("Counting chars...")
    for pair in pairs:
        input_lang.addWord(pair[0])
        output_lang.addWord(pair[1])
    print("Counted chars:")
    print(input_lang.name, input_lang.n_chars)
    print(output_lang.name, output_lang.n_chars)
    return input_lang, output_lang, pairs

In [9]:
input_lang, output_lang, pairs = prepareData('ina', 'fin', reverse=True)
print(random.choice(pairs))

Read 350 sentence pairs
Counting chars...
Counted chars:
fin 27
ina 38
['lisæːntyæ', 'lɑsɑnið']


In [67]:
MAX_LENGTH_INPUT = max(len(pair[0]) for pair in pairs)
MAX_LENGTH_OUTPUT = max(len(pair[1]) for pair in pairs)
MAX_LENGTH = max(MAX_LENGTH_INPUT, MAX_LENGTH_OUTPUT)+1

In [68]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size, dropout_p=0.1):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size, batch_first=True)
        self.dropout = nn.Dropout(dropout_p)

    def forward(self, input):
        embedded = self.dropout(self.embedding(input))
        output, hidden = self.gru(embedded)
        return output, hidden

In [69]:
class DecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size):
        super(DecoderRNN, self).__init__()
        self.embedding = nn.Embedding(output_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size, batch_first=True)
        self.out = nn.Linear(hidden_size, output_size)

    def forward(self, encoder_outputs, encoder_hidden, target_tensor=None):
        batch_size = encoder_outputs.size(0)
        decoder_input = torch.empty(batch_size, 1, dtype=torch.long, device=device).fill_(SOS_token)
        decoder_hidden = encoder_hidden
        decoder_outputs = []

        for i in range(MAX_LENGTH):
            decoder_output, decoder_hidden  = self.forward_step(decoder_input, decoder_hidden)
            decoder_outputs.append(decoder_output)

            if target_tensor is not None:
                decoder_input = target_tensor[:, i].unsqueeze(1)
            else:
                _, topi = decoder_output.topk(1)
                decoder_input = topi.squeeze(-1).detach()

        decoder_outputs = torch.cat(decoder_outputs, dim=1)
        decoder_outputs = F.log_softmax(decoder_outputs, dim=-1)
        return decoder_outputs, decoder_hidden, None

    def forward_step(self, input, hidden):
        output = self.embedding(input)
        output = F.relu(output)
        output, hidden = self.gru(output, hidden)
        output = self.out(output)
        return output, hidden

In [70]:
class BahdanauAttention(nn.Module):
    def __init__(self, hidden_size):
        super(BahdanauAttention, self).__init__()
        self.Wa = nn.Linear(hidden_size, hidden_size)
        self.Ua = nn.Linear(hidden_size, hidden_size)
        self.Va = nn.Linear(hidden_size, 1)

    def forward(self, query, keys):
        scores = self.Va(torch.tanh(self.Wa(query) + self.Ua(keys)))
        scores = scores.squeeze(2).unsqueeze(1)

        weights = F.softmax(scores, dim=-1)
        context = torch.bmm(weights, keys)

        return context, weights

class AttnDecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size, dropout=0.1):
        super(AttnDecoderRNN, self).__init__()
        self.embedding = nn.Embedding(output_size, hidden_size)
        self.attention = BahdanauAttention(hidden_size)
        self.gru = nn.GRU(2 * hidden_size, hidden_size, batch_first=True)
        self.out = nn.Linear(hidden_size, output_size)
        self.dropout = nn.Dropout(dropout)

    def forward(self, encoder_outputs, encoder_hidden, target_tensor=None):
        batch_size = encoder_outputs.size(0)
        decoder_input = torch.empty(batch_size, 1, dtype=torch.long, device=device).fill_(SOS_token)
        decoder_hidden = encoder_hidden
        decoder_outputs = []
        attentions = []

        for i in range(MAX_LENGTH):
            decoder_output, decoder_hidden, attn_weights = self.forward_step(
                decoder_input, decoder_hidden, encoder_outputs
            )
            decoder_outputs.append(decoder_output)
            attentions.append(attn_weights)

            if target_tensor is not None:
                decoder_input = target_tensor[:, i].unsqueeze(1) 
            else:
                _, topi = decoder_output.topk(1)
                decoder_input = topi.squeeze(-1).detach()

        decoder_outputs = torch.cat(decoder_outputs, dim=1)
        decoder_outputs = F.log_softmax(decoder_outputs, dim=-1)
        attentions = torch.cat(attentions, dim=1)

        return decoder_outputs, decoder_hidden, attentions


    def forward_step(self, input, hidden, encoder_outputs):
        embedded =  self.dropout(self.embedding(input))

        query = hidden.permute(1, 0, 2)
        context, attn_weights = self.attention(query, encoder_outputs)
        input_gru = torch.cat((embedded, context), dim=2)

        output, hidden = self.gru(input_gru, hidden)
        output = self.out(output)

        return output, hidden, attn_weights

In [71]:
def indexesFromWord(lang, word):
    return [lang.char2index[char] for char in word]

def tensorFromWord(lang, word):
    indexes = indexesFromWord(lang, word)
    indexes.append(EOS_token)
    return torch.tensor(indexes, dtype=torch.long, device=device).view(1, -1)

def tensorsFromPair(pair):
    input_tensor = tensorFromWord(input_lang, pair[0])
    target_tensor = tensorFromWord(output_lang, pair[1])
    return (input_tensor, target_tensor)

def get_dataloader(batch_size):
    input_lang, output_lang, pairs = prepareData('ina', 'fin')

    n = len(pairs)
    input_ids = np.zeros((n, MAX_LENGTH), dtype=np.int32)
    target_ids = np.zeros((n, MAX_LENGTH), dtype=np.int32)

    for idx, (inp, tgt) in enumerate(pairs):
        inp_ids = indexesFromWord(input_lang, inp)
        tgt_ids = indexesFromWord(output_lang, tgt)
        
        inp_ids.append(EOS_token)
        tgt_ids.append(EOS_token)
        input_ids[idx, :len(inp_ids)] = inp_ids
        target_ids[idx, :len(tgt_ids)] = tgt_ids

    all_data = TensorDataset(torch.LongTensor(input_ids).to(device),
                               torch.LongTensor(target_ids).to(device))

    train_test_gen = torch.Generator().manual_seed(42)
    train_data, test_data = random_split(all_data, [0.8, 0.2], generator=train_test_gen)
    
    train_sampler = RandomSampler(train_data)
    train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)
    
    test_sampler = RandomSampler(test_data)
    test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)

    return input_lang, output_lang, train_dataloader, test_dataloader, test_data

In [72]:
def train_epoch(dataloader, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion):

    total_loss = 0
    for data in dataloader:
        input_tensor, target_tensor = data

        encoder_optimizer.zero_grad()
        decoder_optimizer.zero_grad()

        encoder_outputs, encoder_hidden = encoder(input_tensor)
        decoder_outputs, _, _ = decoder(encoder_outputs, encoder_hidden, target_tensor)

        loss = criterion(
            decoder_outputs.view(-1, decoder_outputs.size(-1)),
            target_tensor.view(-1)
        )
        loss.backward()

        encoder_optimizer.step()
        decoder_optimizer.step()

        total_loss += loss.item()

    return total_loss / len(dataloader)

In [73]:
import time
import math

def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)

def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

In [74]:
def checkpoint(encoder, decoder, epoch, hidden_size, batch_size, learning_rate, final=False):
    if final:
        torch.save(encoder.state_dict(), 'ckpt/encoder_'+str(epoch)+'_'+str(hidden_size)+'_'+str(batch_size)+'_'+str(learning_rate)+'_final.dict')
        torch.save(decoder.state_dict(), 'ckpt/decoder_'+str(epoch)+'_'+str(hidden_size)+'_'+str(batch_size)+'_'+str(learning_rate)+'_final.dict')
    else:    
        torch.save(encoder.state_dict(), 'ckpt/encoder_'+str(epoch)+'_'+str(hidden_size)+'_'+str(batch_size)+'_'+str(learning_rate)+'.dict')
        torch.save(decoder.state_dict(), 'ckpt/decoder_'+str(epoch)+'_'+str(hidden_size)+'_'+str(batch_size)+'_'+str(learning_rate)+'.dict')
    
def resume(epoch, hidden_size, batch_size, learning_rate, dropout=0.1):
    encoder = EncoderRNN(input_lang.n_chars, hidden_size).to(device)
    decoder = AttnDecoderRNN(hidden_size, output_lang.n_chars, dropout=dropout).to(device)

    encoder.load_state_dict(torch.load('ckpt/encoder_'+str(epoch)+'_'+str(hidden_size)+'_'+str(batch_size)+'_'+str(learning_rate)+'.dict'))
    decoder.load_state_dict(torch.load('ckpt/decoder_'+str(epoch)+'_'+str(hidden_size)+'_'+str(batch_size)+'_'+str(learning_rate)+'.dict'))
    return encoder, decoder

In [91]:
encoder, decoder = resume(15, 512, 64, 0.001)

[codecarbon INFO @ 15:04:50] Energy consumed for RAM : 0.004363 kWh. RAM Power : 2.4078168869018555 W
[codecarbon INFO @ 15:04:50] Energy consumed for all CPUs : 0.025376 kWh. Total CPU Power : 14.0 W
[codecarbon INFO @ 15:04:50] 0.029739 kWh of electricity used since the beginning.


In [76]:
def is_decreasing_with_patience(lst, patience=5):
    non_decreasing_count = 0

    for i in range(len(lst) - 1):
        if lst[i] <= lst[i + 1]:
            non_decreasing_count += 1
            if non_decreasing_count > patience:
                return False
        else:
            non_decreasing_count = 0

    return True

In [77]:
def train(train_dataloader, test_dataloader, encoder, decoder, n_epochs, learning_rate,
          hidden_size, batch_size, patience=5):
    start = time.time()
    plot_losses = []
    print_loss_total = 0 
    plot_loss_total = 0 
    
    val_losses = []
    validation_list = []
    print_val_loss_total = 0
    plot_val_loss_total = 0
    bleu_total = []
    chrf_total = []

    encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate)
    criterion = nn.NLLLoss()

    best_val_loss = float('inf')
    epochs_without_improvement = 0
    
    for epoch in range(1, n_epochs + 1):
        start = time.time()
        loss = train_epoch(train_dataloader, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion)
        checkpoint(encoder, decoder, epoch, hidden_size, batch_size, learning_rate)
        print_loss_total += loss
        plot_loss_total += loss
        
        val_loss = validate(test_dataloader, encoder, decoder)

        validation_list.append(val_loss)
        
        print_val_loss_total += val_loss
        plot_val_loss_total += val_loss
        bleu, chrf = calculate_scores(test_data)

        bleu_total.append(bleu)
        chrf_total.append(chrf)

        print_loss_avg = print_loss_total
        print_loss_total = 0
            
        print_val_loss_avg = print_val_loss_total
        print_val_loss_total = 0
            
        print('Epoch: {}/{},\tTime Taken: {:.2f} seconds,\tTraining Loss: {:.4f},\tValidation Loss: {:.4f}'.format(epoch, n_epochs, time.time()-start, print_loss_avg, print_val_loss_avg))
        
        plot_loss_avg = plot_loss_total
        plot_losses.append(plot_loss_avg)
        plot_loss_total = 0
            
        plot_val_loss_avg = plot_val_loss_total
        val_losses.append(plot_val_loss_avg)
        plot_val_loss_total = 0

        if val_loss < best_val_loss:
            best_val_loss = val_loss
            epochs_without_improvement = 0
        else:
            epochs_without_improvement += 1

        if epochs_without_improvement == patience:
            print("Validation loss has not gone down for " + str(patience) + " epochs. Implementing Early Stopping")
            encoder, decoder = resume(validation_list.index(best_val_loss)+1, hidden_size, batch_size, learning_rate) #get the epoch needed
            checkpoint(encoder, decoder, validation_list.index(best_val_loss)+1, hidden_size, batch_size, learning_rate, final=True)
            break

    showPlot(plot_losses, val_losses, hidden_size, batch_size, learning_rate)
    showMetricPlot(bleu_total, chrf_total, hidden_size, batch_size, learning_rate)

In [78]:
def validate(test_dataloader, encoder, decoder):
    encoder.eval()
    decoder.eval()
    
    criterion = nn.NLLLoss()
    
    total_loss = 0
    for data in test_dataloader:
    
        input_tensor, target_tensor = data
    
        encoder_outputs, encoder_hidden = encoder(input_tensor) 
        decoder_outputs, _, _ = decoder(encoder_outputs, encoder_hidden, target_tensor)

        loss = criterion(
            decoder_outputs.view(-1, decoder_outputs.size(-1)),
            target_tensor.view(-1)
        )
        
        total_loss += loss.item()
        
    encoder.train()
    decoder.train()
    
    return total_loss/len(test_dataloader)
    

In [79]:
def showPlot(train_loss, val_loss, hs, bs, lr):
    fig, ax = plt.subplots()
    loc = ticker.MultipleLocator(base=0.2)
    ax.yaxis.set_major_locator(loc)
    ax.plot(train_loss, label='Training Loss', color='blue')
    ax.plot(val_loss, label='Validation Loss',color='red')
    ax.set_xlabel('Epochs')
    ax.set_ylabel('Loss')
    ax.legend()
    ax.set_title("Training and Validation Loss Curve")
    plt.savefig('imgs/Train_Val_Loss_Graph'+str(hs)+'_'+str(bs)+'_'+str(lr)+'.png')
    plt.close(fig)

def showMetricPlot(b, c, hs, bs, lr):
    fig, ax = plt.subplots()
    loc = ticker.MultipleLocator(base=0.2)
    ax.yaxis.set_major_locator(loc)
    ax.plot(b, label='Bleu', color='green')
    ax.plot(c, label='CHRF', color='orange')
    ax.set_xlabel('Epochs')
    #ax.set_ylabel('Score')
    ax.legend()
    ax.set_title("Bleu and CHRF Scores while Training")
    plt.savefig('imgs/Bleu_CHRF_Score_Graph'+str(hs)+'_'+str(bs)+'_'+str(lr)+'.png')
    plt.close(fig)

In [80]:
def evaluater(encoder, decoder, word, input_lang, output_lang):
    with torch.no_grad():
        #input_tensor = tensorFromWord(input_lang, word)
        input_tensor = word
        encoder_outputs, encoder_hidden = encoder(input_tensor)
        decoder_outputs, decoder_hidden, decoder_attn = decoder(encoder_outputs, encoder_hidden)

        _, topi = decoder_outputs.topk(1)
        decoded_ids = topi.squeeze()

        decoded_chars = []
        for idx in decoded_ids:
            if idx.item() == EOS_token:
                decoded_chars.append('<EOS>')
                break
            decoded_chars.append(output_lang.index2char[idx.item()])
        
    return decoded_chars, decoder_attn

In [105]:
def showAttention(input_word, output_chars, correct_output, attentions):
    fig = plt.figure()
    ax = fig.add_subplot(111)
    cax = ax.matshow(attentions.cpu().numpy(), cmap='bone', extent=[-0.5, len(input_word)-0.5, len(output_chars)-0.5, -0.5])
    fig.colorbar(cax)

    ax.set_title(input_word + ' -> ' + correct_output)
    
    ax.set_xticklabels([''] + [*input_word] + ['<EOS>'], rotation=90)
    ax.set_yticklabels([''] + output_chars)
    
    ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
    ax.yaxis.set_major_locator(ticker.MultipleLocator(1))

    plt.savefig(img_path+input_word+'_attention.png', bbox_inches='tight')
    plt.close(fig)


In [103]:
def evaluateRandomly(encoder, decoder, n=10, eval_all=False):
    if eval_all:
        for i, pair in enumerate(test_data):
            pair = (torch.unsqueeze(pair[0], 0), torch.unsqueeze(pair[1], 0))
            output_chars, attentions = evaluater(encoder, decoder, pair[0], input_lang, output_lang)
            output_word = ''.join(output_chars)
            decoded_input = []
        
            in0=[]
            for idx in torch.squeeze(pair[0],0):
                if idx.item() == EOS_token:
                    in0.append('<EOS>')
                    break
                in0.append(input_lang.index2char[idx.item()])
            
            in1=[]
            for idx in torch.squeeze(pair[1],0):
                if idx.item() == EOS_token:
                    in1.append('<EOS>')
                    break
                in1.append(output_lang.index2char[idx.item()])
            
            input_word = ''.join(decoded_input)
            print('>', ''.join(in0))
            print('=', ''.join(in1))
            print('<', output_word)
            print('')
            showAttention(''.join(in0[0:-1]), output_chars, ''.join(in1[0:-1]), attentions[0, :len(output_chars), :])
    for i in range(n):
        random_index = random.randint(0, len(test_data))
        pair = test_data.dataset[random_index]
        pair = (torch.unsqueeze(pair[0], 0), torch.unsqueeze(pair[1], 0))
        
        output_chars, attentions = evaluater(encoder, decoder, pair[0], input_lang, output_lang)
        output_word = ''.join(output_chars)
        decoded_input = []
        
        in0=[]
        for idx in torch.squeeze(pair[0],0):
            if idx.item() == EOS_token:
                in0.append('<EOS>')
                break
            in0.append(input_lang.index2char[idx.item()])
            
        in1=[]
        for idx in torch.squeeze(pair[1],0):
            if idx.item() == EOS_token:
                in1.append('<EOS>')
                break
            in1.append(output_lang.index2char[idx.item()])
            
        input_word = ''.join(decoded_input)
        print('>', ''.join(in0))
        print('=', ''.join(in1))
        print('<', output_word)
        print('')
        showAttention(''.join(in0[0:-1]), output_chars, ''.join(in1[0:-1]), attentions[0, :len(output_chars), :])

In [83]:
from collections import Counter

def modified_precision(candidate, references, n):

    cand_counts = Counter(candidate)
    max_ref_counts = Counter(references)

    for ref in references:
        max_ref_counts |= Counter(ref)

    clipped_counts = {ngram: min(count, max_ref_counts[ngram]) for ngram, count in cand_counts.items()}

    return sum(clipped_counts.values()) / (len(candidate) or 1)

In [84]:
def generate_ngrams(s, n):
    if len(s) < n:
        return []

    ngrams = [s[i:i+n] for i in range(len(s)-n+1)]
    return ngrams

[codecarbon INFO @ 13:23:19] Energy consumed for RAM : 0.000291 kWh. RAM Power : 2.4078168869018555 W
[codecarbon INFO @ 13:23:19] Energy consumed for all CPUs : 0.001692 kWh. Total CPU Power : 14.0 W
[codecarbon INFO @ 13:23:19] 0.001982 kWh of electricity used since the beginning.


In [85]:
from numpy import mean
import math

def bleu_score(can, ref, weights=[0.25, 0.25, 0.25, 0.25]):
    scores = {}
    for which_grammy in range(1, len(weights) + 1):
        scores_temp = []
        canngrams = generate_ngrams(can, which_grammy)
        refngrams = generate_ngrams(ref, which_grammy)
        bp = math.e**(1 - (len(ref) / len(can)))
        scores[which_grammy] = bp * modified_precision(canngrams, refngrams, which_grammy)
    total_score = 0
    for score, weight in zip(scores, weights):
        total_score = total_score + scores[score] * weight
    return scores, total_score

print(bleu_score('aaabdhd','abcdaaks'))

({1: 0.6191984998215584, 2: 0.2889592999167272, 3: 0.0, 4: 0.0}, 0.2270394499345714)


In [86]:
def levenshtein_distance_and_alignment(s, t):
    rows, cols = len(s) + 1, len(t) + 1
    dist = [[0 for _ in range(cols)] for _ in range(rows)]

    for i in range(1, rows):
        dist[i][0] = i
    for j in range(1, cols):
        dist[0][j] = j

    for i in range(1, rows):
        for j in range(1, cols):
            if s[i - 1] == t[j - 1]:
                cost = 0
            else:
                cost = 1
            dist[i][j] = min(dist[i - 1][j] + 1,
                             dist[i][j - 1] + 1,
                             dist[i - 1][j - 1] + cost)

    aligned_s, aligned_t = "", ""
    i, j = len(s), len(t)
    while i > 0 or j > 0:
        if i > 0 and j > 0 and dist[i][j] == dist[i - 1][j - 1] + (s[i - 1] != t[j - 1]):
            aligned_s = s[i - 1] + aligned_s
            aligned_t = t[j - 1] + aligned_t
            i, j = i - 1, j - 1
        elif i > 0 and dist[i][j] == dist[i - 1][j] + 1:
            aligned_s = s[i - 1] + aligned_s
            aligned_t = "$" + aligned_t
            i -= 1
        else:
            aligned_s = "#" + aligned_s
            aligned_t = t[j - 1] + aligned_t
            j -= 1

    return aligned_s, aligned_t, dist[len(s)][len(t)]

In [87]:
def chrf(reference, candidate, max_n=6, beta=2):
    def ngrams(s, n):
        return [s[i:i+n] for i in range(len(s) - n + 1)]

    def count_ngrams(ngram_list):
        ngram_count = {}
        for ngram in ngram_list:
            ngram_count[ngram] = ngram_count.get(ngram, 0) + 1
        return ngram_count

    chrp_total = 0
    chrr_total = 0

    for n in range(1, max_n + 1):
        ref_ngrams = count_ngrams(ngrams(reference, n))
        cand_ngrams = count_ngrams(ngrams(candidate, n))

        overlap = sum(min(ref_ngrams.get(ng, 0), cand_ngrams.get(ng, 0)) for ng in cand_ngrams)

        chrp = overlap / sum(cand_ngrams.values()) if cand_ngrams else 0
        chrr = overlap / sum(ref_ngrams.values()) if ref_ngrams else 0

        chrp_total += chrp
        chrr_total += chrr

    avg_chrp = chrp_total / max_n
    avg_chrr = chrr_total / max_n

    if avg_chrp + avg_chrr == 0:
        return 0
    else:
        return (1 + beta**2) * (avg_chrp * avg_chrr) / (beta**2 * avg_chrp + avg_chrr)

In [93]:
def calculate_scores(test_data):
    total = 0
    length = len(test_data)
    total_bleu_1 = 0
    total_chrf = 0
    for pair in test_data:
        pair = (torch.unsqueeze(pair[0], 0), torch.unsqueeze(pair[1], 0))
        
        output_chars, _ = evaluater(encoder, decoder, pair[0], input_lang, output_lang)
        
        in1 = []
        for idx in torch.squeeze(pair[1], 0):
            if idx.item() == EOS_token:
                break
            in1.append(output_lang.index2char[idx.item()])

        ref_string = ''.join(in1)
        candidate_string = ''.join(output_chars[:-1])

        r, c, _ = levenshtein_distance_and_alignment(ref_string, candidate_string)
        
        scores, total_score = bleu_score(candidate_string, ref_string)
        chrf_temp = chrf(ref_string, candidate_string, max_n=4)
        total_bleu_1 = total_bleu_1 + scores[1]
        total = total + total_score
        total_chrf = total_chrf + chrf_temp
    return total / length, total_chrf / length

print(calculate_scores(test_data))

(0.30184083777607607, 0.3121011564380961)


[codecarbon INFO @ 16:28:59] Energy consumed for RAM : 0.007442 kWh. RAM Power : 2.4078168869018555 W
[codecarbon INFO @ 16:28:59] Energy consumed for all CPUs : 0.043283 kWh. Total CPU Power : 14.0 W
[codecarbon INFO @ 16:28:59] 0.050725 kWh of electricity used since the beginning.
[codecarbon INFO @ 16:28:59] Energy consumed for RAM : 0.007738 kWh. RAM Power : 2.4078168869018555 W
[codecarbon INFO @ 16:28:59] Energy consumed for all CPUs : 0.045008 kWh. Total CPU Power : 14.0 W
[codecarbon INFO @ 16:28:59] 0.052747 kWh of electricity used since the beginning.


In [89]:
hidden_size = 512
batch_size = 8
learning_rate = 0.001
dropout = 0.1

input_lang, output_lang, train_dataloader, test_dataloader, test_data = get_dataloader(batch_size)

encoder = EncoderRNN(input_lang.n_chars, hidden_size).to(device)
decoder = AttnDecoderRNN(hidden_size, output_lang.n_chars, dropout=dropout).to(device)

from codecarbon import EmissionsTracker

tracker = EmissionsTracker(output_dir="emissions", output_file="emissions.csv")

tracker.start()

train(train_dataloader, test_dataloader, encoder, decoder, 1000, learning_rate=learning_rate, patience=5, hidden_size=hidden_size, batch_size=batch_size)

emissions = tracker.stop()

print(f"Emissions: {emissions} kg")

[codecarbon INFO @ 13:23:23] [setup] RAM Tracking...
[codecarbon INFO @ 13:23:23] [setup] GPU Tracking...
[codecarbon INFO @ 13:23:23] No GPU found.
[codecarbon INFO @ 13:23:23] [setup] CPU Tracking...


Read 350 sentence pairs
Counting chars...
Counted chars:
ina 38
fin 27


[codecarbon INFO @ 13:23:24] CPU Model on constant consumption mode: 11th Gen Intel(R) Core(TM) i5-1135G7 @ 2.40GHz
[codecarbon INFO @ 13:23:24] >>> Tracker's metadata:
[codecarbon INFO @ 13:23:24]   Platform system: Linux-6.1.55-06877-gc83437f2949f-x86_64-with-glibc2.31
[codecarbon INFO @ 13:23:24]   Python version: 3.9.2
[codecarbon INFO @ 13:23:24]   CodeCarbon version: 2.3.2
[codecarbon INFO @ 13:23:24]   Available RAM : 6.421 GB
[codecarbon INFO @ 13:23:24]   CPU count: 8
[codecarbon INFO @ 13:23:24]   CPU model: 11th Gen Intel(R) Core(TM) i5-1135G7 @ 2.40GHz
[codecarbon INFO @ 13:23:24]   GPU count: None
[codecarbon INFO @ 13:23:24]   GPU model: None


Epoch: 1/1000,	Time Taken: 4.80 seconds,	Training Loss: 1.2583,	Validation Loss: 1.0043


[codecarbon INFO @ 13:23:34] Energy consumed for RAM : 0.000301 kWh. RAM Power : 2.4078168869018555 W
[codecarbon INFO @ 13:23:34] Energy consumed for all CPUs : 0.001750 kWh. Total CPU Power : 14.0 W
[codecarbon INFO @ 13:23:34] 0.002051 kWh of electricity used since the beginning.


Epoch: 2/1000,	Time Taken: 4.52 seconds,	Training Loss: 0.8209,	Validation Loss: 0.8219
Epoch: 3/1000,	Time Taken: 4.46 seconds,	Training Loss: 0.5757,	Validation Loss: 0.7502


[codecarbon INFO @ 13:23:42] Energy consumed for RAM : 0.000010 kWh. RAM Power : 2.4078168869018555 W
[codecarbon INFO @ 13:23:42] Energy consumed for all CPUs : 0.000058 kWh. Total CPU Power : 14.0 W
[codecarbon INFO @ 13:23:42] 0.000068 kWh of electricity used since the beginning.


Epoch: 4/1000,	Time Taken: 4.22 seconds,	Training Loss: 0.3934,	Validation Loss: 0.6593


[codecarbon INFO @ 13:23:49] Energy consumed for RAM : 0.000311 kWh. RAM Power : 2.4078168869018555 W
[codecarbon INFO @ 13:23:49] Energy consumed for all CPUs : 0.001808 kWh. Total CPU Power : 14.0 W
[codecarbon INFO @ 13:23:49] 0.002119 kWh of electricity used since the beginning.


Epoch: 5/1000,	Time Taken: 4.55 seconds,	Training Loss: 0.2500,	Validation Loss: 0.6585
Epoch: 6/1000,	Time Taken: 4.09 seconds,	Training Loss: 0.1639,	Validation Loss: 0.6612


[codecarbon INFO @ 13:23:57] Energy consumed for RAM : 0.000020 kWh. RAM Power : 2.4078168869018555 W
[codecarbon INFO @ 13:23:57] Energy consumed for all CPUs : 0.000117 kWh. Total CPU Power : 14.0 W
[codecarbon INFO @ 13:23:57] 0.000137 kWh of electricity used since the beginning.


Epoch: 7/1000,	Time Taken: 4.36 seconds,	Training Loss: 0.1097,	Validation Loss: 0.6419
Epoch: 8/1000,	Time Taken: 4.74 seconds,	Training Loss: 0.0731,	Validation Loss: 0.6896


[codecarbon INFO @ 13:24:04] Energy consumed for RAM : 0.000321 kWh. RAM Power : 2.4078168869018555 W
[codecarbon INFO @ 13:24:04] Energy consumed for all CPUs : 0.001867 kWh. Total CPU Power : 14.0 W
[codecarbon INFO @ 13:24:04] 0.002188 kWh of electricity used since the beginning.


Epoch: 9/1000,	Time Taken: 4.68 seconds,	Training Loss: 0.0442,	Validation Loss: 0.6990


[codecarbon INFO @ 13:24:12] Energy consumed for RAM : 0.000030 kWh. RAM Power : 2.4078168869018555 W
[codecarbon INFO @ 13:24:12] Energy consumed for all CPUs : 0.000175 kWh. Total CPU Power : 14.0 W
[codecarbon INFO @ 13:24:12] 0.000205 kWh of electricity used since the beginning.


Epoch: 10/1000,	Time Taken: 4.98 seconds,	Training Loss: 0.0266,	Validation Loss: 0.7181
Epoch: 11/1000,	Time Taken: 4.72 seconds,	Training Loss: 0.0170,	Validation Loss: 0.7337


KeyboardInterrupt: 

In [100]:
r_list=[]
c_list=[]
r_word_list = []
c_word_list = []
for pair in test_data:
        pair = (torch.unsqueeze(pair[0], 0), torch.unsqueeze(pair[1], 0))
        
        output_chars, _ = evaluater(encoder, decoder, pair[0], input_lang, output_lang)
        
        in1 = []
        for idx in torch.squeeze(pair[1], 0):
            if idx.item() == EOS_token:
                break
            in1.append(output_lang.index2char[idx.item()])

        r = ''.join(in1)
        c = ''.join(output_chars[:-1])

        r_list.append([ch for ch in r])
        c_list.append([ch for ch in c])
        r_word_list.append(r)
        c_word_list.append(c)
print('REP', r_word_list)
print('CAN', c_word_list)

REP ['rintɑ', 'liːtːæː', 'liːkːuɑ', 'terʋeys', 'kɑtːo', 'tuo̯mi', 'ei̯', 'tolɑ', 'rɑu̯tɑ', 'nɑpɑ', 'luo̯to', 'lɑu̯kːu', 'meri', 'luʋɑtɑ', 'kuːnːelːɑ', 'nie̯lːæ', 'kuo̯ri', 'ɑlkɑː', 'ʋene', 'kæsiʋɑrsi', 'syʋæ', 'niskɑ', 'syli', 'oksɑ', 'lisæːntyæ', 'hɑlːitɑ', 'ihme', 'kolme', 'tunːistɑː', 'kɑlɑstɑː', 'iloi̯nen', 'suo̯mu', 'mikæ', 'hei̯lutːɑː', 'muː', 'jɑ', 'noi̯tɑ', 'isæ', 'kirjɑʋɑ', 'kuːsi', 'jæː', 'juo̯dɑ', 'rɑu̯hɑ', 'læmsæ', 'jæːtyæ', 'syntyæ', 'ei̯ koskɑːn', 'ʋɑlmistɑː', 'mɑːilmɑ', 'sæːski', 'sɑlko', 'elæmæ', 'nenæ', 'ɑntɑː', 'ʋuo̯ri', 'ʋiholːinen', 'kɑt͡soɑ', 'pyhæ', 'et͡siæ', 'heʋonen', 'onsi', 'pyː', 'keskus', 'uːtinen', 'tosi', 'kynsi', 'pɑi̯stɑː', 'tæstæ', 'lɑu̯tɑ', 'pilʋi']
CAN ['rɑntɑ', 'liːtɑː', 'liːkɑtɑ', 'tæris', 'kɑto', 'tomi', 'ijo', 'tɑlɑ', 'rɑnɑtɑ', 'næi̯n', 'lutɑ', 'lɑi̯ne', 'mere', 'lɑpːɑtɑ', 'kulːɑ', 'nelæː', 'kuri', 'ɑlkɑː', 'ʋinen', 'kerhenen', 'tyæ', 'neskæ', 'suli', 'okɑs', 'liːɑtɑ', 'hiltɑ', 'omistɑ', 'kulmi', 'tuntuɑ', 'kɑlstɑː', 'ilo', 'somi', 'min', 'hei̯lyt

[codecarbon INFO @ 12:53:58] Energy consumed for RAM : 0.056893 kWh. RAM Power : 2.4078168869018555 W
[codecarbon INFO @ 12:53:58] Energy consumed for all CPUs : 0.330831 kWh. Total CPU Power : 14.0 W
[codecarbon INFO @ 12:53:58] 0.387724 kWh of electricity used since the beginning.
[codecarbon INFO @ 12:54:01] Energy consumed for RAM : 0.056598 kWh. RAM Power : 2.4078168869018555 W
[codecarbon INFO @ 12:54:01] Energy consumed for all CPUs : 0.329115 kWh. Total CPU Power : 14.0 W
[codecarbon INFO @ 12:54:01] 0.385713 kWh of electricity used since the beginning.
[codecarbon INFO @ 12:54:13] Energy consumed for RAM : 0.056903 kWh. RAM Power : 2.4078168869018555 W
[codecarbon INFO @ 12:54:13] Energy consumed for all CPUs : 0.330889 kWh. Total CPU Power : 14.0 W
[codecarbon INFO @ 12:54:13] 0.387792 kWh of electricity used since the beginning.
[codecarbon INFO @ 12:54:16] Energy consumed for RAM : 0.056608 kWh. RAM Power : 2.4078168869018555 W
[codecarbon INFO @ 12:54:16] Energy consumed f

In [107]:
encoder.eval()
decoder.eval()
evaluateRandomly(encoder, decoder, eval_all=True)

> rɑdːe<EOS>
= rintɑ<EOS>
< rɑntɑ<EOS>



  ax.set_xticklabels([''] + [*input_word] + ['<EOS>'], rotation=90)
  ax.set_yticklabels([''] + output_chars)


> lɑhteð<EOS>
= liːtːæː<EOS>
< liːtɑː<EOS>

> lihɐdið<EOS>
= liːkːuɑ<EOS>
< liːkɑtɑ<EOS>

> tiervɐsvuotɐ<EOS>
= terʋeys<EOS>
< tæris<EOS>

> kætːu<EOS>
= kɑtːo<EOS>
< kɑto<EOS>

> tuomɐ<EOS>
= tuo̯mi<EOS>
< tomi<EOS>

> ij<EOS>
= ei̯<EOS>
< ijo<EOS>

> tu̯æli<EOS>
= tolɑ<EOS>
< tɑlɑ<EOS>

> ryevdi<EOS>
= rɑu̯tɑ<EOS>
< rɑnɑtɑ<EOS>

> næːpi<EOS>
= nɑpɑ<EOS>
< næi̯n<EOS>

> lu̯ætu<EOS>
= luo̯to<EOS>
< lutɑ<EOS>

> lɑvkːɐ<EOS>
= lɑu̯kːu<EOS>
< lɑi̯ne<EOS>

> meːrɐ<EOS>
= meri<EOS>
< mere<EOS>

> lopedið<EOS>
= luʋɑtɑ<EOS>
< lɑpːɑtɑ<EOS>

> kuldɐlið<EOS>
= kuːnːelːɑ<EOS>
< kulːɑ<EOS>

> ɲielːɐð<EOS>
= nie̯lːæ<EOS>
< nelæː<EOS>

> korːɐ<EOS>
= kuo̯ri<EOS>
< kuri<EOS>

> ælɡið<EOS>
= ɑlkɑː<EOS>
< ɑlkɑː<EOS>

> voːnɐs<EOS>
= ʋene<EOS>
< ʋinen<EOS>

> kietɐverdi<EOS>
= kæsiʋɑrsi<EOS>
< kerhenen<EOS>

> tɑve<EOS>
= syʋæ<EOS>
< tyæ<EOS>

> niske<EOS>
= niskɑ<EOS>
< neskæ<EOS>

> solːɐ<EOS>
= syli<EOS>
< suli<EOS>

> u̯æksi<EOS>
= oksɑ<EOS>
< okɑs<EOS>

> lɑsɑnið<EOS>
= lisæːntyæ<EOS>
< liːɑtɑ<EOS

[codecarbon INFO @ 17:15:08] Energy consumed for RAM : 0.124853 kWh. RAM Power : 2.4078168869018555 W
[codecarbon INFO @ 17:15:08] Energy consumed for all CPUs : 0.726019 kWh. Total CPU Power : 14.0 W
[codecarbon INFO @ 17:15:08] Energy consumed for RAM : 0.125150 kWh. RAM Power : 2.4078168869018555 W
[codecarbon INFO @ 17:15:08] 0.850872 kWh of electricity used since the beginning.
[codecarbon INFO @ 17:15:08] Energy consumed for all CPUs : 0.727744 kWh. Total CPU Power : 14.0 W
[codecarbon INFO @ 17:15:08] 0.852895 kWh of electricity used since the beginning.


> miː<EOS>
= mikæ<EOS>
< min<EOS>

> hi̯æilutːið<EOS>
= hei̯lutːɑː<EOS>
< hei̯lytːæː<EOS>

> nubːe<EOS>
= muː<EOS>
< ninæ<EOS>

> jæ<EOS>
= jɑ<EOS>
< jo<EOS>

> nu̯æidi<EOS>
= noi̯tɑ<EOS>
< niːtɑ<EOS>

> eːt͡ʃi<EOS>
= isæ<EOS>
< setæ<EOS>

> kirjæː<EOS>
= kirjɑʋɑ<EOS>
< kirjɑ<EOS>

> kuosɐ<EOS>
= kuːsi<EOS>
< kusi<EOS>

> jieŋɐ<EOS>
= jæː<EOS>
< jæŋkæ<EOS>

> juːḥɐð<EOS>
= juo̯dɑ<EOS>
< juo̯dɑ<EOS>

> ræːvhu<EOS>
= rɑu̯hɑ<EOS>
< rɑu̯kɑ<EOS>

> læbd͡ʒi<EOS>
= læmsæ<EOS>
< lɑi̯pɑlɑ<EOS>

> ji̯æŋːuð<EOS>
= jæːtyæ<EOS>
< joi̯dɑ<EOS>

> ʃodːɐð<EOS>
= syntyæ<EOS>
< soi̯tɑː<EOS>

> ij ku̯æssin<EOS>
= ei̯ koskɑːn<EOS>
< kentæ<EOS>

> vɑlmɐʃtið<EOS>
= ʋɑlmistɑː<EOS>
< ʋɑlmistɑː<EOS>

> mɑːilm<EOS>
= mɑːilmɑ<EOS>
< milmæ<EOS>

> t͡ʃuoʃkɐ<EOS>
= sæːski<EOS>
< sokis<EOS>

> t͡ʃu̯ælɡui<EOS>
= sɑlko<EOS>
< sɑlkɑ<EOS>

> eːlːim<EOS>
= elæmæ<EOS>
< elæmæ<EOS>

> ɲune<EOS>
= nenæ<EOS>
< nuo̯ni<EOS>

> vyebdið<EOS>
= ɑntɑː<EOS>
< ʋɑntɑː<EOS>

> væːri<EOS>
= ʋuo̯ri<EOS>
< ʋɑrjɑ<EOS>

> vɑjɑlɐʃ<EOS>
= ʋih

[codecarbon INFO @ 17:15:23] Energy consumed for RAM : 0.124863 kWh. RAM Power : 2.4078168869018555 W
[codecarbon INFO @ 17:15:23] Energy consumed for all CPUs : 0.726078 kWh. Total CPU Power : 14.0 W
[codecarbon INFO @ 17:15:23] 0.850941 kWh of electricity used since the beginning.
[codecarbon INFO @ 17:15:23] Energy consumed for RAM : 0.125160 kWh. RAM Power : 2.4078168869018555 W
[codecarbon INFO @ 17:15:23] Energy consumed for all CPUs : 0.727803 kWh. Total CPU Power : 14.0 W
[codecarbon INFO @ 17:15:23] 0.852963 kWh of electricity used since the beginning.
[codecarbon INFO @ 17:15:38] Energy consumed for RAM : 0.124873 kWh. RAM Power : 2.4078168869018555 W
[codecarbon INFO @ 17:15:38] Energy consumed for all CPUs : 0.726136 kWh. Total CPU Power : 14.0 W
[codecarbon INFO @ 17:15:38] Energy consumed for RAM : 0.125170 kWh. RAM Power : 2.4078168869018555 W
[codecarbon INFO @ 17:15:38] 0.851009 kWh of electricity used since the beginning.
[codecarbon INFO @ 17:15:38] Energy consumed f