In [None]:
# Imports
import pickle
import torch
import matplotlib.pyplot as plt
from torchtext.data.metrics import bleu_score
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
# Helper functions
from helpers.preprocessing import preprocess_data, split_data, normalize_data
from helpers.model import Encoder, AttentionDecoder
from helpers.training import prepare_dataloader, train
from helpers.evaluation import generate_translation, evaluate

In [None]:
# Global Variables
languages = ['eng', 'fra', 'spa', 'deu', 'por']
LANGUAGE1 = languages[0]
LANGUAGE2 = languages[2]
FILEPATH = f'data/{LANGUAGE1}-{LANGUAGE2}.txt'

# Model tuning parameters
HIDDEN_SIZE = 128
BATCH_SIZE = 32
LEARNING_RATE = 0.001
DROPOUT = 0.1


Data preprocessing

In [None]:
input_language, output_language, line_pairs = preprocess_data(FILEPATH, LANGUAGE1, LANGUAGE2)

train_pairs, test_pairs = split_data(line_pairs, test_size=0.2)

Model

In [None]:
encoder = Encoder(input_language.n_words, HIDDEN_SIZE, DROPOUT).to(device)
decoder = AttentionDecoder(HIDDEN_SIZE, output_language.n_words, DROPOUT).to(device)

encoder, decoder

Training

In [None]:
input_language, output_language, train_dataloader = prepare_dataloader(input_language, output_language, train_pairs, BATCH_SIZE)

losses_list, accuracies_list = train(train_dataloader, encoder, decoder, epochs=80, learning_rate=LEARNING_RATE, print_every=5, plot_every=5)


In [None]:
print( accuracies_list)
print(losses_list)

Save Model

In [None]:
# Save models

encoder_filepath = f'saved_models/encoder-{LANGUAGE1}-{LANGUAGE2}-mx15.pth'
decoder_filepath = f'saved_models/decoder-{LANGUAGE1}-{LANGUAGE2}-mx15.pth'

torch.save(encoder.state_dict(), encoder_filepath)
torch.save(decoder.state_dict(), decoder_filepath)

Evaluation

In [None]:
# Evaluate random training pairs

encoder.eval()
decoder.eval()

references_corpus, candidate_corpus = evaluate(input_language, output_language, encoder, decoder,
                                               train_pairs, evaluate_train=True)

In [None]:
# Evaluate testing pairs

candidate_corpus, references_corpus = evaluate(input_language, output_language, encoder, decoder,
                                                test_pairs, evaluate_train=False, iterations=len(test_pairs))

Benchmark

In [None]:
#GRU with Bahdanau attention
bleu = bleu_score(candidate_corpus, references_corpus)

bleu_score_percentage = bleu * 100
# bleu_score_percentage
print(f"BLEU Score: {bleu_score_percentage:.2f}%")

Analysis

In [None]:

# Plotting loss for the first language pair
plt.plot(losses_list, label='Loss')

plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.title('Training Loss')
plt.savefig('training_loss.png')  # Save the loss plot as an image
plt.show()
plt.close()  # Close the plot to start a new one

# Plotting accuracies for the first language pair
plt.plot(accuracies_list, label='Accuracy')

plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.title('Training Accuracy')
plt.savefig('training_accuracy.png')  # Save the accuracy plot as an image
plt.show()
plt.close()  # Close the plot


In [None]:
# Save Lists

# Save list as a pickle object

loss_filename = f'objects/bahdanau_loss_{LANGUAGE2}_{LANGUAGE1}.pkl'
accuracy_filename = f'objects/bahdanau_accuracy_{LANGUAGE2}_{LANGUAGE1}.pkl'

with open(loss_filename, 'wb') as file:
    pickle.dump(losses_list, file)

with open(accuracy_filename, 'wb') as file:
    pickle.dump(accuracies_list, file)