In [1]:
# Imports

import pickle

import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

import matplotlib.pyplot as plt
from torchtext.data.metrics import bleu_score

In [2]:
# # Mount google drive

# from google.colab import drive
# drive.mount('/content/drive/')

In [3]:
# %cd /content/drive/MyDrive/Colab Notebooks/CSE-676
# %ls

In [2]:
# Helper functions

from helpers.preprocessing import preprocess_data, split_data, normalize_data
from helpers.model import Encoder, AttentionDecoder
from helpers.training import prepare_dataloader, train
from helpers.evaluation import generate_translation, evaluate

In [4]:
# Global Variables

languages = ['eng', 'fra', 'spa', 'deu', 'por']
LANGUAGE1 = languages[0]
LANGUAGE2 = languages[3]
FILEPATH = f'data/{LANGUAGE1}-{LANGUAGE2}.txt'

# Model tuning parameters

HIDDEN_SIZE = 128
BATCH_SIZE = 32
LEARNING_RATE = 0.001
DROPOUT = 0.1


# losses_list = []  # List to store losses for each language pair
# accuracies_list = []  # List to store accuracies for each language pair

Data preprocessing

In [5]:
input_language, output_language, line_pairs = preprocess_data(FILEPATH, LANGUAGE1, LANGUAGE2)

train_pairs, test_pairs = split_data(line_pairs, test_size=0.2)

Number of translation pairs: 267186
Number of translation pairs after filter: 223849
Vocabulary - deu: 30894
Vocabulary - eng: 14835
Number of training pairs: 179079
Number of testing pairs: 44770


Model

In [6]:
# TODO

# basic, gru, lstm, decoder without attention, diff attension mechanism

In [7]:
encoder = Encoder(input_language.n_words, HIDDEN_SIZE, DROPOUT).to(device)
decoder = AttentionDecoder(HIDDEN_SIZE, output_language.n_words, DROPOUT).to(device)

encoder, decoder

(Encoder(
   (encoder_layers): Sequential(
     (0): Embedding(30894, 128)
     (1): Dropout(p=0.1, inplace=False)
     (2): GRU(128, 128, batch_first=True)
   )
 ),
 AttentionDecoder(
   (embedding): Embedding(14835, 128)
   (attention): BahdanauAttention(
     (Wa): Linear(in_features=128, out_features=128, bias=True)
     (Ua): Linear(in_features=128, out_features=128, bias=True)
     (Va): Linear(in_features=128, out_features=1, bias=True)
   )
   (gru): GRU(256, 128, batch_first=True)
   (out): Linear(in_features=128, out_features=14835, bias=True)
   (dropout): Dropout(p=0.1, inplace=False)
 ))

Training

In [9]:
input_language, output_language, train_dataloader = prepare_dataloader(input_language, output_language, train_pairs, BATCH_SIZE)

losses_list, accuracies_list = train(train_dataloader, encoder, decoder, epochs=10, learning_rate=LEARNING_RATE, print_every=5, plot_every=5)


Epoch [5/10], Avg. Loss: 0.8786, Avg. Accuracy: 0.7352, Time for 5 Epochs: 31:56 minutes
Epoch [10/10], Avg. Loss: 0.6352, Avg. Accuracy: 0.7865, Time for 5 Epochs: 31:07 minutes


In [None]:
print( accuracies_list)
print(losses_list)

Save Model

In [None]:
# Save models

encoder_filepath = f'models/bahdanau_gru_model/encoder-{LANGUAGE1}-{LANGUAGE2}-mx10.pth'
decoder_filepath = f'models/bahdanau_gru_model/decoder-{LANGUAGE1}-{LANGUAGE2}-mx10.pth'

torch.save(encoder.state_dict(), encoder_filepath)
torch.save(decoder.state_dict(), decoder_filepath)

# # Load models
# encoder.load_state_dict(torch.load(encoder_filepath))
# decoder.load_state_dict(torch.load(decoder_filepath))

Save Lists

Evaluation

In [None]:
# Evaluate random training pairs

encoder.eval()
decoder.eval()

references_corpus, candidate_corpus = evaluate(input_language, output_language, encoder, decoder,
                                               train_pairs, evaluate_train=True)

In [None]:
# Evaluate testing pairs

candidate_corpus, references_corpus = evaluate(input_language, output_language, encoder, decoder,
                                                test_pairs, evaluate_train=False, iterations=len(test_pairs))

Benchmark

In [None]:
#GRU with Bahdanau attention
bleu = bleu_score(candidate_corpus, references_corpus)

bleu_score_percentage = bleu * 100
# bleu_score_percentage
print(f"BLEU Score: {bleu_score_percentage:.2f}%")

Analysis

In [None]:

# Plotting loss for the first language pair
plt.plot(losses_list, label='Loss')

plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.title('Training Loss')
plt.savefig('training_loss.png')  # Save the loss plot as an image
plt.show()
plt.close()  # Close the plot to start a new one

# Plotting accuracies for the first language pair
plt.plot(accuracies_list, label='Accuracy')

plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.title('Training Accuracy')
plt.savefig('training_accuracy.png')  # Save the accuracy plot as an image
plt.show()
plt.close()  # Close the plot

# # Display the saved images
# from IPython.display import Image, display
# display(Image(filename='training_loss.png'))
# display(Image(filename='training_accuracy.png'))


In [None]:
# Save list as a pickle object

loss_filename = f'objects/bahdanau_gru_obj/bahdanau_loss_{LANGUAGE2}_{LANGUAGE1}.pkl'
accuracy_filename = f'objects/bahdanau_gru_obj/bahdanau_accuracy_{LANGUAGE2}_{LANGUAGE1}.pkl'

with open(loss_filename, 'wb') as file:
    pickle.dump(losses_list, file)

with open(accuracy_filename, 'wb') as file:
    pickle.dump(accuracies_list, file)

In [None]:

# Load data from the pickle file ACCURACY
with open('/Users/yasmeenmohammed/Yasmeen/UB/Courses/23Fall/Courses_fall23/DL/Project/DL_Project_676/Seq2Seq_with_Attention_Final/CSE-676_NewLatest_Exp/objects/bahdanau_gru_obj/bahdanau_accuracy_deu_eng.pkl', 'rb') as file:
    losses_list = pickle.load(file)

# Plotting the data (assuming 'losses_list' is a list of loss values)
plt.plot(accuracies_list)
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.title('Loss over Epochs')
plt.show()

# Load data from the pickle file LOSS
with open('/Users/yasmeenmohammed/Yasmeen/UB/Courses/23Fall/Courses_fall23/DL/Project/DL_Project_676/Seq2Seq_with_Attention_Final/CSE-676_NewLatest_Exp/objects/bahdanau_gru_obj/bahdanau_loss_deu_eng.pkl', 'rb') as file:
    losses_list = pickle.load(file)

# Plotting the data (assuming 'losses_list' is a list of loss values)
plt.plot(losses_list)
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Loss over Epochs')
plt.show()