In [None]:
# Standard library imports
import os
import random
import sys
import time

# Related third-party imports
from IPython.display import clear_output
from nltk.translate.bleu_score import corpus_bleu
import numpy as np
import pygame
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.callbacks import Callback, EarlyStopping, TensorBoard
from tensorflow.keras.layers import Bidirectional, Concatenate, Dense, Embedding, Input, Layer, LSTM
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer

# Defining paths to data file and model checkpoints
DATA_PATH = './Data/data.txt'
ENCODER_PATH = './Model/encoder_weights.h5'
DECODER_PATH = './Model/decoder_weights.h5'

# Setting hyperparameters for the model
MAX_SEQ_LENGTH = 100
EMBEDDING_DIM = 1024
NUM_EPOCHS = 1000
BATCH_SIZE = 16
TEST_SPLIT_SIZE = 0.2
RANDOM_STATE = 42
EARLY_STOPPING_PATIENCE = 5
LSTM_UNITS = 1024
LOG_DIR = './Logs'

# Define the number of records for BLEU testing
BLEU_TEST_COUNT = 1

# Define training device 'CPU'/'GPU' 
DEVICE = 'CPU'

# Defining Attention layer as a custom Keras layer
class Attention(Layer):
    def __init__(self, units):
        super(Attention, self).__init__()
        # Dense layers for calculating attention scores
        self.W1 = Dense(units)
        self.W2 = Dense(units)
        self.V = Dense(1)

    def call(self, features, hidden):
        # Expanding the hidden state dimension for addition operation with features
        hidden_with_time_axis = tf.expand_dims(hidden, 1)
        # Calculating attention scores
        score = tf.nn.tanh(self.W1(features) + self.W2(hidden_with_time_axis))
        attention_weights = tf.nn.softmax(self.V(score), axis=1)
        context_vector = attention_weights * features
        context_vector = tf.reduce_sum(context_vector, axis=1)
        return context_vector, attention_weights

# Defining a custom Keras callback for saving model weights
class ModelCheckpoint(Callback):
    def __init__(self, encoder_model, decoder_model):
        super().__init__()
        self.encoder_model = encoder_model
        self.decoder_model = decoder_model

    def on_epoch_end(self, epoch, logs=None):
        current_val_loss = logs.get('val_loss')
        # Save the model if it's the first epoch or if the validation loss has improved
        if epoch == 0 or current_val_loss < self.best_val_loss: 
            self.best_val_loss = current_val_loss
            self.encoder_model.save_weights(ENCODER_PATH)
            self.decoder_model.save_weights(DECODER_PATH)

class BleuScoreCallback(Callback):
    def __init__(self, tokenizer_output, sequences_input_val, sequences_output_val, num_records):
        super().__init__()
        # Store the tokenizer for output sequences
        self.tokenizer_output = tokenizer_output  
        # Store the input validation sequences
        self.sequences_input_val = sequences_input_val  
        # Store the output validation sequences
        self.sequences_output_val = sequences_output_val  
        # Store the number of records to select for BLEU score calculation
        self.num_records = num_records  

    def on_epoch_end(self, epoch, logs=None):
        # Create a mapping from integer tokens to words
        int_to_word_decoder = {i: word for word, i in self.tokenizer_output.word_index.items()}  
        # Set the mapping for the out-of-vocabulary token
        int_to_word_decoder[1] = '<OOV>'  
        # Initialize a list to store the reference sequences
        references = []  
        # Initialize a list to store the predicted sequences
        candidates = []  
        # Randomly select indices for the desired number of records
        selected_indices = random.sample(range(len(self.sequences_input_val)), self.num_records)  
        # Select the input validation sequences based on the selected indices
        selected_sequences_input_val = self.sequences_input_val[selected_indices]
        # Select the output validation sequences based on the selected indices  
        selected_sequences_output_val = self.sequences_output_val[selected_indices]  
        # Get the total number of selected sequences
        total_sequences = len(selected_indices) 
        # Initialize a counter for processed sequences and average for bleu score calculations
        processed_sequences = 0  
        bleu_score_average = 0
        # Record the start time for elapsed time calculation
        start_time = time.time() 
        # Print new line to help with BLEU score formatting 
        print()
        # BLEU score testing and calculation loop
        for seq_in, seq_out in zip(selected_sequences_input_val, selected_sequences_output_val):
            # Decode the output sequence
            predicted_sequence = decode_sequence(np.array([seq_in]))
            # Convert the output sequence indices to words
            reference_sequence = ' '.join([int_to_word_decoder[int(i)] for i in seq_out if i > 0])  
            # Strip '<start>' and '<end>' tokens if they exist
            predicted_sequence = predicted_sequence.replace('<start>', '').replace('<end>', '').strip()
            reference_sequence = reference_sequence.replace('<start>', '').replace('<end>', '').strip()
            # Append the reference sequence as a list of words
            references.append([reference_sequence.split()])  
            # Append the predicted sequence as a list of words
            candidates.append(predicted_sequence.split())  
            # Increment the counter for processed sequences
            processed_sequences += 1  
            # Calculate the progress percentage
            progress = processed_sequences / total_sequences * 100  
            # Calculate the elapsed time
            elapsed_time = time.time() - start_time  
            # Calculate the BLEU score
            bleu_score_average = (corpus_bleu(references, candidates) +  bleu_score_average) / processed_sequences 
            # Print progress with carriage return to overwrite the previous line
            sys.stdout.write(f'\rProcessing sequences: {processed_sequences}/{total_sequences} ({progress:.2f}%), Elapsed Time: {elapsed_time:.2f}s, BLEU-1 Average: {bleu_score_average}')
            sys.stdout.flush()
        # Print new line to help with BLEU score formatting 
        print()
            
# Function for sending an audible ping to the user
def play_ping():
    pygame.mixer.init()
    pygame.mixer.music.load("./Sounds/notification_by_UNIVERSFIELD.mp3")  # replace with the path to your sound file
    pygame.mixer.music.play()

# Function for prompting a user for input, notifying the user with an audible ping, and accepting and returning the input value
def input_with_notification(prompt):
    # Plays ping letting user know input is requested
    play_ping()
    # Prints prompt to screen before requesting user input
    user_input = input(prompt)
    # Returns input value
    return user_input

# Function for loading and preprocessing the data
def load_and_preprocess_data(data_file):
    # Raise an error if the data file does not exist
    if not os.path.exists(data_file):
        raise FileNotFoundError(f'The file {data_file} does not exist.')
    data = []
    # Open the data file and read lines
    with open(data_file, 'r') as file:
        for line in file:
            try:
                # Split each line into description and subject and append to the data list
                description, subject = line.strip().split('|')
                data.append((description, subject))
            except ValueError:
                print(f'Skipped line due to wrong format: {line}')
    # Shuffle the data for randomness
    np.random.shuffle(data)
    # Return the descriptions and subjects as separate lists
    return [sample[0] for sample in data], [sample[1] for sample in data]

# Function for tokenizing and padding the text data
def tokenize_and_pad(texts, max_seq_length):
    # Initialize a tokenizer with a special out-of-vocabulary token
    tokenizer = Tokenizer(num_words=None, oov_token='<OOV>')
    tokenizer.fit_on_texts(texts)
    # Convert texts to sequences of integer tokens
    sequences = tokenizer.texts_to_sequences(texts)
    # Pad the sequences to the maximum sequence length
    padded_sequences = pad_sequences(sequences, maxlen=max_seq_length, padding='post')
    return tokenizer, padded_sequences

# Function for building the model
def build_model(max_seq_length, vocab_size_input, vocab_size_output, embedding_dim):
    # Define the encoder input layer
    encoder_input = Input(shape=(max_seq_length,))
    encoder_embedding_layer = Embedding(vocab_size_input, embedding_dim)
    encoder_embedding = encoder_embedding_layer(encoder_input)
    # Define the encoder LSTM layer
    encoder_lstm = Bidirectional(LSTM(LSTM_UNITS, return_sequences=True, return_state=True))
    encoder_output, forward_h, forward_c, backward_h, backward_c = encoder_lstm(encoder_embedding)
    # Concatenate the forward and backward hidden states
    state_h = Concatenate()([forward_h, backward_h])
    state_c = Concatenate()([forward_c, backward_c])
    # Define the decoder input layer
    decoder_input = Input(shape=(max_seq_length,))
    decoder_embedding_layer = Embedding(vocab_size_output, embedding_dim)
    decoder_embedding = decoder_embedding_layer(decoder_input)
    # Define the decoder LSTM layer
    decoder_lstm = LSTM(LSTM_UNITS*2, return_sequences=True, return_state=True)
    # Define the attention layer
    attention_layer = Attention(LSTM_UNITS*2)
    context_vector, attention_weights = attention_layer(encoder_output, state_h)
    decoder_concat_input = Concatenate(axis=-1)([decoder_embedding, tf.repeat(tf.expand_dims(context_vector, 1), repeats=MAX_SEQ_LENGTH, axis=1)])
    # Pass the concatenated input to the decoder LSTM
    decoder_output, _, _ = decoder_lstm(decoder_concat_input, initial_state=[state_h, state_c])
    # Define the output layer
    decoder_output = Dense(vocab_size_output, activation='softmax')(decoder_output)
    # Define the model with the encoder and decoder inputs and the decoder output
    model = Model(inputs=[encoder_input, decoder_input], outputs=decoder_output)
    return model, encoder_input, encoder_output, state_h, state_c, decoder_input, decoder_embedding_layer, decoder_lstm, attention_layer

# Function for decoding a sequence of tokens into text
def decode_sequence(input_sequence):
    # Pass the input sequence to the encoder model and get the output and states
    e_out, e_h, e_c = encoder_model.predict(input_sequence, verbose=0)
    states_value = [e_h, e_c]
    # Initialize the target sequence with the start token
    target_sequence = np.zeros((1, 1))
    target_sequence[0, 0] = 1
    output_sequence = ''
    while True:
        # Pass the target sequence and states to the decoder model and get the output tokens and new states
        output_tokens, h, c = decoder_model.predict([target_sequence] + states_value + [e_out], verbose=0)
        # Get the token with the highest probability
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        # Convert the token into a word
        sampled_word = int_to_word_decoder.get(sampled_token_index, '<OOV>')  # dealing with the OOV token
        output_sequence += ' ' + sampled_word
        # Stop if the end token is found or the maximum sequence length is reached
        if sampled_word == '<end>' or len(output_sequence) > MAX_SEQ_LENGTH:
            break
        # Update the target sequence and states for the next iteration
        target_sequence = np.zeros((1, 1))
        target_sequence[0, 0] = sampled_token_index
        states_value = [h, c]
    return output_sequence

# Refresh all outputs before we begin
clear_output()

# Configure tensorflow to use defined device for training
tf.config.set_visible_devices(tf.config.list_physical_devices(DEVICE)[0], DEVICE)

# Load and preprocess the data
descriptions, subjects = load_and_preprocess_data(DATA_PATH)

# Tokenize and pad the descriptions and subjects
tokenizer_input, sequences_input = tokenize_and_pad(descriptions, MAX_SEQ_LENGTH)
tokenizer_output, sequences_output = tokenize_and_pad(subjects, MAX_SEQ_LENGTH)

# Get the vocab sizes for the input and output
vocab_size_input = len(tokenizer_input.word_index) + 1
vocab_size_output = len(tokenizer_output.word_index) + 1

# Create a dictionary for converting integer tokens back into words
int_to_word_decoder = {i: word for word, i in tokenizer_output.word_index.items()}
int_to_word_decoder[1] = '<OOV>'

# Build the model
model, encoder_input, encoder_output, state_h, state_c, decoder_input, decoder_embedding_layer, decoder_lstm, attention_layer = build_model(
    MAX_SEQ_LENGTH, vocab_size_input, vocab_size_output, EMBEDDING_DIM)

# Compile the model with a loss function, optimizer, and metrics
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam')

# Split the sequences into training and validation sets
sequences_input_train, sequences_input_val, sequences_output_train, sequences_output_val = train_test_split(
    sequences_input, np.expand_dims(sequences_output, -1), test_size=TEST_SPLIT_SIZE, random_state=RANDOM_STATE)

# Create directories for saving model weights if they do not exist
if not os.path.exists(os.path.dirname(ENCODER_PATH)):
    os.makedirs(os.path.dirname(ENCODER_PATH))
if not os.path.exists(os.path.dirname(DECODER_PATH)):
    os.makedirs(os.path.dirname(DECODER_PATH))

# Define TensorBoard and early stopping callbacks
tensorboard_callback = TensorBoard(log_dir=LOG_DIR)
early_stopping_callback = EarlyStopping(monitor='val_loss', patience=EARLY_STOPPING_PATIENCE)

# Ask the user whether to train a new model, load an existing model, or continue training from an existing model
user_input = input_with_notification("Enter '1' to Train New Model, '2' to Continue Training from Existing Model, or '3' to Load Existing Model for Testing Only: ")

# Define the encoder and decoder models for inference
encoder_model = Model(encoder_input, [encoder_output, state_h, state_c])
decoder_state_input_h = Input(shape=(LSTM_UNITS*2,))
decoder_state_input_c = Input(shape=(LSTM_UNITS*2,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
decoder_hidden_state_input = Input(shape=(MAX_SEQ_LENGTH, LSTM_UNITS*2))
decoder_input_inf = Input(shape=(1,))
decoder_embedding_inf = decoder_embedding_layer(decoder_input_inf)
context_vector, _ = attention_layer(decoder_hidden_state_input, decoder_state_input_h)
decoder_concat_input = Concatenate(axis=-1)([decoder_embedding_inf, tf.expand_dims(context_vector, 1)])
decoder_outputs_inf, state_h_inf, state_c_inf = decoder_lstm(decoder_concat_input, initial_state=decoder_states_inputs)
decoder_states_inf = [state_h_inf, state_c_inf]
decoder_outputs_inf = Dense(vocab_size_output, activation='softmax')(decoder_outputs_inf)
decoder_model = Model([decoder_input_inf] + decoder_states_inputs + [decoder_hidden_state_input], [decoder_outputs_inf] + decoder_states_inf)

# Define the model checkpoint callback
checkpoint_callback = ModelCheckpoint(encoder_model, decoder_model)

# If the user chose to train a new model, fit the model and save the best weights
if user_input == '1':
    # Create a BLEU score callback object with specified parameters
    bleu_score_callback = BleuScoreCallback(tokenizer_output, sequences_input_val, sequences_output_val, BLEU_TEST_COUNT)
    # Begin training
    model.fit([sequences_input_train, sequences_output_train], sequences_output_train,
            batch_size=BATCH_SIZE,
            epochs=NUM_EPOCHS,
            validation_data=([sequences_input_val, sequences_output_val], sequences_output_val),
            callbacks=[checkpoint_callback, tensorboard_callback, bleu_score_callback, early_stopping_callback])

# If the user chose to continue training from an existing model, load the saved weights and continue training
# Please be aware that this requires you to have the exact same architecture as the one that was used when the model was saved
elif user_input == '2':
    # Load existing weights
    encoder_model.load_weights(ENCODER_PATH)
    decoder_model.load_weights(DECODER_PATH)
    # Create a BLEU score callback object with specified parameters
    bleu_score_callback = BleuScoreCallback(tokenizer_output, sequences_input_val, sequences_output_val, BLEU_TEST_COUNT)
    # Begin training
    model.fit([sequences_input_train, sequences_output_train], sequences_output_train,
            batch_size=BATCH_SIZE,
            epochs=NUM_EPOCHS,
            validation_data=([sequences_input_val, sequences_output_val], sequences_output_val),
            callbacks=[checkpoint_callback, tensorboard_callback, bleu_score_callback, early_stopping_callback])

# If the user chose to load an existing model for testing, load the saved weights
elif user_input == '3':
    encoder_model.load_weights(ENCODER_PATH)
    decoder_model.load_weights(DECODER_PATH)

# Loop for getting user input and predicting the output
while True:
    # Ask the user for a description
    input_description = input_with_notification("Enter a new description (type 'quit' to exit): ")
    # Break the loop if the user types 'quit'
    if input_description.lower() == 'quit': break
    # Tokenize and pad the user's description
    input_sequence = tokenizer_input.texts_to_sequences([input_description])
    input_sequence = pad_sequences(input_sequence, maxlen=MAX_SEQ_LENGTH, padding='post')
    # Decode the model's output sequence
    predicted_sequence = decode_sequence(input_sequence)
    # Print the predicted subject
    print('Predicted Subject:', predicted_sequence)
    