In [None]:
# Standard library imports
import os
import random
import sys
import time
import requests
import json

# Related third-party imports
from IPython.display import clear_output
from nltk.translate.bleu_score import corpus_bleu
import numpy as np
from pathlib import Path
import pickle
import pygame
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.callbacks import Callback, EarlyStopping, TensorBoard
from tensorflow.keras.layers import Bidirectional, Concatenate, Dense, Embedding, Input, Layer, LSTM
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer

# Defining paths to data file and model checkpoints
LOG_DIR = './Logs'
DATA_PATH = './Data'
TOKENIZER_DIR = './Tokenizer'
TRAINING_DIR = './Training'
MODEL_DIR = './Model'
ENCODER_TRAINING_PATH = './Training/encoder_weights.keras'
DECODER_TRAINING_PATH = './Training/decoder_weights.keras'
ENCODER_PATH = './Model/encoder_weights.keras'
DECODER_PATH = './Model/decoder_weights.keras'

# Setting hyperparameters for the model
MAX_SEQ_LENGTH = 32000
EMBEDDING_DIM = 256
NUM_EPOCHS = 1000
BATCH_SIZE = 8
TEST_SPLIT_SIZE = 0.2
RANDOM_STATE = 42
EARLY_STOPPING_PATIENCE = 5
LSTM_UNITS = 256

# Define the number of records for BLEU testing
BLEU_TEST_COUNT = 4

# Define training device 'CPU'/'GPU' 
DEVICE = 'CPU'

# Defining Attention layer as a custom Keras layer
class Attention(Layer):
    def __init__(self, units):
        super(Attention, self).__init__()
        # Dense layers for calculating attention scores
        self.W1 = Dense(units)
        self.W2 = Dense(units)
        self.V = Dense(1)

    def call(self, features, hidden):
        hidden_with_time_axis = tf.expand_dims(hidden, 1)  # Expand the hidden state dimension for addition operation with features
        score = tf.nn.tanh(self.W1(features) + self.W2(hidden_with_time_axis))  # Calculate attention scores
        attention_weights = tf.nn.softmax(self.V(score), axis=1)  # Apply softmax to calculate attention weights
        context_vector = attention_weights * features  # Calculate the context vector by element-wise multiplication
        context_vector = tf.reduce_sum(context_vector, axis=1)  # Sum the context vector across the time axis
        return context_vector, attention_weights

# Defining a custom Keras callback for saving model weights
class ModelCheckpoint(Callback):
    def __init__(self, encoder_model, decoder_model):
        super().__init__()
        self.encoder_model = encoder_model
        self.decoder_model = decoder_model

    def on_batch_end(self, batch, logs=None):
        # Save the entire model after each batch
        self.encoder_model.save_weights(ENCODER_TRAINING_PATH)
        self.decoder_model.save_weights(DECODER_TRAINING_PATH)

    def on_epoch_end(self, epoch, logs=None):
        current_val_loss = logs.get('val_loss')
        # Save the entire model if it's the first epoch or if the validation loss has improved
        if epoch == 0 or current_val_loss < self.best_val_loss:
            self.best_val_loss = current_val_loss
            self.encoder_model.save_weights(ENCODER_PATH)
            self.decoder_model.save_weights(DECODER_PATH)

class BleuScoreCallback(Callback):
    def __init__(self, tokenizer_output, sequences_input_val, sequences_output_val, num_records, log_dir):
        super().__init__()
        # Store the tokenizer for output sequences
        self.tokenizer_output = tokenizer_output  
        # Store the input validation sequences
        self.sequences_input_val = sequences_input_val  
        # Store the output validation sequences
        self.sequences_output_val = sequences_output_val  
        # Store the number of records to select for BLEU score calculation
        self.num_records = num_records  
        # TensorBoard logging
        self.log_dir = log_dir
        self.file_writer = tf.summary.create_file_writer(self.log_dir)

    def on_epoch_end(self, epoch, logs=None):
        # Create a mapping from integer tokens to words
        int_to_word_decoder = {i: word for word, i in self.tokenizer_output.word_index.items()}  
        # Set the mapping for the out-of-vocabulary token
        int_to_word_decoder[1] = '<OOV>'  
        # Initialize a list to store the reference sequences
        references = []  
        # Initialize a list to store the predicted sequences
        candidates = []  
        # Randomly select indices for the desired number of records
        selected_indices = random.sample(range(len(self.sequences_input_val)), self.num_records)  
        # Select the input validation sequences based on the selected indices
        selected_sequences_input_val = self.sequences_input_val[selected_indices]
        # Select the output validation sequences based on the selected indices  
        selected_sequences_output_val = self.sequences_output_val[selected_indices]  
        # Get the total number of selected sequences
        total_sequences = len(selected_indices) 
        # Initialize a counter for processed sequences and average for bleu score calculations
        processed_sequences = 0  
        bleu_score_average = 0
        # Record the start time for elapsed time calculation
        start_time = time.time() 
        # Print new line to help with BLEU score formatting 
        print()
        # BLEU score testing and calculation loop
        for seq_in, seq_out in zip(selected_sequences_input_val, selected_sequences_output_val):
            # Decode the output sequence
            predicted_sequence = decode_sequence(np.array([seq_in]))
            # Convert the output sequence indices to words
            reference_sequence = ' '.join([int_to_word_decoder[int(i)] for i in seq_out if i > 0])  
            # Strip '<start>' and '<end>' tokens if they exist
            predicted_sequence = predicted_sequence.replace('<start>', '').replace('<end>', '').strip()
            reference_sequence = reference_sequence.replace('<start>', '').replace('<end>', '').strip()
            # Append the reference sequence as a list of words
            references.append([reference_sequence.split()])  
            # Append the predicted sequence as a list of words
            candidates.append(predicted_sequence.split())  
            # Increment the counter for processed sequences
            processed_sequences += 1  
            # Calculate the progress percentage
            progress = processed_sequences / total_sequences * 100  
            # Calculate the elapsed time
            elapsed_time = time.time() - start_time  
            # Calculate the BLEU score
            bleu_score_average = (corpus_bleu(references, candidates) +  bleu_score_average) / processed_sequences 
            # Print progress with carriage return to overwrite the previous line
            sys.stdout.write(f'\rProcessing sequences: {processed_sequences}/{total_sequences} ({progress:.2f}%), Elapsed Time: {elapsed_time:.2f}s, BLEU-1 Average: {bleu_score_average}')
            sys.stdout.flush()
        # Print new line to help with BLEU score formatting 
        print()
        # Tensorboard logging
        with self.file_writer.as_default():
            tf.summary.scalar('BLEU-1 score', bleu_score_average, step=epoch)
            
# Function for sending an audible ping to the user
def play_ping():
    pygame.mixer.init()
    pygame.mixer.music.load("./Sounds/notification_by_UNIVERSFIELD.mp3")  # Replace with the path to your sound file
    pygame.mixer.music.play()

# Function for prompting a user for input, notifying the user with an audible ping, and accepting and returning the input value
def input_with_notification(prompt):
    # Plays ping letting user know input is requested
    play_ping()
    # Prints prompt to screen before requesting user input
    user_input = input(prompt)
    # Returns input value
    return user_input

# Function to create a directory with a .gitignore "*" file
def create_directory_with_gitignore(directory_path):
    # Check if the directory already exists
    if not os.path.exists(directory_path):
        # Create the directory
        os.makedirs(directory_path)
    # Check if .gitignore file exists
    gitignore_path = os.path.join(directory_path, '.gitignore')
    if not os.path.exists(gitignore_path):
        # Create the .gitignore file
        with open(gitignore_path, 'w') as gitignore_file:
            gitignore_file.write('*')

# Function for loading and preprocessing the data
def load_and_preprocess_data(DATA_PATH):
    if not os.path.exists(DATA_PATH):
        raise FileNotFoundError(f'The folder {DATA_PATH} does not exist.')
    data = []
    for file in os.listdir(DATA_PATH):
        file_path = os.path.join(DATA_PATH, file)
        if os.path.isfile(file_path):
            with open(file_path, 'r', encoding='utf-8') as file:
                line_numer = 0
                for line in file:
                    line_numer = line_numer + 1
                    try:
                        description, subject = line.strip().rsplit('<HTML+JSON DELIMITER>', 1)
                        json_obj = json.loads(subject)
                        if 'Image Url List' in json_obj:
                            del json_obj['VDP Url']
                            del json_obj['Image Url List']
                        data.append((description, json.dumps(json_obj)))
                    except ValueError:
                        print(f'\nSkipped line in {file.name} due to wrong format. Line Number: {line_numer}')
        print(f"\rNumber of Training Records Loaded: {len(data)}", end='', flush=True)
    np.random.shuffle(data)
    print("\n")  # Move to the next line after everything is loaded
    return [sample[0] for sample in data], [sample[1] for sample in data]

# Custom subclass of Tokenizer for progress monitoring
class MonitoredTokenizer(Tokenizer):
    def fit_on_texts(self, texts):
        total_texts = len(texts)
        for i, text in enumerate(texts):
            # Call the original method for a single text
            super(MonitoredTokenizer, self).fit_on_texts([text])        
            # Print progress
            progress = (i + 1) / total_texts * 100
            print(f"\rFitting Progress: {progress:.2f}%", end='', flush=True)
        print("\n")  # Move to the next line after progress

# Function for tokenizing and padding the text data
def tokenize_and_pad(texts, max_seq_length):
    # Initialize a tokenizer with a special out-of-vocabulary token
    tokenizer = MonitoredTokenizer(num_words=None, oov_token='<OOV>')
    tokenizer.fit_on_texts(texts)
    # Convert texts to sequences of integer tokens
    sequences = tokenizer.texts_to_sequences(texts)
    # Initialize counter
    counter = 0
    total = len(sequences)
    # New padded sequences list
    padded_sequences = []
    # Pad the sequences to the maximum sequence length
    for seq in sequences:
        # Pad the sequence
        padded_seq = pad_sequences([seq], maxlen=max_seq_length, padding='post')[0]
        padded_sequences.append(padded_seq)
        
        # Update counter and print status
        counter += 1
        print(f"\rPadding Progress: {counter}/{total}", end='', flush=True)
    print("\n")  # Move to the next line after progress

    return tokenizer, padded_sequences

# Function for building the model
def build_model(max_seq_length, vocab_size_input, vocab_size_output, embedding_dim):
    # Define the encoder input layer
    encoder_input = Input(shape=(max_seq_length,))
    encoder_embedding_layer = Embedding(vocab_size_input, embedding_dim)
    encoder_embedding = encoder_embedding_layer(encoder_input)
    # Define the encoder LSTM layer
    encoder_lstm = Bidirectional(LSTM(LSTM_UNITS, return_sequences=True, return_state=True))
    encoder_output, forward_h, forward_c, backward_h, backward_c = encoder_lstm(encoder_embedding)
    # Concatenate the forward and backward hidden states
    state_h = Concatenate()([forward_h, backward_h])
    state_c = Concatenate()([forward_c, backward_c])
    # Define the decoder input layer
    decoder_input = Input(shape=(max_seq_length,))
    decoder_embedding_layer = Embedding(vocab_size_output, embedding_dim)
    decoder_embedding = decoder_embedding_layer(decoder_input)
    # Define the decoder LSTM layer
    decoder_lstm = LSTM(LSTM_UNITS*2, return_sequences=True, return_state=True)
    # Define the attention layer
    attention_layer = Attention(LSTM_UNITS*2)
    context_vector, attention_weights = attention_layer(encoder_output, state_h)
    decoder_concat_input = Concatenate(axis=-1)([decoder_embedding, tf.repeat(tf.expand_dims(context_vector, 1), repeats=MAX_SEQ_LENGTH, axis=1)])
    # Pass the concatenated input to the decoder LSTM
    decoder_output, _, _ = decoder_lstm(decoder_concat_input, initial_state=[state_h, state_c])
    # Define the output layer
    decoder_output = Dense(vocab_size_output, activation='softmax')(decoder_output)
    # Define the model with the encoder and decoder inputs and the decoder output
    model = Model(inputs=[encoder_input, decoder_input], outputs=decoder_output)
    return model, encoder_input, encoder_output, state_h, state_c, decoder_input, decoder_embedding_layer, decoder_lstm, attention_layer

# Function for decoding a sequence of tokens into text
def decode_sequence(input_sequence):
    # Pass the input sequence to the encoder model and get the output and states
    e_out, e_h, e_c = encoder_model.predict(input_sequence, verbose=0)
    states_value = [e_h, e_c]
    # Initialize the target sequence with the start token
    target_sequence = np.zeros((1, 1))
    target_sequence[0, 0] = 1
    output_sequence = ''
    while True:
        # Pass the target sequence and states to the decoder model and get the output tokens and new states
        output_tokens, h, c = decoder_model.predict([target_sequence] + states_value + [e_out], verbose=0)
        # Get the token with the highest probability
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        # Convert the token into a word
        sampled_word = int_to_word_decoder.get(sampled_token_index, '<OOV>')  # Dealing with the OOV token
        output_sequence += ' ' + sampled_word
        # Stop if the end token is found or the maximum sequence length is reached
        if sampled_word == '<end>' or len(output_sequence) > MAX_SEQ_LENGTH:
            break
        # Update the target sequence and states for the next iteration
        target_sequence = np.zeros((1, 1))
        target_sequence[0, 0] = sampled_token_index
        states_value = [h, c]
    return output_sequence

# Refresh all outputs before we begin
clear_output()

# Create data, logging, and model directories
create_directory_with_gitignore(DATA_PATH)
create_directory_with_gitignore(TOKENIZER_DIR)
create_directory_with_gitignore(TRAINING_DIR)
create_directory_with_gitignore(MODEL_DIR)
create_directory_with_gitignore(LOG_DIR)

# Configure tensorflow to use defined device for training
tf.config.set_visible_devices(tf.config.list_physical_devices(DEVICE)[0], DEVICE)

# Tokenize and pad the descriptions and subjects
TOKENIZER_INPUT_PATH = os.path.join(TOKENIZER_DIR, 'tokenizer_input.pkl')
SEQUENCES_INPUT_PATH = os.path.join(TOKENIZER_DIR, 'sequences_input.pkl')
TOKENIZER_OUTPUT_PATH = os.path.join(TOKENIZER_DIR, 'tokenizer_output.pkl')
SEQUENCES_OUTPUT_PATH = os.path.join(TOKENIZER_DIR, 'sequences_output.pkl')

# Ask the user whether to train a new vocab or continue using existing vocab
token_input = input("Enter '1' to Re-tokenize and Pad Sequences or '2' to Load Saved Files: ") # Ask the user whether to re-tokenize and pad the sequences or load the saved files

# Ask the user whether to train a new model, load an existing model, or continue training from an existing model
train_input = input_with_notification("Enter '1' to Train New Model, '2' to Continue Training from Existing Model, or '3' to Load Existing Model for Testing Only: ")

if token_input == '1':
    # Load and preprocess the data0
    descriptions, subjects = load_and_preprocess_data(DATA_PATH)
    
    # Tokenize and pad the descriptions and subjects
    tokenizer_input, sequences_input = tokenize_and_pad(descriptions, MAX_SEQ_LENGTH)
    tokenizer_output, sequences_output = tokenize_and_pad(subjects, MAX_SEQ_LENGTH)
    
    # Dispose of the descriptions and subjects variables to free up memory load
    del descriptions
    del subjects

    # Save tokenizers and sequences to files
    with open(TOKENIZER_INPUT_PATH, 'wb') as f:
        pickle.dump(tokenizer_input, f)
    with open(SEQUENCES_INPUT_PATH, 'wb') as f:
        pickle.dump(sequences_input, f)
    with open(TOKENIZER_OUTPUT_PATH, 'wb') as f:
        pickle.dump(tokenizer_output, f)
    with open(SEQUENCES_OUTPUT_PATH, 'wb') as f:
        pickle.dump(sequences_output, f)

elif token_input == '2':
    # Load tokenizers and sequences from files
    with open(TOKENIZER_INPUT_PATH, 'rb') as f:
        tokenizer_input = pickle.load(f)
    with open(SEQUENCES_INPUT_PATH, 'rb') as f:
        sequences_input = pickle.load(f)
    with open(TOKENIZER_OUTPUT_PATH, 'rb') as f:
        tokenizer_output = pickle.load(f)
    with open(SEQUENCES_OUTPUT_PATH, 'rb') as f:
        sequences_output = pickle.load(f)

# Get the vocab sizes for the input and output
vocab_size_input = len(tokenizer_input.word_index) + 1
print('Input Vocab Size:', vocab_size_input)
vocab_size_output = len(tokenizer_output.word_index) + 1
print('Output Vocab Size:', vocab_size_output)

# Create a dictionary for converting integer tokens back into words
int_to_word_decoder = {i: word for word, i in tokenizer_output.word_index.items()}
int_to_word_decoder[1] = '<OOV>'

# Build the model
model, encoder_input, encoder_output, state_h, state_c, decoder_input, decoder_embedding_layer, decoder_lstm, attention_layer = build_model(
    MAX_SEQ_LENGTH, vocab_size_input, vocab_size_output, EMBEDDING_DIM)

# Compile the model with a loss function, optimizer, and metrics
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam')

# Split the sequences into training and validation sets
sequences_input_train, sequences_input_val, sequences_output_train, sequences_output_val = train_test_split(
    sequences_input, np.expand_dims(sequences_output, -1), test_size=TEST_SPLIT_SIZE, random_state=RANDOM_STATE)

# Create directories for saving model weights if they do not exist
if not os.path.exists(os.path.dirname(ENCODER_PATH)):
    os.makedirs(os.path.dirname(ENCODER_PATH))
if not os.path.exists(os.path.dirname(DECODER_PATH)):
    os.makedirs(os.path.dirname(DECODER_PATH))

# Define TensorBoard and early stopping callbacks
tensorboard_callback = TensorBoard(log_dir=LOG_DIR)
early_stopping_callback = EarlyStopping(monitor='val_loss', patience=EARLY_STOPPING_PATIENCE)

# Define the encoder and decoder models for inference
encoder_model = Model(encoder_input, [encoder_output, state_h, state_c])  # Create an encoder model for inference, taking encoder_input as input and outputting encoder_output, state_h, and state_c
decoder_state_input_h = Input(shape=(LSTM_UNITS*2,))  # Define the input layer for the decoder LSTM's hidden state
decoder_state_input_c = Input(shape=(LSTM_UNITS*2,))  # Define the input layer for the decoder LSTM's cell state
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]  # Collect the decoder states inputs into a list
decoder_hidden_state_input = Input(shape=(MAX_SEQ_LENGTH, LSTM_UNITS*2))  # Define the input layer for the decoder's hidden state
decoder_input_inf = Input(shape=(1,))  # Define the input layer for the decoder during inference
decoder_embedding_inf = decoder_embedding_layer(decoder_input_inf)  # Apply the decoder's embedding layer to the decoder input during inference
context_vector, _ = attention_layer(decoder_hidden_state_input, decoder_state_input_h)  # Calculate the context vector and attention weights using the attention layer
decoder_concat_input = Concatenate(axis=-1)([decoder_embedding_inf, tf.expand_dims(context_vector, 1)])  # Concatenate the embedding input and context vector
decoder_outputs_inf, state_h_inf, state_c_inf = decoder_lstm(decoder_concat_input, initial_state=decoder_states_inputs)  # Pass the concatenated input and decoder states to the decoder LSTM and get the outputs and new states during inference
decoder_states_inf = [state_h_inf, state_c_inf]  # Collect the decoder states outputs into a list
decoder_outputs_inf = Dense(vocab_size_output, activation='softmax')(decoder_outputs_inf)  # Apply a dense layer with softmax activation to the decoder outputs during inference
decoder_model = Model([decoder_input_inf] + decoder_states_inputs + [decoder_hidden_state_input], [decoder_outputs_inf] + decoder_states_inf)  # Create a decoder model for inference, taking decoder_input_inf, decoder_states_inputs, and decoder_hidden_state_input as inputs, and outputting decoder_outputs_inf and decoder_states_inf

# Define the model checkpoint callback
checkpoint_callback = ModelCheckpoint(encoder_model, decoder_model)

# If the user chose to train a new model, fit the model and save the best weights
if train_input == '1':
    # Create a BLEU score callback object with specified parameters
    bleu_score_callback = BleuScoreCallback(tokenizer_output, sequences_input_val, sequences_output_val, BLEU_TEST_COUNT, LOG_DIR)
    # Begin training
    model.fit([sequences_input_train, sequences_output_train], sequences_output_train,
            batch_size=BATCH_SIZE,
            epochs=NUM_EPOCHS,
            validation_data=([sequences_input_val, sequences_output_val], sequences_output_val),
            callbacks=[checkpoint_callback, tensorboard_callback, bleu_score_callback, early_stopping_callback])

# If the user chose to continue training from an existing model, load the saved weights and continue training
# Please be aware that this requires you to have the exact same architecture as the one that was used when the model was saved
elif train_input == '2':
    # Load existing weights
    encoder_model.load_weights(ENCODER_TRAINING_PATH)
    decoder_model.load_weights(DECODER_TRAINING_PATH)
    # Create a BLEU score callback object with specified parameters
    bleu_score_callback = BleuScoreCallback(tokenizer_output, sequences_input_val, sequences_output_val, BLEU_TEST_COUNT, LOG_DIR)
    # Begin training
    model.fit([sequences_input_train, sequences_output_train], sequences_output_train,
            batch_size=BATCH_SIZE,
            epochs=NUM_EPOCHS,
            validation_data=([sequences_input_val, sequences_output_val], sequences_output_val),
            callbacks=[checkpoint_callback, tensorboard_callback, bleu_score_callback, early_stopping_callback])

# If the user chose to load an existing model for testing, load the saved weights
elif train_input == '3':
    encoder_model.load_weights(ENCODER_PATH)
    decoder_model.load_weights(DECODER_PATH)

# Ask the user whether to train a new model, load an existing model, or continue training from an existing model
test_input = input_with_notification("Enter '1' to Get HTML From URL or '2' to Manually Input HTML: ")

if test_input == '1':
    while True:
        # Ask the user for a URL
        url = input_with_notification("Enter a URL (type 'quit' to exit): ")
        # Break the loop if the user types 'quit'
        if url.lower() == 'quit':
            break

        try:
            # Send a GET request to the URL and retrieve the HTML content
            response = requests.get(url)
            html_content = response.text

            # Tokenize and pad the HTML content
            input_sequence = tokenizer_input.texts_to_sequences([html_content])
            input_sequence = pad_sequences(input_sequence, maxlen=MAX_SEQ_LENGTH, padding='post')

            # Decode the model's output sequence
            predicted_sequence = decode_sequence(input_sequence)

            # Print the predicted subject
            print('Predicted Subject:', predicted_sequence)
        except requests.exceptions.RequestException:
            print('Error: Failed to retrieve HTML content from the URL.')

elif test_input == '2':
    # Loop for getting user input and predicting the output
    while True:
        # Ask the user for a description
        input_description = input_with_notification("Enter a new description (type 'quit' to exit): ")
        # Break the loop if the user types 'quit'
        if input_description.lower() == 'quit': break
        # Tokenize and pad the user's description
        input_sequence = tokenizer_input.texts_to_sequences([input_description])
        input_sequence = pad_sequences(input_sequence, maxlen=MAX_SEQ_LENGTH, padding='post')
        # Decode the model's output sequence
        predicted_sequence = decode_sequence(input_sequence)
        # Print the predicted subject
        print('Predicted Subject:', predicted_sequence)