In [None]:
from tensorflow.keras.layers import LSTM, Dense, Embedding, Bidirectional, Input, Concatenate, Layer
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import Callback, TensorBoard, EarlyStopping
from sklearn.model_selection import train_test_split
import tensorflow as tf
import numpy as np
import os

# Global configuration variables
DATA_PATH = './Data/data.txt'  # Path to the data file
ENCODER_PATH = './Model/encoder_weights.h5'  # Path to save model checkpoints
DECODER_PATH = './Model/decoder_weights.h5'  # Path to save model checkpoints
MAX_SEQ_LENGTH = 100  # Maximum sequence length for tokenization and padding
EMBEDDING_DIM = 256  # Dimension of the word embeddings
NUM_EPOCHS = 100  # Number of training epochs
BATCH_SIZE = 32  # Size of the mini-batches for training
TEST_SPLIT_SIZE = 0.2  # Fraction of the data to use as validation data
RANDOM_STATE = 42  # Seed for random number generator for reproducibility
EARLY_STOPPING_PATIENCE = 5  # Number of epochs with no improvement after which training will be stopped
LSTM_UNITS = 256  # Number of units in LSTM layer
LOG_DIR = './Logs'  # Directory to save TensorBoard logs

class Attention(Layer):
    def __init__(self, units):
        super(Attention, self).__init__()
        self.W1 = Dense(units)
        self.W2 = Dense(units)
        self.V = Dense(1)

    def call(self, features, hidden):
        hidden_with_time_axis = tf.expand_dims(hidden, 1)
        score = tf.nn.tanh(self.W1(features) + self.W2(hidden_with_time_axis))
        attention_weights = tf.nn.softmax(self.V(score), axis=1)
        context_vector = attention_weights * features
        context_vector = tf.reduce_sum(context_vector, axis=1)

        return context_vector, attention_weights
    
class ModelCheckpoint(Callback):
    def __init__(self, encoder_model, decoder_model):
        super().__init__()
        self.encoder_model = encoder_model
        self.decoder_model = decoder_model

    def on_epoch_end(self, epoch, logs=None):
        # Modify this if you want to use another metric for model comparison
        current_val_loss = logs.get('val_loss')
        
        # Save the model
        if epoch == 0 or current_val_loss < self.best_val_loss: 
            self.best_val_loss = current_val_loss
            self.encoder_model.save_weights(ENCODER_PATH)
            self.decoder_model.save_weights(DECODER_PATH)


def load_and_preprocess_data(data_file):
    if not os.path.exists(data_file):
        raise FileNotFoundError(f'The file {data_file} does not exist.')
    
    data = []
    with open(data_file, 'r') as file:
        for line in file:
            try:
                description, subject = line.strip().split('|')
                data.append((description, subject))
            except ValueError:
                print(f'Skipped line due to wrong format: {line}')

    np.random.shuffle(data)

    return [sample[0] for sample in data], [sample[1] for sample in data]

def tokenize_and_pad(texts, max_seq_length):
    tokenizer = Tokenizer(num_words=None, oov_token='<OOV>')
    tokenizer.fit_on_texts(texts)
    sequences = tokenizer.texts_to_sequences(texts)
    padded_sequences = pad_sequences(sequences, maxlen=max_seq_length, padding='post')

    return tokenizer, padded_sequences

def build_model(max_seq_length, vocab_size_input, vocab_size_output, embedding_dim):
    encoder_input = Input(shape=(max_seq_length,))
    encoder_embedding_layer = Embedding(vocab_size_input, embedding_dim)
    encoder_embedding = encoder_embedding_layer(encoder_input)
    encoder_lstm = Bidirectional(LSTM(LSTM_UNITS, return_sequences=True, return_state=True))
    encoder_output, forward_h, forward_c, backward_h, backward_c = encoder_lstm(encoder_embedding)
    state_h = Concatenate()([forward_h, backward_h])
    state_c = Concatenate()([forward_c, backward_c])

    decoder_input = Input(shape=(max_seq_length,))
    decoder_embedding_layer = Embedding(vocab_size_output, embedding_dim)
    decoder_embedding = decoder_embedding_layer(decoder_input)
    decoder_lstm = LSTM(LSTM_UNITS*2, return_sequences=True, return_state=True)

    attention_layer = Attention(LSTM_UNITS*2)
    context_vector, attention_weights = attention_layer(encoder_output, state_h)
    decoder_concat_input = Concatenate(axis=-1)([decoder_embedding, tf.repeat(tf.expand_dims(context_vector, 1), repeats=MAX_SEQ_LENGTH, axis=1)])

    decoder_output, _, _ = decoder_lstm(decoder_concat_input, initial_state=[state_h, state_c])
    decoder_output = Dense(vocab_size_output, activation='softmax')(decoder_output)

    model = Model(inputs=[encoder_input, decoder_input], outputs=decoder_output)

    return model, encoder_input, encoder_output, state_h, state_c, decoder_input, decoder_embedding_layer, decoder_lstm, attention_layer

def decode_sequence(input_sequence):
    e_out, e_h, e_c = encoder_model.predict(input_sequence)
    states_value = [e_h, e_c]
    target_sequence = np.zeros((1, 1))
    target_sequence[0, 0] = 1
    output_sequence = ''
    while True:
        output_tokens, h, c = decoder_model.predict([target_sequence] + states_value + [e_out])
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_word = int_to_word_decoder.get(sampled_token_index, '<OOV>')  # dealing with the OOV token
        output_sequence += ' ' + sampled_word
        if sampled_word == '<end>' or len(output_sequence) > MAX_SEQ_LENGTH:
            break
        target_sequence = np.zeros((1, 1))
        target_sequence[0, 0] = sampled_token_index
        states_value = [h, c]
    return output_sequence

descriptions, subjects = load_and_preprocess_data(DATA_PATH)

tokenizer_input, sequences_input = tokenize_and_pad(descriptions, MAX_SEQ_LENGTH)
tokenizer_output, sequences_output = tokenize_and_pad(subjects, MAX_SEQ_LENGTH)

vocab_size_input = len(tokenizer_input.word_index) + 1
vocab_size_output = len(tokenizer_output.word_index) + 1
model, encoder_input, encoder_output, state_h, state_c, decoder_input, decoder_embedding_layer, decoder_lstm, attention_layer = build_model(
    MAX_SEQ_LENGTH, vocab_size_input, vocab_size_output, EMBEDDING_DIM)

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

sequences_input_train, sequences_input_val, sequences_output_train, sequences_output_val = train_test_split(
    sequences_input, np.expand_dims(sequences_output, -1), test_size=TEST_SPLIT_SIZE, random_state=RANDOM_STATE)

if not os.path.exists(os.path.dirname(ENCODER_PATH)):
    os.makedirs(os.path.dirname(ENCODER_PATH))

if not os.path.exists(os.path.dirname(DECODER_PATH)):
    os.makedirs(os.path.dirname(DECODER_PATH))

tensorboard_callback = TensorBoard(log_dir=LOG_DIR)
early_stopping_callback = EarlyStopping(monitor='val_loss', patience=EARLY_STOPPING_PATIENCE)

user_input = input("Enter '1' to Train New Model or '2' to Load Existing Model: ")

# Create the encoder and decoder models for inference whether it's a new training or loading an existing model
encoder_model = Model(encoder_input, [encoder_output, state_h, state_c])

decoder_state_input_h = Input(shape=(LSTM_UNITS*2,))
decoder_state_input_c = Input(shape=(LSTM_UNITS*2,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
decoder_hidden_state_input = Input(shape=(MAX_SEQ_LENGTH, LSTM_UNITS*2))

decoder_input_inf = Input(shape=(1,))
decoder_embedding_inf = decoder_embedding_layer(decoder_input_inf)

context_vector, _ = attention_layer(decoder_hidden_state_input, decoder_state_input_h)
decoder_concat_input = Concatenate(axis=-1)([decoder_embedding_inf, tf.expand_dims(context_vector, 1)])

decoder_outputs_inf, state_h_inf, state_c_inf = decoder_lstm(decoder_concat_input, initial_state=decoder_states_inputs)
decoder_states_inf = [state_h_inf, state_c_inf]
decoder_outputs_inf = Dense(vocab_size_output, activation='softmax')(decoder_outputs_inf)
decoder_model = Model([decoder_input_inf] + decoder_states_inputs + [decoder_hidden_state_input], [decoder_outputs_inf] + decoder_states_inf)

checkpoint_callback = ModelCheckpoint(encoder_model, decoder_model)

if user_input == '1':
    # Include custom_checkpoint_callback in the list of callbacks during training
    model.fit([sequences_input_train, sequences_output_train], sequences_output_train,
            batch_size=BATCH_SIZE,
            epochs=NUM_EPOCHS,
            validation_data=([sequences_input_val, sequences_output_val], sequences_output_val),
            callbacks=[checkpoint_callback, tensorboard_callback, early_stopping_callback])

if user_input == '2':
    encoder_model.load_weights(ENCODER_PATH)
    decoder_model.load_weights(DECODER_PATH)

int_to_word_decoder = {i: word for word, i in tokenizer_output.word_index.items()}
int_to_word_decoder[1] = '<OOV>'

while True:
    input_description = input("Enter a new description (type 'quit' to exit): ")
    if input_description.lower() == 'quit':
        break
    input_sequence = tokenizer_input.texts_to_sequences([input_description])
    input_sequence = pad_sequences(input_sequence, maxlen=MAX_SEQ_LENGTH, padding='post')
    predicted_sequence = decode_sequence(input_sequence)
    print('Predicted Subject:', predicted_sequence)
