In [None]:
import tensorflow as tf
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Bidirectional
from tensorflow.keras.callbacks import ModelCheckpoint

# Step 1: Pre-processing the data
data_file = "E:/GitHub/ThinkTAI/ThinkTAI/Data/data.txt"
data = []
with open(data_file, 'r') as file:
    for line in file:
        description, subject = line.strip().split("|")
        data.append((description, subject))

# Shuffle the data
np.random.shuffle(data)

# Step 2: Building the encoder-decoder model
# Define the input sequence length and vocabulary size
max_seq_length = 100
vocab_size = 1000

# Tokenize the descriptions and subjects
tokenizer = Tokenizer(num_words=vocab_size, oov_token="<OOV>")
tokenizer.fit_on_texts([sample[0] for sample in data])
sequences = tokenizer.texts_to_sequences([sample[0] for sample in data])

# Pad the sequences to ensure equal length
sequences = pad_sequences(sequences, maxlen=max_seq_length, padding='post')

# Create the encoder network
encoder_input = Input(shape=(max_seq_length, 1))  # Update input shape to include the additional dimension
encoder_lstm = Bidirectional(LSTM(256, return_sequences=True))
encoder_output = encoder_lstm(encoder_input)

# Create the decoder network
decoder_lstm = LSTM(256, return_sequences=True)
decoder_output = decoder_lstm(encoder_output)
decoder_output = Dense(vocab_size, activation='softmax')(decoder_output)

# Create the encoder-decoder model
model = Model(inputs=encoder_input, outputs=decoder_output)

# Step 3: Training the model
target_sequences = tokenizer.texts_to_sequences([sample[1] for sample in data])
target_sequences = pad_sequences(target_sequences, maxlen=max_seq_length, padding='post')
target_one_hot = tf.one_hot(target_sequences, vocab_size)
model.compile(loss='categorical_crossentropy', optimizer='adam')

# Set the number of epochs and batch size
num_epochs = 10
batch_size = 32

# Define the checkpoint path
checkpoint_path = r"E:/GitHub/ThinkTAI/ThinkTAI/Model/model_checkpoint.h5"

# Create a ModelCheckpoint callback to save the model weights
checkpoint_callback = ModelCheckpoint(checkpoint_path, save_weights_only=True)

for epoch in range(num_epochs):
    print(f"Epoch {epoch+1}/{num_epochs}")

    # Shuffle the data before each epoch
    np.random.shuffle(data)
    sequences = tokenizer.texts_to_sequences([sample[0] for sample in data])
    sequences = pad_sequences(sequences, maxlen=max_seq_length, padding='post')
    target_sequences = tokenizer.texts_to_sequences([sample[1] for sample in data])
    target_sequences = pad_sequences(target_sequences, maxlen=max_seq_length, padding='post')
    target_one_hot = tf.one_hot(target_sequences, vocab_size)

    # Fit the model with shuffled data and save the weights
    model.fit(sequences[..., tf.newaxis], target_one_hot, batch_size=batch_size, epochs=1, callbacks=[checkpoint_callback])

# Step 4: Generating subject sequence for a new description
input_description = ""
while input_description != "quit":
    input_description = input("Enter a new description (type 'quit' to exit): ")
    if input_description == "quit":
        break

    input_sequence = tokenizer.texts_to_sequences([input_description])
    input_sequence = pad_sequences(input_sequence, maxlen=max_seq_length, padding='post')

    predicted_sequence = model.predict(input_sequence[..., tf.newaxis])
    predicted_subject = tokenizer.sequences_to_texts(predicted_sequence.argmax(axis=-1))[0]

    print("Predicted Subject:", predicted_subject)
