In [91]:
# For the encoder-decoder structure in a seq2seq model using LSTM, here are the basic steps you might want to follow:

# 1. Prepare Data:
# Load and preprocess your dataset, tokenizing text into sequences.
# Split the data into training and testing sets.
# 2. Build the Encoder:
# Create an LSTM-based encoder that processes input sequences.
# Embed the input sequences.
# Pass the embedded sequences through an LSTM layer to get the encoder output and states.
# 3. Build the Decoder:
# Create another LSTM-based model as the decoder.
# Embed the output sequences.
# Use the encoder states as initial states for the decoder.
# Train the decoder to predict the target sequences based on the encoder states and target inputs.
# 4. Create the Seq2Seq Model:
# Combine the encoder and decoder models to form the full seq2seq model.
# Here's a simplified example using Keras:

 

# from tensorflow.keras.models import Model
# from tensorflow.keras.layers import LSTM, Input, Embedding, Dense

# # Define your input sequence length and vocabulary size
# max_encoder_seq_length = ...
# max_decoder_seq_length = ...
# num_encoder_tokens = ...
# num_decoder_tokens = ...

# # Encoder
# encoder_inputs = Input(shape=(max_encoder_seq_length,))
# encoder_embedding = Embedding(num_encoder_tokens, latent_dim)(encoder_inputs)
# encoder_lstm = LSTM(latent_dim, return_state=True)
# encoder_outputs, state_h, state_c = encoder_lstm(encoder_embedding)
# encoder_states = [state_h, state_c]

# # Decoder
# decoder_inputs = Input(shape=(max_decoder_seq_length,))
# decoder_embedding = Embedding(num_decoder_tokens, latent_dim)(decoder_inputs)
# decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
# decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)
# decoder_dense = Dense(num_decoder_tokens, activation='softmax')
# decoder_outputs = decoder_dense(decoder_outputs)

# # Define the model
# model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

# # Compile the model and specify the optimizer, loss function, etc.
# model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# # Train the model using your prepared training data
# model.fit([encoder_input_data, decoder_input_data], decoder_target_data, ...)
 

 

# This is a high-level overview and a basic structure of an LSTM-based seq2seq model. You'll need to adapt this to your specific dataset and requirements.

# If you're facing issues with your current code or implementation, please share specific parts of your code or explain the problem you're facing, and I'll be glad to assist you further!

In [92]:
#importing library-

import numpy as np

import re

import time

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, Dropout, Bidirectional
from tensorflow.keras.optimizers import Adam

In [93]:
#importing data-

lines = open('movie_lines.tsv',encoding = 'utf-8', errors = 'ignore').read().split('\n')

conversations = open('movie_conversations.tsv',encoding = 'utf-8', errors = 'ignore').read().split('\n')

In [94]:
#lines

In [95]:
#conversations

In [96]:
id2line = {}

with open('movie_lines.tsv', 'r') as file:
    for line in file:
        _line = line.strip().split('\t')
        if len(_line) >= 5:
            id2line[_line[0]] = _line[4]


In [97]:
#'L1045': 'They do not!',
#id2line

In [98]:
conversations_ids = []
for conversation in conversations[:-1]:
    _conversation = conversation.strip().split('\t')  # Split the conversation using tab as the delimiter
    if len(_conversation) >= 2:  # Ensure the conversation has at least two fields
        conv_id = _conversation[0]  # Assuming the ID is in the first field
        last_field = _conversation[-1]  # Retrieve the last field
        conversations_ids.append((conv_id, last_field))  # Append a tuple with ID and last field


In [99]:
#('u0', "['L194' 'L195' 'L196' 'L197']")
#conversations_ids

In [100]:
# questions = []
# answers = []
# for conversation in conversations_ids:
#     if len(conversation) > 1:  # Ensure there is more than one element in the conversation
#         for i in range(len(conversation) - 1):
#             current_question = id2line.get(conversation[i])
#             next_answer = id2line.get(conversation[i + 1])

#             # Check for None values before appending
#             if current_question is not None and next_answer is not None:
#                 questions.append(current_question)
#                 answers.append(next_answer)


In [101]:
questions = []
answers = []

for conversation in conversations_ids:
    if len(conversation) > 1:
        # Extract individual IDs from the string representation of the list
        conversation_ids_str = conversation[1].replace("'", "").replace("[", "").replace("]", "")
        conversation_ids_list = conversation_ids_str.split()

        for i in range(len(conversation_ids_list) - 1):
            current_question_id = conversation_ids_list[i]
            next_answer_id = conversation_ids_list[i + 1]

            # Retrieve lines using the IDs from id2line
            current_question = id2line.get(current_question_id)
            next_answer = id2line.get(next_answer_id)

            # Check for None values before appending
            if current_question is not None and next_answer is not None:
                questions.append(current_question)
                answers.append(next_answer)


In [102]:
#questions

In [103]:
#answers

In [104]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r"i'm", "i am", text)
    text = re.sub(r"he's", "he is", text)
    text = re.sub(r"she's", "she is", text)
    text = re.sub(r"that's", "that is", text)
    text = re.sub(r"what's", "what is", text)
    text = re.sub(r"where's", "where is", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"won't", "will not", text)
    text = re.sub(r"can't", "cannot", text)
    text = re.sub(r"[-()\"#/@;:<>{}+=~|.?,]", "", text)
    return text


In [105]:
# Cleaning the questions
clean_questions = []
for question in questions:
    clean_questions.append(clean_text(question))
 

In [106]:
#clean_questions

In [107]:
# Cleaning the answers
clean_answers = []
for answer in answers:
    clean_answers.append(clean_text(answer))

In [108]:
#clean_answers

In [109]:
# Creating a dictionary that maps each word to its number of occurrences
word2count = {}
for question in clean_questions:
    for word in question.split():
        if word not in word2count:
            word2count[word] = 1
        else:
            word2count[word] += 1

In [110]:
#word2count

In [111]:
for answer in clean_answers:
    for word in answer.split():
        if word not in word2count:
            word2count[word] = 1
        else:
            word2count[word] += 1

In [112]:
# Creating two dictionaries that map the questions words and the answers words to a unique integer
threshold_questions = 20
questionswords2int = {}
word_number = 0
for word, count in word2count.items():
    if count >= threshold_questions:
        questionswords2int[word] = word_number
        word_number += 1


In [113]:
threshold_answers = 20
answerswords2int = {}
word_number = 0
for word, count in word2count.items():
    if count >= threshold_answers:
        answerswords2int[word] = word_number
        word_number += 1

In [114]:
# Adding the last tokens to these two dictionaries
tokens = ['<PAD>', '<EOS>', '<OUT>', '<SOS>']

In [115]:
for token in tokens:
    questionswords2int[token] = len(questionswords2int) + 1
for token in tokens:
    answerswords2int[token] = len(answerswords2int) + 1

In [116]:
 # Creating the inverse dictionary of the answerswords2int dictionary
answersints2word = {w_i: w for w, w_i in answerswords2int.items()}

In [117]:
# Adding the End Of String token to the end of every answer
for i in range(len(clean_answers)):
    clean_answers[i] += ' <EOS>'
 

In [118]:
# Translating all the questions and the answers into integers
# and Replacing all the words that were filtered out by <OUT> 
questions_into_int = []
for question in clean_questions:
    ints = []
    for word in question.split():
        if word not in questionswords2int:
            ints.append(questionswords2int['<OUT>'])
        else:
            ints.append(questionswords2int[word])
    questions_into_int.append(ints)


In [119]:
answers_into_int = []
for answer in clean_answers:
    ints = []
    for word in answer.split():
        if word not in answerswords2int:
            ints.append(answerswords2int['<OUT>'])
        else:
            ints.append(answerswords2int[word])
    answers_into_int.append(ints)

In [120]:
# Sorting questions and answers by the length of questions
sorted_clean_questions = []
sorted_clean_answers = []
for length in range(1, 25 + 1):
    for i in enumerate(questions_into_int):
        if len(i[1]) == length:
            sorted_clean_questions.append(questions_into_int[i[0]])
            sorted_clean_answers.append(answers_into_int[i[0]])

BUILDING THE SEQ2SEQ MODEL--

In [121]:
# inputs and the targets-
def model_inputs():
    inputs = tf.keras.layers.Input(shape=(None,), dtype=tf.int32, name='input')
    targets = tf.keras.layers.Input(shape=(None,), dtype=tf.int32, name='target')
    lr = tf.keras.layers.Input(shape=(), dtype=tf.float32, name='learning_rate')
    keep_prob = tf.keras.layers.Input(shape=(), dtype=tf.float32, name='keep_prob')
    return inputs, targets, lr, keep_prob

In [122]:
# Preprocessing the targets
def preprocess_targets(targets, word2int, batch_size):
    left_side = tf.fill([batch_size, 1], word2int['<SOS>'])
    right_side = tf.strided_slice(targets, [0,0], [batch_size, -1], [1,1])
    preprocessed_targets = tf.concat([left_side, right_side], 1)
    return preprocessed_targets
 

In [123]:
# Creating the Encoder RNN
def encoder_rnn(rnn_inputs, rnn_size, num_layers, keep_prob, sequence_length):
    lstm = tf.contrib.rnn.BasicLSTMCell(rnn_size)
    lstm_dropout = tf.contrib.rnn.DropoutWrapper(lstm, input_keep_prob = keep_prob)
    encoder_cell = tf.contrib.rnn.MultiRNNCell([lstm_dropout] * num_layers)
    encoder_output, encoder_state = tf.nn.bidirectional_dynamic_rnn(cell_fw = encoder_cell,
                                                                    cell_bw = encoder_cell,
                                                                    sequence_length = sequence_length,
                                                                    inputs = rnn_inputs,
                                                                    dtype = tf.float32)
    return encoder_state

In [124]:
 
# Decoding the training set
def decode_training_set(encoder_state, decoder_cell, decoder_embedded_input, sequence_length, decoding_scope, output_function, keep_prob, batch_size):
    attention_states = tf.zeros([batch_size, 1, decoder_cell.output_size])
    attention_keys, attention_values, attention_score_function, attention_construct_function = tf.contrib.seq2seq.prepare_attention(attention_states, attention_option = "bahdanau", num_units = decoder_cell.output_size)
    training_decoder_function = tf.contrib.seq2seq.attention_decoder_fn_train(encoder_state[0],
                                                                              attention_keys,
                                                                              attention_values,
                                                                              attention_score_function,
                                                                              attention_construct_function,
                                                                              name = "attn_dec_train")
    decoder_output, decoder_final_state, decoder_final_context_state = tf.contrib.seq2seq.dynamic_rnn_decoder(decoder_cell,
                                                                                                              training_decoder_function,
                                                                                                              decoder_embedded_input,
                                                                                                              sequence_length,
                                                                                                              scope = decoding_scope)
    decoder_output_dropout = tf.nn.dropout(decoder_output, keep_prob)
    return output_function(decoder_output_dropout)

In [125]:
# Decoding the test/validation set
def decode_test_set(encoder_state, decoder_cell, decoder_embeddings_matrix, sos_id, eos_id, maximum_length, num_words, decoding_scope, output_function, keep_prob, batch_size):
    attention_states = tf.zeros([batch_size, 1, decoder_cell.output_size])
    attention_keys, attention_values, attention_score_function, attention_construct_function = tf.contrib.seq2seq.prepare_attention(attention_states, attention_option = "bahdanau", num_units = decoder_cell.output_size)
    test_decoder_function = tf.contrib.seq2seq.attention_decoder_fn_inference(output_function,
                                                                              encoder_state[0],
                                                                              attention_keys,
                                                                              attention_values,
                                                                              attention_score_function,
                                                                              attention_construct_function,
                                                                              decoder_embeddings_matrix,
                                                                              sos_id,
                                                                              eos_id,
                                                                              maximum_length,
                                                                              num_words,
                                                                              name = "attn_dec_inf")
    test_predictions, decoder_final_state, decoder_final_context_state = tf.contrib.seq2seq.dynamic_rnn_decoder(decoder_cell,
                                                                                                                test_decoder_function,
                                                                                                                scope = decoding_scope)
    return test_predictions

In [126]:
# Creating the Decoder RNN
def decoder_rnn(decoder_embedded_input, decoder_embeddings_matrix, encoder_state, num_words, sequence_length, rnn_size, num_layers, word2int, keep_prob, batch_size):
    with tf.variable_scope("decoding") as decoding_scope:
        lstm = tf.contrib.rnn.BasicLSTMCell(rnn_size)
        lstm_dropout = tf.contrib.rnn.DropoutWrapper(lstm, input_keep_prob = keep_prob)
        decoder_cell = tf.contrib.rnn.MultiRNNCell([lstm_dropout] * num_layers)
        weights = tf.truncated_normal_initializer(stddev = 0.1)
        biases = tf.zeros_initializer()
        output_function = lambda x: tf.contrib.layers.fully_connected(x,
                                                                      num_words,
                                                                      None,
                                                                      scope = decoding_scope,
                                                                      weights_initializer = weights,
                                                                      biases_initializer = biases)
        training_predictions = decode_training_set(encoder_state,
                                                   decoder_cell,
                                                   decoder_embedded_input,
                                                   sequence_length,
                                                   decoding_scope,
                                                   output_function,
                                                   keep_prob,
                                                   batch_size)
        decoding_scope.reuse_variables()
        test_predictions = decode_test_set(encoder_state,
                                           decoder_cell,
                                           decoder_embeddings_matrix,
                                           word2int['<SOS>'],
                                           word2int['<EOS>'],
                                           sequence_length - 1,
                                           num_words,
                                           decoding_scope,
                                           output_function,
                                           keep_prob,
                                           batch_size)
    return training_predictions, test_predictions

In [127]:
# #Building the seq2seq model--
# def seq2seq_model(inputs, targets, keep_prob, batch_size, sequence_length, answers_num_words, questions_num_words, encoder_embedding_size, decoder_embedding_size, rnn_size, num_layers, questionswords2int):
#     encoder_embedded_input = tf.contrib.layers.embed_sequence(inputs,
#                                                               answers_num_words + 1,
#                                                               encoder_embedding_size,
#                                                               initializer = tf.random_uniform_initializer(0, 1))
#     encoder_state = encoder_rnn(encoder_embedded_input, rnn_size, num_layers, keep_prob, sequence_length)
#     preprocessed_targets = preprocess_targets(targets, questionswords2int, batch_size)
#     decoder_embeddings_matrix = tf.Variable(tf.random_uniform([questions_num_words + 1, decoder_embedding_size], 0, 1))
#     decoder_embedded_input = tf.nn.embedding_lookup(decoder_embeddings_matrix, preprocessed_targets)
#     training_predictions, test_predictions = decoder_rnn(decoder_embedded_input,
#                                                          decoder_embeddings_matrix,
#                                                          encoder_state,
#                                                          questions_num_words,
#                                                          sequence_length,
#                                                          rnn_size,
#                                                          num_layers,
#                                                          questionswords2int,
#                                                          keep_prob,
#                                                          batch_size)
#     return training_predictions, test_predictions

In [128]:
#Building the seq2seq model--
def seq2seq_model(inputs, targets, keep_prob, batch_size, sequence_length, answers_num_words, questions_num_words, encoder_embedding_size, decoder_embedding_size, rnn_size, num_layers, questionswords2int):
    embedding_layer = tf.keras.layers.Embedding(input_dim=answers_num_words + 1,
                                           output_dim=encoder_embedding_size,
                                           embeddings_initializer=tf.keras.initializers.RandomUniform(0, 1))
    encoder_embedded_input = embedding_layer(inputs)
    encoder_state = encoder_rnn(encoder_embedded_input, rnn_size, num_layers, keep_prob, sequence_length)
    preprocessed_targets = preprocess_targets(targets, questionswords2int, batch_size)
    decoder_embeddings_matrix = tf.Variable(tf.random_uniform([questions_num_words + 1, decoder_embedding_size], 0, 1))
    decoder_embedded_input = tf.nn.embedding_lookup(decoder_embeddings_matrix, preprocessed_targets)
    training_predictions, test_predictions = decoder_rnn(decoder_embedded_input,
                                                         decoder_embeddings_matrix,
                                                         encoder_state,
                                                         questions_num_words,
                                                         sequence_length,
                                                         rnn_size,
                                                         num_layers,
                                                         questionswords2int,
                                                         keep_prob,
                                                         batch_size)
    return training_predictions, test_predictions



PART 3 - TRAINING THE SEQ2SEQ MODEL

In [129]:
# Setting the Hyperparameters
epochs = 100
batch_size = 64
rnn_size = 512
num_layers = 3
encoding_embedding_size = 512
decoding_embedding_size = 512
learning_rate = 0.01
learning_rate_decay = 0.9
min_learning_rate = 0.0001
keep_probability = 0.5
 

In [130]:
# Loading the model inputs
def model_inputs():
    inputs = tf.keras.layers.Input(shape=(None,), dtype=tf.int32, name='input')
    targets = tf.keras.layers.Input(shape=(None,), dtype=tf.int32, name='target')
    lr = tf.keras.layers.Input(shape=(), dtype=tf.float32, name='learning_rate')
    keep_prob = tf.keras.layers.Input(shape=(), dtype=tf.float32, name='keep_prob')
    
    return inputs, targets, lr, keep_prob

# Setting the sequence length
sequence_length = 25

# Getting the shape of the inputs tensor
input_shape = (None, None)


In [131]:
def encoder_rnn(inputs, rnn_size, num_layers, keep_prob, sequence_length):
    # Define LSTM layer
    lstm_layer = tf.keras.layers.LSTM(units=rnn_size, dropout=keep_prob, recurrent_dropout=keep_prob, return_sequences=True, return_state=True)
    # Wrap LSTM layer with dropout layer
    lstm_dropout = tf.keras.layers.Dropout(keep_prob)
    # Wrap LSTM layer with dropout layer
    dropout_lstm_cell = tf.keras.layers.StackedRNNCells([lstm_layer, lstm_dropout])
    # Create a list of dropout-wrapped LSTM cells
    encoder_cell = [dropout_lstm_cell] * num_layers

    # Build the RNN
    encoder_output, encoder_state = tf.keras.layers.RNN(encoder_cell, return_sequences=True, return_state=True)(inputs)

    return encoder_output, encoder_state


In [132]:
inputs, targets, lr, keep_prob = model_inputs()


In [133]:
#chat gpt said reverse input to provide it to model
reversed_inputs = tf.reverse(inputs, axis=[-1])


In [135]:
training_predictions, test_predictions = seq2seq_model(reversed_inputs,
                                                       targets,
                                                       keep_prob,
                                                       batch_size,
                                                       sequence_length,
                                                       len(answerswords2int),
                                                       len(questionswords2int),
                                                       encoding_embedding_size,
                                                       decoding_embedding_size,
                                                       rnn_size,
                                                       num_layers,
                                                       questionswords2int)


TypeError: Keras symbolic inputs/outputs do not implement `__len__`. You may be trying to pass Keras symbolic inputs/outputs to a TF API that does not register dispatching, preventing Keras from automatically converting the API call to a lambda layer in the Functional Model. This error will also get raised if you try asserting a symbolic input/output directly.

In [None]:

# # Setting up the Loss Error, the Optimizer and Gradient Clipping
# with tf.name_scope("optimization"):
#     loss_error = tf.contrib.seq2seq.sequence_loss(training_predictions,
#                                                   targets,
#                                                   tf.ones([input_shape[0], sequence_length]))
#     optimizer = tf.train.AdamOptimizer(learning_rate)
#     gradients = optimizer.compute_gradients(loss_error)
#     clipped_gradients = [(tf.clip_by_value(grad_tensor, -5., 5.), grad_variable) for grad_tensor, grad_variable in gradients if grad_tensor is not None]
#     optimizer_gradient_clipping = optimizer.apply_gradients(clipped_gradients)

In [None]:
with tf.name_scope("optimization"):
    loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')
    mask = tf.math.logical_not(tf.math.equal(targets, 0))  # Create a mask for non-zero targets
    loss_ = loss_object(targets, training_predictions)
    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask  # Apply the mask to the loss
    loss_error = tf.reduce_mean(loss_)

    optimizer = tf.keras.optimizers.Adam(learning_rate)

    # Get model trainable variables
    variables = seq2seq_model.trainable_variables

    gradients = tf.gradients(loss_, variables)
    clipped_gradients, _ = tf.clip_by_global_norm(gradients, 5.0)

    optimizer_gradient_clipping = optimizer.apply_gradients(zip(clipped_gradients, variables))

#i hav to define targets before executing this cell so that it can pass data to all non zero targets.

In [None]:
# Padding the sequences with the <PAD> token
def apply_padding(batch_of_sequences, word2int):
    max_sequence_length = max([len(sequence) for sequence in batch_of_sequences])
    return [sequence + [word2int['<PAD>']] * (max_sequence_length - len(sequence)) for sequence in batch_of_sequences]

In [None]:
# Splitting the data into batches of questions and answers
def split_into_batches(questions, answers, batch_size):
    for batch_index in range(0, len(questions) // batch_size):
        start_index = batch_index * batch_size
        questions_in_batch = questions[start_index : start_index + batch_size]
        answers_in_batch = answers[start_index : start_index + batch_size]
        padded_questions_in_batch = np.array(apply_padding(questions_in_batch, questionswords2int))
        padded_answers_in_batch = np.array(apply_padding(answers_in_batch, answerswords2int))
        yield padded_questions_in_batch, padded_answers_in_batch

In [None]:
# Splitting the questions and answers into training and validation sets
training_validation_split = int(len(sorted_clean_questions) * 0.15)
training_questions = sorted_clean_questions[training_validation_split:]
training_answers = sorted_clean_answers[training_validation_split:]
validation_questions = sorted_clean_questions[:training_validation_split]
validation_answers = sorted_clean_answers[:training_validation_split]