# Data Preprocessing

In [None]:
import pandas as pd
import numpy as np
#import tensorflow as tf
import re 
import time 

In [None]:
#%tensorflow_version 1.x

In [None]:
questions = [sentence.replace("'", "") for sentence in question]
answers = [sentence.replace("'", "") for sentence in answer]


In [None]:
#conversations[:5]
questions[:5]

['[What does this section show different approaches to working with chained maps?',
 ' What is an example of simulating a Python’s internal lookup chain?',
 ' What do environment variables take precedence over?',
 ' What class can be used to simulate nested contexts?',
 ' What does the ChainMap class only make updates to?']

In [None]:
#lines[:5]
answers[:5]

['[various approaches to working with chained maps.',
 ' Example',
 ' default values',
 ' ChainMap',
 ' the  first mapping in the chain']

In [None]:
#len(id2line.keys())

In [None]:
#conversations_id[:10]

In [None]:
len(questions)

151

In [None]:
len(answers)

167

In [None]:
len(answers)

167

In [None]:
answers = answers[:152]

In [None]:
# Doing a first cleaning of the texts

def clean_text(text):
    text = text.lower()
    text = re.sub(r"i'm", "i am", text)
    text = re.sub(r"he's", "he is", text)
    text = re.sub(r"she's", "she is", text)
    text = re.sub(r"that's", "that is", text)
    text = re.sub(r"what's", "what is", text)
    text = re.sub(r"where's", "where is", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"won't", "will not", text)
    text = re.sub(r"can't", "cannot", text)
    text = re.sub(r"[-()\"#/@;:<>{}+=~|.?,]", "", text)
    return text

In [None]:
questions_clean = []
for question in questions:
    questions_clean.append(clean_text(question))

answers_clean = []
for answer in answers:
    answers_clean.append(clean_text(answer))

In [None]:

# Creating a dictionary that maps each word to its number of occurrences
word2count = {}
for question in questions_clean:
    for word in question.split():
        if word not in word2count:
            word2count[word] = 1
        else:
            word2count[word] += 1
for answer in answers_clean:
    for word in answer.split():
        if word not in word2count:
            word2count[word] = 1
        else:
            word2count[word] += 1

In [None]:
#word2count

In [None]:

# Creating two dictionaries that map the questions words and the answers words to a unique integer
threshold_questions = 1
questionswords2int = {}
word_number = 0
for word, count in word2count.items():
    if count >= threshold_questions:
        questionswords2int[word] = word_number
        word_number += 1
threshold_answers = 1
answerswords2int = {}
word_number = 0
for word, count in word2count.items():
    if count >= threshold_answers:
        answerswords2int[word] = word_number
        word_number += 1

In [None]:
#answerswords2int

In [None]:
# Adding the last tokens to these two dictionaries, to use it in encoding and decoding.
tokens = ['<PAD>', '<EOS>', '<OUT>', '<SOS>']
for token in tokens:
    questionswords2int[token] = len(questionswords2int) + 1
for token in tokens:
    answerswords2int[token] = len(answerswords2int) + 1

In [None]:
# Creating the inverse dictionary of the answerswords2int dictionary, to use it in seq2seq model
answersints2word = {w_i: w for w, w_i in answerswords2int.items()}

In [None]:
# Adding the End Of String token to the end of every answer
for i in range(len(answers_clean)):
    answers_clean[i] += ' <EOS>'

In [None]:
answers_clean[:3]

['[various approaches to working with chained maps <EOS>',
 ' example <EOS>',
 ' default values <EOS>']

In [None]:
# Translating all the questions and the answers into integers
# and Replacing all the words that were filtered out by <OUT> 
questions_into_int = []
for question in questions_clean:
    ints = []
    for word in question.split():
        if word not in questionswords2int:
            ints.append(questionswords2int['<OUT>'])
        else:
            ints.append(questionswords2int[word])
    questions_into_int.append(ints)


answers_into_int = []
for answer in answers_clean:
    ints = []
    for word in answer.split():
        if word not in answerswords2int:
            ints.append(answerswords2int['<OUT>'])
        else:
            ints.append(answerswords2int[word])
    answers_into_int.append(ints)

In [None]:
# Sorting questions and answers by the length of questions
sorted_clean_questions = []
sorted_clean_answers = []
for length in range(1, 25 + 1):
    for i in enumerate(questions_into_int):
        if len(i[1]) == length:
            sorted_clean_questions.append(questions_into_int[i[0]])
            sorted_clean_answers.append(answers_into_int[i[0]])

# BUILDING THE SEQ2SEQ MODEL

In [None]:
%%capture
!pip install tensorflow-addons

In [None]:
import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense
from tensorflow.keras.models import Model
from tensorflow_addons.seq2seq import BahdanauAttention, Decoder, Sampler


# Creating placeholders for the inputs and the targets
def model_inputs():
    inputs = Input(shape=(None, ), dtype=tf.int32, name='input')
    targets = Input(shape=(None, ), dtype=tf.int32, name='target')
    lr = Input(shape=(), dtype=tf.float32, name='learning_rate')
    keep_prob = Input(shape=(), dtype=tf.float32, name='keep_prob')
    return inputs, targets, lr, keep_prob

# Preprocessing the targets
def preprocess_targets(targets, word2int, batch_size):
    left_side = tf.fill([batch_size, 1], word2int['<SOS>'])
    right_side = targets[:, :-1]
    preprocessed_targets = tf.concat([left_side, right_side], axis=1)
    return preprocessed_targets

# Creating the Encoder
class Encoder(tf.keras.layers.Layer):
    def __init__(self, vocab_size, embedding_dim, rnn_units, num_layers, dropout_rate):
        super(Encoder, self).__init__()
        self.embedding = Embedding(vocab_size, embedding_dim)
        self.rnn_layers = [LSTM(rnn_units, return_sequences=True, return_state=True, dropout=dropout_rate) for _ in range(num_layers)]

    def call(self, inputs, training):
        x = self.embedding(inputs)
        for layer in self.rnn_layers:
            x, *states = layer(x)
        return x, states

# Creating the Decoder
class CustomDecoder(Decoder):
    def __init__(self, vocab_size, embedding_dim, rnn_units, num_layers, dropout_rate):
        super(CustomDecoder, self).__init__()
        self.embedding = Embedding(vocab_size, embedding_dim)
        self.rnn_layers = [LSTM(rnn_units, return_sequences=True, return_state=True, dropout=dropout_rate) for _ in range(num_layers)]
        self.attention = BahdanauAttention(rnn_units)
        self.dense = Dense(vocab_size, activation="linear")

    def initialize(self, initial_state, **kwargs):
        self.initial_state = initial_state
        return initial_state

    def call(self, inputs, training=None, mask=None, **kwargs):
        x = self.embedding(inputs)
        outputs = []
        for layer in self.rnn_layers:
            x, *states = layer(x, initial_state=self.initial_state)
            self.initial_state = states
            context_vector, _ = self.attention(x, **kwargs)
            x = tf.concat([x, context_vector], axis=-1)
            outputs.append(x)
        x = self.dense(tf.concat(outputs, axis=1))
        return x


def create_seq2seq_model(batch_size, answers_vocab_size, questions_vocab_size, encoding_embedding_size, decoding_embedding_size, rnn_size, num_layers, questionswords2int):
    # Encoder
    encoder_inputs = Input(batch_shape=(batch_size, None), name='encoder_inputs')
    encoder_embeddings = Embedding(questions_vocab_size + 1, encoding_embedding_size, mask_zero=True, name='encoder_embeddings')(encoder_inputs)
    encoder_lstm = LSTM(rnn_size, return_sequences=True, return_state=True, name='encoder_lstm')
    _, encoder_state_h, encoder_state_c = encoder_lstm(encoder_embeddings)

    # Decoder
    decoder_inputs = Input(batch_shape=(batch_size, None), name='decoder_inputs')
    decoder_embeddings = Embedding(answers_vocab_size + 1, decoding_embedding_size, mask_zero=True, name='decoder_embeddings')(decoder_inputs)
    decoder_lstm = LSTM(rnn_size, return_sequences=True, return_state=True, name='decoder_lstm')
    decoder_outputs, _, _ = decoder_lstm(decoder_embeddings, initial_state=[encoder_state_h, encoder_state_c])

    # Output layer
    decoder_dense = Dense(answers_vocab_size + 1, activation='softmax', name='decoder_dense')
    decoder_outputs = decoder_dense(decoder_outputs)

    # Create the seq2seq model
    seq2seq_model = tf.keras.Model([encoder_inputs, decoder_inputs], decoder_outputs)

    return seq2seq_model


# TRAINING THE SEQ2SEQ MODEL

In [None]:
from tensorflow.keras import backend as K


# Setting the Hyperparameters
epochs = 100
batch_size = 32
rnn_size = 1024
num_layers = 3
encoding_embedding_size = 1024
decoding_embedding_size = 1024
learning_rate = 0.001
learning_rate_decay = 0.9
min_learning_rate = 0.0001
keep_probability = 0.5

# Loading the model inputs
inputs, targets, lr, keep_prob = model_inputs()

# Getting the shape of the inputs tensor
input_shape = tf.shape(inputs)

# Create the seq2seq model
seq2seq_model = create_seq2seq_model(batch_size, len(answerswords2int), len(questionswords2int), encoding_embedding_size, decoding_embedding_size, rnn_size, num_layers, questionswords2int)
# Compile the model
seq2seq_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001), loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True))



# Padding the sequences with the <PAD> token
def apply_padding(batch_of_sequences, word2int):
    max_sequence_length = max([len(sequence) for sequence in batch_of_sequences])
    return [sequence + [word2int['<PAD>']] * (max_sequence_length - len(sequence)) for sequence in batch_of_sequences]

# Splitting the data into batches of questions and answers
def split_into_batches(questions, answers, batch_size):
    for batch_index in range(0, len(questions) // batch_size):
        start_index = batch_index * batch_size
        questions_in_batch = questions[start_index : start_index + batch_size]
        answers_in_batch = answers[start_index : start_index + batch_size]
        padded_questions_in_batch = np.array(apply_padding(questions_in_batch, questionswords2int))
        padded_answers_in_batch = np.array(apply_padding(answers_in_batch, answerswords2int))
        yield padded_questions_in_batch, padded_answers_in_batch

# Splitting the questions and answers into training and validation sets
training_validation_split = int(len(sorted_clean_questions) * 0.15)
training_questions = sorted_clean_questions[training_validation_split:]
training_answers = sorted_clean_answers[training_validation_split:]
validation_questions = sorted_clean_questions[:training_validation_split]
validation_answers = sorted_clean_answers[:training_validation_split]





from tensorflow.keras.layers import Layer

class LossCalculationLayer(Layer):
    def __init__(self, **kwargs):
        super(LossCalculationLayer, self).__init__(**kwargs)

    def call(self, inputs):
        logits, targets, sequence_length = inputs
        loss = tf.keras.backend.sparse_categorical_crossentropy(targets, logits, from_logits=True)
        mask = tf.sequence_mask(sequence_length, dtype=tf.float32)
        loss = loss * mask
        return tf.reduce_sum(loss) / tf.reduce_sum(mask)

# Instantiate the loss calculation layer
loss_calculation_layer = LossCalculationLayer()

optimizer=tf.keras.optimizers.Adam(learning_rate=0.001)

best_validation_loss = float('inf')
early_stopping_check = 0
early_stopping_stop = 2000

# Training
for epoch in range(1, epochs + 1):
    for batch_index, (padded_questions_in_batch, padded_answers_in_batch) in enumerate(split_into_batches(training_questions, training_answers, batch_size)):
        starting_time = time.time()
        with tf.GradientTape() as tape:
            logits = seq2seq_model([padded_questions_in_batch, padded_answers_in_batch[:, :-1]])
            loss_error = loss_calculation_layer([logits, padded_answers_in_batch[:, 1:], padded_answers_in_batch.shape[1] - 1])
        #total_training_loss_error += loss_error.numpy()
        gradients = tape.gradient(loss_error, seq2seq_model.trainable_variables)
        clipped_gradients = [tf.clip_by_value(grad, -5., 5.) for grad in gradients]
        optimizer.apply_gradients(zip(clipped_gradients, seq2seq_model.trainable_variables))
        # ...
        history = seq2seq_model.fit([padded_questions_in_batch, padded_answers_in_batch], padded_answers_in_batch, batch_size=batch_size, epochs=1, verbose=0)
        training_loss = history.history['loss'][0]
        print('Epoch: {:>3}/{}, Batch: {:>4}/{}, Training Loss Error: {:>6.3f}'.format(epoch, epochs, batch_index, len(training_questions) // batch_size, training_loss))

    # Validation
    total_validation_loss = 0
    total_batches = 0
    for batch_index, (padded_questions_in_batch, padded_answers_in_batch) in enumerate(split_into_batches(validation_questions, validation_answers, batch_size)):
        validation_loss = seq2seq_model.evaluate([padded_questions_in_batch, padded_answers_in_batch], padded_answers_in_batch, batch_size=batch_size, verbose=0)
        total_validation_loss += validation_loss
        total_batches += 1

    #average_validation_loss = total_validation_loss / total_batches
    if total_batches > 0:
        average_validation_loss = total_validation_loss / total_batches
    else:
        average_validation_loss = None
    #print('Validation Loss Error: {:>6.3f}'.format(average_validation_loss))
    if average_validation_loss is not None:
        print('Validation Loss Error: {:>6.3f}'.format(average_validation_loss))
    else:
        print('No validation batches were processed.')

    learning_rate *= learning_rate_decay
    if learning_rate < min_learning_rate:
        learning_rate = min_learning_rate
        K.set_value(seq2seq_model.optimizer.learning_rate, learning_rate)

    if average_validation_loss is not None and average_validation_loss <= best_validation_loss:
        print('I speak better now!!')
        early_stopping_check = 0
        best_validation_loss = average_validation_loss
        seq2seq_model.save_weights(checkpoint)
    else:
        print("Sorry I do not speak better, I need to practice more.")
        early_stopping_check += 1
        if early_stopping_check == early_stopping_stop:
            break

if early_stopping_check == early_stopping_stop:
    print("My apologies, I cannot speak better anymore. This is the best I can do.")
else:
    print("Game Over")

Epoch:   1/100, Batch:    0/4, Training Loss Error:  5.092
Epoch:   1/100, Batch:    1/4, Training Loss Error:  2.488
Epoch:   1/100, Batch:    2/4, Training Loss Error:  2.142
Epoch:   1/100, Batch:    3/4, Training Loss Error:  2.742
No validation batches were processed.
Sorry I do not speak better, I need to practice more.
Epoch:   2/100, Batch:    0/4, Training Loss Error:  0.930
Epoch:   2/100, Batch:    1/4, Training Loss Error:  2.005
Epoch:   2/100, Batch:    2/4, Training Loss Error:  1.451
Epoch:   2/100, Batch:    3/4, Training Loss Error:  1.989
No validation batches were processed.
Sorry I do not speak better, I need to practice more.
Epoch:   3/100, Batch:    0/4, Training Loss Error:  1.283
Epoch:   3/100, Batch:    1/4, Training Loss Error:  1.748
Epoch:   3/100, Batch:    2/4, Training Loss Error:  1.330
Epoch:   3/100, Batch:    3/4, Training Loss Error:  1.989
No validation batches were processed.
Sorry I do not speak better, I need to practice more.
Epoch:   4/100, 

In [None]:
seq2seq_model.save_weights("./chatbot_weights.ckpt")


# TESTING THE SEQ2SEQ MODEL

In [None]:
import numpy as np

# Loading the weights
checkpoint = "./chatbot_weights.ckpt"
seq2seq_model.load_weights(checkpoint)

def convert_string2int(question, word2int):
    question = clean_text(question)
    return [word2int.get(word, word2int['<OUT>']) for word in question.split()]

sequence_length = 25
batch_size = 32

def chat_with_model(input_text):
    input_ints = convert_string2int(input_text, questionswords2int)
    input_padded = input_ints + [questionswords2int['<PAD>']] * (sequence_length - len(input_ints))
    
    fake_batch = np.zeros((batch_size, sequence_length))
    fake_batch[0] = input_padded
    
    logits = seq2seq_model.predict([fake_batch, np.zeros((batch_size, sequence_length - 1))])
    response_indices = np.argmax(logits, axis=-1)[0]
    
    response_tokens = []
    for index in response_indices:
        if answersints2word[index] == 'i':
            token = ' I'
        elif answersints2word[index] == '<EOS>':
            token = '.'
        elif answersints2word[index] == '<OUT>':
            token = 'out'
        else:
            token = ' ' + answersints2word[index]
        response_tokens.append(token)
        if token == '.':
            break
    
    response_text = ''.join(response_tokens)
    return response_text

# Setting up the chat
while True:
    question = input("You: ")
    if question.lower() == 'goodbye':
        break
    response = chat_with_model(question)
    print('ChatBot:', response)


You: What does this section show different approaches to working with chained maps?
ChatBot: .
You: What class can be used to simulate nested contexts?
ChatBot: .
You: What does the ChainMap class only make updates to?
ChatBot: .


KeyboardInterrupt: ignored