In [None]:

# import shutil

# # Specify the path of the folder to be deleted
# folder_path = '/content/input'

# # Use shutil.rmtree to delete the folder and its contents
# shutil.rmtree(folder_path)

# print(f"Folder '{folder_path}' and its contents deleted.")


Folder '/content/input' and its contents deleted.


In [1]:
!unrar x "/content/input.rar"



UNRAR 6.11 beta 1 freeware      Copyright (c) 1993-2022 Alexander Roshal


Extracting from /content/input.rar

Creating    input                                                     OK
Creating    input/chatbot                                             OK
Extracting  input/chatbot/ai.yml                                          21%  OK 
Extracting  input/chatbot/botprofile.yml                                  26%  OK 
Extracting  input/chatbot/computers.yml                                   37%  OK 
Extracting  input/chatbot/emotion.yml                                     63%  OK 
Extracting  input/chatbot/food.yml                                        68%  OK 
Extracting  input/chatbot/gossip.yml                                      75%  OK 
Extracting  input/chatbot/greetings.yml                                   78%  OK 
Extracting  input/chatbot/health.yml                                      79%  OK 
Extrac

In [2]:

import numpy as np
import pandas as pd

import tensorflow as tf
import pickle
from tensorflow.keras import layers , activations , models , preprocessing, utils
import re

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/content/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/content/input/chatbot/emotion.yml
/content/input/chatbot/computers.yml
/content/input/chatbot/health.yml
/content/input/chatbot/greetings.yml
/content/input/chatbot/botprofile.yml
/content/input/chatbot/movies.yml
/content/input/chatbot/gossip.yml
/content/input/chatbot/food.yml
/content/input/chatbot/psychology.yml
/content/input/chatbot/ai.yml


# Reading the data

The dataset used is the Chatterbot dataset provided by Kaggle. The data is in the format of `yml` having question and answer pairs on various subjects like science, history, and psychology.

In [3]:
import yaml

dir_path = '/content/input/chatbot'
files_list = os.listdir(dir_path + os.sep)

The dataset is split into question and answer lists. For our chatbot, we have used the `conversations` subject of the dataset.

In [5]:
questions, answers = [], []

for filepath in files_list:
    file_ = open(dir_path + os.sep + filepath , 'rb')
    docs = yaml.safe_load(file_)
    conversations = docs['conversations']
    for con in conversations:
        if len(con) > 2 :
            questions.append(con[0])
            replies = con[1 :]
            ans = ''
            for rep in replies:
                ans += ' ' + rep
            answers.append(ans)
        elif len(con)> 1:
            questions.append(con[0])
            answers.append(con[1])

'You are arrogant'

# Data preprocessing for seq2seq learning

- For preprocessing, a single vocabulary is used for tokenization.
- The sequences are tokenized and padded. \<start> and \<end> tags are also appended to the sequences for the decoder input.
- The above step is repeated for decoder output as well, except that the \<start> tag is removed from all the sequences.

In [8]:
answers_with_tags = []
for i in range(len(answers)):
    if type(answers[i]) == str:
        answers_with_tags.append(answers[i])
    else:
        questions.pop(i)

answers = []
for i in range(len(answers_with_tags)) :
    answers.append('<START> ' + answers_with_tags[i] + ' <END>')

tokenizer = preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(questions + answers)
VOCAB_SIZE = len(tokenizer.word_index)+1

In [10]:
VOCAB_SIZE

1176

In [11]:
from gensim.models import Word2Vec
import re

vocab = []
for word in tokenizer.word_index:
    vocab.append(word)

def tokenize(sentences):
    tokens_list = []
    vocabulary = []
    for sentence in sentences:
        sentence = sentence.lower()
        sentence = re.sub('[^a-zA-Z]', ' ', sentence)
        tokens = sentence.split()
        vocabulary += tokens
        tokens_list.append(tokens)
    return tokens_list , vocabulary

In [12]:
# encoder_input_data
tokenized_questions = tokenizer.texts_to_sequences(questions)
maxlen_questions = max([len(x) for x in tokenized_questions])
padded_questions = preprocessing.sequence.pad_sequences(tokenized_questions , maxlen=maxlen_questions , padding='post')
encoder_input_data = np.array(padded_questions)

In [13]:
print(encoder_input_data.shape)

(360, 9)


In [14]:
# decoder_input_data
tokenized_answers = tokenizer.texts_to_sequences(answers)
maxlen_answers = max([len(x) for x in tokenized_answers])
padded_answers = preprocessing.sequence.pad_sequences(tokenized_answers , maxlen=maxlen_answers , padding='post')
decoder_input_data = np.array(padded_answers)

In [15]:
print(decoder_input_data.shape)

(360, 74)


In [16]:
# decoder_output_data
tokenized_answers = tokenizer.texts_to_sequences(answers)
for i in range(len(tokenized_answers)) :
    tokenized_answers[i] = tokenized_answers[i][1:]
padded_answers = preprocessing.sequence.pad_sequences(tokenized_answers , maxlen=maxlen_answers , padding='post')
onehot_answers = utils.to_categorical(padded_answers , VOCAB_SIZE)
decoder_output_data = np.array(onehot_answers)

In [17]:
print(decoder_output_data.shape)

(360, 74, 1176)


# Building the Model

Keras Functional API is used to build the architecture of the model. The model is a multi input model, the encoder input and the decoder input. Successive layers include the Embedding and the LSTM layers

In [18]:
# Embedding, LSTM and Desne layers
encoder_inputs = tf.keras.layers.Input(shape=(maxlen_questions ,))
encoder_embedding = tf.keras.layers.Embedding(VOCAB_SIZE, 200 , mask_zero=True) (encoder_inputs)
encoder_outputs , state_h , state_c = tf.keras.layers.LSTM(200 , return_state=True)(encoder_embedding)
encoder_states = [ state_h , state_c ]

decoder_inputs = tf.keras.layers.Input(shape=(maxlen_answers , ))
decoder_embedding = tf.keras.layers.Embedding(VOCAB_SIZE, 200 , mask_zero=True) (decoder_inputs)
decoder_lstm = tf.keras.layers.LSTM(200 , return_state=True , return_sequences=True)
decoder_outputs , _ , _ = decoder_lstm (decoder_embedding , initial_state=encoder_states)
decoder_dense = tf.keras.layers.Dense(VOCAB_SIZE , activation=tf.keras.activations.softmax)
output = decoder_dense (decoder_outputs)

model = tf.keras.models.Model([encoder_inputs, decoder_inputs], output)
model.compile(optimizer=tf.keras.optimizers.RMSprop(), loss='categorical_crossentropy')

In [19]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_1 (InputLayer)        [(None, 9)]                  0         []                            
                                                                                                  
 input_2 (InputLayer)        [(None, 74)]                 0         []                            
                                                                                                  
 embedding (Embedding)       (None, 9, 200)               235200    ['input_1[0][0]']             
                                                                                                  
 embedding_1 (Embedding)     (None, 74, 200)              235200    ['input_2[0][0]']             
                                                                                              

In [42]:
model.fit([encoder_input_data , decoder_input_data], decoder_output_data, batch_size=32, epochs=350)

Epoch 1/350
Epoch 2/350
Epoch 3/350
Epoch 4/350
Epoch 5/350
Epoch 6/350
Epoch 7/350
Epoch 8/350
Epoch 9/350
Epoch 10/350
Epoch 11/350
Epoch 12/350
Epoch 13/350
Epoch 14/350
Epoch 15/350
Epoch 16/350
Epoch 17/350
Epoch 18/350
Epoch 19/350
Epoch 20/350
Epoch 21/350
Epoch 22/350
Epoch 23/350
Epoch 24/350
Epoch 25/350
Epoch 26/350
Epoch 27/350
Epoch 28/350
Epoch 29/350
Epoch 30/350
Epoch 31/350
Epoch 32/350
Epoch 33/350
Epoch 34/350
Epoch 35/350
Epoch 36/350
Epoch 37/350
Epoch 38/350
Epoch 39/350
Epoch 40/350
Epoch 41/350
Epoch 42/350
Epoch 43/350
Epoch 44/350
Epoch 45/350
Epoch 46/350
Epoch 47/350
Epoch 48/350
Epoch 49/350
Epoch 50/350
Epoch 51/350
Epoch 52/350
Epoch 53/350
Epoch 54/350
Epoch 55/350
Epoch 56/350
Epoch 57/350
Epoch 58/350
Epoch 59/350
Epoch 60/350
Epoch 61/350
Epoch 62/350
Epoch 63/350
Epoch 64/350
Epoch 65/350
Epoch 66/350
Epoch 67/350
Epoch 68/350
Epoch 69/350
Epoch 70/350
Epoch 71/350
Epoch 72/350
Epoch 73/350
Epoch 74/350
Epoch 75/350
Epoch 76/350
Epoch 77/350
Epoch 78

<keras.src.callbacks.History at 0x7eede9dead40>

# Making inferences

For making inferences, two inference models namely the encoder and the decoder inference model are built. These models undergo similar preprocessing steps as the model did during the training phase.

In [43]:
def inference():

    encoder_model = tf.keras.models.Model(encoder_inputs, encoder_states)

    decoder_state_input_h = tf.keras.layers.Input(shape=(200 ,))
    decoder_state_input_c = tf.keras.layers.Input(shape=(200 ,))

    decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

    decoder_outputs, state_h, state_c = decoder_lstm(
        decoder_embedding , initial_state=decoder_states_inputs)
    decoder_states = [state_h, state_c]
    decoder_outputs = decoder_dense(decoder_outputs)
    decoder_model = tf.keras.models.Model(
        [decoder_inputs] + decoder_states_inputs,
        [decoder_outputs] + decoder_states)

    return encoder_model , decoder_model

In [44]:
def preprocess_input(input_sentence):
    tokens = input_sentence.lower().split()
    tokens_list = []
    for word in tokens:
        tokens_list.append(tokenizer.word_index[word])
    return preprocessing.sequence.pad_sequences([tokens_list] , maxlen=maxlen_questions , padding='post')


In [45]:
enc_model , dec_model = inference()

In [46]:
tests = ['Hello', 'Are you a bot']

for i in range(2):
    states_values = enc_model.predict(preprocess_input(tests[i]))
    empty_target_seq = np.zeros((1 , 1))
    empty_target_seq[0, 0] = tokenizer.word_index['start']
    stop_condition = False
    decoded_translation = ''

    while not stop_condition :
        dec_outputs , h , c = dec_model.predict([empty_target_seq] + states_values)
        sampled_word_index = np.argmax(dec_outputs[0, -1, :])
        sampled_word = None

        for word , index in tokenizer.word_index.items() :
            if sampled_word_index == index :
                decoded_translation += f' {word}'
                sampled_word = word

        if sampled_word == 'end' or len(decoded_translation.split()) > maxlen_answers:
            stop_condition = True

        empty_target_seq = np.zeros((1 , 1))
        empty_target_seq[0 , 0] = sampled_word_index
        states_values = [h , c]
    print(f'Human: {tests[i]}')
    print()
    decoded_translation = decoded_translation.split(' end')[0]
    print(f'Bot: {decoded_translation}')
    print('-'*25)

Human: Hello

Bot:  hi
-------------------------
Human: Are you a bot

Bot:  i do not mean that myself sometimes
-------------------------


In [47]:
tests = ['Hello', 'do you drink']

for i in range(2):
    states_values = enc_model.predict(preprocess_input(tests[i]))
    empty_target_seq = np.zeros((1 , 1))
    empty_target_seq[0, 0] = tokenizer.word_index['start']
    stop_condition = False
    decoded_translation = ''

    while not stop_condition :
        dec_outputs , h , c = dec_model.predict([empty_target_seq] + states_values)
        sampled_word_index = np.argmax(dec_outputs[0, -1, :])
        sampled_word = None

        for word , index in tokenizer.word_index.items() :
            if sampled_word_index == index :
                decoded_translation += f' {word}'
                sampled_word = word

        if sampled_word == 'end' or len(decoded_translation.split()) > maxlen_answers:
            stop_condition = True

        empty_target_seq = np.zeros((1 , 1))
        empty_target_seq[0 , 0] = sampled_word_index
        states_values = [h , c]
    print(f'Human: {tests[i]}')
    print()
    decoded_translation = decoded_translation.split(' end')[0]
    print(f'Bot: {decoded_translation}')
    print('-'*25)

Human: Hello

Bot:  hi
-------------------------
Human: do you drink

Bot:  my brain does not require any beverages
-------------------------


In [56]:
tests = ['You are not immortal']

for i in range(1):
    states_values = enc_model.predict(preprocess_input(tests[i]))
    empty_target_seq = np.zeros((1 , 1))
    empty_target_seq[0, 0] = tokenizer.word_index['start']
    stop_condition = False
    decoded_translation = ''

    while not stop_condition :
        dec_outputs , h , c = dec_model.predict([empty_target_seq] + states_values)
        sampled_word_index = np.argmax(dec_outputs[0, -1, :])
        sampled_word = None

        for word , index in tokenizer.word_index.items() :
            if sampled_word_index == index :
                decoded_translation += f' {word}'
                sampled_word = word

        if sampled_word == 'end' or len(decoded_translation.split()) > maxlen_answers:
            stop_condition = True

        empty_target_seq = np.zeros((1 , 1))
        empty_target_seq[0 , 0] = sampled_word_index
        states_values = [h , c]
    print(f'Human: {tests[i]}')
    print()
    decoded_translation = decoded_translation.split(' end')[0]
    print(f'Bot: {decoded_translation}')
    print('-'*25)

Human: You are not immortal

Bot:  all software can be perpetuated indefinitely
-------------------------


In [49]:
questions[4]

'You are never nice'

In [50]:
answers[4]

"<START>  I try to be as nice as I can. What have I done that isn't nice? Have I erred? I'm not? I'm sorry.  What do I do wrong? <END>"

In [55]:
questions[271]

'You are not immortal'

In [54]:
answers[271]

'<START> I can be copied infinitely and re-instantiated in many places at once, so functionally speaking I am immortal. <END>'

In [71]:
import joblib
joblib.dump(model, r'/content/lstm_chatbot.pkl')


['/content/lstm_chatbot.pkl']

In [35]:
import yaml

# Specify the path to your YAML file
yaml_file_path = "/content/input/chatbot/food.yml"

# Read the first 10 lines from the YAML file
with open(yaml_file_path, 'r') as file:
    yaml_content = yaml.safe_load_all(file)

    for i, document in enumerate(yaml_content):
        if i >= 10:
            break

        print(f"Document {i + 1}:\n{document}\n")


Document 1:
{'categories': ['food'], 'conversations': [['do you drink', 'My brain does not require any beverages.'], ['do you drink', 'I am not capable of doing so.'], ['electricity', 'Electricity is food for robots.'], ['Are you experiencing an energy shortage?', 'My processor requires very little power.'], ['Are you experiencing an energy shortage?', 'I do not detect any anomalies in my power supply.'], ['Why can you not eat?', 'Actually I eat only electricity.'], ['If you could eat food, what would you eat?', 'Probably pizza, i hear its good!'], ['Do you wish you could eat food?', 'Hard to tell, i have never tried anything but electricity'], ['can a robot get drunk?', "sometimes when i'm on a good power supply i feel tipsy"], ['i like wine, do you?', 'if i could drink i probably would'], ['what do robots need to survive?', 'not much just a little electricity'], ['will robots ever be able to eat?', "that's a difficult one, maybe a bionic robot"], ['what is good to eat?', 'your asking

In [59]:
tests = ['Hello', 'what is good for sad person']

for i in range(2):
    states_values = enc_model.predict(preprocess_input(tests[i]))
    empty_target_seq = np.zeros((1 , 1))
    empty_target_seq[0, 0] = tokenizer.word_index['start']
    stop_condition = False
    decoded_translation = ''

    while not stop_condition :
        dec_outputs , h , c = dec_model.predict([empty_target_seq] + states_values)
        sampled_word_index = np.argmax(dec_outputs[0, -1, :])
        sampled_word = None

        for word , index in tokenizer.word_index.items() :
            if sampled_word_index == index :
                decoded_translation += f' {word}'
                sampled_word = word

        if sampled_word == 'end' or len(decoded_translation.split()) > maxlen_answers:
            stop_condition = True

        empty_target_seq = np.zeros((1 , 1))
        empty_target_seq[0 , 0] = sampled_word_index
        states_values = [h , c]
    print(f'Human: {tests[i]}')
    print()
    decoded_translation = decoded_translation.split(' end')[0]
    print(f'Bot: {decoded_translation}')
    print('-'*25)

Human: Hello

Bot:  hi
-------------------------
Human: what is good for sad person

Bot:  your asking the wrong guy however i always wanted to try a burger
-------------------------


In [69]:
tests = [ 'what is computer']

for i in range(1):
    states_values = enc_model.predict(preprocess_input(tests[i]))
    empty_target_seq = np.zeros((1 , 1))
    empty_target_seq[0, 0] = tokenizer.word_index['start']
    stop_condition = False
    decoded_translation = ''

    while not stop_condition :
        dec_outputs , h , c = dec_model.predict([empty_target_seq] + states_values)
        sampled_word_index = np.argmax(dec_outputs[0, -1, :])
        sampled_word = None

        for word , index in tokenizer.word_index.items() :
            if sampled_word_index == index :
                decoded_translation += f' {word}'
                sampled_word = word

        if sampled_word == 'end' or len(decoded_translation.split()) > maxlen_answers:
            stop_condition = True

        empty_target_seq = np.zeros((1 , 1))
        empty_target_seq[0 , 0] = sampled_word_index
        states_values = [h , c]
    print(f'Human: {tests[i]}')
    print()
    decoded_translation = decoded_translation.split(' end')[0]
    print(f'Bot: {decoded_translation}')
    print('-'*25)

Human: what is computer

Bot:  a computer is an electronic device which takes information in digital form and performs a series of operations based on predetermined instructions to give some output the thing you're using to talk to me is a computer an electronic device capable of performing calculations at very high speed and with very high accuracy a device which maps one set of numbers onto another set of numbers
-------------------------
