# Creating a conversation AI chatbot

- using encoder-decoder framework

# 1)- Importing key modules

In [1]:
# support both Python 2 and Python 3 with minimal overhead.
from __future__ import absolute_import, division, print_function

# I am an engineer. I care only about error not warning. So, let's be maverick and ignore warnings.
import warnings
warnings.filterwarnings('ignore')

In [2]:
from mpl_toolkits.mplot3d import Axes3D
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt # plotting
import numpy as np # linear algebra
import os # accessing directory structure
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# 2)-Loading Dataset

source : https://www.kaggle.com/mushaya/chatbot-1

In [3]:
print(os.listdir('raw_data'))

['bot.ipynb', 'botprofile.yml', 'politics.yml']


In [4]:
import os
import yaml

dir_path = 'raw_data'
files_list = os.listdir(dir_path + os.sep)

In [5]:
files_list

['bot.ipynb', 'botprofile.yml', 'politics.yml']

### reading tags in data

In [6]:
for filepath in files_list:
    stream = open(os.path.join(dir_path, filepath), 'rb')
    docs = yaml.safe_load(stream)
print(docs)

{'categories': ['politics'], 'conversations': [['have you read the communist', 'yes, marx had made some interesting observations.'], ['what is a government', 'ideally it is a representative of the people.'], ['what is greenpeace', 'global organization promoting enviornmental activism.'], ['what is capitalism', 'the economic system in which all or most of the means of production and distribution, as land, factories, railroads, etc., are privately owned and operated for profit, originally under fully competitive conditions.'], ['what is socialism', 'communism from people who want to keep their volvos. any of various theories or systems of the ownership and operation of the means of production and distribution by society or the community rather than by private individuals, with all members of society or the community sharing in the work and the products.'], ['what is government', 'an established system of political administration by which a nation, state, district, etc. is governed.'], ['

In [7]:
docs["categories"]

['politics']

In [8]:
docs["conversations"]

[['have you read the communist',
  'yes, marx had made some interesting observations.'],
 ['what is a government', 'ideally it is a representative of the people.'],
 ['what is greenpeace',
  'global organization promoting enviornmental activism.'],
 ['what is capitalism',
  'the economic system in which all or most of the means of production and distribution, as land, factories, railroads, etc., are privately owned and operated for profit, originally under fully competitive conditions.'],
 ['what is socialism',
  'communism from people who want to keep their volvos. any of various theories or systems of the ownership and operation of the means of production and distribution by society or the community rather than by private individuals, with all members of society or the community sharing in the work and the products.'],
 ['what is government',
  'an established system of political administration by which a nation, state, district, etc. is governed.'],
 ['what is communism',
  'a socio

In [9]:
con_access=docs["conversations"]
type(con_access)

list

In [10]:
con_access

[['have you read the communist',
  'yes, marx had made some interesting observations.'],
 ['what is a government', 'ideally it is a representative of the people.'],
 ['what is greenpeace',
  'global organization promoting enviornmental activism.'],
 ['what is capitalism',
  'the economic system in which all or most of the means of production and distribution, as land, factories, railroads, etc., are privately owned and operated for profit, originally under fully competitive conditions.'],
 ['what is socialism',
  'communism from people who want to keep their volvos. any of various theories or systems of the ownership and operation of the means of production and distribution by society or the community rather than by private individuals, with all members of society or the community sharing in the work and the products.'],
 ['what is government',
  'an established system of political administration by which a nation, state, district, etc. is governed.'],
 ['what is communism',
  'a socio

### Separating Questions and answers

In [11]:
questions = list()
answers = list()
for con in con_access:
    if len(con)>2:
        questions.append(con[0])
        replies=con[1:]
        ans=""
        for rep in replies:
            ans+=' '+rep
            answers.append(ans)
    elif len(con)>1:
        questions.append(con[0])
        answers.append(con[1])

In [12]:
questions

['have you read the communist',
 'what is a government',
 'what is greenpeace',
 'what is capitalism',
 'what is socialism',
 'what is government',
 'what is communism',
 'what is impeached',
 'i do not like guns',
 'i do not like guns',
 'do you like guns',
 'why guns',
 'who was the first impeached president',
 'who is the governor',
 'who is the governor',
 'guns']

In [13]:
answers

['yes, marx had made some interesting observations.',
 'ideally it is a representative of the people.',
 'global organization promoting enviornmental activism.',
 'the economic system in which all or most of the means of production and distribution, as land, factories, railroads, etc., are privately owned and operated for profit, originally under fully competitive conditions.',
 'communism from people who want to keep their volvos. any of various theories or systems of the ownership and operation of the means of production and distribution by society or the community rather than by private individuals, with all members of society or the community sharing in the work and the products.',
 'an established system of political administration by which a nation, state, district, etc. is governed.',
 'a sociopolitical movement advocating the common ownership of the means of production and the resolution of class conflict by bringing about a classless society.',
 "when a person's honor or reput

### Add start and end tag with answers

In [14]:
answers_with_tags=list()
for i in range( len(answers)):
    if type(answers[i])==str:
        answers_with_tags.append(answers[i])
    else:
        questions.pop(i)

answers=list()
for ans in answers_with_tags:
    answers.append('<START> '+ans+' <END>')

In [15]:
questions

['have you read the communist',
 'what is a government',
 'what is greenpeace',
 'what is capitalism',
 'what is socialism',
 'what is government',
 'what is communism',
 'what is impeached',
 'i do not like guns',
 'i do not like guns',
 'do you like guns',
 'why guns',
 'who was the first impeached president',
 'who is the governor',
 'who is the governor',
 'guns']

In [16]:
answers

['<START> yes, marx had made some interesting observations. <END>',
 '<START> ideally it is a representative of the people. <END>',
 '<START> global organization promoting enviornmental activism. <END>',
 '<START> the economic system in which all or most of the means of production and distribution, as land, factories, railroads, etc., are privately owned and operated for profit, originally under fully competitive conditions. <END>',
 '<START> communism from people who want to keep their volvos. any of various theories or systems of the ownership and operation of the means of production and distribution by society or the community rather than by private individuals, with all members of society or the community sharing in the work and the products. <END>',
 '<START> an established system of political administration by which a nation, state, district, etc. is governed. <END>',
 '<START> a sociopolitical movement advocating the common ownership of the means of production and the resolution

# 3)- Preprocessing Data

In [17]:
import tensorflow as tf
from tensorflow.keras import layers, preprocessing, utils
import yaml

print( tf.version)

<module 'tensorflow_core._api.v2.version' from 'C:\\Users\\hassan.sherwani\\Miniconda3\\envs\\tensorflow\\lib\\site-packages\\tensorflow_core\\_api\\v2\\version\\__init__.py'>


In [18]:
tokenizer=preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(questions+answers)
VOCAB_SIZE= len(tokenizer.word_index)+1
print('VOCAB SIZE : {}'.format(VOCAB_SIZE))

VOCAB SIZE : 140


### 3.1.tokenizer

In [19]:
from gensim.models import Word2Vec
import re

vocab=[]
for word in tokenizer.word_index:
    vocab.append(word)
    
def tokenize(sentences):
    tokens_list=[]
    vocabulary=[]
    for sentence in sentences:
        sentence = sentence.lower()
        sentence = re.sub( '^a-zA-Z',' ',sentence)
        tokens = sentence.split()
        vocabulary+=tokens
        tokens_list.append(tokens)
    return tokens_list,vocabulary

p=tokenize(questions+answers)
model = Word2Vec(p[0])

#### 3.2.a.tokenized_questions

In [20]:
tokenized_questions = tokenizer.texts_to_sequences(questions)
maxlen_questions = max([len(x) for x in tokenized_questions])
padded_questions = preprocessing.sequence.pad_sequences(tokenized_questions, maxlen=maxlen_questions, padding='post')
encoder_input_data = np.array( padded_questions )
print(encoder_input_data.shape, maxlen_questions)

(16, 6) 6


#### 3.2.b.tokenized_answers

In [21]:
tokenized_answers = tokenizer.texts_to_sequences(answers)
maxlen_answers = max([len(x) for x in tokenized_answers])
padded_answers = preprocessing.sequence.pad_sequences(tokenized_answers, maxlen=maxlen_answers, padding='post')
decoder_input_data = np.array(padded_answers)
print(decoder_input_data.shape, maxlen_answers)

(16, 54) 54


In [22]:
tokenized_answers = tokenizer.texts_to_sequences(answers)
for i in range(len(tokenized_answers)):
    tokenized_answers[i]=tokenized_answers[i][1:]
padded_answers = preprocessing.sequence.pad_sequences(tokenized_answers, maxlen=maxlen_answers, padding='post')
one_hot_answers = utils.to_categorical(padded_answers, VOCAB_SIZE)
decoder_output_data = np.array(one_hot_answers)
print(decoder_output_data.shape)

(16, 54, 140)


# 4)- Model Building

In [23]:
encoder_inputs = tf.keras.layers.Input(shape=( None , ))
encoder_embedding = tf.keras.layers.Embedding( VOCAB_SIZE, 200 , mask_zero=True ) (encoder_inputs)
encoder_outputs , state_h , state_c = tf.keras.layers.LSTM( 200 , return_state=True )( encoder_embedding )
encoder_states = [ state_h , state_c ]

decoder_inputs = tf.keras.layers.Input(shape=( None ,  ))
decoder_embedding = tf.keras.layers.Embedding( VOCAB_SIZE, 200 , mask_zero=True) (decoder_inputs)
decoder_lstm = tf.keras.layers.LSTM( 200 , return_state=True , return_sequences=True )
decoder_outputs , _ , _ = decoder_lstm ( decoder_embedding , initial_state=encoder_states )
decoder_dense = tf.keras.layers.Dense( VOCAB_SIZE , activation=tf.keras.activations.softmax ) 
output = decoder_dense ( decoder_outputs )

model = tf.keras.models.Model([encoder_inputs, decoder_inputs], output )
model.compile(optimizer=tf.keras.optimizers.Adam(), loss='categorical_crossentropy', metrics=['accuracy'])

model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, None)]       0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, None)]       0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, None, 200)    28000       input_1[0][0]                    
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, None, 200)    28000       input_2[0][0]                    
______________________________________________________________________________________________

### fit and save model

In [24]:
model.fit([encoder_input_data , decoder_input_data], decoder_output_data, batch_size=50, epochs=100 ) 
model.save( 'model.h5' )

Train on 16 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
E

Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


**Accuracy value started from 9% and eventually improved with epochs upto value of 82.51 %**

### interface model

In [25]:
def make_inference_models():
    
    encoder_model = tf.keras.models.Model(encoder_inputs, encoder_states)
    
    decoder_state_input_h = tf.keras.layers.Input(shape=( 200 ,))
    decoder_state_input_c = tf.keras.layers.Input(shape=( 200 ,))
    
    decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
    
    decoder_outputs, state_h, state_c = decoder_lstm(
        decoder_embedding , initial_state=decoder_states_inputs)
    decoder_states = [state_h, state_c]
    decoder_outputs = decoder_dense(decoder_outputs)
    decoder_model = tf.keras.models.Model(
        [decoder_inputs] + decoder_states_inputs,
        [decoder_outputs] + decoder_states)
    
    return encoder_model , decoder_model

In [26]:
def str_to_tokens( sentence : str ):
    words = sentence.lower().split()
    tokens_list = list()
    for word in words:
        tokens_list.append( tokenizer.word_index[ word ] ) 
    return preprocessing.sequence.pad_sequences( [tokens_list] , maxlen=maxlen_questions , padding='post')

# 5)-Evaluation

In [27]:
enc_model , dec_model = make_inference_models()

for _ in range(10):
    states_values = enc_model.predict( str_to_tokens( input( 'Enter question : ' ) ) )
    empty_target_seq = np.zeros( ( 1 , 1 ) )
    empty_target_seq[0, 0] = tokenizer.word_index['start']
    stop_condition = False
    decoded_translation = ''
    while not stop_condition :
        dec_outputs , h , c = dec_model.predict([ empty_target_seq ] + states_values )
        sampled_word_index = np.argmax( dec_outputs[0, -1, :] )
        sampled_word = None
        for word , index in tokenizer.word_index.items() :
            if sampled_word_index == index :
                decoded_translation += ' {}'.format( word )
                sampled_word = word
        
        if sampled_word == 'end' or len(decoded_translation.split()) > maxlen_answers:
            stop_condition = True
            
        empty_target_seq = np.zeros( ( 1 , 1 ) )  
        empty_target_seq[ 0 , 0 ] = sampled_word_index
        states_values = [ h , c ] 

    print( decoded_translation )

Enter question : have you read the communist
 yes marx had made some interesting observations end
Enter question : what is a government
 ideally it is a representative of the people end
Enter question : what is socialism
 communism from who who want to keep their volvos any of various theories or of the of the ownership of the of the of the of the and the community society or the community rather society or the community society or the the the sharing the the work the work and the the work end
Enter question : what is capitalism
 a a movement advocating the common of the means of the of production and distribution resolution of class conflict by bringing about a classless end
Enter question :  what is impeached
 a a person's honor or or or or or or or the means and and and and and distribution etc etc end
Enter question : who was the first impeached president
 andrew jackson end
Enter question : do you like guns
 not especially i am not into violence end
Enter question : what is back t

KeyError: 'back'

any question that is not trained,  will not be entertained.

# END of NOTEBOOK