In [4]:
#importing the libraries
import numpy as np
import pandas as pd
import re
from nltk.corpus import stopwords

from nltk.tokenize import word_tokenize

import tensorflow as tf
from tensorflow import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

from keras.layers import Input,Embedding,Bidirectional,LSTM,Dense,Concatenate
from keras.models import Model

In [5]:
  >>> import nltk
  >>> nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [6]:
dataset = pd.read_csv("dialogs.txt",names=["question","answer"],sep='\t')
print(dataset.shape)
dataset.head()

(3725, 2)


Unnamed: 0,question,answer
0,"hi, how are you doing?",i'm fine. how about yourself?
1,i'm fine. how about yourself?,i'm pretty good. thanks for asking.
2,i'm pretty good. thanks for asking.,no problem. so how have you been?
3,no problem. so how have you been?,i've been great. what about you?
4,i've been great. what about you?,i've been good. i'm in school right now.


In [7]:
#checking and removing any duplicated rows
print(dataset.isna().sum())
print("---"*10)
dataset.drop_duplicates(inplace=True)
print("Duplicate data present in dataframe :",dataset.duplicated().sum())

question    0
answer      0
dtype: int64
------------------------------
Duplicate data present in dataframe : 0


In [8]:
#convert the question and answer to lower case

dataset['question']=dataset['question'].str.lower()
dataset['answer']=dataset['answer'].str.lower()

#add SOS and EOS token for the encoder and decoder structure 
dataset['decoder_input'] = dataset.answer.apply(lambda x: 'sos '+x)
dataset['decoder_label'] = dataset.answer.apply(lambda x: x+' eo>')
dataset.head()

Unnamed: 0,question,answer,decoder_input,decoder_label
0,"hi, how are you doing?",i'm fine. how about yourself?,sos i'm fine. how about yourself?,i'm fine. how about yourself? eo>
1,i'm fine. how about yourself?,i'm pretty good. thanks for asking.,sos i'm pretty good. thanks for asking.,i'm pretty good. thanks for asking. eo>
2,i'm pretty good. thanks for asking.,no problem. so how have you been?,sos no problem. so how have you been?,no problem. so how have you been? eo>
3,no problem. so how have you been?,i've been great. what about you?,sos i've been great. what about you?,i've been great. what about you? eo>
4,i've been great. what about you?,i've been good. i'm in school right now.,sos i've been good. i'm in school right now.,i've been good. i'm in school right now. eo>


In [10]:
question=[]
for i in dataset['question']:
    question.append(word_tokenize(i))
    
decoder_input=[]
for i in dataset['decoder_input']:
    decoder_input.append(word_tokenize(i))
    

    
decoder_label=[]
for i in dataset['decoder_label']:
    decoder_label.append(word_tokenize(i))
    


print(len(question),len(decoder_input),len(decoder_label))

#create a vocabulary set

vocab = set()
for ques in question:
    vocab=vocab.union(set(ques))

for ans in decoder_input:
    vocab=vocab.union(set(ans))
    
for ans in decoder_label:
    vocab=vocab.union(set(ans))
    

    
#print(vocab)
print("Length of the vocab :",len(vocab))

3724 3724 3724
Length of the vocab : 2511


In [11]:
#increase the vocab size + 1  for padding 
vocab_size = len(vocab) + 1
print(vocab_size)

2512


In [12]:
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer(filters=[])
tokenizer.fit_on_texts(vocab)

print(type(tokenizer.word_index))
disp=list(tokenizer.word_index.items())[:30]
print("First 30 tokens with their respective indexes ")
print(disp)


<class 'dict'>
First 30 tokens with their respective indexes 
[('candy', 1), ('nation', 2), ('predictable', 3), ('news', 4), ('leftovers', 5), ('blows', 6), ('as', 7), ('passenger', 8), ('angry', 9), ('no', 10), ('offer', 11), ('barking', 12), ('theaters', 13), ('finger', 14), ('talent', 15), ('jobs', 16), ('answered', 17), ('drawer', 18), ('vacuumed', 19), ('office', 20), ('broke', 21), ('scrub', 22), ('lessons', 23), ('sleeve', 24), ('waiter', 25), ('needs', 26), ('rained', 27), ('boy', 28), ('genes', 29), ('oranges', 30)]


In [14]:
#mapping the tokens with the id
train_encoder_input = tokenizer.texts_to_sequences(question)
train_decoder_input=tokenizer.texts_to_sequences(decoder_input)
train_decoder_label=tokenizer.texts_to_sequences(decoder_label)

print(question[0])
print(train_encoder_input[0])
print("---"*5)
print(decoder_input[0])
print(train_decoder_input[0])

['hi', ',', 'how', 'are', 'you', 'doing', '?']
[2270, 1259, 1113, 991, 793, 2485, 1510]
---------------
['sos', 'i', "'m", 'fine', '.', 'how', 'about', 'yourself', '?']
[326, 1083, 719, 2246, 1508, 1113, 1764, 2024, 1510]


In [15]:

maximum_encoder_input=np.array([len(s) for s in train_encoder_input]).max()
maximum_decoder_input=np.array([len(s) for s in train_decoder_input]).max()
maximum__decoder_label=np.array([len(s) for s in train_decoder_label]).max()
print(maximum_encoder_input,maximum_decoder_input,maximum__decoder_label)

# padding the vectors so that all vectors are of equal length
train_encoder_input = pad_sequences(train_encoder_input,maxlen=maximum_encoder_input)
train_decoder_input = pad_sequences(train_decoder_input,maxlen=maximum_decoder_input)
train_decoder_label = pad_sequences(train_decoder_label,maxlen=maximum_decoder_input)

print(train_encoder_input.shape)
print(train_decoder_input.shape)
print(train_decoder_label.shape)

22 23 24
(3724, 22)
(3724, 23)
(3724, 23)


Encoder input --- > Encoder embedding   ----> LSTM ---->LSTM ---->LSTM ---->Dense layer

In [16]:
embedding_size=128
hidden_size = 128

encoder_input = Input(shape=[maximum_encoder_input])
#print("encoder_input shape : ",encoder_input)
#print("---")
encoder_embedding = Embedding(vocab_size,embedding_size,mask_zero=True)
encoder_embedded = encoder_embedding(encoder_input)
#print("encoder_embedded shape : ",encoder_embedded)
#print("---")
lstm1 = LSTM(hidden_size,return_sequences=True,return_state=True)
encoder_output1,_,_ = lstm1(encoder_embedded)

lstm2 = LSTM(hidden_size,return_sequences=True,return_state=True)
encoder_output2,_,_ = lstm2(encoder_output1)
print("manin : ", lstm2(encoder_output1))
print("----------")
lstm3 = LSTM(hidden_size,return_sequences=True,return_state=True)
encoder_output3,hs,cs = lstm3(encoder_output2)
print("manin : ",encoder_output3)

encoder_dense_c = Dense(hidden_size)
encoder_c3 = encoder_dense_c(cs)
print(encoder_c3)
encoder_dense_h = Dense(hidden_size)
encoder_h3 = encoder_dense_h(cs)
print(encoder_h3)

manin :  [<KerasTensor: shape=(None, 22, 128) dtype=float32 (created by layer 'lstm_1')>, <KerasTensor: shape=(None, 128) dtype=float32 (created by layer 'lstm_1')>, <KerasTensor: shape=(None, 128) dtype=float32 (created by layer 'lstm_1')>]
----------
manin :  KerasTensor(type_spec=TensorSpec(shape=(None, 22, 128), dtype=tf.float32, name=None), name='lstm_2/PartitionedCall:1', description="created by layer 'lstm_2'")
KerasTensor(type_spec=TensorSpec(shape=(None, 128), dtype=tf.float32, name=None), name='dense/BiasAdd:0', description="created by layer 'dense'")
KerasTensor(type_spec=TensorSpec(shape=(None, 128), dtype=tf.float32, name=None), name='dense_1/BiasAdd:0', description="created by layer 'dense_1'")


In [17]:
decoder_input = Input(shape=(None,))

decoder_embedding = Embedding(vocab_size,embedding_size,mask_zero=True)
decoder_embedded = decoder_embedding(decoder_input)

print("decoder_embedded : ",decoder_embedded)
print("-------------------")

decoder_lstm = LSTM(hidden_size,return_sequences=True,return_state=True)
decoder_output,dh,dc = decoder_lstm(decoder_embedded,initial_state=[encoder_h3,encoder_c3])

decoder_lstm1 = LSTM(hidden_size,return_sequences=True,return_state=True)
decoder_output,dh,dc = decoder_lstm1(decoder_output)

dense1 = Dense(200,activation='relu')
decoder_output = dense1(decoder_output)

softmax = Dense(vocab_size,activation='softmax')
decoder_output = softmax(decoder_output)


decoder_embedded :  KerasTensor(type_spec=TensorSpec(shape=(None, None, 128), dtype=tf.float32, name=None), name='embedding_1/embedding_lookup/Identity_1:0', description="created by layer 'embedding_1'")
-------------------


In [18]:
#defining an optimizer and training a model
trainer = Model([encoder_input,decoder_input],decoder_output)
trainer.compile(loss='sparse_categorical_crossentropy',optimizer='adam',metrics=['accuracy'])
train_hist = trainer.fit([train_encoder_input,train_decoder_input],train_decoder_label,epochs=30,validation_split=0.1,batch_size=128)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
