In [None]:
import re
import numpy as np
import pandas as pd
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Model


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# Loading The Dataset

In [None]:
lines = open('../input/chatbot-data/cornell movie-dialogs corpus/movie_lines.txt', encoding='utf-8',
             errors='ignore').read().split('\n')

convers = open('../input/chatbot-data/cornell movie-dialogs corpus/movie_conversations.txt', encoding='utf-8',
             errors='ignore').read().split('\n')


# Data Preprocessing

In [None]:
exchn = []
for conver in convers:
    exchn.append(conver.split(' +++$+++ ')[-1][1:-1].replace("'", " ").replace(",","").split())

diag = {}
for line in lines:
    diag[line.split(' +++$+++ ')[0]] = line.split(' +++$+++ ')[-1]

## delete
del(lines, convers, conver, line)

# Creating List of Questions and Answers

In [None]:
questions = []
answers = []

for conver in exchn:
    for i in range(len(conver) - 1):
        questions.append(diag[conver[i]])
        answers.append(diag[conver[i+1]])
del(diag, exchn, conver, i)

# More preprocessing of QnA
# Maximum Length of Questions= 13    

sorted_ques = []
sorted_ans = []
for i in range(len(questions)):
    if len(questions[i]) < 13:
        sorted_ques.append(questions[i])
        sorted_ans.append(answers[i])


# Cleaning of Dataset

In [None]:
def clean_text(txt):
    txt = txt.lower()
    txt = re.sub(r"i'm", "i am", txt)
    txt = re.sub(r"he's", "he is", txt)
    txt = re.sub(r"she's", "she is", txt)
    txt = re.sub(r"that's", "that is", txt)
    txt = re.sub(r"what's", "what is", txt)
    txt = re.sub(r"where's", "where is", txt)
    txt = re.sub(r"\'ll", " will", txt)
    txt = re.sub(r"\'ve", " have", txt)
    txt = re.sub(r"\'re", " are", txt)
    txt = re.sub(r"\'d", " would", txt)
    txt = re.sub(r"won't", "will not", txt)
    txt = re.sub(r"can't", "can not", txt)
    txt = re.sub(r"[^\w\s]", "", txt)
    return txt

clean_ques = []
clean_ans = []

for line in sorted_ques:
    clean_ques.append(clean_text(line))
    
for line in sorted_ans:
    clean_ans.append(clean_text(line))

for i in range(len(clean_ans)):
    clean_ans[i] = ' '.join(clean_ans[i].split()[:13])

del(answers, questions, line,sorted_ans, sorted_ques)


# trimming
clean_ans=clean_ans[:35000]
clean_ques=clean_ques[:35000]

# Creating Vocabulary

In [None]:
#  Count Occurences 
word2count = {}

for line in clean_ques:
    for word in line.split():
        if word not in word2count:
            word2count[word] = 1
        else:
            word2count[word] += 1
            
for line in clean_ans:
    for word in line.split():
        if word not in word2count:
            word2count[word] = 1
        else:
            word2count[word] += 1
            
del(word, line)


# Remove less frequent Words by threshold frequency
thresh = 5
vocab = {}
word_num = 0
for word, count in word2count.items():
    if count >= thresh:
        vocab[word] = word_num
        word_num += 1
        
## delete
del(word2count, word, count, thresh,word_num)       
   


# Adding SOS and EOS

In [None]:
for i in range(len(clean_ans)):
    clean_ans[i] = '<SOS> ' + clean_ans[i] + ' <EOS>'
    
tokens = ['<PAD>', '<EOS>', '<OUT>', '<SOS>']
x = len(vocab)

for token in tokens:
    vocab[token] = x
    x += 1
    
vocab['cameron'] = vocab['<PAD>']
vocab['<PAD>'] = 0

del(x,token, tokens) 

# Inverse Answers Dictionary 
inv_vocab = {w:v for v, w in vocab.items()}
del(i)

# Creating Encoder and Decoder Inputs

In [None]:
encoder_inp = []
for line in clean_ques:
    lst = []
    for word in line.split():
        if word not in vocab:
            lst.append(vocab['<OUT>'])
        else:
            lst.append(vocab[word])
        
    encoder_inp.append(lst)
    
decoder_inp = []
for line in clean_ans:
    lst = []
    for word in line.split():
        if word not in vocab:
            lst.append(vocab['<OUT>'])
        else:
            lst.append(vocab[word])        
    decoder_inp.append(lst)

del(clean_ans, clean_ques, line, lst, word)

# Padding the inputs for LSTM Model
from tensorflow.keras.preprocessing.sequence import pad_sequences

encoder_inp = pad_sequences(encoder_inp, 13, padding='post', truncating='post')
decoder_inp = pad_sequences(decoder_inp, 13, padding='post', truncating='post')
decoder_final_output = []

for i in decoder_inp:
    decoder_final_output.append(i[1:]) 
decoder_final_output = pad_sequences(decoder_final_output, 13, padding='post', truncating='post')
del(i)

# Label Encoding
decoder_final_output = to_categorical(decoder_final_output, len(vocab))
print(decoder_final_output.shape)


# Creating Encoding  Model Using LSTM

In [None]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Embedding, LSTM, Input

enc_inp = Input(shape=(13, ))
dec_inp = Input(shape=(13, ))

VOCAB_SIZE = len(vocab)
embed = Embedding(VOCAB_SIZE+1, output_dim=50, 
                  input_length=13,
                  trainable=True                  
                  )

enc_embed = embed(enc_inp)
enc_lstm = LSTM(400, return_sequences=True, return_state=True)
enc_op, h, c = enc_lstm(enc_embed)
enc_states = [h, c]

dec_embed = embed(dec_inp)
dec_lstm = LSTM(400, return_sequences=True, return_state=True)
dec_op, _, _ = dec_lstm(dec_embed, initial_state=enc_states)

dense = Dense(VOCAB_SIZE, activation='softmax')

dense_op = dense(dec_op)
#model = tf.keras.models.load_model("../input/models/model.h5")
model = Model([enc_inp, dec_inp], dense_op)

model.compile(loss='categorical_crossentropy',metrics=['acc'],optimizer='adam')

model.fit([encoder_inp, decoder_inp],decoder_final_output,epochs=300)


In [None]:
#model.save("dec_model.h5")
#model.save("model.h5")

# Creating Decoding Model Using LSTM

In [None]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input
import tensorflow as tf
#model = tf.keras.models.load_model("../input/models/model.h5")

enc_model = Model([enc_inp], enc_states)

# decoder Model
decoder_state_input_h = Input(shape=(400,))
decoder_state_input_c = Input(shape=(400,))

decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

decoder_outputs, state_h, state_c = dec_lstm(dec_embed , 
                                    initial_state=decoder_states_inputs)

decoder_states = [state_h, state_c]
#dec_model = tf.keras.models.load_model("../input/models/dec_model.h5")
dec_model = Model([dec_inp]+ decoder_states_inputs,[decoder_outputs]+ decoder_states)

dec_model.compile(loss='categorical_crossentropy',metrics=['acc'],optimizer='adam')

# Inference Model for Frontend

In [None]:
import numpy as np

from keras.preprocessing.sequence import pad_sequences

conversations = open('../input/testgg/Test.txt', encoding='utf-8',
             errors='ignore').read().split("\nq")
conversations.pop()
print("##########################################")
print("#       start chatting ver. 1.0          #")
print("##########################################")
print("")
Lines = []
i=0
CHATBOT_ACTIVITY_VOLUME = [] #Measuring a chatbot’s Activity Volume means evaluating the number of interactions, 
#from the time a user asks a simple question until a constructive dialogue takes place.
NON_RESPONSE_RATE = [] #This metric measures the number of times your chatbot fails to respond to a question. 
#Such failure may be the result of a lack of content or of your bot’s difficulty in comprehending user inquiries.
RESPONSE_RATE = [] #This is a concrete indicator that will tell you the number of questions your chatbot has answered.
for i in range (len(conversations)):
    CHATBOT_ACTIVITY_VOLUME.append(0)
    NON_RESPONSE_RATE.append(0)
    RESPONSE_RATE.append(0)
i=0
for i in range (len(conversations)):
 print("Conversation",i+1,": ") 
 print("")
 k=0
 l=0
 lines = conversations[i].split('\n')
 #lines.pop()
 if i!=0:   
  lines.pop(0)
 for j in range (len(lines)):
    #prepro1 = ""
    prepro1 = lines[j]
    #while prepro1 != 'q':
    l=0
    CHATBOT_ACTIVITY_VOLUME[i]+=1
    NON_RESPONSE_RATE[i] = k*100/CHATBOT_ACTIVITY_VOLUME[i]
    RESPONSE_RATE[i] = 100 - NON_RESPONSE_RATE[i]
    #prepro1  = input("you : ")
    
    prepro1 = clean_text(prepro1)
    prepro = [prepro1]

    txt = []
    for x in prepro:
        
        lst = []
        for y in x.split():
           
            try:
                lst.append(vocab[y])
            
            except:
                lst.append(vocab['<OUT>'])
                l=1
        txt.append(lst)

    txt = pad_sequences(txt, 13, padding='post')

    stat = enc_model.predict( txt )

    empty_target_seq = np.zeros( ( 1 , 1) )

    empty_target_seq[0, 0] = vocab['<SOS>']

    stop_condition = False
    decoded_translation = ''

    while not stop_condition :

        dec_outputs , h, c= dec_model.predict([ empty_target_seq] + stat )
        decoder_concat_input = dense(dec_outputs)

        sampled_word_index = np.argmax( decoder_concat_input[0, -1, :] )
      
        sampled_word = inv_vocab[sampled_word_index] + ' '

        if sampled_word != '<EOS> ':
            decoded_translation += sampled_word  

        if sampled_word == '<EOS> ' or len(decoded_translation.split()) > 13:
            stop_condition = True 

        empty_target_seq = np.zeros( ( 1 , 1 ) )  
        empty_target_seq[ 0 , 0 ] = sampled_word_index
        stat = [h, c]  
    print("you : ",prepro1)
    print("chatbot attention : ", decoded_translation )
    #print("==============================================")
    Lines.append(prepro1)
    if l==1:
        k+=1
 print("==============================================")
 print("")

In [None]:
jargon = open('../input/testgg/Jargon.txt', encoding='utf-8',
             errors='ignore').read().split("\n")
print("Jargon used is : ",jargon)
COMPREHENSION_LEVEL = [] #Your chatbot will indicate its overall 
#comprehension of user inquiries.
for j in range(len(jargon)):
    COMPREHENSION_LEVEL.append(0)
for j in range (len(jargon)):
    #prepro1 = ""
    prepro1 = jargon[j]
    #while prepro1 != 'q':
    l=100
    
    prepro1 = clean_text(prepro1)
    prepro = [prepro1]

    txt = []
    for x in prepro:
        
        lst = []
        for y in x.split():
           
            try:
                lst.append(vocab[y])
            
            except:
                lst.append(vocab['<OUT>'])
                l=0
        txt.append(lst)

    txt = pad_sequences(txt, 13, padding='post')

    stat = enc_model.predict( txt )

    empty_target_seq = np.zeros( ( 1 , 1) )

    empty_target_seq[0, 0] = vocab['<SOS>']

    stop_condition = False
    decoded_translation = ''

    while not stop_condition :

        dec_outputs , h, c= dec_model.predict([ empty_target_seq] + stat )
        decoder_concat_input = dense(dec_outputs)

        sampled_word_index = np.argmax( decoder_concat_input[0, -1, :] )
      
        sampled_word = inv_vocab[sampled_word_index] + ' '

        if sampled_word != '<EOS> ':
            decoded_translation += sampled_word  

        if sampled_word == '<EOS> ' or len(decoded_translation.split()) > 13:
            stop_condition = True 

        empty_target_seq = np.zeros( ( 1 , 1 ) )  
        empty_target_seq[ 0 , 0 ] = sampled_word_index
        stat = [h, c]  
    COMPREHENSION_LEVEL[j] = l 

In [None]:
print("CHATBOT_ACTIVITY_VOLUME = ", CHATBOT_ACTIVITY_VOLUME)
print("")
print("RESPONSE_RATE = ", RESPONSE_RATE)
print("")
print("NON_RESPONSE_RATE = ", NON_RESPONSE_RATE)
print("")
INTERACTION_RATE = sum(CHATBOT_ACTIVITY_VOLUME)/len(CHATBOT_ACTIVITY_VOLUME)
print("")
print("INERACTION RATE = ",INTERACTION_RATE)
print("")
print("COMPREHENSION LEVEL = ",COMPREHENSION_LEVEL)

In [None]:
def lcs(S,T):
    S.upper()
    T.upper()
    m = len(S)
    n = len(T)
    counter = [[0]*(n+1) for x in range(m+1)]
    longest = 0
    lcs_set = set()
    for i in range(m):
        for j in range(n):
            if S[i] == T[j]:
                c = counter[i][j] + 1
                counter[i+1][j+1] = c
                if c > longest:
                    lcs_set = set()
                    longest = c
                    lcs_set.add(S[i-c+1:i+1])
                #elif c == longest:
                    #lcs_set.add(S[i-c+1:i+1])

    return lcs_set

In [None]:
MFAQ = [] #Most frequently asked questions
for i in range(len(Lines)):
    Lines[i] = Lines[i].strip()
Lines = list(dict.fromkeys(Lines))
for i in range(len(Lines)):
    MFAQ.append(0)
for j in range (len(Lines)):
    for k in range(len(Lines)):
      if(j!=k):
        omg = len(repr(lcs(Lines[j].upper(),Lines[k].upper())))
        omg = omg - 4
        if(omg*100/len(Lines[j])>51):
            MFAQ[j] = MFAQ[j] + 1
        #if(omg*100/len(Lines[k])>51):
            #MFAQ[k] = MFAQ[k] + 1
n = len(MFAQ)
 
    # Traverse through all array elements
for i in range(n):
 
        # Last i elements are already in place
    for j in range(0, n-i-1):
 
            # traverse the array from 0 to n-i-1
            # Swap if the element found is greater
            # than the next element
        if MFAQ[j] < MFAQ[j+1] :
                MFAQ[j], MFAQ[j+1] = MFAQ[j+1], MFAQ[j]
                Lines[j], Lines[j+1] = Lines[j+1], Lines[j]
print("Most frequently asked questions are : ")
print(Lines[0:20])
mdf = pd.DataFrame(list(zip(Lines,MFAQ)),
               columns =['Lines', 'Frequency'])

In [None]:
mdf

In [None]:
# count plot on two categorical variable
#sns.countplot(x ='Lines',y='Frequency', hue = 'Frequency', data = mdf)
 
# Show the plot
#plt.show()
mdf.plot.bar(x='Lines',y='Frequency')

In [None]:
df = pd.DataFrame(list(zip(CHATBOT_ACTIVITY_VOLUME,RESPONSE_RATE,COMPREHENSION_LEVEL)),
               columns =['CHATBOT_ACTIVITY_VOLUME', 'RESPONSE_RATE', 'COMPREHENSION_LEVEL' ])
df

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
df.describe(include='all')

In [None]:
df.hist('CHATBOT_ACTIVITY_VOLUME')
df.hist('RESPONSE_RATE')
df.hist('COMPREHENSION_LEVEL')

In [None]:
plt.figure(figsize=(12, 10)) # Set the figure size
sns.heatmap(df.corr(), annot=True) # Print the heatmap

# # # **Live questions**

In [None]:
import numpy as np

from keras.preprocessing.sequence import pad_sequences
print("##########################################")
print("#       start chatting ver. 1.0          #")
print("##########################################")


prepro1 = ""
while prepro1 != 'q':
    prepro1  = input("you : ")
    
    prepro1 = clean_text(prepro1)
    prepro = [prepro1]

    txt = []
    for x in prepro:
        
        lst = []
        for y in x.split():
           
            try:
                lst.append(vocab[y])
            
            except:
                lst.append(vocab['<OUT>'])
        txt.append(lst)

    txt = pad_sequences(txt, 13, padding='post')

    stat = enc_model.predict( txt )

    empty_target_seq = np.zeros( ( 1 , 1) )

    empty_target_seq[0, 0] = vocab['<SOS>']

    stop_condition = False
    decoded_translation = ''

    while not stop_condition :

        dec_outputs , h, c= dec_model.predict([ empty_target_seq] + stat )
        decoder_concat_input = dense(dec_outputs)

        sampled_word_index = np.argmax( decoder_concat_input[0, -1, :] )
      
        sampled_word = inv_vocab[sampled_word_index] + ' '

        if sampled_word != '<EOS> ':
            decoded_translation += sampled_word  

        if sampled_word == '<EOS> ' or len(decoded_translation.split()) > 13:
            stop_condition = True 

        empty_target_seq = np.zeros( ( 1 , 1 ) )  
        empty_target_seq[ 0 , 0 ] = sampled_word_index
        stat = [h, c]  

    print("chatbot attention : ", decoded_translation )
    print("==============================================")  