In [133]:
from keras.models import load_model,Model
from keras.layers import Input
from keras.preprocessing.sequence import pad_sequences
import numpy as np
import re

Cornell Movie dataset: https://www.kaggle.com/code/programminghut/seq2seq-chatbot-keras-with-attention/input?scriptVersionId=40597744

Youtube Tutorial: https://www.youtube.com/watch?v=Xg2wOBU9v90&list=PLTuKYqpidPXbulRHl8HL7JLRQXwDlqpLO&ab_channel=ProgrammingHut

In [134]:
# get the data, and ignore errors, split each new line as new data entry
lines = open('cornell movie-dialogs corpus/movie_lines.txt', encoding='utf-8', errors='ignore').read().split('\n')
conversations = open('cornell movie-dialogs corpus/movie_conversations.txt', encoding='utf-8', errors='ignore').read().split('\n')

# Pre processing the Text data

In [135]:
# create nested lists by splitting for exchanges
exchanges =[]
for conver in conversations:
    exchanges.append({conver.split(' +++$+++ ')[-1]: [item[1:-1].replace("'", "").replace(",", "") for 
    item in conver.split(' +++$+++ ')[-1][1:-1].split()]})
#exchanges


In [136]:
#create a dictionary
dialog ={}
for line in lines:
    dialog[line.split(' +++$+++ ')[0]] = line.split(' +++$+++ ')[-1]
#dialog

In [137]:
# Create a list for questions and answers
questions = []
answers = []

# Iterate through the conversation list
for exchange in exchanges:
    # Extract the list of conversation exchanges from the dictionary
    conversation = list(exchange.values())[0]
    
    # Iterate through the conversation exchanges
    for i in range(len(conversation) - 1):
        answer_key = conversation[i]
        question_key = conversation[i + 1]
        
        # Use dialog dictionary to map keys to actual text
        question_text = dialog.get(question_key, '')
        answer_text = dialog.get(answer_key, '')
        
        # Append the mapped text to questions and answers
        questions.append(question_text)
        answers.append(answer_text)

# Now 'questions' and 'answers' lists should contain the extracted conversation text
#print(questions)
#print(answers)


In [138]:
#delete variables that we are no longer using

del(conversation, i, dialog, conversations, line, lines, exchanges, conver)

# fix comment

cleaning the text data

In [139]:
# put questions and answers by fix length of questions
sorted_questions =[]
sorted_answers =[]
for i in range(len(questions)):
    if len(questions[i])<16:
        sorted_questions.append(questions[i])
        sorted_answers.append(answers[i])

In [140]:
# clean the text data
#create a regular expression
def clean_text(text):
    text = text.lower()
    text = re.sub(r"i'm","i am",text)
    text = re.sub(r"he's","he is",text)
    text = re.sub(r"she's","she is",text)
    text = re.sub(r"that's","that is",text)
    text= re.sub(r"what's","what is",text)
    text = re.sub(r"where's","where is",text)
    text = re.sub(r"\'ll", "will",text)
    text = re.sub(r"\'ve", "have",text)
    text = re.sub(r"\'re","are",text)
    text = re.sub(r"\d","would",text)
    text = re.sub(r"won't","will not",text)
    text = re.sub(r"can't","can not",text)
    text = re.sub(r"[^\w\s]", "", text)
    return(text)


In [141]:
#apply cleaning to our lists

clean_questions=[]
clean_answers=[]

for line in sorted_questions:
    clean_questions.append(clean_text(line))

for line in sorted_answers:
    clean_answers.append(clean_text(line))

#clean_answers

In [142]:
#max length of 11
for i in range(len(clean_answers)):
    clean_answers[i] = ' '.join(clean_answers[i].split()[:11])

In [143]:
#delete variables that are done
del(answers, i, line, questions, sorted_answers, sorted_questions)

In [144]:
#lessening amount of data
#clean_answers = clean_answers[:30000]
#clean_questions = clean_questions[:30000]

In [145]:
# create vocabulary
#sentances split into  words and counted 
#dictionary
word2count={}
for line in clean_questions:
    for word in line.split():
        if word not in word2count:
            word2count[word] = 1
        else:
            word2count[word] += 1
for line in clean_answers:
    for word in line.split():
        if word not in word2count:
            word2count[word] = 1
        else:
            word2count[word] += 1

#delete
del(word,line)
#print(word2count)

In [146]:
#remove less used words
threshold =5

vocabulary ={}
word_number =0
for word,count in word2count.items():
    if count>= threshold:
        vocabulary[word] =word_number
        word_number +=1
#delete
del(word2count, word, count, threshold, word_number)
#print(vocabulary)

In [147]:
# we need to create a beginning of string and end of string

for i in range(len(clean_answers)):
    clean_answers[i]= ' <SOS> ' + clean_answers[i] + ' <EOS> ' 

clean_answers

[' <SOS> youare asking me out that is so cute what is your <EOS> ',
 ' <SOS> no no its my fault we didnt have a proper introduction <EOS> ',
 ' <SOS> unsolved mystery she used to be really popular when she started <EOS> ',
 ' <SOS> that is because its such a nice one <EOS> ',
 ' <SOS> there <EOS> ',
 ' <SOS> you have my word as a gentleman <EOS> ',
 ' <SOS> have fun tonight <EOS> ',
 ' <SOS> i looked for you back at the party but you always <EOS> ',
 ' <SOS> then that is all you had to say <EOS> ',
 ' <SOS> then guillermo says if you go any lighter youare gonna look <EOS> ',
 ' <SOS> do you listen to this crap <EOS> ',
 ' <SOS> what good stuff <EOS> ',
 ' <SOS> i am kidding you know how sometimes you just become this <EOS> ',
 ' <SOS> wow <EOS> ',
 ' <SOS> she okay <EOS> ',
 ' <SOS> they do to <EOS> ',
 ' <SOS> did you change your hair <EOS> ',
 ' <SOS> where did he go he was just here <EOS> ',
 ' <SOS> who <EOS> ',
 ' <SOS> you think you re the only sophomore at the prom <EOS> ',
 ' <

In [148]:
#add tokens to vocabulary eg pad is padding 
tokens= ['<PAD>','<EOS>','<OUT>','<SOS>']
X= len(vocabulary)
for token in tokens:
    vocabulary[token] = X 
    X += 1
vocabulary['cameron'] = vocabulary['<PAD>']
vocabulary['<PAD>']=0
#delete
del(token, tokens ,X)

In [149]:
# inverse answers
inverse_vocabulary={w:v for v,w in vocabulary.items()}
#inverse_vocabulary
#vocabulary

#creating inputs

In [150]:
#encoder input - used to calculate the input 
encoder_inp = []
for line in clean_questions:
    list=[]
    for word in line.split():
        if word not in vocabulary:
            #check if word is in vocabulary
            list.append(vocabulary['<OUT>'])
        else:
            #add word to list
            list.append(vocabulary[word])
    encoder_inp.append(list)

encoder_inp

[[0, 1],
 [4040],
 [3, 4, 5, 6],
 [0, 7],
 [8],
 [9, 10],
 [11],
 [12, 13],
 [14],
 [15],
 [16, 17],
 [18, 19, 20],
 [15],
 [21, 22],
 [12, 23, 24],
 [25, 26, 27],
 [15],
 [28],
 [29],
 [12, 30],
 [31],
 [4042],
 [32, 33],
 [34, 29],
 [16, 4, 35],
 [36, 37, 22, 38],
 [9, 39],
 [40, 20, 41],
 [16],
 [42],
 [43, 44],
 [20, 30, 16],
 [24, 30, 20],
 [20, 45, 46, 47],
 [48, 49, 20],
 [4042],
 [50, 15],
 [42],
 [15, 14],
 [51],
 [4042],
 [52, 53],
 [28, 4, 54],
 [42, 27],
 [15, 12, 55, 27],
 [56],
 [54, 57, 46],
 [8],
 [27, 58, 59],
 [60],
 [20, 49, 61],
 [62, 63],
 [64, 65],
 [12, 66, 67, 1],
 [16],
 [0, 1],
 [26, 16],
 [68, 20, 69],
 [37, 70, 4042],
 [71, 27],
 [20, 72, 4042],
 [73, 74],
 [49, 5, 75],
 [76, 1, 77],
 [15, 78],
 [28, 79],
 [20, 80],
 [28],
 [4042],
 [34],
 [],
 [42, 70, 20],
 [12, 66, 64, 12, 66],
 [12, 81, 20],
 [56, 12, 26],
 [42],
 [28],
 [4042],
 [12, 82, 83],
 [16],
 [84],
 [85],
 [15, 20, 86],
 [87, 20, 88],
 [26, 16],
 [89],
 [28],
 [4042],
 [20, 68, 24],
 [90],
 [404

In [151]:
#decoder input - used to calculate the output
decoder_inp = []
for line in clean_answers:
    list=[]
    for word in line.split():
        if word not in vocabulary:
            #check if word is in vocabulary
            list.append(vocabulary['<OUT>'])
        else:
            #add word to list
            list.append(vocabulary[word])
    decoder_inp.append(list)

decoder_inp

[[4043, 9, 281, 46, 53, 3, 4, 24, 686, 16, 4, 203, 4041],
 [4043, 15, 15, 48, 147, 2316, 37, 80, 414, 5, 3019, 4042, 4041],
 [4043, 4042, 1384, 54, 1557, 297, 135, 205, 4042, 106, 54, 2070, 4041],
 [4043, 3, 4, 255, 48, 537, 5, 357, 249, 4041],
 [4043, 452, 4041],
 [4043, 20, 414, 147, 1269, 43, 5, 3244, 4041],
 [4043, 414, 341, 176, 4041],
 [4043, 12, 492, 94, 20, 235, 58, 18, 1596, 14, 20, 918, 4041],
 [4043, 138, 3, 4, 59, 20, 392, 297, 569, 4041],
 [4043, 138, 4042, 124, 331, 20, 22, 153, 4042, 9, 2839, 191, 4041],
 [4043, 26, 20, 1675, 297, 89, 17, 4041],
 [4043, 16, 91, 339, 4041],
 [4043, 12, 66, 208, 20, 114, 62, 755, 20, 49, 3245, 89, 4041],
 [4043, 814, 4041],
 [4043, 54, 155, 4041],
 [4043, 25, 26, 297, 4041],
 [4043, 30, 20, 986, 203, 2014, 4041],
 [4043, 8, 30, 125, 22, 125, 13, 49, 132, 4041],
 [4043, 28, 4041],
 [4043, 20, 332, 20, 4042, 18, 717, 4042, 58, 18, 2708, 4041],
 [4043, 48, 209, 4041],
 [4043, 254, 24, 20, 248, 297, 4042, 4042, 580, 67, 573, 4041],
 [4043, 24,

In [152]:
#delete
del(clean_answers, clean_questions, line, list,word)

# keras

In [153]:
# fix length of 16 - 0 means padding 0 will point to padding value

encoder_inp = pad_sequences(encoder_inp,13, padding='post', truncating='post')
decoder_inp = pad_sequences(decoder_inp,13, padding='post', truncating='post')

In [154]:

decoder_final_output=[]
for i in decoder_inp:
    decoder_final_output.append(i[1:])

decoder_final_output=pad_sequences(decoder_final_output,13, padding='post',truncating='post')

#decoder_final_output

Training and testing

In [155]:

# Load the saved weights
weights_filename = 'lstm_model_weights.h5'
new_lstm_model=load_model(weights_filename)

# Now, new_lstm_model contains the same architecture as the trained model and is initialized with the saved weights.


# Inverence

### building decoder model

In [156]:

# Create an inference model for the decoder part
decoder_inputs = Input(shape=(3,))  # Input for the previous predicted word index
decoder_state_input_h = Input(shape=(500,))  # Hidden state
decoder_state_input_c = Input(shape=(500,))  # Cell state
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

# Get the embedding layer (shared with training model)
dec_embed = new_lstm_model.layers[2](decoder_inputs)

# Use the LSTM layer (shared with training model)
decoder_lstm = new_lstm_model.layers[4]
dec_op, state_h, state_c = decoder_lstm(dec_embed, initial_state=decoder_states_inputs)
decoder_states = [state_h, state_c]

# Output layer (shared with training model)
decoder_dense = new_lstm_model.layers[5]
decoder_outputs = decoder_dense(dec_op)

# Create the inference model
inference_decoder_model = Model([decoder_inputs] + decoder_states_inputs, [decoder_outputs] + decoder_states)


In [157]:
#adding temperature sampling to improve performance

def temperature_sampling(predictions, temperature=1.0):
    # Adjust the predictions using temperature
    predictions = np.log(predictions) / temperature
    exp_predictions = np.exp(predictions)
    
    # Normalize to get a probability distribution
    adjusted_predictions = exp_predictions / np.sum(exp_predictions)
    
    # Sample from the adjusted distribution
    sampled_index = np.random.choice(len(adjusted_predictions), p=adjusted_predictions)
    
    return sampled_index


In [159]:
print("##############################")
print("#######  ChatBot   ##########")
print("##############################")

def generate_response(input_text, model):
    # Preprocess the input text
    preprocessed_input = clean_text(input_text)
     # Print the preprocessed input
    #print("Preprocessed input:", preprocessed_input)
    # Tokenize the input
   # Tokenize and pad the input
    input_sequence = [vocabulary.get(word, vocabulary['<OUT>']) for word in preprocessed_input.split()]
    input_sequence = pad_sequences([input_sequence], maxlen=13, padding='post')
    
    # Print the input sequence
   # print("Input sequence:", input_sequence)
    
    # Initialize the target sequence with <SOS>
    #target_sequence = np.zeros((1, 1))
    #target_sequence[0, 0] = vocabulary['<SOS>']

    
    # Initialize variables
    stop_condition = False
    decoded_response = ''
     # Initial values for the decoder states
    initial_decoder_states = [np.zeros((1, 500)), np.zeros((1, 500))]
    # Encode the input using the loaded model
    #encoder_states = model.layers[2].predict(input_sequence)  # Assuming the encoder is at index 2
    # Encode the input using the loaded model (embedding layer)
    encoded_input = model.layers[2](np.array(input_sequence).reshape(1, -1))  # Use the embedding layer directly
    
    # Print the encoded input
   # print("Encoded input:", encoded_input)
    
    while not stop_condition:
        # Initialize the target sequence with <SOS> at the start of each loop iteration
        target_sequence = np.zeros((1, 1))
        target_sequence[0, 0] = vocabulary['<SOS>']

        # Predict the next word index
        dec_outputs, h, c = inference_decoder_model.predict([np.array([target_sequence[-1]])] + initial_decoder_states, verbose=0)
         # Print the decoder outputs
        #print("Decoder outputs:", dec_outputs)
        
        # Get the predicted word index
        sampled_word_index = np.argmax(dec_outputs[0, -1, :])
        
        # Get the word associated with the index
        sampled_word = inverse_vocabulary[sampled_word_index]
          # Print the sampled word
        print("Sampled word:", sampled_word)
        if sampled_word != '<EOS>' and len(decoded_response.split()) < 13:
            decoded_response += sampled_word + ' '
        
        if sampled_word == '<EOS>' or len(decoded_response.split()) >= 13:
            stop_condition = True
        
        # Update the target sequence for the next prediction
        target_sequence = np.append(target_sequence, sampled_word_index)
        
         # Update the decoder states for the next prediction
        initial_decoder_states = [h, c]
    
    return decoded_response

# Initialize variables for conversation
conversation = []

# Start a conversation loop
while True:
    user_input = input("You: ")
    
    if user_input.lower() == 'q':
        break
    
    # Generate a response from the chatbot
    chatbot_response = generate_response(user_input, new_lstm_model)
    
    # Print the chatbot's response
    print("ChatBot: " + chatbot_response)

    # Add the user's input and the chatbot's response to the conversation history
    conversation.append("You: " + user_input)
    conversation.append("ChatBot: " + chatbot_response)

# End the conversation
print("ChatBot: Goodbye!")

##############################
#######  ChatBot   ##########
##############################
Sampled word: stay
Sampled word: frankie
Sampled word: adorable
Sampled word: adorable


KeyError: 4069