<a href="https://colab.research.google.com/github/M-PRERNA/NLP-CHATBOT-2020/blob/main/chatybot.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **PART 1: NLP**

# importing the libraries

In [1]:
import numpy as np
import tensorflow as tf
import re
import time


# importing the data sets

In [2]:
lines = open('movie_lines.txt', encoding='utf-8', errors='ignore').read().split('\n')
conversations = open('movie_conversations.txt', encoding='utf-8', errors='ignore').read().split('\n')


# creating a dictionary and maps each line with its id


In [3]:
id2line = {}
for line in lines:
  _line = line.split(' +++$+++ ')
  if len(_line)==5:
    id2line[_line[0]]=_line[4] 

# creating a list of all the conversations

In [4]:
conversation_ids=[]
for conversation in conversations[:-1]: 
  _conversation = conversation.split(' +++$+++ ')[-1][1:-1].replace("'","").replace(" ","")#to remove the squarebrackets which are indexed as 0 and -1 and removing the single quotes
  conversation_ids.append(_conversation.split(","))
  # here underscore _ means its a temporary variable

# conversation_ids

# get separately the question and the answers

In [5]:
questions = []
answers = []

for conversation in conversation_ids :
  for i in range (len(conversation)-1):
    questions.append(id2line[conversation[i]])
    answers.append(id2line[conversation[i+1]])

# Doing a first cleaning of the text

In [6]:
def clean_text(text):
  text = text.lower()
  # using regex to clean
  text = re.sub(r"i'am","i am",text)
  text = re.sub(r"he's","he is",text)
  text = re.sub(r"she's","she is",text)
  text = re.sub(r"there's","there is",text)
  text = re.sub(r"who's","who is",text)
  text = re.sub(r"where's","where is",text)
  text = re.sub(r"what's","what's",text)
  text = re.sub(r"\'ll"," will",text)
  text = re.sub(r"\'ve"," have",text)
  text = re.sub(r"\'d"," would",text)
  text = re.sub(r"\'ll"," will",text)
  text = re.sub(r"\'re"," are",text)
  text = re.sub(r"won't","will not",text)
  text = re.sub(r"shouldn't","should not",text)
  text = re.sub(r"hasn't","has not",text)
  text = re.sub(r"can't","can not",text)
  text = re.sub(r"don't","do not",text)
  text = re.sub(r"isn't","is not",text)
  text = re.sub(r"[-()\"{}/@#%;<>:*$+=~`?.,|]","",text)
  return text


# cleaning the questions and answers

In [7]:
clean_questions =[]
for question in questions:
  clean_questions.append(clean_text(question))

clean_answers =[]
for answer in answers:
  clean_answers.append(clean_text(answer))



## process to remove the not so frequent words from the data sets

# creating a dictionary that maps each word to its number of occurences

In [8]:
word2count = {}
for question in clean_questions:
  for word in question.split():
    if word not in word2count:
      word2count[word]=1
    else:
      word2count[word] +=1

for answer in clean_answers:
  for word in answer.split():
    if word not in word2count:
      word2count[word]=1
    else:
      word2count[word] +=1

#creating two dictionaries that map the question words and answer words to a unique integer


In [9]:
threshold = 20
questionsword2int ={}
word_number = 0
for word,count in word2count.items():
  if count>=threshold:
    questionsword2int[word]=word_number
    word_number +=1
# includes only 95% of the most frequent words
answersword2int ={}
word_number = 0
for word,count in word2count.items():
  if count>=threshold:
    answersword2int[word]=word_number
    word_number +=1


# adding the last tokens to the two dictionaries

In [10]:
# # these special tokens are used in seq2seq models in python 
# GO - the same as <start> on the picture below - the first token which is fed to the decoder along with the though vector in order to start generating tokens of the answer
# EOS - "end of sentence" - the same as <end> on the picture below - as soon as decoder generates this token we consider the answer to be complete (you can't use usual punctuation marks for this purpose cause their meaning can be different)
# UNK - "unknown token" - is used to replace the rare words that did not fit in your vocabulary. So your sentence My name is guotong1988 will be translated into My name is _unk_.
# PAD - your GPU (or CPU at worst) processes your training data in batches and all the sequences in your batch should have the same length. If the max length of your sequence is 8, your sentence My name is guotong1988 will be padded from either side to fit this length: My name is guotong1988 _pad_ _pad_ _pad_ _pad_


In [11]:
tokens=['<PAD>','<EOS>','<OUT>','<SOS>']
for token in tokens :
  questionsword2int[token]=len(questionsword2int)+1

for token in tokens :
  answersword2int[token]=len(answersword2int)+1

# creating an inverse map of the answers dictionary for mapping

###### note the trick to inverse a dictionary




In [12]:
answersint2word = {w_i: w for w, w_i in answersword2int.items()}

In [None]:
# answersint2word

# to add the end of string token to the end of every answer

In [13]:
for i in range (len(clean_answers)):
  clean_answers[i] += ' <EOS>'

# translating all the questions and answers into integers
# and replacing all the words that were filtered by OUT

In [14]:
questions_to_int = []
for question in clean_questions:
  ints =[]
  for word in question.split():
    if word not in questionsword2int:
      ints.append(questionsword2int['<OUT>'])
    else:
      ints.append(questionsword2int[word])
  questions_to_int.append(ints)


answers_to_int = []
for answer in clean_answers:
  ints =[]
  for word in answer.split():
    if word not in answersword2int:
      ints.append (answersword2int ['<OUT>'])
    else:
      ints.append(answersword2int[word])
  answers_to_int.append(ints)


# sorting questions and answers based upon the length of the questions

In [15]:
sorted_clean_questions=[]
sorted_clean_answers = []
for length in range (1,25+1):
  for i in enumerate (questions_to_int):
    if len(i[1])==length:
      sorted_clean_questions.append(questions_to_int[i[0]])
      sorted_clean_answers.append(answers_to_int[i[0]])



# **PART 2: BUILDING THE SEQ2SEQ MODEL**

 # Creating placeholders for the inputs and the targets

In [17]:
def model_inputs():
  inputs = tf.placeholder(tf.int32, [None,None],name = 'input')
  # the tf.int32->is the tensorflow datatype [None,None]->the dimensions of the inputs matrix name='input' specifies the input of the place holder
  targets = tf.placeholder(tf.int32, [None,None],name = 'target') #the answers are the targets
  # which we would compare with our chatbot models
  lr = tf.placeholder(tf.float32, name = 'learning_rate')
  keep_prob = tf.placeholder(tf.float32, name = 'keep_prob')
  return inputs, targets, lr, keep_prob


In [20]:
# before putting the inputs into the encoder of our sequence to sequence models 
# we need to first preprocess them so that they are accessible by the encoder
def preprocess_targets(targets, word2int, batch_size):
  left_side = tf.fill([batch_size,1],word2int['<SOS>'])
  right_side = tf.strided_slice(targets, [0,0], [batch_size, -1], [1,1])
  preprocessed_targets = tf.concat ([left_side,right_side],1)
  return preprocessed_targets

# Architecture of Seq2Seq model

# Creating the encoder RNN layer

In [None]:
def encoder_rnn_layer(rnn_inputs, rnn_size, num_layers, keep_prob, sequence_length):
  lstm = tf.contrib.rnn.BasicLSTMCell(rnn_size)
# new object of the dropout wrapper class because 20% of the neurons are non-existent during the training
  lstm_dropout = 