In [1]:
# Building a deep NLP chatbot using seq2seq models

# importing the required libraries
import os
import re
import time

import numpy as np
import tensorflow as tf

DATASET_PATH = 'dataset/cornell movie-dialogs corpus/'

In [2]:
lines_buff = open(os.path.join(DATASET_PATH,'movie_lines.txt'), mode='r', encoding='latin-1')
lines = lines_buff.read().split('\n')
lines_buff.close()

converasations_buff = open(os.path.join(DATASET_PATH,'movie_conversations.txt'), mode='r', encoding='latin-1')
converasations = converasations_buff.read().split('\n')
converasations_buff.close()

In [3]:
# Mapping each dialogie with the corresponding dialogie code in a dictionary for accessing easily    
mapping_id2line = {}
lines = [line.split(' +++$+++ ') for line in lines]
for line in lines :
    if len(line) == 5:
        mapping_id2line[line[0]] = line[-1]

In [4]:
# mapping_id2line

In [5]:
# list of all conversations 

conversations_list = [converasation.split(' +++$+++ ')[-1][1:-1].strip().replace("'","").replace(" ","").split(',') for converasation in converasations]

In [6]:
# conversations_list

In [7]:
questions = []
answers = []

for conversation in conversations_list[:-1] :
    for i in range(len(conversation)-1): 
                questions.append(mapping_id2line[conversation[i]])
                answers.append(mapping_id2line[conversation[i+1]])

In [8]:
len(questions)

221616

In [9]:
len(answers)

221616

In [10]:
questions

['Can we make this quick?  Roxanne Korrine and Andrew Barrett are having an incredibly horrendous public break- up on the quad.  Again.',
 "Well, I thought we'd start with pronunciation, if that's okay with you.",
 'Not the hacking and gagging and spitting part.  Please.',
 "You're asking me out.  That's so cute. What's your name again?",
 "No, no, it's my fault -- we didn't have a proper introduction ---",
 'Cameron.',
 "The thing is, Cameron -- I'm at the mercy of a particularly hideous breed of loser.  My sister.  I can't date until she does.",
 'Why?',
 'Unsolved mystery.  She used to be really popular when she started high school, then it was just like she got sick of it or something.',
 'Gosh, if only we could find Kat a boyfriend...',
 "C'esc ma tete. This is my head",
 "Right.  See?  You're ready for the quiz.",
 "I don't want to know how to say that though.  I want to know useful things. Like where the good stores are.  How much does champagne cost?  Stuff like Chat.  I have n

In [11]:
answers

["Well, I thought we'd start with pronunciation, if that's okay with you.",
 'Not the hacking and gagging and spitting part.  Please.',
 "Okay... then how 'bout we try out some French cuisine.  Saturday?  Night?",
 'Forget it.',
 'Cameron.',
 "The thing is, Cameron -- I'm at the mercy of a particularly hideous breed of loser.  My sister.  I can't date until she does.",
 'Seems like she could get a date easy enough...',
 'Unsolved mystery.  She used to be really popular when she started high school, then it was just like she got sick of it or something.',
 "That's a shame.",
 'Let me see what I can do.',
 "Right.  See?  You're ready for the quiz.",
 "I don't want to know how to say that though.  I want to know useful things. Like where the good stores are.  How much does champagne cost?  Stuff like Chat.  I have never in my life had to point out my head to someone.",
 "That's because it's such a nice one.",
 'Forget French.',
 "Well, there's someone I think might be --",
 'Where?',
 "I 

###  Cleaning

In [12]:
def clean_text(text):
    
    text = text.lower()
    text = re.sub(r"i'm", "i am", text)
    text = re.sub(r"don't", "do not", text)
    text = re.sub(r"he's", "he is", text)
    text = re.sub(r"she's", "she is", text)
    text = re.sub(r"that's", "that is", text)
    text = re.sub(r"what's", "what is", text)
    text = re.sub(r"where's", "where is", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"won't", "will not", text)
    text = re.sub(r"can't", "cannot", text)
    
    # use this
    text = re.sub(r"[-{}\"#/@;:<>()+=`|.?,]","", text)
    #Or this
#     text = re.sub("[^a-zA-Z]", " ", text)
    
#     text = re.sub(r"'s\b","",text)
#     text = ' '.join([word for word in text.split() if len(word)>=3]).strip()
    return text

In [13]:
questions_cleaned = [clean_text(question) for question in questions]

In [14]:
# question_cleaned

In [15]:
answers_cleaned = [clean_text(answer) for answer in answers]

In [16]:
# answers_cleaned

In [17]:
mapping_dict = dict()
def data_imporvement(text):
    for word in text.split():
        if word not in mapping_dict:
            mapping_dict[word] = 1
        else:
            mapping_dict[word] += 1

In [19]:
for text in questions_cleaned:
    data_imporvement(text)

In [20]:
for text in answers_cleaned:
    data_imporvement(text)

In [21]:
mapping_dict

{'can': 15975,
 'we': 40500,
 'make': 6747,
 'this': 33573,
 'quick': 337,
 'roxanne': 1,
 'korrine': 1,
 'and': 65586,
 'andrew': 56,
 'barrett': 19,
 'are': 54579,
 'having': 1217,
 'an': 9482,
 'incredibly': 60,
 'horrendous': 4,
 'public': 364,
 'break': 895,
 'up': 16050,
 'on': 27234,
 'the': 140638,
 'quad': 2,
 'again': 3193,
 'well': 14090,
 'i': 204499,
 'thought': 4550,
 'would': 20007,
 'start': 1656,
 'with': 24954,
 'pronunciation': 2,
 'if': 18949,
 'that': 67068,
 'is': 79603,
 'okay': 6096,
 'you': 212918,
 'not': 67918,
 'hacking': 18,
 'gagging': 9,
 'spitting': 16,
 'part': 1419,
 'please': 3208,
 'asking': 746,
 'me': 44888,
 'out': 18466,
 'so': 19059,
 'cute': 272,
 'what': 55200,
 'your': 29939,
 'name': 3122,
 'no': 27578,
 "it's": 25843,
 'my': 29684,
 'fault': 482,
 "didn't": 8733,
 'have': 46593,
 'a': 102001,
 'proper': 138,
 'introduction': 19,
 'cameron': 35,
 'thing': 5731,
 'am': 37860,
 'at': 15290,
 'mercy': 68,
 'of': 56294,
 'particularly': 111,
 'h

In [37]:
# Creating threshold to filter out less frequent words
threshold = 20
questions2int ={}
word_number = 0
for word, count in mapping_dict.items():
         if count > threshold:
            questions2int[word] =  word_number
            word_number += 1

# questions2int = dict((w,i) for i,w in enumerate(sorted(list(set(' '.join(question_cleaned).split())))) if mapping_dict[w] > 20)

In [38]:
questions2int

{'can': 0,
 'we': 1,
 'make': 2,
 'this': 3,
 'quick': 4,
 'and': 5,
 'andrew': 6,
 'are': 7,
 'having': 8,
 'an': 9,
 'incredibly': 10,
 'public': 11,
 'break': 12,
 'up': 13,
 'on': 14,
 'the': 15,
 'again': 16,
 'well': 17,
 'i': 18,
 'thought': 19,
 'would': 20,
 'start': 21,
 'with': 22,
 'if': 23,
 'that': 24,
 'is': 25,
 'okay': 26,
 'you': 27,
 'not': 28,
 'part': 29,
 'please': 30,
 'asking': 31,
 'me': 32,
 'out': 33,
 'so': 34,
 'cute': 35,
 'what': 36,
 'your': 37,
 'name': 38,
 'no': 39,
 "it's": 40,
 'my': 41,
 'fault': 42,
 "didn't": 43,
 'have': 44,
 'a': 45,
 'proper': 46,
 'cameron': 47,
 'thing': 48,
 'am': 49,
 'at': 50,
 'mercy': 51,
 'of': 52,
 'particularly': 53,
 'breed': 54,
 'loser': 55,
 'sister': 56,
 'cannot': 57,
 'date': 58,
 'until': 59,
 'she': 60,
 'does': 61,
 'why': 62,
 'mystery': 63,
 'used': 64,
 'to': 65,
 'be': 66,
 'really': 67,
 'popular': 68,
 'when': 69,
 'started': 70,
 'high': 71,
 'school': 72,
 'then': 73,
 'it': 74,
 'was': 75,
 'just':

In [39]:
# Creating threshold to filter out less frequent words
threshold = 20
answers2int ={}
word_number = 0
for word, count in mapping_dict.items():
         if count > threshold:
            answers2int[word] =  word_number
            word_number += 1

In [40]:
answers2int

{'can': 0,
 'we': 1,
 'make': 2,
 'this': 3,
 'quick': 4,
 'and': 5,
 'andrew': 6,
 'are': 7,
 'having': 8,
 'an': 9,
 'incredibly': 10,
 'public': 11,
 'break': 12,
 'up': 13,
 'on': 14,
 'the': 15,
 'again': 16,
 'well': 17,
 'i': 18,
 'thought': 19,
 'would': 20,
 'start': 21,
 'with': 22,
 'if': 23,
 'that': 24,
 'is': 25,
 'okay': 26,
 'you': 27,
 'not': 28,
 'part': 29,
 'please': 30,
 'asking': 31,
 'me': 32,
 'out': 33,
 'so': 34,
 'cute': 35,
 'what': 36,
 'your': 37,
 'name': 38,
 'no': 39,
 "it's": 40,
 'my': 41,
 'fault': 42,
 "didn't": 43,
 'have': 44,
 'a': 45,
 'proper': 46,
 'cameron': 47,
 'thing': 48,
 'am': 49,
 'at': 50,
 'mercy': 51,
 'of': 52,
 'particularly': 53,
 'breed': 54,
 'loser': 55,
 'sister': 56,
 'cannot': 57,
 'date': 58,
 'until': 59,
 'she': 60,
 'does': 61,
 'why': 62,
 'mystery': 63,
 'used': 64,
 'to': 65,
 'be': 66,
 'really': 67,
 'popular': 68,
 'when': 69,
 'started': 70,
 'high': 71,
 'school': 72,
 'then': 73,
 'it': 74,
 'was': 75,
 'just':

In [41]:
# Adding last token to dictionaries

In [42]:
# Token are used while encoding and decoding

In [43]:
# <SOS> Start of string
# <EOS> End of String
# <PAD> for maintaining the length of input
# <OUT> for words not used while training(filter out)
tokens = ['<PAD>', '<EOS>', '<OUT>', '<SOS>']
for token in tokens:
    questions2int[token] = len(questions2int) + 1

In [48]:
for token in tokens:
    answers2int[token] = len(answers2int) + 1

In [49]:
# create an inverse dictionary of ansanswers2int for decoder

In [51]:
int2answers = {i:w for w,i in answers2int.items()}

In [52]:
int2answers

{0: 'can',
 1: 'we',
 2: 'make',
 3: 'this',
 4: 'quick',
 5: 'and',
 6: 'andrew',
 7: 'are',
 8: 'having',
 9: 'an',
 10: 'incredibly',
 11: 'public',
 12: 'break',
 13: 'up',
 14: 'on',
 15: 'the',
 16: 'again',
 17: 'well',
 18: 'i',
 19: 'thought',
 20: 'would',
 21: 'start',
 22: 'with',
 23: 'if',
 24: 'that',
 25: 'is',
 26: 'okay',
 27: 'you',
 28: 'not',
 29: 'part',
 30: 'please',
 31: 'asking',
 32: 'me',
 33: 'out',
 34: 'so',
 35: 'cute',
 36: 'what',
 37: 'your',
 38: 'name',
 39: 'no',
 40: "it's",
 41: 'my',
 42: 'fault',
 43: "didn't",
 44: 'have',
 45: 'a',
 46: 'proper',
 47: 'cameron',
 48: 'thing',
 49: 'am',
 50: 'at',
 51: 'mercy',
 52: 'of',
 53: 'particularly',
 54: 'breed',
 55: 'loser',
 56: 'sister',
 57: 'cannot',
 58: 'date',
 59: 'until',
 60: 'she',
 61: 'does',
 62: 'why',
 63: 'mystery',
 64: 'used',
 65: 'to',
 66: 'be',
 67: 'really',
 68: 'popular',
 69: 'when',
 70: 'started',
 71: 'high',
 72: 'school',
 73: 'then',
 74: 'it',
 75: 'was',
 76: 'ju

In [55]:
# Adding <EOS> tag at the end of every answer for decoder to learn where to stop

In [53]:
for i in range(len(answers_cleaned)):
    answers_cleaned[i] = answers_cleaned[i] + ' <EOS>'