# ChatBot Tutorial

* Creation of a chat bot using sequence to sequence models
* Will train a simple chatbot using movie scripts from the Cornell Movie-Dialogs Corpus.

#### Steps

   * Handle loading and preprocessing of Cornell Movie-Dialogs Corpus dataset
   * Implement a sequence-to-sequence model with Luong attention mechanism(s)
   * Jointly train encoder and decoder models using mini-batches
   * Implement greedy-search decoding module
   * Interact with trained chatbot

In [1]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals

In [2]:
import torch
from torch.jit import script, trace
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
import csv
import random
import re
import os
import unicodedata
import codecs
from io import open
import itertools
import math


In [3]:
use_cuda = torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")

#### Load and Process Data

DataSet:  
   * 220,579 conversational exchanges between 10,292 pairs of movie characters
   * 9,035 characters from 617 movies
   * 304,713 total utterances

In [4]:
corpus_name = "cornell movie-dialogs corpus"
corpus = os.path.join('data', corpus_name)

def printLines(file, n = 10):
    with open(file, 'rb') as datafile:
        lines = datafile.readlines()
    for line in lines[:n]:
        print(line)
        
printLines(os.path.join(corpus, 'movie_lines.txt'))        

b'L1045 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ They do not!\n'
b'L1044 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ They do to!\n'
b'L985 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ I hope so.\n'
b'L984 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ She okay?\n'
b"L925 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ Let's go.\n"
b'L924 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ Wow\n'
b"L872 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ Okay -- you're gonna need to learn how to lie.\n"
b'L871 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ No\n'
b'L870 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ I\'m kidding.  You know how sometimes you just become this "persona"?  And you don\'t know how to quit?\n'
b'L869 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ Like my fear of wearing pastels?\n'


In [5]:
# Splits each line of the file into a dictionary of fields
def loadLines(fileName, fields):
    lines = {}
    with open(fileName, 'r', encoding='iso-8859-1') as f:
        for line in f:
            values = line.split(" +++$+++ ")
            #extract different fields
            lineObj = {}
            for i, field in enumerate(fields):
                lineObj[field] = values[i]
                lines[lineObj['lineID']] = lineObj
    return lines

In [6]:
# group lines into conversations
def loadConversations(fileName, lines, fields):
    conversations = []
    with open(fileName, 'r', encoding='iso-8859-1') as f:
        for line in f:
            values = line.split(" +++$+++ ")
            convObj = {}
            for i, field in enumerate(fields):
                convObj[field] = values[i]
            lineIds = eval(convObj["utteranceIDs"])
            convObj["lines"] = []
            for lineId in lineIds:
                convObj["lines"].append(lines[lineId])
            conversations.append(convObj)
        return conversations    

In [7]:
#extract pairs of sentances from conversation (duplicates)
def extractSentancePairs(conversations):
    qa_pairs = []
    #iterate through conversations
    for conversation in conversations:
        #iterate through lines of conversation
        for i in range(len(conversation["lines"]) - 1):
            inputLine = conversation["lines"][i]["text"].strip()
            targetLine = conversation["lines"][i + 1]["text"].strip()
            if inputLine and targetLine:
                qa_pairs.append([inputLine, targetLine])
    return qa_pairs
        

In [8]:
#path to new file
datafile = os.path.join(corpus, "formatted_movie_lines.txt")
delimiter = '\t'
delimiter = str(codecs.decode(delimiter, 'unicode_escape'))

lines = {}
conversations = []

MOVIE_LINES_FIELDS = ["lineID", "characterID", "movieID", "character", "text"]
MOVIE_CONVERSATIONS_FIELDS = ["character1ID", "character2ID", "movieID", "utteranceIDs"]
print("\n Processing corpus...")
lines = loadLines(os.path.join(corpus, "movie_lines.txt"), MOVIE_LINES_FIELDS)
print("\n Loading Conversations...")
conversations = loadConversations(os.path.join(corpus, "movie_conversations.txt"), lines, MOVIE_CONVERSATIONS_FIELDS)

print("\n Writing newly formatted file...")
with open(datafile, 'w', encoding = 'utf-8') as outputfile:
    writer = csv.writer(outputfile, delimiter = delimiter, lineterminator = "\n")
    for pair in extractSentancePairs(conversations):
        writer.writerow(pair)

print("\n Printing sample lines... ")
printLines(datafile)


 Processing corpus...

 Loading Conversations...

 Writing newly formatted file...

 Printing sample lines... 
b"Can we make this quick?  Roxanne Korrine and Andrew Barrett are having an incredibly horrendous public break- up on the quad.  Again.\tWell, I thought we'd start with pronunciation, if that's okay with you.\n"
b"Well, I thought we'd start with pronunciation, if that's okay with you.\tNot the hacking and gagging and spitting part.  Please.\n"
b"Not the hacking and gagging and spitting part.  Please.\tOkay... then how 'bout we try out some French cuisine.  Saturday?  Night?\n"
b"You're asking me out.  That's so cute. What's your name again?\tForget it.\n"
b"No, no, it's my fault -- we didn't have a proper introduction ---\tCameron.\n"
b"Cameron.\tThe thing is, Cameron -- I'm at the mercy of a particularly hideous breed of loser.  My sister.  I can't date until she does.\n"
b"The thing is, Cameron -- I'm at the mercy of a particularly hideous breed of loser.  My sister.  I can

#### Load and Trim Data

Voc class maps words to indexes

In [9]:
PAD_token = 0
SOS_token = 1
EOS_token = 2

In [10]:
# Default word tokens
PAD_token = 0  # Used for padding short sentences
SOS_token = 1  # Start-of-sentence token
EOS_token = 2  # End-of-sentence token

class Voc:
    def __init__(self, name):
        self.name = name
        self.trimmed = False
        self.word2index = {}
        self.word2count = {}
        self.index2word = {PAD_token: "PAD", SOS_token: "SOS", EOS_token: "EOS"}
        self.num_words = 3  # Count SOS, EOS, PAD

    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.num_words
            self.word2count[word] = 1
            self.index2word[self.num_words] = word
            self.num_words += 1
        else:
            self.word2count[word] += 1

    # Remove words below a certain count threshold
    def trim(self, min_count):
        if self.trimmed:
            return
        self.trimmed = True

        keep_words = []

        for k, v in self.word2count.items():
            if v >= min_count:
                keep_words.append(k)

        print('keep_words {} / {} = {:.4f}'.format(
            len(keep_words), len(self.word2index), len(keep_words) / len(self.word2index)
        ))

        # Reinitialize dictionaries
        self.word2index = {}
        self.word2count = {}
        self.index2word = {PAD_token: "PAD", SOS_token: "SOS", EOS_token: "EOS"}
        self.num_words = 3 # Count default tokens

        for word in keep_words:
            self.addWord(word)

In [11]:
MAX_LENGTH = 10

# Turn a Unicode string to plain ASCII, thanks to
# http://stackoverflow.com/a/518232/2809427

def unicodeToASCII(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

# Lowercase, trim, and remove non-letter characters
def normalizeString(s):
    s = unicodeToASCII(s.lower().strip())
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    s = re.sub(r"\s+", r" ", s).strip()
    return s

# Read query/response pairs and return a voc object
def readVocs(datafile, corpus_name):
    print("\n Reading line... ")
    lines = open(datafile, encoding = "utf-8").read().strip().split('\n')
    pairs = [[normalizeString(s) for s in l.split('\t')] for l in lines]
    voc = Voc(corpus_name)
    return voc,pairs

def filterPair(p):
    return len(p[0].split(' ')) < MAX_LENGTH and len(p[1].split(' ')) < MAX_LENGTH

def filterPairs(pairs):
    return [pair for pair in pairs if filterPair(pair)]

def loadPrepareData(corpus, corpus_name, datafile, save_dir):
    print("Start preparing training data ...")
    voc, pairs = readVocs(datafile, corpus_name)
    print("Read {!s} sentence pairs".format(len(pairs)))
    pairs = filterPairs(pairs)
    print("Trimmed to {!s} sentence pairs".format(len(pairs)))
    print("Counting words...")
    for pair in pairs:
        voc.addSentence(pair[0])
        voc.addSentence(pair[1])
    print("Counted words:", voc.num_words)
    return voc, pairs


save_dir = os.path.join("data", "save")
voc, pairs = loadPrepareData(corpus, corpus_name, datafile, save_dir)

# Print some pairs to validate
print("\npairs:")
for pair in pairs[:10]:
    print(pair)


Start preparing training data ...

 Reading line... 
Read 221282 sentence pairs
Trimmed to 64271 sentence pairs
Counting words...
Counted words: 18008

pairs:
['there .', 'where ?']
['you have my word . as a gentleman', 'you re sweet .']
['hi .', 'looks like things worked out tonight huh ?']
['you know chastity ?', 'i believe we share an art instructor']
['have fun tonight ?', 'tons']
['well no . . .', 'then that s all you had to say .']
['then that s all you had to say .', 'but']
['but', 'you always been this selfish ?']
['do you listen to this crap ?', 'what crap ?']
['what good stuff ?', 'the real you .']


In [12]:
# Trim out rarely used words to make the set searched over smaller 

In [13]:
Min_Count = 3 #Min numbers of appearences to be kept in set

def trimRareWords(voc,pairs, Min_Count):
    #trim words under min_count in vod
    voc.trim(Min_Count)
    #filter out pairs if they contain trimmed words
    keep_pairs = []
    for pair in pairs:
        input_sentance = pair[0]
        output_sentance = pair[1]
        keep_input = True
        keep_output = True
        for word in input_sentance.split(' '):
            #input
            if word not in voc.word2index:
                keep_input = False
                break
        for word in input_sentance.split(' '):
            if word not in voc.word2index:
                keep_output = False
                break
        # keep pairs that dont have trims in input or output
        if keep_input and keep_output:
            keep_pairs.append(pair)
    print("Trimmed from {} pairs to {}, {:.4f} of total".format(len(pairs), len(keep_pairs),
                                                                len(keep_pairs) / len(pairs)))
    return keep_pairs
pairs = trimRareWords(voc, pairs, Min_Count)

keep_words 7823 / 18005 = 0.4345
Trimmed from 64271 pairs to 58043, 0.9031 of total


In [14]:
def indexesFromSentence(voc, sentence):
    return [voc.word2index[word] for word in sentence.split(' ')] + [EOS_token]

def zeroPadding(l, fillvalue = PAD_token):
    return list(itertools.zip_longest(*l, fillvalue = fillvalue))

def binaryMatrix(l, value = PAD_token):
    m = []
    for i, seq in enumerate(l):
        m.append([])
        for token in seq:
            if token == PAD_token:
                m[i].append(0)
            else:
                m[i].append(1)
    return m

def inputVar(l, voc):
    indexes_batch = [indexesFromSentence(voc, sentence) for sentence in l]
    lengths = torch.tensor([len(indexes) for indexes in indexes_batch])
    padList = zeroPadding(indexes_batch)
    padVar = torch.LongTensor(padList)
    return padVar, lengths


def outputVar(l, voc):
    indexes_batch = [indexesFromSentence(voc, sentence) for sentence in l]
    max_target_len = max([len(indexes) for indexes in indexes_batch])
    padList = zeroPadding(indexes_batch)
    mask = binaryMatrix(padList)
    mask = torch.ByteTensor(mask)
    padVar = torch.LongTensor(padList)
    return padVar, mask, max_target_len
    

def batch2TrainData(voc, pair_batch):
    pair_batch.sort(key = lambda x: len(x[0].split(" ")), reverse = True)
    input_batch, output_batch = [],[]
    for pair in pair_batch:
        input_batch.append(pair[0])
        output_batch.append(pair[1])
    inp,lengths = inputVar(input_batch, voc)
    output, mask, max_target_len = outputVar(output_batch, voc)
    return inp, lengths, output, mask, max_target_len

small_batch_size = 5
batches = batch2TrainData(voc, [random.choice(pairs) for _ in range(small_batch_size)])
input_variable, lengths, target_variable, mask, max_target_len = batches

print("input_variable:", input_variable)
print("lengths:", lengths)
print("target_variable:", target_variable)
print("mask:", mask)
print("max_target_len:", max_target_len)
                                

input_variable: tensor([[  40,   36,  199,    9,   59],
        [4337,   37,  329, 5144,   37],
        [ 276,   67, 1007,  215,   60],
        [  53, 2891,    6,   36,    4],
        [4431,    2,    2,    2,    2],
        [  56,    0,    0,    0,    0],
        [4432,    0,    0,    0,    0],
        [   4,    0,    0,    0,    0],
        [   2,    0,    0,    0,    0]])
lengths: tensor([9, 5, 5, 5, 5])
target_variable: tensor([[  50,   35,   38,    7,  785],
        [  66,   50,   25,   14,   66],
        [   2,    6,  974,   96,    4],
        [   0,    9,  102, 4374,    4],
        [   0, 4707,  949,    6,    4],
        [   0,   92, 7187,    2,  242],
        [   0,  349,    4,    0,   66],
        [   0,  716,    2,    0,    2],
        [   0,    6,    0,    0,    0],
        [   0,    2,    0,    0,    0]])
mask: tensor([[1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1],
        [0, 1, 1, 1, 1],
        [0, 1, 1, 1, 1],
        [0, 1, 1, 1, 1],
        [0, 1, 1

In [15]:
## Sequence to sequence model
## Take in variable length input and return variable length output using FIXED model

In [16]:
# Use encoder to convert all variable length inputs to a fixed size to then make a prediction on

In [17]:
## Encoder: Two RNN's, one fed input in sequential order, one fed input in reverse order

In [18]:
## INPUTS: input_seq : batch of sentences, shape = (max_length, batch_size)
##         input_lengths : length of each sentence
##         hidden: the hidden state

In [19]:
## OUTPUTS: outputs: output features from last layer 
##          hidden: update hidden state

In [20]:
class EncoderRNN(nn.Module):
    def __init__(self,hidden_size, embedding, n_layers = 1, dropout = 0):
        super(EncoderRNN, self).__init__()
        self.n_layers = n_layers
        self.embedding = embedding
        self.hidden_size = hidden_size
        
        #Initializing the GRU
        self.gru = nn.GRU(hidden_size, hidden_size, n_layers, dropout = (0 if n_layers == 1 else 0), 
                          bidirectional = True)
    
    def forward(self, input_seq, input_lengths, hidden = None):
        #convert word indexes to embeddings
        embedded = self.embedding(input_seq)
        #  pack padded batch for sequence
        packed = torch.nn.utils.rnn.pack_padded_sequence(embedded, input_lengths)
        # forward pass
        outputs, hidden = self.gru(packed, hidden)
        #unpack
        outputs, _ = torch.nn.utils.rnn.pack_padded_sequence(outputs)
        #Sum bidirectional outputs
        outputs = outputs[:,:,:self.hidden_size] + outputs[:,:,self.hidden_size] 
        return outputs, hidden

In [21]:
## Decoder generates words until it has reached an EOS_token
## “attention mechanism” - Similar to the ideas behind a cache, contains important information in a seperate location

In [22]:
## Attention Class to calculate and apply the attention to our tensors

In [23]:
class Attn(torch.nn.Module):
    def __init__(self, method, hidden_size):
        super(Attn, self).__init__()
        self.method = method
        if self.method not in ['dot', 'general', 'concat']:
            raise ValueError(self.method, "Not an appropriate method")
    
        self.hidden_size = hidden_size
        if self.method == 'general':
            selt.attn = torch.nn.Linear(self.hidden_size, hidden_size)
        elif self.method == 'concat':
            self.attn = torch.nn.Linear(self.hidden_size * 2, hidden_size)
            self.v = torch.nn.Parameter(torch.FloatTensor(hidden_size))
            
    def dot_score(self, hidden, encoder_output):
        return torch.sum(hidden * encoder_output, dim = 2)
    
    def general_score(self, hidden, encoder_output):
        energy = self.attn(encoder_output)
        return torch.sum(hidden * energy, dim = 2)
    
    def concat_score(self, hidden, encoder_output):
        energy = self.attn(torch.cat((hidden.expand(encoder_output.size(0), -1, -1), encoder_output), 2)).tanh()
        return torch.sum(energy * self.v, dim = 2)
    
    def forward(self, hidden, encoder_weights):
        #calculate energy weights
        if self.method == 'general':
            attn_energies = self.general_score(hidden, encoder_outputs)
        elif self.method == 'concat':
            attn_energies = self.concat_score(hidden, encoder_outputs)
        elif self.method == 'dot':
            attn_energies = self.dot_score(hidden, encoder_outputs)
        
        attn_energies = attn_energies.t()
        
        return F.softmax(attn_energies, dim = 1).unsqueeze(1)
    

In [24]:
class LuongAttnDecoderRNN(nn.Module):
    def __init__(self, attn_model, embedding, hidden_size, output_size, n_layers = 1, dropout = .1):
        super(LuongAttnDecoderRNN, self).__init__()
        
        # Keep for reference
        self.attn_model = attn_model
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.n_layers = n_layers
        self.dropout = dropout
        
        # Define layers
        self.embedding = embedding
        self.embedding_dropout = embedding.Dropout(dropout)
        self.gru = nn.GRU(hidden_size, hidden_size, n_layers, dropout = (0 if n_layers == 1 else dropout))
        self.concat = nn.Linear(hidden_size * 2, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)
        self.attn = Attn(attn_model, hidden_size)
        
        def forward(self, input_step, last_hidden, encoder_outputs):
            # run one word at a time to find embedding of word
            embedded = self.embedding(input_step)
            embedded = self.embedding_dropout(embedded)
            #forward through uniderctoral GRU
            rnn_output, hidden = self.gru(embedded, last_hidden)
            # calculate attention weights
            attn_weights = self.attn(rnn_output, encoder_outputs)
            # attn_weights * encoder_outputs = weighted_sums 
            context = attn_weights.bmm(encoder_outputs.transpose(0,1))
            # concatenate wieghted context vector
            rnn_output = rnn_output.squeeze(0)
            context = context.squeeze(1)
            concat_input = torch.cat((rnn_output, context), 1)
            concat_output = torch.tanh(self.concat(concat_input))
            # Predict next input
            output = self.out(concat_output)
            output = F.softmax(output, dim=1)
            
            return output, hidden_state

In [25]:
# Training Mask procedure
# Masked Loss

In [26]:
def maxNLLLoss(inp, target, mask):
    nTotal = mask.sum()
    crossEntropy = -torch.log(torch.gather(inp, 1, target.view(-1,1)).squeeze())
    loss = crossEntropy.masked_select(mask).mean()
    loss = loss.to(device)
    return loss, nTotal.item()