In [None]:
import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
import csv
import random
import re
import os
import unicodedata
import codecs
import itertools

# Part1: Data Preprocessing

In [None]:
directory_path = '/content/my_folder'
lines_filepath = os.path.join(directory_path, "movie_lines.txt")
conv_filepath = os.path.join(directory_path, "movie_conversations.txt")

In [None]:


#visualise some lines
#read lines
with open(lines_filepath, 'rb') as file:
    lines = file.read().decode('utf-8', errors='ignore').splitlines()

# Print the first 8 lines
for line in lines[:8]:
    print(line.strip())

L1045 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ They do not!
L1044 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ They do to!
L985 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ I hope so.
L984 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ She okay?
L925 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ Let's go.
L924 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ Wow
L872 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ Okay -- you're gonna need to learn how to lie.
L871 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ No


In [None]:
#splits each line of the file into a dictionary of fields(lineID, characterID, movieID, character, text)
line_fields = ["lineID", "characterID", "movieID", "character", "text"]
lines = {}
with open(lines_filepath, 'r', encoding='iso-8859-1') as f:
  for line in f:
    values = line.split(" +++$+++ ")
    #Extract fields
    lineObj = {}
    for i, field in enumerate(line_fields):
      lineObj[field]=values[i]
    lines[lineObj['lineID']] = lineObj

In [None]:
list(lines.items())[0]

('L1045',
 {'lineID': 'L1045',
  'characterID': 'u0',
  'movieID': 'm0',
  'character': 'BIANCA',
  'text': 'They do not!\n'})

In [None]:
#Groups fields of lines from 'loadLines into conversation based on "movie_conversation.txt"
conv_fields = ["character1ID","character2ID", "movieID", "utteranceIDs"]
conversations = []
with open(conv_filepath, 'r', encoding='iso-8859-1') as f:
  for line in f:
    values = line.split(" +++$+++ ")
    #Extract fields
    convObj = {}
    for i, field in enumerate(conv_fields):
      convObj[field]=values[i]
    # convert string result from split to list, since convObj["utteranceIDs"] == "['L598485',....]"
    lineIds = eval(convObj["utteranceIDs"])
    # Reassemble lines
    convObj["lines"] = []
    for lineId in lineIds:
      convObj["lines"].append(lines[lineId])
    conversations.append(convObj)

In [None]:
conversations[0]

{'character1ID': 'u0',
 'character2ID': 'u2',
 'movieID': 'm0',
 'utteranceIDs': "['L194', 'L195', 'L196', 'L197']\n",
 'lines': [{'lineID': 'L194',
   'characterID': 'u0',
   'movieID': 'm0',
   'character': 'BIANCA',
   'text': 'Can we make this quick?  Roxanne Korrine and Andrew Barrett are having an incredibly horrendous public break- up on the quad.  Again.\n'},
  {'lineID': 'L195',
   'characterID': 'u2',
   'movieID': 'm0',
   'character': 'CAMERON',
   'text': "Well, I thought we'd start with pronunciation, if that's okay with you.\n"},
  {'lineID': 'L196',
   'characterID': 'u0',
   'movieID': 'm0',
   'character': 'BIANCA',
   'text': 'Not the hacking and gagging and spitting part.  Please.\n'},
  {'lineID': 'L197',
   'characterID': 'u2',
   'movieID': 'm0',
   'character': 'CAMERON',
   'text': "Okay... then how 'bout we try out some French cuisine.  Saturday?  Night?\n"}]}

In [None]:
#extracts pairs of sentences from conversations
#arrange in the question and answer way
qa_pairs = []
for conversation in conversations:
  # Iterate over all the lines of the conversation
  for i in range(len(conversation["lines"])-1):
    inputLine = conversation["lines"][i]["text"].strip()
    targetLine = conversation["lines"][i+1]["text"].strip()
    #Filter wrong samples (if one of the lists is empty)
    if inputLine and targetLine:
      qa_pairs.append([inputLine, targetLine])

In [None]:
qa_pairs

[['Can we make this quick?  Roxanne Korrine and Andrew Barrett are having an incredibly horrendous public break- up on the quad.  Again.',
  "Well, I thought we'd start with pronunciation, if that's okay with you."],
 ["Well, I thought we'd start with pronunciation, if that's okay with you.",
  'Not the hacking and gagging and spitting part.  Please.'],
 ['Not the hacking and gagging and spitting part.  Please.',
  "Okay... then how 'bout we try out some French cuisine.  Saturday?  Night?"],
 ["You're asking me out.  That's so cute. What's your name again?",
  'Forget it.'],
 ["No, no, it's my fault -- we didn't have a proper introduction ---",
  'Cameron.'],
 ['Cameron.',
  "The thing is, Cameron -- I'm at the mercy of a particularly hideous breed of loser.  My sister.  I can't date until she does."],
 ["The thing is, Cameron -- I'm at the mercy of a particularly hideous breed of loser.  My sister.  I can't date until she does.",
  'Seems like she could get a date easy enough...'],
 [

In [None]:
#define the path to new file
datafile = os.path.join(directory_path,"formatted_movie_lines.txt")
delimiter = '\t'
#decode the delimiter
delimiter = str(codecs.decode(delimiter, "unicode_escape"))

#write new csv file
print('\nWriting newly formatted file...')
with open(datafile, 'w', encoding='utf-8') as outputfile:
  writer = csv.writer(outputfile, delimiter=delimiter)
  for pair in qa_pairs:
    writer.writerow(pair)
print("Done writing to file")


Writing newly formatted file...
Done writing to file


In [None]:
#visualise some lines
datafile = os.path.join(directory_path, "formatted_movie_lines.txt")
with open(datafile, 'rb') as file:
  lines = file.readlines()
for line in lines[:8]:
  print(line)

b"Can we make this quick?  Roxanne Korrine and Andrew Barrett are having an incredibly horrendous public break- up on the quad.  Again.\tWell, I thought we'd start with pronunciation, if that's okay with you.\r\n"
b"Well, I thought we'd start with pronunciation, if that's okay with you.\tNot the hacking and gagging and spitting part.  Please.\r\n"
b"Not the hacking and gagging and spitting part.  Please.\tOkay... then how 'bout we try out some French cuisine.  Saturday?  Night?\r\n"
b"You're asking me out.  That's so cute. What's your name again?\tForget it.\r\n"
b"No, no, it's my fault -- we didn't have a proper introduction ---\tCameron.\r\n"
b"Cameron.\tThe thing is, Cameron -- I'm at the mercy of a particularly hideous breed of loser.  My sister.  I can't date until she does.\r\n"
b"The thing is, Cameron -- I'm at the mercy of a particularly hideous breed of loser.  My sister.  I can't date until she does.\tSeems like she could get a date easy enough...\r\n"
b'Why?\tUnsolved myster

Processing the words

In [None]:

PAD_token = 0 #used for padding short sentences
SOS_token = 1 #start-of-sentence token <START>
EOS_token = 2 #End-of-sentence token <END>

class Vocabulary:
  def __init__(self,name):
    self.name=name
    self.word2index={}
    self.word2count={}
    self.index2word={PAD_token: "PAD", SOS_token: "SOS", EOS_token: "EOS"}
    self.num_words=3 #count SOS, EOS, PAD

  def addSentence(self,sentence):
    for word in sentence.split(' '):
      self.addWord(word)

  def addWord(self,word):
    if word not in self.word2index:
      self.word2index[word]=self.num_words
      self.word2count[word]=1
      self.index2word[self.num_words]=word
      self.num_words += 1
    else:
      self.word2count[word] += 1

  # remove words below a certain count threshold
  def trim(self, min_count):
    keep_words = []
    for k,v in self.word2count.items():
      if v>= min_count:
        keep_words.append(k)

    print('keep_words {} / {} = {:.4f}'.format(len(keep_words), len(self.word2index), len(keep_words)/len(self.word2index)))
    # reinitialize dictionaries
    self.word2index= {}
    self.word2count= {}
    self.index2word= {PAD_token: "PAD", SOS_token: "SOS", EOS_token: "EOS"}
    self.num_words= 3 #count default tokens

    for word in keep_words:
      self.addWord(word)

In [None]:
# turn a unicode string to plain ASCII
# NFD: normal forms decompose
# Mn: non-marking space
def unicodeToAscii(s):
  return ''.join(c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn')

In [None]:
#test the function
unicodeToAscii("Montréal,Françoise......")

'Montreal,Francoise......'

In [None]:
# Lowercase, trim white spaces, lines....etc, and remove non-letter characters
def normalizeString(s):
   s = unicodeToAscii(s.lower().strip())
   # Replace any .!? by a whitespace + the character --> '!' = '!'. \1 means the first bracketed group --> [,!?]. r is to
   # not consider \1 as a character (r to escape a backslash).
   s = re.sub(r"([.!?])", r" \1",s)
   # Remove any character that is not a seqeunce of lower or upper case letters + means one or more
   s = re.sub(r"[^a-zA-Z.!?]+", r" ",s)
   # remove a sequence of whitespace characters
   s = re.sub(r"\s+", r" ", s).strip()
   return s

In [None]:
# Test the function
normalizeString("aa123aa!s's  dd?")

'aa aa !s s dd ?'

Processing the text

In [None]:
datafile = os.path.join(directory_path, "formatted_movie_lines.txt")
# read the file and split into lines
print("Reading and processing file....Please Wait")
lines = open(datafile, encoding='utf-8').read().strip().split('\n')
# split every line into pairs and normalize
pairs = [[normalizeString(s) for s in pair.split('\t')] for pair in lines]
print("Done Reading!")
voc = Vocabulary(directory_path)

Reading and processing file....Please Wait
Done Reading!


In [None]:
pairs

[['can we make this quick ? roxanne korrine and andrew barrett are having an incredibly horrendous public break up on the quad . again .',
  'well i thought we d start with pronunciation if that s okay with you .'],
 ['well i thought we d start with pronunciation if that s okay with you .',
  'not the hacking and gagging and spitting part . please .'],
 ['not the hacking and gagging and spitting part . please .',
  'okay . . . then how bout we try out some french cuisine . saturday ? night ?'],
 ['you re asking me out . that s so cute . what s your name again ?',
  'forget it .'],
 ['no no it s my fault we didn t have a proper introduction', 'cameron .'],
 ['cameron .',
  'the thing is cameron i m at the mercy of a particularly hideous breed of loser . my sister . i can t date until she does .'],
 ['the thing is cameron i m at the mercy of a particularly hideous breed of loser . my sister . i can t date until she does .',
  'seems like she could get a date easy enough . . .'],
 ['why ?

Filtering the long sentences

In [None]:
# Return true if both sentences in a pair 'p' are under the MAX_LENGTH threshold
MAX_LENGTH = 10 #Maximum sentence length to consider (max words)
def filterPair(p):
  #Input sequences need to preserve the last word for EOS token
  return len(p[0].split()) < MAX_LENGTH and len(p[1].split()) < MAX_LENGTH

#Filter pairs using filterPair condition
def filterPairs(pairs):
  return [pair for pair in pairs if filterPair(pair)]

Start preparing training data ...


NameError: ignored

In [None]:
pairs[1]

['well i thought we d start with pronunciation if that s okay with you .',
 'not the hacking and gagging and spitting part . please .']

In [None]:
pairs = [pair for pair in pairs if len(pair)>1]
print("There are {} pairs/conversations in the dataset", format(len(pairs)))
pairs = filterPairs(pairs)
print("After filtering, there are {} pairs/conversatons", format(len(pairs)))

There are {} pairs/conversations in the dataset 221282
After filtering, there are {} pairs/conversatons 64271


Getting rid of rare words

In [None]:
#loop through each pair of and add question and reply sentence to the vocabulary
for pair in pairs:
  voc.addSentence(pair[0])
  voc.addSentence(pair[1])
print("Counted words:", voc.num_words)
for pair in pairs[:10]:
  print(pair)

Counted words: 18008
['there .', 'where ?']
['you have my word . as a gentleman', 'you re sweet .']
['hi .', 'looks like things worked out tonight huh ?']
['you know chastity ?', 'i believe we share an art instructor']
['have fun tonight ?', 'tons']
['well no . . .', 'then that s all you had to say .']
['then that s all you had to say .', 'but']
['but', 'you always been this selfish ?']
['do you listen to this crap ?', 'what crap ?']
['what good stuff ?', 'the real you .']


In [None]:
MIN_COUNT = 3 # minimum word count threshold for trimming

def trimRareWords(voc,pairs,MIN_COUNT):
  # Trim words used under the MIN_COUNT from the voc
  voc.trim(MIN_COUNT)
  #Filter out pairs with trimmed words
  keep_pairs = []
  for pair in pairs:
    input_sentence = pair[0]
    output_sentence = pair[1]
    keep_input = True
    keep_output = True
    #Check input sentence
    for word in input_sentence.split(' '):
      if word not in voc.word2index:
        keep_input = False
        break
    for word in output_sentence.split(' '):
      if word not in voc.word2index:
        keep_output = False
        break

    #Only keep pairs that do not contain trimmed word(s) i their input ot output sentence
    if keep_input and keep_output:
      keep_pairs.append(pair)

  print("trimmed from {} pairs to {}, {:.4f} of total".format(len(pairs), len(keep_pairs), len(keep_pairs)/len(pairs)))
  return keep_pairs

# Trim voc and pairs
pairs = trimRareWords(voc, pairs, MIN_COUNT)

keep_words 7823 / 18005 = 0.4345
trimmed from 64271 pairs to 53165, 0.8272 of total


# Preparing the data

In [None]:
def indexesFromSentence(voc, sentence):
  return [voc.word2index[word] for word in sentence.split(' ')] + [EOS_token]

In [None]:
pairs[1][0]

'you have my word . as a gentleman'

In [None]:
#testing the function
indexesFromSentence(voc,pairs[1][0])

[7, 8, 9, 10, 4, 11, 12, 13, 2]

In [None]:
# define some samples for testing
inp = []
out = []
for pair in pairs[:10]:
  inp.append(pair[0])
  out.append(pair[1])
print(inp)
print(len(inp))
indexes = [indexesFromSentence(voc, sentence) for sentence in inp]
indexes

['there .', 'you have my word . as a gentleman', 'hi .', 'have fun tonight ?', 'well no . . .', 'then that s all you had to say .', 'but', 'do you listen to this crap ?', 'what good stuff ?', 'wow']
10


[[3, 4, 2],
 [7, 8, 9, 10, 4, 11, 12, 13, 2],
 [16, 4, 2],
 [8, 31, 22, 6, 2],
 [33, 34, 4, 4, 4, 2],
 [35, 36, 37, 38, 7, 39, 40, 41, 4, 2],
 [42, 2],
 [47, 7, 48, 40, 45, 49, 6, 2],
 [50, 51, 52, 6, 2],
 [58, 2]]

In [None]:
#zip implementation
a= ['A', 'B', 'C','D','E']
b= [1,2,3]
print(list(zip(a,b)))
print(list(itertools.zip_longest(a,b)))

[('A', 1), ('B', 2), ('C', 3)]
[('A', 1), ('B', 2), ('C', 3), ('D', None), ('E', None)]


In [None]:
# implementaton for zip_longest function for 2d array
c=[[3, 4, 2],
 [7, 8, 9, 10, 4, 11, 12, 13, 2],
 [16, 4, 2],
 [8, 31, 22, 6, 2],
 [33, 34, 4, 4, 4, 2],
 [35, 36, 37, 38, 7, 39, 40, 41, 4, 2],
 [42, 2],
 [47, 7, 48, 40, 45, 49, 6, 2],
 [50, 51, 52, 6, 2],
 [58, 2]]
list(itertools.zip_longest(*c,fillvalue=0))

[(3, 7, 16, 8, 33, 35, 42, 47, 50, 58),
 (4, 8, 4, 31, 34, 36, 2, 7, 51, 2),
 (2, 9, 2, 22, 4, 37, 0, 48, 52, 0),
 (0, 10, 0, 6, 4, 38, 0, 40, 6, 0),
 (0, 4, 0, 2, 4, 7, 0, 45, 2, 0),
 (0, 11, 0, 0, 2, 39, 0, 49, 0, 0),
 (0, 12, 0, 0, 0, 40, 0, 6, 0, 0),
 (0, 13, 0, 0, 0, 41, 0, 2, 0, 0),
 (0, 2, 0, 0, 0, 4, 0, 0, 0, 0),
 (0, 0, 0, 0, 0, 2, 0, 0, 0, 0)]

In [None]:
def zeroPadding(l, fillvalue = 0):
  return list(itertools.zip_longest(*l, fillvalue=fillvalue))

In [None]:
leng = [len(ind) for ind in indexes]
max(leng)

10

In [None]:
leng

[3, 9, 3, 5, 6, 10, 2, 8, 5, 2]

In [None]:
#test the funtion
test_result = zeroPadding(indexes)
print(len(test_result)) #the max length is now the number of rows or the batch size
test_result

10


[(3, 7, 16, 8, 33, 35, 42, 47, 50, 58),
 (4, 8, 4, 31, 34, 36, 2, 7, 51, 2),
 (2, 9, 2, 22, 4, 37, 0, 48, 52, 0),
 (0, 10, 0, 6, 4, 38, 0, 40, 6, 0),
 (0, 4, 0, 2, 4, 7, 0, 45, 2, 0),
 (0, 11, 0, 0, 2, 39, 0, 49, 0, 0),
 (0, 12, 0, 0, 0, 40, 0, 6, 0, 0),
 (0, 13, 0, 0, 0, 41, 0, 2, 0, 0),
 (0, 2, 0, 0, 0, 4, 0, 0, 0, 0),
 (0, 0, 0, 0, 0, 2, 0, 0, 0, 0)]

In [None]:
# conversion of the all the digits to one and zeros
def binaryMatrix(l, value=0):
  m = []
  for i, seq in enumerate(l):
    m.append([])
    for token in seq:
      if token == PAD_token:
        m[i].append(0)
      else:
        m[i].append(1)
  return m

In [None]:
binary_result = binaryMatrix(test_result)
binary_result

[[1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
 [1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
 [1, 1, 1, 1, 1, 1, 0, 1, 1, 0],
 [0, 1, 0, 1, 1, 1, 0, 1, 1, 0],
 [0, 1, 0, 1, 1, 1, 0, 1, 1, 0],
 [0, 1, 0, 0, 1, 1, 0, 1, 0, 0],
 [0, 1, 0, 0, 0, 1, 0, 1, 0, 0],
 [0, 1, 0, 0, 0, 1, 0, 1, 0, 0],
 [0, 1, 0, 0, 0, 1, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 1, 0, 0, 0, 0]]

In [None]:
# returns padded input sequence tensor and so well as a tensor of lengths for each of the sequences int the batch
def inputVar(l, voc):
  indexes_batch = [indexesFromSentence(voc, sentence) for sentence in l]
  lengths = torch.tensor([len(indexes) for indexes in indexes_batch])
  padList = zeroPadding(indexes_batch)
  padVar = torch.LongTensor(padList)
  return padVar, lengths

In [None]:
# returns padded target sequence tensor, padding mask, and max target length
def outputVar(l, voc):
  indexes_batch = [indexesFromSentence(voc, sentence) for sentence in l]
  max_target_len = max([len(indexes) for indexes in indexes_batch])
  padList = zeroPadding(indexes_batch)
  mask = binaryMatrix(padList)
  mask = torch.ByteTensor(mask)
  padVar = torch.LongTensor(padList)
  return padVar, mask, max_target_len

In [None]:
#lambda explanation
add = lambda x:x+1

In [None]:
add(2)

3

In [None]:
# returns all items for a given batch of pairs
def batch2TrainData(voc, pair_batch):
  #sort the questions in descending order
  pair_batch.sort(key=lambda x: len(x[0].split("  ")), reverse=True)
  input_batch, output_batch =[], []
  for pair in pair_batch:
    input_batch.append(pair[0])
    output_batch.append(pair[1])
  inp, lengths = inputVar(input_batch, voc)
  #assert len(inp) == lengths[0]
  output, mask, max_target_len = outputVar(output_batch, voc)
  return inp, lengths, output, mask, max_target_len

In [None]:
# Example for vlidation
small_batch_size = 5
batches = batch2TrainData(voc, [random.choice(pairs) for _ in range(small_batch_size)])
input_variable, lengths, target_variable, mask, max_target_len = batches

print("input_variable:")
print(input_variable)
print("lengths:", lengths)
print("target_variable:")
print(target_variable)
print("mask:")
print(mask)
print("max_target_len:",max_target_len)

input_variable:
tensor([[  65, 5100, 1006,  842, 2047],
        [  14,    6, 1002,  387,  719],
        [1646,    2,    4,    6,  117],
        [  98,    0,    2,    2,  534],
        [7786,    0,    0,    0,   34],
        [   4,    0,    0,    0, 3947],
        [   2,    0,    0,    0,    4],
        [   0,    0,    0,    0,    2]])
lengths: tensor([7, 3, 4, 4, 8])
target_variable:
tensor([[ 558,  318, 1064,  167,   50],
        [   4,    6,    4,    4,    6],
        [   2,    2,    2,    2,    2]])
mask:
tensor([[1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1]], dtype=torch.uint8)
max_target_len: 3


# Defining the model

The steps to be taken are: convert word indexes to embeddings, Pack padded batch of sequences for RNN module, Forward pass through GRU, Unpack padding, and finally return output and final hidden state

we will use the bidirectional variant of the GRU, meaning tht there are essentially two independent RNNs: one that is fed the input sequence in normal sequential order, and one that is fed the input sequence in reverse order. The output of each network are summed each time step. using a bidirectional GRU will give us thw advantage of encoding both past and future context

defining the encoding class

In [None]:
class EncoderRNN(nn.Module):
    def __init__(self, hidden_size, embedding, n_layers=1, dropout=0):
        super(EncoderRNN, self).__init__()
        self.n_layers = n_layers
        self.hidden_size = hidden_size
        self.embedding = embedding
        # Initialize GRU; the input_size and hidden_size params are both set to 'hidden_size'
        # because our input size is a word embedding with the number of features == hidden_size
        self.gru = nn.GRU(hidden_size, hidden_size, n_layers, dropout=(0 if n_layers == 1 else dropout), bidirectional=True)

    def forward(self, input_seq, input_lengths, hidden=None):
        # input_seq: batch of input sentences; shape=(max_length, batch_size)
        # input_lengths: list of sequence lengths corresponding to each sentence in the batch
        # hidden state, of shape: (n_layers x num_directions, batch_size, hidden_state)

        # Convert word indexes to embeddings
        embedded = self.embedding(input_seq)
        # Pack padded batch of sequences for RNN module
        packed = torch.nn.utils.rnn.pack_padded_sequence(embedded, input_lengths, enforce_sorted=False)
        # Forward pass through GRU
        outputs, hidden = self.gru(packed, hidden)
        # Unpack padding
        outputs, _ = torch.nn.utils.rnn.pad_packed_sequence(outputs)
        # Sum bidirectional GRU outputs
        outputs = outputs[:, :, :self.hidden_size] + outputs[:, :, self.hidden_size:]
        return outputs, hidden
        # outputs: the output features h_t from the last layer of the GRU, for each timestep (sum of bidirectional outputs)
        # outputs shape: (max_length, batch_size, hidden_size)
        # hidden: hidden state for the last timestep, of shape=(n_layers x num_directions, batch_size, hidden_size)


In [None]:
a = torch.randn(5,3,7)
a

tensor([[[-2.2280e-01,  2.0142e-01, -2.4913e-01, -4.4207e-01, -8.8855e-01,
           7.7878e-01,  4.9811e-01],
         [-1.0300e-01,  7.6054e-01, -4.0112e-01, -1.0350e-02,  4.6259e-01,
          -7.2658e-01,  1.2543e+00],
         [ 4.6943e-01, -1.0709e+00,  3.4586e-02,  4.0473e-01, -8.1455e-01,
           9.2665e-02,  3.4458e-01]],

        [[ 2.4751e+00, -1.5479e+00,  7.6795e-01,  2.1837e+00, -9.8268e-01,
          -2.6216e+00,  1.6933e-01],
         [-1.3522e-01,  6.1532e-01, -1.7428e-01,  2.2826e+00,  3.5139e-01,
           3.9133e-01,  5.3095e-01],
         [ 1.3137e+00,  1.1407e+00,  2.4797e-01,  1.0734e-01,  5.5858e-01,
          -1.4304e+00, -9.0088e-01]],

        [[ 2.0475e-01, -1.6376e+00, -3.1823e-01,  9.8716e-01, -2.1994e+00,
           1.0863e+00,  4.8451e-01],
         [-1.0549e-01,  5.9714e-01,  5.6380e-01,  6.9940e-01,  1.7712e+00,
           1.0738e+00,  2.3776e-01],
         [-4.7313e-01, -1.3431e+00,  2.3311e-02,  4.2587e-01, -3.9279e-01,
           5.5582e-01, -5

In [None]:
torch.sum(a,dim=2)

tensor([[-0.3242,  1.2363, -0.5395],
        [ 0.4439,  3.8621,  1.0370],
        [-1.3926,  4.8376, -1.7937],
        [ 0.3302,  2.3359, -3.7479],
        [ 3.1308,  1.6367, -0.6360]])

In [None]:
b= torch.rand(5,7)

In [None]:
b

tensor([[0.3950, 0.9241, 0.4377, 0.8068, 0.8854, 0.1827, 0.3151],
        [0.4332, 0.7074, 0.1659, 0.2989, 0.4722, 0.7453, 0.0484],
        [0.3215, 0.1116, 0.8680, 0.4582, 0.4496, 0.8951, 0.9066],
        [0.7532, 0.2134, 0.3439, 0.6290, 0.0701, 0.2671, 0.0506],
        [0.8557, 0.2988, 0.3618, 0.5568, 0.6090, 0.6883, 0.1624]])

In [None]:
c=F.softmax(b, dim=1)

In [None]:
c

tensor([[0.1161, 0.1970, 0.1211, 0.1752, 0.1895, 0.0939, 0.1072],
        [0.1420, 0.1868, 0.1087, 0.1242, 0.1477, 0.1940, 0.0966],
        [0.1065, 0.0863, 0.1839, 0.1221, 0.1210, 0.1890, 0.1912],
        [0.2107, 0.1228, 0.1399, 0.1861, 0.1064, 0.1296, 0.1044],
        [0.1979, 0.1134, 0.1208, 0.1468, 0.1547, 0.1674, 0.0990]])

In [None]:
c[0].sum()

tensor(1.0000)

implementing the attention mechanism

In [None]:
# Luang attention layer
class Attn(torch.nn.Module):
  def __init__(self, method, hidden_size):
    super(Attn, self).__init__()
    self.method = method
    self.hidden_size = hidden_size

  def dot_score(self, hidden, encoder_output):
    #Element-Wise Multiply the current target decoder state with the encoder output and sum then
    return torch.sum(hidden * encoder_output, dim=2)

  def forward(self, hidden, encoder_outputs):
    # hiddden of shape: (1, batch_size, hidden_size)
    # encoder_outputs of shape: (max_length, batch_size, hidden_size)
    # (1, batch_size, hidden_size) * (max_length, batch_size, hidden_size) = (max_length, batch_size, hidden_size)

    # Calculate the attention weights (energies)
    attn_energies = self.dot_score(hidden, encoder_outputs) #(max_length, batch_size)
    # Transpose max_length and batch_size dimensions
    attn_energies = attn_energies.t()       # (batch_size, max_length)
    # Return the softmax normalized probability scores(with added dimension)
    return F.softmax(attn_energies, dim=1).unsqueeze(1)  #(batch_size, max_length)

using attention mechanism to bulid our decoder

In [None]:
class LuongAttnDecoderRNN(nn.Module):
  def __init__(self, attn_model, embedding, hidden_size, output_size, n_layers=1, dropout=0.1):
    super(LuongAttnDecoderRNN, self).__init__()
    self.attn_model= attn_model
    self.hidden_size = hidden_size
    self.output_size = output_size
    self.n_layers = n_layers
    self.dropout = dropout

    # Define layers
    self.embedding = embedding
    self.embedding_dropout = nn.Dropout(dropout)
    self.gru = nn.GRU(hidden_size, hidden_size, n_layers, dropout=(0 if n_layers == 1 else dropout))
    self.concat = nn.Linear(hidden_size * 2, hidden_size)
    self.out = nn.Linear(hidden_size, output_size)

    self.attn = Attn(attn_model, hidden_size)

  def forward(self, input_step, last_hidden, encoder_outputs):
    # input_step: one time step (one word) of input sequence batch; shape=(1, batch_size)
    # last_hidden: final hidden state of encoder GRU; shape=(n_layers x num_directions, batch_size, hidden_size)
    # encoder_outputs: encoder model's output; shape=(max_length, batch_size, hiddden_size)
    # Note: we run this one step (batch of words) at a time

    # Get embedding of current input word
    embedded = self.embedding(input_step)
    embedded = self.embedding_dropout(embedded)
    # Forward through unidirectional GRU
    rnn_output, hidden = self.gru(embedded, last_hidden)
    # Calculate attention weights from the current GRU output
    attn_weights = self.attn(rnn_output, encoder_outputs)
    #Multiply attention weights to encoder outputs to get new "weighted sum" context vector
    # (batch_size, 1, max_length) bmm with (batch_size, mx_length, hidden)= (batch_size,1, hidden)
    context = attn_weights.bmm(encoder_outputs.transpose(0,1))
    # concatenate weighted context vector and GRU output
    rnn_output = rnn_output.squeeze(0)
    context = context.squeeze(1)
    concat_input = torch.cat((rnn_output, context), 1)
    concat_output = torch.tanh(self.concat(concat_input))
    # Predict next word using Luong eq. 6
    output = self.out(concat_output)
    output = F.softmax(output, dim=1)
    # Return output and final hidden state
    return output, hidden
    #output: softmax normalized tensor giving probabilities of each word being correct next word in the decode sequence
    # shape=(batch_size, voc.num_words)
    # hidden: final hidden state of GRU; shape:(n_layers x num_directions, batch_size, hidden_size)

In [None]:
device = 'cpu'

# Training the model

In [None]:
def maskNLLLoss(decoder_out, target, mask):
  nTotal = mask.sum()
  target = target.view(-1,1)
  # decoder_out shape: (btch_size, vocab_size), target_size = (batch_size, 1)
  gathered_tensor = torch.gather(decoder_out, 1, target)
  # Calculate the Negative Log Likelihood Loss
  crossEntropy = -torch.log(gathered_tensor)
  # Select the non-zero elements
  loss = crossEntropy.masked_select(mask)
  # Calculate the mean of the loss
  loss = loss.mean()
  loss = loss.to(device)
  return loss, nTotal.item()

# The training steps are as follows:

Forward pass entire input batch through encoder, initialize decoder inputs ass SOS_token, and hidden state as the encoder's final hidden state. Forward input batch sequence through one time step at a time. If teacher forcing: set next decoder input as the current target; else set next decoder input as current decoder output, Calculate and accumulate loss, perform backpropagation, Clip gradients and finally Update encoder and decoder model parameters

In [None]:
# Visualize what's happening in one iteration. Only run this for visualisation
small_batch_size = 5
batches = batch2TrainData(voc, [random.choice(pairs) for _ in range(small_batch_size)])
input_variable, lengths, target_variable, mask, max_length_len = batches

print("input_variable shape:", input_variable.shape)
print("lengths shape:", lengths.shape)
print("target_variable shape:", target_variable.shape)
print("mask shape:", mask.shape)
print("max_lengths_len:", max_target_len)

#Define the parameters
hidden_size = 500
encoder_n_layers = 2
decoder_n_layers = 2
dropout = 0.1
attn_model = 'dot'
embedding = nn.Embedding(voc.num_words, hidden_size)

#Define the encoder and decoder
encoder = EncoderRNN(hidden_size, embedding, encoder_n_layers, dropout)
decoder = LuongAttnDecoderRNN(attn_model, embedding, hidden_size, voc.num_words, decoder_n_layers, dropout)
encoder = encoder.to(device)
decoder = decoder.to(device)
# Ensure dropout layers are in train mode
encoder.train()
decoder.train()

# Initialize optimizers
encoder_optimizer = optim.Adam(encoder.parameters(), lr=0.0001)
decoder_optimizer = optim.Adam(decoder.parameters(), lr=0.0001)
encoder_optimizer.step()
decoder_optimizer.step()

input_variable = input_variable.to(device)
lengths = lengths.to(device)
target_variable = target_variable.to(device)
mask = mask.to(device)

loss = 0
print_losses = []
n_totals = 0

encoder_outputs, encoder_hidden = encoder(input_variable, lengths)
print("Encoder Outputs Shape:", encoder_outputs.shape)
print("Last Encoder Hidden Shape:", encoder_hidden.shape)

decoder_input = torch.LongTensor([[SOS_token for _ in range(small_batch_size)]])
decoder_input = decoder_input.to(device)
print("Initial Decoder Input Shape:", decoder_input.shape)
print(decoder_input)

# Set initial decoder hidden state to the encoder's final hidden state
decoder_hidden = encoder_hidden[:decoder.n_layers]
print("Initial Decoder hidden state shape:", decoder_hidden.shape)
print("\n")
print("-------------------------------------------------------------")
print("Now let's look what's happening in every timestep of the GRU!")
print("--------------------------------------------------------------")
print("\n")

# Assume we are using Teacher forcing
for t in range(max_target_len):
  decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden, encoder_outputs)
  print("Decoder Output Shape:", decoder_output.shape)
  print("Decoder Hidden Shape:", decoder_hidden.shape)
  # teacher forcing: next input is current target
  decoder_input = target_variable[t].view(1,-1)
  print("The target variable at the current timestep before reshaping:", target_variable[t])
  print("The target variable at the current timestep shape before reshaping:", target_variable[t].shape)
  print("The Decoder input shape (reshape the target variable):", decoder_input.shape)
  # Calculate and accumulate loss
  print("The mask at the current timestep:", mask[t])
  print("The mask at the current timestep shape:", mask[t].shape)
  mask_loss, nTotal = maskNLLLoss(decoder_output, target_variable[t], mask[t])
  print("Mask Loss:", mask_loss)
  print("Total:", nTotal)
  loss += mask_loss
  print_losses.append(mask_loss.item() * nTotal)
  print(print_losses)
  n_totals += nTotal
  print(n_totals)
  encoder_optimizer.step()
  decoder_optimizer.step()
  returned_loss = sum(print_losses) / n_totals
  print("Returned Loss: ", returned_loss)
  print("\n")
  print("-----------------------------DONE ONE TIMESTEP-----------------")
  print("\n")


input_variable shape: torch.Size([8, 5])
lengths shape: torch.Size([5])
target_variable shape: torch.Size([7, 5])
mask shape: torch.Size([7, 5])
max_lengths_len: 3
Encoder Outputs Shape: torch.Size([8, 5, 500])
Last Encoder Hidden Shape: torch.Size([4, 5, 500])
Initial Decoder Input Shape: torch.Size([1, 5])
tensor([[1, 1, 1, 1, 1]])
Initial Decoder hidden state shape: torch.Size([2, 5, 500])


-------------------------------------------------------------
Now let's look what's happening in every timestep of the GRU!
--------------------------------------------------------------


Decoder Output Shape: torch.Size([5, 7826])
Decoder Hidden Shape: torch.Size([2, 5, 500])
The target variable at the current timestep before reshaping: tensor([ 77, 660,  96, 218, 144])
The target variable at the current timestep shape before reshaping: torch.Size([5])
The Decoder input shape (reshape the target variable): torch.Size([1, 5])
The mask at the current timestep: tensor([1, 1, 1, 1, 1], dtype=torch

  loss = crossEntropy.masked_select(mask)


In [None]:
def train(input_variable, lengths, target_varible, mask, max_target_len, encoder, decoder, embedding, encoder_optimizer, decoder_optimizer, batch_size, clip, max_length=MAX_LENGTH):
  # Zero gradients
  encoder_optimizer.zero_grad()
  decoder_optimizer.zero_grad()

  # Set device options
  input_variable = input_variable.to(device)
  lengths = lengths.to(device)
  target_variable = target_variable.to(device)
  mask = mask.to(device)

  # Initialize variables
  loss = 0
  print_losses = []
  n_totals = 0

  # Forward pass through encoder
  encoder_outputs, encoder_hidden = encoder(input_variable, lengths)

  # Create initial decoder input (start with SOS tokens for each sentence)
  decoder_input = torch.LongTensor([[SOS_token for _ in range(batch_size)]])
  decoder_input = decoder_input.to(device)

  # Set initial decoder hidden state to the encoder's final hidden state
  decoder_hidden = encoder_hidden[:decoder.n_layers]
  teacher_forcing_ratio = 0.5
  # Determine if we are using teacher forcing this iteration
  use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False

  # Forward batch of sequences through decoder one time step at a time
  if use_teacher_forcing:
    for t in range(max_target_len):
      decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden, encoder_outputs)
      # Teacher forcing: next input is current target
      decoder_input = target_variable[t].view(1,-1)
      # Calculate and accumulate loss
      mask_loss, nTotal = maskNLLLoss(decoder_output, target_variable[t], mask[t])
      loss += mask_loss
      print_losses.append(mask_loss.items() * nTotal)
      n_totals += nTotal

  else:
    for t in range(max_length_len):
      decoder_output, decoder_hiddden = decoder(decoder_input, decoder_hidden, encoder_outputs)
      # No teacher forcing: next input is decoder's own current output
      _, topi = decoder_output.topk(1)
      decoder_input = torch.LongTensor([[topi[i][0] for i in range(batch_size)]])
      decoder_input = decoder_input.to(device)
      # calculate and accumulate loss
      mask_loss, nTotal = maskNLLLoss(decoder_output, target_variable[t], mask[t])
      loss += mask_loss
      print_losses.append(mask_loss.item() * nTotal)
      n_totals += nTotal

  # Perform backpropatation
  loss.backward()

  # Clip gradients: gradients are modified in place
  _ = torch.nn.utils.clip_grad_norm_(encoder.parameters(), clip)
  _ = torch.nn.utils.clip_grad_norm_(decoder.parameters(), clip)

  # Adjust model weights
  encoder_optimizer.step()
  decoder_optimizer.step()

  return sum(print_losses) / n_totals

In [None]:
def trainIters(model_name, voc, pair, encoder, decoder, encoder_optimizer, decoder_optimizer, embedding, encoder_n_layers, decoder_n_layers, pairs, n_iteration, batch_size, print_every, save_every, clip, directory_path, loadFilename):

  # Load batches for each iteration
  training_batches = [batch2TrainData(voc, [random.choice(pairs) for _ in range(batch_size)]) for _ in range(n_iteration)]
  # Initializations
  print("Initializing.....")
  start_iteration = 1
  print_loss = 0
  if loadFilename:
    start_iteration = checkpoint['iteration'] + 1

  # Training loop
  print("Training....")
  for iteration in range(start_iteration, n_iteration + 1):
    training_batch = training_batches[iteration - 1]
    # Extract fields from batches
    input_variable, lengths, target_variable, mask, max_length_len = training_batch

    # Run a training iteration with batch
    loss = train(input_variable, lengths, target_variable, mask, max_length_len, encoder, decoder, embedding, encoder_optimizer, decoder_optimizer, batch_size, clip)
    print_loss += loss

    # Print progress
    if iteration % print_every == 0:
      print_loss_avg = print_loss / print_every
      print("Iteration: {}; Percent complete: {:.1f}%; Average loss: {:.4f}".format(iteration, iteration / n_iteration * 100, print_loss_avg))
      print_loss = 0

    # Save checkpoint
    if (iteration % save_every == 0):
      directory = os.path.join(pairs, model_name, directory_path, '{}--{}_{}'.format(encoder_n_layers, decoder_n_layers, hidden_size))
      if not os.path.exists(directory):
        os.makedirs(directory)
      torch.save({'iteration':iteration,
                  'en': encoder.state_dict(),
                  'de': decoder.state_dict(),
                  'en_opt': encoder_optimizer.state_dict(),
                  'de_opt': decoder_optimizer.state_dict(),
                  'loss': loss,
                  'voc_dict': voc.__dict__,
                  'embedding': embedding.state_dict()}, os.path.join(directory, '{}_{}.tar'.format(iteration, 'checkpoint')))



After training a model, we want to be able to talk to the bot ourselves. First, we must define how we want the model to decode the encoded input.

# Greedy decoding

Greedy decoding is the decoding method that we use during training when we are NOT using teacher forcing. In other words, for each time step, we simply choose the word from decoder_output with the highest softmax value. This decoding method is optimal on a single time-step level.

To facilitate the greedy decoding operation, we define a GreedySearchDecoder class. When run, an object of this class takes an input sequence (input_seq) of shape (input_seq length, 1), a scalar input length (input_length) tensor, and a max_length to bound the response sentence length.

In [None]:
class GreedySearchDecoder(nn.Module):
  def __init__(self, encoder, decoder):
    super(GreedySearchDecoder, self).__init__()
    self.encoder = encoder
    self.decoder = decoder

  def forward(self, input_seq, input_length, max_length):
    # Forward input through encoder model
    encoder_outputs, encoder_hidden = self.encoder(input_seq, input_length)
    # Prepare encoder's final hidden layer to be first hidden input to the decoder
    decoder_hidden = encoder_hidden[:decoder.n_layers]
    # Initialize decoder input with SOS_token
    decoder_input = torch.ones(1, 1, device=device, dtype=torch.long) * SOS_token
    # Initialize tensors to append decoder words to
    all_tokens = torch.zeros([0], device=device, dtype=torch.long)
    all_scores = torch.zeeros([0], device=device)
    # Iteratively decode one word token at a time
    for _ in range(max_length):
      # Forward pass through decoder
      decoder_output, decoder_hidden = self.decoder(decoder_input, decoder_hidden, encoder_outputs)
      # Obtain most likely word token and its softmax score
      decoder_scores, decoder_input = torch.max(decoder_output, dim=1)
      # Record token and score
      all_tokens = torch.cat((all_tokens, decoder_input), dim=0)
      all_scores = torch.cat((all_scores, decoder_scores), dim=0)
      # Prepare current token to be next decoder input (add a dimension)
      decoder_input = torch.unsqueeze(decoder_input, 0)
    # Return collections of word token and scores
    return all_tokens, all_scores

In [None]:
def evaluate(encoder, decoder, searcher, voc, sentence, max_length=MAX_LENGTH):
  ### Format input sentence as a batch
  # words -> indexes
  indexes_batch = [indexesFromSentence(voc, sentence)]
  # Create lengths tensor
  lengths = torch.tensor([len(indexes) for indexes in indexes_batch])
  # Transpose dimensions of batch to match model's expectations
  input_batch = torch.LongTensor(indexes_batch).transpose(0, 1)
  # Use appropriate device
  input_batch = input_batch.to(device)
  lengths = lengths.to("cpu")
  # Decode sentence with searcher
  tokens, scores = searcher(input_batch, lengths, max_length)
  # indexes -> words
  decoded_words = [voc.index2word[token.item()] for token in tokens]
  return decoded_words

def evaluateInput(encoder, decoder, searcher, voc):
  input_sentence = ''
  while(1):
    try:
      # Get input sentence
      input_sentence = input('> ')
      # Check if it is quite case
      if input_sentence == 'q' or input_sentence == 'quit': break
      # Normalize sentence
      input_sequence = normalizeString(input_sentence)
      # Evaluate sentence
      output_words = evaluate(encoder, decoder, searcher, voc, input_sequence)
      # Format and print response sentence
      output_words[:] = [x for x in output_words if not (x == 'EOS' or x =='PAD')]
      print('Bot:', ' '.join(output_words))
    except KeyError:
      print("Error: Encountered unknown word.")

In [None]:
# Configure models
model_name = 'cb_model'
attn_model = 'dot'
# ''attn_model = 'general'''
# ''attn_model = 'concat'''
hidden_size = 500
encoder_n_layers = 2
decoder_n_layers = 2
dropout = 0.1
batch_size = 64

# Set checkpoint to load from; set to None if starting from scratch
loadFilename = None
checkpoint_iter = 4000

In [None]:
loadFilename = os.path.join(save_dir, model_name, corpus_name,
                    '{}-{}_{}'.format(encoder_n_layers, decoder_n_layers, hidden_size),
                    '{}_checkpoint.tar'.format(checkpoint_iter))

In [None]:
# Load model if a "loadFilename" is provided
if loadFilename:
  # If loading a model trained on GPU to CPU
  # checkpoint = torch.load(loadFilename, map_location=torch.device('cpu'))
  encoder_sd = checkpoint['en']
  decoder_sd = checkpoint['de']
  encoder_optimizer_sd = checkpoint['en_opt']
  decoder_optimizer_sd = checkpoint['de_opt']
  embedding_sd = checkpoint['embedding']
  voc.__dict__ = checkpoint['voc_dict']

print('Building encoder and decoder....')
# Initialize word embeddings
embedding = nn.Embedding(voc.num_words, hidden_size)
if loadFilename:
  embedding.load_state_dict(embedding_sd)
# Initialize encoder and decoder
encoder = EncoderRNN(hidden_size, embedding, encoder_n_layers, dropout)
decoder = LuongAttnDecoderRNN(attn_model, embedding, hidden_size, voc.num_words, decoder_n_layers, dropout)
if loadFilename:
  encoder.load_state_dict(encoder_sd)
  decoder.load_state_dict(decoder_sd)
# Use appropriate device
encoder = encoder.to(device)
decoder = decoder.to(device)
print('Models bulit and ready to go')

Building encoder and decoder....
Models bulit and ready to go


In [None]:
# Configure training/optimization
clip = 50.0
teacher_forcing_ratio = 1.0
learning_rate = 0.0001
decoder_learning_ratio = 5.0
n_iteration = 4000
print_every = 1
save_every = 500

# Ensure dropout layers are in train mode
encoder.train()
decoder.train()

# Initialize optimizers
print('Building optimizers ...')
encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate)
decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate * decoder_learning_ratio)
if loadFilename:
    encoder_optimizer.load_state_dict(encoder_optimizer_sd)
    decoder_optimizer.load_state_dict(decoder_optimizer_sd)

# If you have CUDA, configure CUDA to call
for state in encoder_optimizer.state.values():
    for k, v in state.items():
        if isinstance(v, torch.Tensor):
            state[k] = v.cuda()

for state in decoder_optimizer.state.values():
    for k, v in state.items():
        if isinstance(v, torch.Tensor):
            state[k] = v.cuda()

# Run training iterations
print("Starting Training!")
trainIters(model_name, voc, pairs, encoder, decoder, encoder_optimizer, decoder_optimizer,
           embedding, encoder_n_layers, decoder_n_layers, save_dir, n_iteration, batch_size,
           print_every, save_every, clip, corpus_name, loadFilename)

In [None]:
# Set dropouts layers to "eval" mode
encoder.eval()
decoder.eval()

# Initialize search module
searcher = GreedySearchDecoder(encoder, decoder)

# Begin chatting (uncomment and run the following line to begin)
# evaluateInput(encoder, decoder, searcher, voc)