# Project Overview

In this project, I will build a chatbot that can converse with you at the command line. The chatbot will use a Sequence to Sequence text generation architecture with an LSTM as it's memory unit. You will also learn to use pretrained word embeddings to improve the performance of the model. At the conclusion of the project, you will be able to show your chatbot to potential employers.

Additionally, you have the option to use pretrained word embeddings in your model. We have loaded Brown Embeddings from Gensim in the starter code below. You can compare the performance of your model with pre-trained embeddings against a model without the embeddings.

A sequence to sequence model (Seq2Seq) has two components:

An Encoder consisting of an embedding layer and LSTM unit.
A Decoder consisting of an embedding layer, LSTM unit, and linear output unit.
The Seq2Seq model works by accepting an input into the Encoder, passing the hidden state from the Encoder to the Decoder, which the Decoder uses to output a series of token predictions.

In [11]:
import gensim
import nltk
import numpy as np
import pandas as pd
import gzip
import torch
from nltk.corpus import brown
from torchtext.datasets import SQuAD1
import string
import torch.nn as nn
import random 
from sklearn.model_selection import KFold
from datetime import datetime

stemmer = nltk.stem.snowball.SnowballStemmer('english')

nltk.download('brown')
nltk.download('punkt')

# Output, save, and load brown embeddings

model = gensim.models.Word2Vec(brown.sents())
model.save('brown.embedding')

w2v = gensim.models.Word2Vec.load('brown.embedding')

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

[nltk_data] Downloading package brown to /Users/ante/nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package punkt to /Users/ante/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [12]:
def loadDF(path):
    
    dataset_train, dataset_dev = SQuAD1(root = path, split = ('train', 'dev'))

    df_train = pd.DataFrame.from_dict(dataset_train)
    df_dev = pd.DataFrame.from_dict(dataset_dev)
    
    df = df_train.append(df_dev)
    
    return df

In [13]:
df = loadDF('.data')

feature = ["Sentence", "Question", "Answer", "?"]
df.columns = feature

df.head()

train-v1.1.json: 30.3MB [00:00, 46.9MB/s]                            
dev-v1.1.json: 4.85MB [00:00, 44.8MB/s]                   


Unnamed: 0,Sentence,Question,Answer,?
0,"Architecturally, the school has a Catholic cha...",To whom did the Virgin Mary allegedly appear i...,[Saint Bernadette Soubirous],[515]
1,"Architecturally, the school has a Catholic cha...",What is in front of the Notre Dame Main Building?,[a copper statue of Christ],[188]
2,"Architecturally, the school has a Catholic cha...",The Basilica of the Sacred heart at Notre Dame...,[the Main Building],[279]
3,"Architecturally, the school has a Catholic cha...",What is the Grotto at Notre Dame?,[a Marian place of prayer and reflection],[381]
4,"Architecturally, the school has a Catholic cha...",What sits on top of the Main Building at Notre...,[a golden statue of the Virgin Mary],[92]


In [14]:
SOS_token = 0
EOS_token = 1

class Chatbot:
    def __init__(self):
        #initialize containers to hold the words and corresponding index
        self.word2index = {"": SOS_token, "": EOS_token}
        self.index2word = {SOS_token: "", EOS_token: ""}
        self.words_count = len(self.word2index)
        
    def add_Sentence(self, sentence):
        #split a sentence into words and add it to the container
        for word in sentence.split(' '):
            self.add_words(word)

    def add_words(self, word):
        #If the word is not in the container, the word will be added to it
        if word not in self.word2index:
            self.word2index[word] = self.words_count
            self.index2word[self.words_count] = word
            self.words_count += 1

In [15]:
#Normalize every sentence
def normalize_sentence(sentence):
    
    sentence = ''.join([s.lower() for s in sentence if s not in string.punctuation])
    sentence = ' '.join(stemmer.stem(w) for w in sentence.split())
    tokens = nltk.tokenize.RegexpTokenizer(r'\w+').tokenize(sentence)
    
    return tokens

def read_sentence(df, col1, col2):
    
    df[col1] = df[col1].apply(normalize_sentence)
    sentence1 = df[col1].apply(lambda x: " ".join(x)).to_list()
    df[col2] = df[col2].apply(normalize_sentence)
    sentence2 = df[col2].apply(lambda x: " ".join(x) ).to_list()
    
    return list(sentence1), list(sentence2)

def getMaxLen(sentence):
    
    max_len = 0 
    
    for s in sentence:
        max_len = len(s) if len(s) > max_len else max_len
        
    return max_len

In [16]:
def process_data(df, col1, col2):
    
    sentence1, sentence2 = read_sentence(df, col1, col2)
    
    source = Chatbot()
    target = Chatbot()
    pairs = []
    
    max_lens = getMaxLen(sentence1)
    max_lenc = getMaxLen(sentence2)
    for i in range(len(df)):
        if len(sentence1[i]) <= max_lens and len(sentence2[i]) <= max_lenc:
            full = [sentence1[i], sentence2[i]]
            source.add_Sentence(sentence1[i])
            target.add_Sentence(sentence2[i])
            pairs.append(full)
    
    return source, target, pairs

In [17]:
source, target, pairs = process_data(df, 'Question', 'Answer')

In [18]:
pairs[0]

['to whom did the virgin mari alleg appear in 1858 in lourd franc',
 'saint bernadett soubir']

In [19]:
source, target

(<__main__.Chatbot at 0x1a308a2d68>, <__main__.Chatbot at 0x1a2c2c6940>)

In [20]:
#converting pairs into Tensor.
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
def indexesFromSentence(chatbot, sentence):
    return [chatbot.word2index[word] for word in sentence.split(' ')]

def tensorFromSentence(chatbot, sentence):
    indexes = indexesFromSentence(chatbot, sentence)
    indexes.append(chatbot.word2index[''])
    return torch.Tensor(indexes).long().to(device).view(-1, 1)

def tensorsFromPair(source, target, pairs):
    source_data = []
    target_data = []
    for pair in pairs:
        source_data.append(tensorFromSentence(source, pair[0]))
        target_data.append(tensorFromSentence(target, pair[1]))
    return source_data, target_data

In [21]:
source_data, target_data = tensorsFromPair(source, target, pairs)

In [22]:
class Encoder(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(Encoder, self).__init__()
        
        self.input_size = input_size
        self.hidden_size = hidden_size
        
        self.embedding = nn.Embedding(self.input_size, self.hidden_size)
        # The LSTM is our last cell because it produces the hidden state
        self.lstm = nn.LSTM(self.hidden_size, self.hidden_size)

    def forward(self, x, hidden, cell_state):
        
        x = self.embedding(x).view(1, 1, -1)
        x, (hidden, cell_state) = self.lstm(x, (hidden, cell_state))
        
        #outputs are always from the top hidden layer
        return x, hidden, cell_state     

In [23]:
class Decoder(nn.Module):
    def __init__(self, hidden_size, output_size):
        super(Decoder, self).__init__()
        
        self.hidden_size = hidden_size
        self.output_size = output_size
        
        # initialize every layer with the appropriate dimension.
        self.embedding = nn.Embedding(output_size, self.hidden_size)
        self.lstm = nn.LSTM(self.hidden_size, self.hidden_size)
        self.fc = nn.Linear(self.hidden_size, self.output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, x, hidden, cell_state):
        
        # reshape the input to (1, 1, batch_size)
        x = self.embedding(x).view(1, 1, -1)
        x, (hidden, cell_state) = self.lstm(x, (hidden, cell_state))
        x = self.softmax(self.fc(x[0]))
        
        return x, hidden, cell_state

In [24]:
class Seq2Seq(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(Seq2Seq, self).__init__()
        
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        
        #initialize the encoder and decoder
        self.encoder = Encoder(self.input_size, self.hidden_size)
        self.decoder = Decoder(self.hidden_size, self.output_size)
        
    def forward(self, src, trg, src_len, trg_len, teacher_force=1):
        
        #initialize a variable to hold the predicted outputs
        output = {'decoder_output':[]}
        
        encoder_hidden = torch.zeros([1, 1, self.hidden_size]).to(device) 
        cell_state = torch.zeros([1, 1, self.hidden_size]).to(device)  
        
        for i in range(src_len):
            encoder_output, encoder_hidden, cell_state = self.encoder(src[i], encoder_hidden, cell_state)

        #use the encoder’s hidden layer as the decoder hidden
        decoder_input = torch.Tensor([[0]]).long().to(device) 
        decoder_hidden = encoder_hidden
        
        #predict the output word from the current target word.
        for i in range(trg_len):
            decoder_output, decoder_hidden, cell_state = self.decoder(decoder_input, decoder_hidden, cell_state)
            output['decoder_output'].append(decoder_output)
            
            if self.training: 
                decoder_input = target_tensor[i] if random.random() > teacher_force else decoder_output.argmax(1) 
            else:
                _, top_index = decoder_output.data.topk(1)
                decoder_input = top_index.squeeze().detach()
                
        return output

In [25]:
def train(source_data, target_data, model, epochs, batch_size, print_every, learning_rate):
    
    model.to(device)
    total_training_loss = 0
    total_valid_loss = 0
    loss = 0
    
    optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
    criterion = nn.NLLLoss()

    kf = KFold(n_splits=epochs, shuffle=True)

    for e, (train_index, test_index) in enumerate(kf.split(source_data), 1):
        model.train()
        for i in range(0, len(train_index)):

            src = source_data[i]
            trg = target_data[i]

            output = model(src, trg, src.size(0), trg.size(0))

            #calculate the loss from a predicted sentence with the expected result
            current_loss = 0
            for (s, t) in zip(output["decoder_output"], trg): 
                current_loss += criterion(s, t)

            loss += current_loss
            total_training_loss += (current_loss.item() / trg.size(0))

            if i % batch_size == 0 or i == (len(train_index)-1):
                loss.backward()
                optimizer.step()
                optimizer.zero_grad()
                loss = 0

        model.eval()
        for i in range(0, len(test_index)):
            src = source_data[i]
            trg = target_data[i]

            output = model(src, trg, src.size(0), trg.size(0))

            current_loss = 0
            for (s, t) in zip(output["decoder_output"], trg): 
                current_loss += criterion(s, t)

            total_valid_loss += (current_loss.item() / trg.size(0))

        if e % print_every == 0:
            training_loss_average = total_training_loss / (len(train_index)*print_every)
            validation_loss_average = total_valid_loss / (len(test_index)*print_every)
            print("{}/{} Epoch  -  Training Loss = {:.4f}  -  Validation Loss = {:.4f}".format(e, epochs, training_loss_average, validation_loss_average))
            total_training_loss = 0
            total_valid_loss = 0 

In [27]:
hidden_size = 128

seq2seq = Seq2Seq(source.words_count, hidden_size, target.words_count)

In [29]:
learning_rate = 0.1
batch_size = 128
epochs = 50
train(source_data = source_data,
      target_data = target_data,
      model = seq2seq,
      epochs = epochs,
      batch_size = batch_size,
      print_every = 5,
      learning_rate = learning_rate)

0/50 Epoch  -  Training Loss = 11.2148  -  Validation Loss = 6.5352
5/50 Epoch  -  Training Loss = 9.0778  -  Validation Loss = 6.6216
10/50 Epoch  -  Training Loss = 6.3157  -  Validation Loss = 6.4931
15/50 Epoch  -  Training Loss = 8.9279  -  Validation Loss = 12.6267
20/50 Epoch  -  Training Loss = 7.9184  -  Validation Loss = 6.1722
25/50 Epoch  -  Training Loss = 5.9953  -  Validation Loss = 6.8249
30/50 Epoch  -  Training Loss = 5.4673  -  Validation Loss = 4.4526
35/50 Epoch  -  Training Loss = 5.3805  -  Validation Loss = 3.8604
40/50 Epoch  -  Training Loss = 5.0533  -  Validation Loss = 4.6428
45/50 Epoch  -  Training Loss = 6.4695  -  Validation Loss = 6.4215
50/50 Epoch  -  Training Loss = 6.6867  -  Validation Loss = 6.7893


In [30]:
def evaluate(source, target, pairs, model, target_max_len):

    source_data, target_data = tensorsFromPair(source, target, pairs)
    answer_words = []
    
    output = model(source_data[0], None, source_data[0].size(0), target_max_len)

    for tensor in output['decoder_output']:

        top_v, top_token = tensor.data.topk(1)
        
        if top_token.item() == EOS_token:
            #answer_words.append('<EOS>')
            continue
        else:
            word = target.index2word[top_token.item()]
            answer_words.append(word)
            
    print(' '.join(answer_words), "\n")

In [31]:
torch.save(seq2seq, 'seq2seq.pt')

seq2seq = torch.load('seq2seq.pt')
seq2seq.eval()

Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(30769, 128)
    (lstm): LSTM(128, 128)
  )
  (decoder): Decoder(
    (embedding): Embedding(46365, 128)
    (lstm): LSTM(128, 128)
    (fc): Linear(in_features=128, out_features=46365, bias=True)
    (softmax): LogSoftmax(dim=1)
  )
)

In [68]:
print("Type 'exit' to finish the chat.\n", "-"*50, '\n')
pair_list = []
while (True):
    pair = input("> ")
    
    if pair.strip() == "exit":
        break  
    pair_item = pair.split(",")
    for item in pair_item:
        pair_list.append(item)  

pair_list = [pair_list]

Type 'exit' to finish the chat.
 -------------------------------------------------- 

> who would she perform with at superbowl 50,coldplay
> exit


In [73]:
evaluate(source, target, pair_list, seq2seq, len(pair_list[0][1]))

june era and nation coldplay of 



In [43]:
pairs

[['to whom did the virgin mari alleg appear in 1858 in lourd franc',
  'saint bernadett soubir'],
 ['what is in front of the notr dame main build', 'a copper statu of christ'],
 ['the basilica of the sacr heart at notr dame is besid to which structur',
  'the main build'],
 ['what is the grotto at notr dame', 'a marian place of prayer and reflect'],
 ['what sit on top of the main build at notr dame',
  'a golden statu of the virgin mari'],
 ['when did the scholast magazin of notr dame begin publish', 'septemb 1876'],
 ['how often is notr dame the juggler publish', 'twice'],
 ['what is the daili student paper at notr dame call', 'the observ'],
 ['how mani student news paper are found at notr dame', 'three'],
 ['in what year did the student paper common sens begin public at notr dame',
  '1987'],
 ['where is the headquart of the congreg of the holi cross', 'rome'],
 ['what is the primari seminari of the congreg of the holi cross',
  'moreau seminari'],
 ['what is the oldest structur at n