# Project Overview

In this project, I will build a chatbot that can converse with you at the command line. The chatbot will use a Sequence to Sequence text generation architecture with an LSTM as it's memory unit. You will also learn to use pretrained word embeddings to improve the performance of the model. At the conclusion of the project, you will be able to show your chatbot to potential employers.

Additionally, you have the option to use pretrained word embeddings in your model. We have loaded Brown Embeddings from Gensim in the starter code below. You can compare the performance of your model with pre-trained embeddings against a model without the embeddings.

A sequence to sequence model (Seq2Seq) has two components:

An Encoder consisting of an embedding layer and LSTM unit.
A Decoder consisting of an embedding layer, LSTM unit, and linear output unit.
The Seq2Seq model works by accepting an input into the Encoder, passing the hidden state from the Encoder to the Decoder, which the Decoder uses to output a series of token predictions.

In [None]:
import gensim
import nltk
import numpy as np
import pandas as pd
import gzip
import torch
from nltk.corpus import brown
from torchtext.datasets import SQuAD1
import string
import torch.nn as nn
import random 
from sklearn.model_selection import KFold
from datetime import datetime

stemmer = nltk.stem.snowball.SnowballStemmer('english')

nltk.download('brown')
nltk.download('punkt')

# Output, save, and load brown embeddings

model = gensim.models.Word2Vec(brown.sents())
model.save('brown.embedding')

w2v = gensim.models.Word2Vec.load('brown.embedding')

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

[nltk_data] Downloading package brown to /Users/ante/nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package punkt to /Users/ante/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
def loadDF(path):
    
    dataset_train, dataset_dev = SQuAD1(root = path, split = ('train', 'dev'))

    df_train = pd.DataFrame.from_dict(dataset_train)
    df_dev = pd.DataFrame.from_dict(dataset_dev)
    
    df = df_train.append(df_dev)
    
    return df

In [None]:
df = loadDF('.data')

feature = ["Sentence", "Question", "Answer", "?"]
df.columns = feature

df.head()

Unnamed: 0,Sentence,Question,Answer,?
0,"Architecturally, the school has a Catholic cha...",To whom did the Virgin Mary allegedly appear i...,[Saint Bernadette Soubirous],[515]
1,"Architecturally, the school has a Catholic cha...",What is in front of the Notre Dame Main Building?,[a copper statue of Christ],[188]
2,"Architecturally, the school has a Catholic cha...",The Basilica of the Sacred heart at Notre Dame...,[the Main Building],[279]
3,"Architecturally, the school has a Catholic cha...",What is the Grotto at Notre Dame?,[a Marian place of prayer and reflection],[381]
4,"Architecturally, the school has a Catholic cha...",What sits on top of the Main Building at Notre...,[a golden statue of the Virgin Mary],[92]


In [None]:
df = df[['Question', 'Answer']]

In [None]:
def normalize_sentence(sentence):
    
    sentence = ''.join([s.lower() for s in sentence if s not in string.punctuation])
    sentence = ' '.join(stemmer.stem(w) for w in sentence.split())
    #tokens = nltk.tokenize.RegexpTokenizer(r'\w+').tokenize(sentence)
    
    return sentence

In [None]:
df['Question'] = df['Question'].apply(normalize_sentence)
df['Answer'] = df['Answer'].apply(normalize_sentence)

In [None]:
df_train = df.iloc[:10000, :]
df_test = df.iloc[10000:20000, :]

In [None]:
df_train.head()

Unnamed: 0,Question,Answer
0,to whom did the virgin mari alleg appear in 18...,saint bernadett soubir
1,what is in front of the notr dame main build,a copper statu of christ
2,the basilica of the sacr heart at notr dame is...,the main build
3,what is the grotto at notr dame,a marian place of prayer and reflect
4,what sit on top of the main build at notr dame,a golden statu of the virgin mari


In [None]:
import torchtext
import torch
from torchtext.data.utils import get_tokenizer
from collections import Counter
from torchtext.vocab import vocab
import spacy

en_nlp = spacy.load("en_core_web_sm")


en_tokenizer = get_tokenizer('spacy', language='en_core_web_sm')

def build_vocab(df, caption, tokenizer):
    default_index = 0
    counter = Counter()
    for index, row, in df.iterrows():
        counter.update(tokenizer(row[caption]))
    v2 = vocab(counter, specials=['<unk>', '<pad>', '<sos>', '<eos>'])
    v2.set_default_index(default_index)
    v2.set_default_index(v2['<unk>'])
        
    return v2

In [None]:
question_vocab = build_vocab(df_train, "Question", en_tokenizer)
answer_vocab = build_vocab(df_train, "Answer", en_tokenizer)

def data_process(df):
    
    data = []
    for index, (question, answers) in df.iterrows():
        question_tensor_ = torch.tensor([question_vocab[token] for token in en_tokenizer(question)],
                            dtype=torch.long)
        answer_tensor_ = torch.tensor([answer_vocab[token] for token in en_tokenizer(answers)],
                            dtype=torch.long)
        data.append((question_tensor_, answer_tensor_))
    return data

train_data = data_process(df_train)
val_data = data_process(df_test)

In [None]:
train_data[0]

(tensor([ 4,  5,  6,  7,  8,  9, 10, 11]), tensor([4, 5, 6]))

In [None]:
question_vocab['<sos>']

2

In [None]:
import torch

device = torch.device('cpu')
print(device)
BATCH_SIZE = 521
PAD_IDX = question_vocab['<pad>']
SOS_IDX = question_vocab['<sos>']
EOS_IDX = question_vocab['<eos>']

from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader

def generate_batch(data_batch):
    de_batch, en_batch = [], []
    for (de_item, en_item) in data_batch:
        de_batch.append(torch.cat([torch.tensor([SOS_IDX]), de_item, torch.tensor([EOS_IDX])], dim=0))
        en_batch.append(torch.cat([torch.tensor([SOS_IDX]), en_item, torch.tensor([EOS_IDX])], dim=0))
        de = pad_sequence(de_batch, padding_value=PAD_IDX)
        en = pad_sequence(en_batch, padding_value=PAD_IDX)
    return de, en

train_iter = DataLoader(train_data, batch_size=BATCH_SIZE,
                        shuffle=True, collate_fn=generate_batch)
valid_iter = DataLoader(val_data, batch_size=BATCH_SIZE,
                        shuffle=True, collate_fn=generate_batch)

cpu


In [None]:
PAD_IDX

1

In [None]:
len(train_iter)

20

In [None]:
class Encoder(nn.Module):
    def __init__(self, input_size, emb_size, hidden_size, n_layers = 2, dropout = 0.5):
        super(Encoder, self).__init__()
        
        self.input_size = input_size
        self.emb_size = emb_size
        self.hidden_size = hidden_size
        self.n_layers = n_layers
        
        self.embedding = nn.Embedding(self.input_size, self.emb_size)
        # The LSTM is our last cell because it produces the hidden state
        self.lstm = nn.LSTM(self.emb_size, self.hidden_size, self.n_layers)
        self.dropout = nn.Dropout(dropout)

    def forward(self, src):
        embedded = self.embedding(src)
        
        embedded = self.dropout(embedded)
     
        outputs, (hidden, cell) = self.lstm(embedded)
        
        return hidden, cell   

In [None]:
class Decoder(nn.Module):
    def __init__(self, hidden_size, emb_size, output_size, n_layers = 2, dropout = 0.5):
        super(Decoder, self).__init__()
        
        self.hidden_size = hidden_size
        self.emb_size = emb_size
        self.output_size = output_size
        self.n_layers = n_layers
        
        # initialize every layer with the appropriate dimension.
        self.embedding = nn.Embedding(output_size, self.emb_size)
        self.lstm = nn.LSTM(self.emb_size, self.hidden_size, self.n_layers)
        self.fc = nn.Linear(self.hidden_size, self.output_size)
        self.softmax = nn.LogSoftmax(dim=1)
        
        self.dropout = nn.Dropout(dropout)

    def forward(self, input, hidden, cell):

        input = input.unsqueeze(0)
        embedded = self.embedding(input)
        embedded = self.dropout(embedded)
        
        output, (hidden, cell) = self.lstm(embedded, (hidden, cell))
             
        prediction = self.softmax(self.fc(output.squeeze(0)))

        return prediction, hidden, cell

In [None]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
     
    def forward(self, src, trg, teacher_forcing_ratio = 1):
        
        batch_size = trg.shape[1]
        trg_len = trg.shape[0]
        trg_vocab_size = self.decoder.output_size
        
        #tensor to store decoder outputs
        outputs = torch.zeros(trg_len, batch_size, trg_vocab_size).to(self.device)
        
        #last hidden state of the encoder is used as the initial hidden state of the decoder
        hidden, cell = self.encoder(src)
        
        #first input to the decoder is the <sos> tokens
        input = trg[0,:]
        
        for t in range(1, trg_len):
            
            #insert input token embedding, previous hidden and previous cell states
            #receive output tensor (predictions) and new hidden and cell states
            output, hidden, cell = self.decoder(input, hidden, cell)
            
            #place predictions in a tensor holding predictions for each token
            outputs[t] = output
            
            #decide if we are going to use teacher forcing or not
            teacher_force = random.random() < teacher_forcing_ratio
            
            #get the highest predicted token from our predictions
            top1 = output.argmax(1) 
            
            #if teacher forcing, use actual next token as next input
            #if not, use predicted token
            input = trg[t] if teacher_force else top1
        
        return outputs


In [None]:
INPUT_DIM = len(question_vocab)
OUTPUT_DIM = len(answer_vocab)
HID_DIM = 128

enc = Encoder(INPUT_DIM, 256, HID_DIM)

dec = Decoder(HID_DIM, 256, OUTPUT_DIM)

model = Seq2Seq(enc, dec, device).to(device)

In [None]:
model

Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(11768, 256)
    (lstm): LSTM(256, 128, num_layers=2)
    (dropout): Dropout(p=0.5, inplace=False)
  )
  (decoder): Decoder(
    (embedding): Embedding(10371, 256)
    (lstm): LSTM(256, 128, num_layers=2)
    (fc): Linear(in_features=128, out_features=10371, bias=True)
    (softmax): LogSoftmax(dim=1)
    (dropout): Dropout(p=0.5, inplace=False)
  )
)

In [None]:
def init_weights(m: nn.Module):
    for name, param in m.named_parameters():
        if 'weight' in name:
            nn.init.normal_(param.data, mean=0, std=0.01)
        else:
            nn.init.constant_(param.data, 0)

model.apply(init_weights)

optimizer = torch.optim.SGD(model.parameters(), lr= 0.01)

def count_parameters(model: nn.Module):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 7,664,899 trainable parameters


In [None]:
PAD_IDX = question_vocab.get_stoi()['<pad>']
print(PAD_IDX)
criterion = nn.NLLLoss(ignore_index=PAD_IDX)

1


In [None]:
import math
import time

def train(model, iterator, optimizer, criterion, clip):  

    model.train()
    epoch_loss = 0

    for _, (src, trg) in enumerate(iterator):

        src, trg = src.to(device), trg.to(device)

        optimizer.zero_grad()

        output = model(src, trg)

        output = output[1:].view(-1, output.shape[-1])
        trg = trg[1:].view(-1)

        loss = criterion(output, trg)

        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        
        optimizer.step()
        optimizer.zero_grad()
        epoch_loss += loss.item()

    return epoch_loss / len(iterator)

In [None]:
def evaluate(model, iterator, criterion):

    model.eval()
    epoch_loss = 0

    with torch.no_grad():

        for _, (src, trg) in enumerate(iterator):
            src, trg = src.to(device), trg.to(device)
            output = model(src, trg, 0) #turn off teacher forcing

            output = output[1:].view(-1, output.shape[-1])
            trg = trg[1:].view(-1)

            loss = criterion(output, trg)

            epoch_loss += loss.item()

    return epoch_loss / len(iterator)


In [None]:
N_EPOCHS = 100
CLIP = 1

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    
    start_time = time.time()

    train_loss = train(model, train_iter, optimizer, criterion, CLIP)
    valid_loss = evaluate(model, valid_iter, criterion)
    
    end_time = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'seq2seq_model.pt')
    
    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f}')

Epoch: 01 | Time: 0m 18s
	Train Loss: 4.744
	 Val. Loss: 3.257
Epoch: 02 | Time: 0m 18s
	Train Loss: 4.528
	 Val. Loss: 3.227
Epoch: 03 | Time: 0m 18s
	Train Loss: 4.334
	 Val. Loss: 3.320
Epoch: 04 | Time: 0m 18s
	Train Loss: 4.174
	 Val. Loss: 3.364
Epoch: 05 | Time: 0m 17s
	Train Loss: 4.034
	 Val. Loss: 3.293
Epoch: 06 | Time: 0m 18s
	Train Loss: 3.934
	 Val. Loss: 3.278
Epoch: 07 | Time: 0m 18s
	Train Loss: 3.805
	 Val. Loss: 3.352
Epoch: 08 | Time: 0m 18s
	Train Loss: 3.715
	 Val. Loss: 3.453
Epoch: 09 | Time: 0m 18s
	Train Loss: 3.595
	 Val. Loss: 3.396
Epoch: 10 | Time: 0m 18s
	Train Loss: 3.492
	 Val. Loss: 3.503
Epoch: 11 | Time: 0m 18s
	Train Loss: 3.356
	 Val. Loss: 3.585
Epoch: 12 | Time: 0m 18s
	Train Loss: 3.257
	 Val. Loss: 3.573
Epoch: 13 | Time: 0m 18s
	Train Loss: 3.145
	 Val. Loss: 3.720
Epoch: 14 | Time: 0m 18s
	Train Loss: 3.010
	 Val. Loss: 3.616
Epoch: 15 | Time: 0m 18s
	Train Loss: 2.870
	 Val. Loss: 4.014
Epoch: 16 | Time: 0m 18s
	Train Loss: 2.770
	 Val. Loss

In [None]:
model.load_state_dict(torch.load('seq2seq_model.pt'))

<All keys matched successfully>

In [None]:
def chatbot(model, iterator, criterion):

    model.eval()

    epoch_loss = 0

    with torch.no_grad():

        for _, (src, trg) in enumerate(iterator):
            src, trg = src.to(device), trg.to(device)
            output = model(src, trg, 0) #turn off teacher forcing
            output = output[1:].view(-1, output.shape[-1])
            trg = trg[1:].view(-1)
            loss = criterion(output, trg)

            epoch_loss += loss.item()
            result = []
            for tensor in output:
                _, top_token = tensor.data.topk(1)
                if top_token.item() == 1:
                    break
                else:
                    word = answer_vocab.get_itos()[top_token.item()]
                    result.append(word)     
            return  result

In [None]:
def generate_test_batch(data_batch):
    de_batch, en_batch = [], []
    for (de_item, en_item) in data_batch:
        de_batch.append(torch.cat([torch.tensor([SOS_IDX]), de_item, torch.tensor([EOS_IDX])], dim=0))
        en_batch.append(torch.cat([torch.tensor([SOS_IDX]), en_item, torch.tensor([EOS_IDX])], dim=0))
        de = pad_sequence(de_batch, padding_value=PAD_IDX)
        en = pad_sequence(en_batch, padding_value=PAD_IDX)
    return de, en


In [None]:
pd.set_option('display.max_rows', None)
dt_test = df_train.sample(n = 3)

dt_test_1 = dt_test[0:1]
dt_test_2 = dt_test[1:2]
dt_test_3 = dt_test[2:3]

In [None]:
print(dt_test_1)
test_data = data_process(dt_test_1)
test_iter = DataLoader(test_data, batch_size=BATCH_SIZE,
                       shuffle=True, collate_fn=generate_test_batch)

next(iter(test_iter))

result = chatbot(model, test_iter, criterion)
print('Answer:', result)

                                               question answers
4148  If someone serves three consecutive terms as m...    four
Answer: ['four', '<eos>']


In [None]:
print(dt_test_2)
test_data = data_process(dt_test_2)
test_iter = DataLoader(test_data, batch_size=BATCH_SIZE,
                       shuffle=True, collate_fn=generate_test_batch)

next(iter(test_iter))

result = chatbot(model, test_iter, criterion)
print('Answer:', result)

                                               question            answers
4528  Over 90% of homes use solar hot water systems ...  Israel and Cyprus
Answer: ['Israel', 'and', 'Cyprus', '<eos>']


In [None]:
print(dt_test_3)
test_data = data_process(dt_test_3)
test_iter = DataLoader(test_data, batch_size=BATCH_SIZE,
                       shuffle=True, collate_fn=generate_test_batch)

next(iter(test_iter))

result = chatbot(model, test_iter, criterion)
print('Answer:', result)

                                             question           answers
941  While in Berlin he saw the operatic work of who?  Gaspare Spontini
Answer: ['Gaspare', 'Spontini', '<eos>']
