# Retrieve Chatbot
## Chatbot using the Poly-encoder Transformer architecture (Humeau et al., 2019) for retrieval

In [1]:
# This notebook is based on :
# https://aritter.github.io/CS-7650/
# This Project was developed at the Georgia Institute of Technology by Ashutosh Baheti (ashutosh.baheti@cc.gatech.edu), 
# borrowing  from the Neural Machine Translation Project (Project 2) 
# of the UC Berkeley NLP course https://cal-cs288.github.io/sp20/

In [2]:
# Imports

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals

import torch
from torch.jit import script, trace
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
import numpy as np
import csv
import random
import re
import os
import unicodedata
import codecs
from io import open
import itertools
import math
import pickle
import statistics

from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
import tqdm
import nltk

In [12]:
import pandas as pd
import numpy as np
import sys
from functools import partial
import time

In [3]:
bert_model_name = 'distilbert-base-uncased' 
# Bert Imports
from transformers import DistilBertTokenizer, DistilBertModel
bert_model = DistilBertModel.from_pretrained(bert_model_name)
tokenizer = DistilBertTokenizer.from_pretrained(bert_model_name)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [4]:
tokenizer

PreTrainedTokenizer(name_or_path='distilbert-base-uncased', vocab_size=30522, model_max_len=512, is_fast=False, padding_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})

In [5]:
# Utils

def make_dir_if_not_exists(directory):
    if not os.path.exists(directory):
        logging.info("Creating new directory: {}".format(directory))
        os.makedirs(directory)

def print_list(l, K=None):
    # If K is given then only print first K
    for i, e in enumerate(l):
        if i == K:
            break
        print(e)
    print()

def remove_multiple_spaces(string):
    return re.sub(r'\s+', ' ', string).strip()

def save_in_pickle(save_object, save_file):
    with open(save_file, "wb") as pickle_out:
        pickle.dump(save_object, pickle_out)

def load_from_pickle(pickle_file):
    with open(pickle_file, "rb") as pickle_in:
        return pickle.load(pickle_in)

def save_in_txt(list_of_strings, save_file):
    with open(save_file, "w") as writer:
        for line in list_of_strings:
            line = line.strip()
            writer.write(f"{line}\n")

def load_from_txt(txt_file):
    with open(txt_file, "r") as reader:
        all_lines = list()
        for line in reader:
            line = line.strip()
            all_lines.append(line)
        return all_lines

In [6]:
# Check CUDA

print(torch.cuda.is_available())
if torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")
print("Using device:", device)

True
Using device: cuda


## Load Data

### Cornell Movie Database

In [7]:
# Loading the pre-processed conversational exchanges (source-target pairs) from pickle data files
all_conversations = load_from_pickle("../data/cornell_movie/processed_CMDC.pkl")
# Extract 100 conversations from the end for evaluation and keep the rest for training
eval_conversations = all_conversations[-100:]
all_conversations = all_conversations[:-100]

# Logging data stats
print(f"Number of Training Conversation Pairs = {len(all_conversations)}")
print(f"Number of Evaluation Conversation Pairs = {len(eval_conversations)}")

Number of Training Conversation Pairs = 53065
Number of Evaluation Conversation Pairs = 100


#### Building the vocabulary

In [8]:
pad_word = "<pad>"
bos_word = "<s>"
eos_word = "</s>"
unk_word = "<unk>"
pad_id = 0
bos_id = 1
eos_id = 2
unk_id = 3
    
def normalize_sentence(s):
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    s = re.sub(r"\s+", r" ", s).strip()
    return s

class Vocabulary:
    def __init__(self):
        self.word_to_id = {pad_word: pad_id, bos_word: bos_id, eos_word:eos_id, unk_word: unk_id}
        self.word_count = {}
        self.id_to_word = {pad_id: pad_word, bos_id: bos_word, eos_id: eos_word, unk_id: unk_word}
        self.num_words = 4
    
    def get_ids_from_sentence(self, sentence):
        sentence = normalize_sentence(sentence)
        sent_ids = [bos_id] + [self.word_to_id[word] if word in self.word_to_id \
                               else unk_id for word in sentence.split()] + \
                               [eos_id]
        return sent_ids
    
    def tokenized_sentence(self, sentence):
        sent_ids = self.get_ids_from_sentence(sentence)
        return [self.id_to_word[word_id] for word_id in sent_ids]

    def decode_sentence_from_ids(self, sent_ids):
        words = list()
        for i, word_id in enumerate(sent_ids):
            if word_id in [bos_id, eos_id, pad_id]:
                # Skip these words
                continue
            else:
                words.append(self.id_to_word[word_id])
        return ' '.join(words)

    def add_words_from_sentence(self, sentence):
        sentence = normalize_sentence(sentence)
        for word in sentence.split():
            if word not in self.word_to_id:
                # add this word to the vocabulary
                self.word_to_id[word] = self.num_words
                self.id_to_word[self.num_words] = word
                self.word_count[word] = 1
                self.num_words += 1
            else:
                # update the word count
                self.word_count[word] += 1

vocab = Vocabulary()
for src, tgt in all_conversations:
    vocab.add_words_from_sentence(src)
    vocab.add_words_from_sentence(tgt)
print(f"Total words in the vocabulary = {vocab.num_words}")

Total words in the vocabulary = 7727


## Dataset Preparation

In [9]:
class SingleTurnMovieDialog_dataset(Dataset):
    """Single-Turn version of Cornell Movie Dialog Cropus dataset."""

    def __init__(self, conversations, vocab, device):
        """
        Args:
            conversations: list of tuple (src_string, tgt_string) 
                         - src_string: String of the source sentence
                         - tgt_string: String of the target sentence
            vocab: Vocabulary object that contains the mapping of 
                    words to indices
            device: cpu or cuda
        """
        self.conversations = conversations
        self.vocab = vocab
        self.device = device

        def encode(src, tgt):
            src_ids = self.vocab.get_ids_from_sentence(src)
            tgt_ids = self.vocab.get_ids_from_sentence(tgt)
            return (src_ids, tgt_ids)

        # We will pre-tokenize the conversations and save in id lists for later use
        self.tokenized_conversations = [encode(src, tgt) for src, tgt in self.conversations]
        
    def __len__(self):
        return len(self.conversations)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        return {"conv_ids":self.tokenized_conversations[idx], "conv":self.conversations[idx]}

def collate_fn(data):
    """Creates mini-batch tensors from the list of tuples (src_seq, tgt_seq).
    We should build a custom collate_fn rather than using default collate_fn,
    because merging sequences (including padding) is not supported in default.
    Seqeuences are padded to the maximum length of mini-batch sequences (dynamic padding).
    Args:
        data: list of dicts {"conv_ids":(src_ids, tgt_ids), "conv":(src_str, trg_str)}.
            - src_ids: list of src piece ids; variable length.
            - tgt_ids: list of tgt piece ids; variable length.
            - src_str: String of src
            - tgt_str: String of tgt
    Returns: dict { "conv_ids":     (src_ids, tgt_ids), 
                    "conv":         (src_str, tgt_str), 
                    "conv_tensors": (src_seqs, tgt_seqs)}
            src_seqs: torch tensor of shape (src_padded_length, batch_size).
            tgt_seqs: torch tensor of shape (tgt_padded_length, batch_size).
            src_padded_length = length of the longest src sequence from src_ids
            tgt_padded_length = length of the longest tgt sequence from tgt_ids
    """
    # Sort conv_ids based on decreasing order of the src_lengths.
    # This is required for efficient GPU computations.
    src_ids = [torch.LongTensor(e["conv_ids"][0]) for e in data]
    tgt_ids = [torch.LongTensor(e["conv_ids"][1]) for e in data]
    src_str = [e["conv"][0] for e in data]
    tgt_str = [e["conv"][1] for e in data]
    data = list(zip(src_ids, tgt_ids, src_str, tgt_str))
    data.sort(key=lambda x: len(x[0]), reverse=True)
    src_ids, tgt_ids, src_str, tgt_str = zip(*data)


    # Pad the src_ids and tgt_ids using token pad_id to create src_seqs and tgt_seqs
    
    # Implementation tip: You can use the nn.utils.rnn.pad_sequence utility
    # function to combine a list of variable-length sequences with padding.
    
    # YOUR CODE HERE
    src_seqs = nn.utils.rnn.pad_sequence(src_ids, padding_value = pad_id,
                                         batch_first = False)
    tgt_seqs = nn.utils.rnn.pad_sequence(tgt_ids, padding_value = pad_id, 
                                         batch_first = False)
    
    src_padded_length = len(src_seqs[0])
    tgt_padded_length = len(tgt_seqs[0])
    return {"conv_ids":(src_ids, tgt_ids), "conv":(src_str, tgt_str), "conv_tensors":(src_seqs.to(device), tgt_seqs.to(device))}

In [10]:
# Create the DataLoader for all_conversations
dataset = SingleTurnMovieDialog_dataset(all_conversations, vocab, device)

batch_size = 5

data_loader = DataLoader(dataset=dataset, batch_size=batch_size, 
                               shuffle=True, collate_fn=collate_fn)

In [36]:
def transformer_collate_fn(batch, tokenizer):
    bert_vocab = tokenizer.get_vocab()
    bert_pad_token = bert_vocab['[PAD]']
    bert_unk_token = bert_vocab['[UNK]']
    bert_cls_token = bert_vocab['[CLS]']

    inputs, masks_input, outputs, masks_output = [], [], [], []
    for data in batch:
        tokenizer_input = tokenizer([data[0]])
        tokenized_sent = tokenizer_input['input_ids'][0]
        mask_input = tokenizer_input['attention_mask'][0]
        inputs.append(torch.tensor(tokenized_sent))
        tokenizer_output = tokenizer([data[0]])
        tokenized_sent = tokenizer_output['input_ids'][0]
        mask_output = tokenizer_output['attention_mask'][0]
        outputs.append(torch.tensor(tokenized_sent))
        masks_input.append(torch.tensor(mask_input))
        masks_output.append(torch.tensor(mask_output))
    inputs = pad_sequence(inputs, batch_first=True, padding_value=bert_pad_token)
    outputs = pad_sequence(outputs, batch_first=True, padding_value=bert_pad_token)
    masks_input = pad_sequence(masks_input, batch_first=True, padding_value=0.0)
    masks_output = pad_sequence(masks_output, batch_first=True, padding_value=0.0)
    return inputs, masks_input, outputs, masks_output

In [37]:
#create pytorch dataloaders from train_dataset, val_dataset, and test_datset
train_dataloader = DataLoader(all_conversations,batch_size=batch_size,collate_fn=partial(transformer_collate_fn, tokenizer=tokenizer), shuffle = True)

In [38]:
 with tqdm.notebook.tqdm(
                train_dataloader,
                desc="epoch {}".format(1 + 1),
                unit="batch",
                total=len(data_loader)) as batch_iterator:
            for i, batch_data in enumerate(batch_iterator, start=1):
                print(batch_data)

epoch 2:   0%|          | 0/10613 [00:00<?, ?batch/s]

(tensor([[  101,  1045,  2123,  1056,  2113,  1012,   102,     0,     0,     0,
             0],
        [  101,  2071,  2057,  2831,  2055,  2008,  1037,  2210,  2101,  1029,
           102],
        [  101,  1045,  2123,  1056,  2113,  1012,   102,     0,     0,     0,
             0],
        [  101,  2505,  3308,  1012,  1012,  1012,  1029,   102,     0,     0,
             0],
        [  101,  6574,   999,  2026,  2104,  3367, 20217,  1012,  1012,  1012,
           102]]), tensor([[1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]), tensor([[  101,  1045,  2123,  1056,  2113,  1012,   102,     0,     0,     0,
             0],
        [  101,  2071,  2057,  2831,  2055,  2008,  1037,  2210,  2101,  1029,
           102],
        [  101,  1045,  2123,  1056,  2113,  1012,   102,     0,     0,     0,
             0],
        [  1

KeyboardInterrupt: 

In [53]:
transformer_collate_fn(all_conversations[0:10],tokenizer)[0][5]

tensor([ 101, 2059, 2008, 1055, 2035, 2017, 2018, 2000, 2360, 1012,  102])

In [55]:
tokenizer.decode(transformer_collate_fn(all_conversations[0:10],tokenizer)[0][5], skip_special_tokens=True)

'then that s all you had to say.'

In [56]:
tokenizer.batch_decode(transformer_collate_fn(all_conversations[0:10],tokenizer)[0], skip_special_tokens=True)

['[CLS] there. [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]',
 '[CLS] you have my word. as a gentleman [SEP] [PAD]',
 '[CLS] hi. [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]',
 '[CLS] have fun tonight? [SEP] [PAD] [PAD] [PAD] [PAD] [PAD]',
 '[CLS] well no... [SEP] [PAD] [PAD] [PAD] [PAD]',
 '[CLS] then that s all you had to say. [SEP]',
 '[CLS] but [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]',
 '[CLS] do you listen to this crap? [SEP] [PAD] [PAD]',
 '[CLS] what good stuff? [SEP] [PAD] [PAD] [PAD] [PAD] [PAD]',
 '[CLS] wow [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]']

## Polyencoder Model

In [21]:
class RetrieverPolyencoder(nn.Module):
    def __init__(self, contextBert, candidateBert, vocab, cand_len = 300, hidden_dim = 300, num_layers = 2, dropout=0.1):
        super().__init__()

        self.hidden_dim = hidden_dim
        self.cand_len = cand_len
        
        # Context layers
        self.contextBert = contextBert
        self.contextDropout = nn.Dropout(dropout)
        self.contextFc = nn.Linear(self.hidden_dim, self.vec_dim)
        
        # Candidates layers
        self.candidatesBert = candidateBert
        self.pos_emb = nn.Embedding(self.cand_len, self.hidden_dim)
        self.candidatesDropout = nn.Dropout(dropout)
        self.candidatesFc = nn.Linear(self.hidden_dim, self.vec_dim)
        
        self.att_dropout = nn.Dropout(dropout)


    def attention(q, k, v, vMask=None):
        w = torch.matmul(q, k.transpose(-1, -2))
        if v_mask is not None:
            w *= vMask.unsqueeze(1)
            w = F.softmax(w, -1)
        w = self.att_dropout(w)
        score = torch.matmul(w, v)
        return score

    def rank(self, context, context_mask, response, response_mask):
        """Run the model on the source and compute the loss on the target.

        Args:
            source: An integer tensor with shape (max_source_sequence_length,
                batch_size) containing subword indices for the source sentences.
            target: An integer tensor with shape (max_target_sequence_length,
                batch_size) containing subword indices for the target sentences.

        Returns:
            A scalar float tensor representing cross-entropy loss on the current batch
            divided by the number of target tokens in the batch.
            Many of the target tokens will be pad tokens. You should mask the loss 
            from these tokens using appropriate mask on the target tokens loss.
        """
        # Response
        response_encoded = self.candidatesBert()
        pos_emb = self.pos_emb(torch.arange(self.max_length).to(self.device))
        response_att = attention(pos_emb, response_encoded, response_encoded, response_mask)
        
        
        # Context
        context_encoded = self.contextBert()
        pos_emb = self.pos_emb(torch.arange(self.max_length).to(self.device))
        context_att = attention(pos_emb, context_encoded, context_encoded, context_mask)
        
        context_emb = self.attention(response_att, context_att, context_att)
        scores = (ctx_emb*cand_emb).sum(-1)
        
        return scores

    
    def compute_loss(self, context, context_mask, response, response_mask):
        """Run the model on the source and compute the loss on the target.

        Args:
            source: An integer tensor with shape (max_source_sequence_length,
                batch_size) containing subword indices for the source sentences.
            target: An integer tensor with shape (max_target_sequence_length,
                batch_size) containing subword indices for the target sentences.

        Returns:
            A scalar float tensor representing cross-entropy loss on the current batch
            divided by the number of target tokens in the batch.
            Many of the target tokens will be pad tokens. You should mask the loss 
            from these tokens using appropriate mask on the target tokens loss.
        """
        # Response
        response_encoded = self.candidatesBert()
        pos_emb = self.pos_emb(torch.arange(self.max_length).to(self.device))
        response_att = attention(pos_emb, response_encoded, response_encoded, response_mask)
        
        
        # Context
        context_encoded = self.contextBert()
        pos_emb = self.pos_emb(torch.arange(self.max_length).to(self.device))
        context_att = attention(pos_emb, context_encoded, context_encoded, context_mask)
        
        response_att = response_att.expand(batch_size, batch_size, response_att.shape[2]) 
        context_emb = self.dot_attention(response_att, context_att, context_att).squeeze() 
        dot_product = (ctx_emb*cand_emb).sum(-1) 
        mask = torch.eye(batch_size)
        loss = F.log_softmax(dot_product, dim=-1) * mask
        loss = (-loss.sum(dim=1)).mean()
        
        return loss

In [22]:
def train(model, data_loader, num_epochs, model_file, learning_rate=0.0001):
    """Train the model for given µnumber of epochs and save the trained model in 
    the final model_file.
    """

    decoder_learning_ratio = 5.0
    #encoder_parameter_names = ['word_embedding', 'encoder']
    encoder_parameter_names = ['encode_emb', 'encode_gru', 'l1', 'l2']
                           
    encoder_named_params = list(filter(lambda kv: any(key in kv[0] for key in encoder_parameter_names), model.named_parameters()))
    decoder_named_params = list(filter(lambda kv: not any(key in kv[0] for key in encoder_parameter_names), model.named_parameters()))
    encoder_params = [e[1] for e in encoder_named_params]
    decoder_params = [e[1] for e in decoder_named_params]
    optimizer = torch.optim.AdamW([{'params': encoder_params},
                {'params': decoder_params, 'lr': learning_rate * decoder_learning_ratio}], lr=learning_rate)
    
    clip = 50.0
    for epoch in tqdm.notebook.trange(num_epochs, desc="training", unit="epoch"):
        # print(f"Total training instances = {len(train_dataset)}")
        # print(f"train_data_loader = {len(train_data_loader)} {1180 > len(train_data_loader)/20}")
        with tqdm.notebook.tqdm(
                data_loader,
                desc="epoch {}".format(epoch + 1),
                unit="batch",
                total=len(data_loader)) as batch_iterator:
            model.train()
            total_loss = 0.0
            for i, batch_data in enumerate(batch_iterator, start=1):
                source, target = batch_data["conv_tensors"]
                optimizer.zero_grad()
                loss = model.compute_loss(source, target)
                total_loss += loss.item()
                loss.backward()
                # Gradient clipping before taking the step
                _ = nn.utils.clip_grad_norm_(model.parameters(), clip)
                optimizer.step()

                batch_iterator.set_postfix(mean_loss=total_loss / i, current_loss=loss.item())
    # Save the model after training         
    torch.save(model.state_dict(), model_file)

In [None]:
# You are welcome to adjust these parameters based on your model implementation.
num_epochs = 15
batch_size = 64
learning_rate = 0.001
# Reloading the data_loader to increase batch_size
data_loader = DataLoader(dataset=dataset, batch_size=batch_size, 
                               shuffle=True, collate_fn=collate_fn)

baseline_model = Seq2seqBaseline(vocab).to(device)
train(baseline_model, data_loader, num_epochs, "baseline_model.pt",learning_rate=learning_rate)
# Download the trained model to local for future use
#files.download('baseline_model.pt')