# Retrieve Chatbot
## Chatbot using the Poly-encoder Transformer architecture (Humeau et al., 2019) for retrieval

In [1]:
# This notebook is based on :
# https://aritter.github.io/CS-7650/
# This Project was developed at the Georgia Institute of Technology by Ashutosh Baheti (ashutosh.baheti@cc.gatech.edu), 
# borrowing  from the Neural Machine Translation Project (Project 2) 
# of the UC Berkeley NLP course https://cal-cs288.github.io/sp20/

In [2]:
# Imports

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals

import torch
from torch.jit import script, trace
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
import numpy as np
import csv
import random
import re
import os
import unicodedata
import codecs
from io import open
import itertools
import math
import pickle
import statistics

from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
import tqdm
import nltk

import gc
gc.collect()

0

In [3]:
import pandas as pd
import numpy as np
import sys
from functools import partial
import time

In [4]:
bert_model_name = 'distilbert-base-uncased' 
# Bert Imports
from transformers import DistilBertTokenizer, DistilBertModel
#bert_model = DistilBertModel.from_pretrained(bert_model_name)
tokenizer = DistilBertTokenizer.from_pretrained(bert_model_name)

In [5]:
# Utils

def make_dir_if_not_exists(directory):
    if not os.path.exists(directory):
        logging.info("Creating new directory: {}".format(directory))
        os.makedirs(directory)

def print_list(l, K=None):
    # If K is given then only print first K
    for i, e in enumerate(l):
        if i == K:
            break
        print(e)
    print()

def remove_multiple_spaces(string):
    return re.sub(r'\s+', ' ', string).strip()

def save_in_pickle(save_object, save_file):
    with open(save_file, "wb") as pickle_out:
        pickle.dump(save_object, pickle_out)

def load_from_pickle(pickle_file):
    with open(pickle_file, "rb") as pickle_in:
        return pickle.load(pickle_in)

def save_in_txt(list_of_strings, save_file):
    with open(save_file, "w") as writer:
        for line in list_of_strings:
            line = line.strip()
            writer.write(f"{line}\n")

def load_from_txt(txt_file):
    with open(txt_file, "r") as reader:
        all_lines = list()
        for line in reader:
            line = line.strip()
            all_lines.append(line)
        return all_lines

In [6]:
# Check CUDA

print(torch.cuda.is_available())
if torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")
print("Using device:", device)

True
Using device: cuda


## Load Data

### Cornell Movie Database

In [7]:
# Loading the pre-processed conversational exchanges (source-target pairs) from pickle data files
all_conversations = load_from_pickle("../data/cornell_movie/processed_CMDC.pkl")
# Extract 100 conversations from the end for evaluation and keep the rest for training
eval_conversations = all_conversations[-100:]
all_conversations = all_conversations[:-100]

# Logging data stats
print(f"Number of Training Conversation Pairs = {len(all_conversations)}")
print(f"Number of Evaluation Conversation Pairs = {len(eval_conversations)}")

Number of Training Conversation Pairs = 53065
Number of Evaluation Conversation Pairs = 100


#### Building the vocabulary

In [8]:
pad_word = "<pad>"
bos_word = "<s>"
eos_word = "</s>"
unk_word = "<unk>"
pad_id = 0
bos_id = 1
eos_id = 2
unk_id = 3
    
def normalize_sentence(s):
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    s = re.sub(r"\s+", r" ", s).strip()
    return s

class Vocabulary:
    def __init__(self):
        self.word_to_id = {pad_word: pad_id, bos_word: bos_id, eos_word:eos_id, unk_word: unk_id}
        self.word_count = {}
        self.id_to_word = {pad_id: pad_word, bos_id: bos_word, eos_id: eos_word, unk_id: unk_word}
        self.num_words = 4
    
    def get_ids_from_sentence(self, sentence):
        sentence = normalize_sentence(sentence)
        sent_ids = [bos_id] + [self.word_to_id[word] if word in self.word_to_id \
                               else unk_id for word in sentence.split()] + \
                               [eos_id]
        return sent_ids
    
    def tokenized_sentence(self, sentence):
        sent_ids = self.get_ids_from_sentence(sentence)
        return [self.id_to_word[word_id] for word_id in sent_ids]

    def decode_sentence_from_ids(self, sent_ids):
        words = list()
        for i, word_id in enumerate(sent_ids):
            if word_id in [bos_id, eos_id, pad_id]:
                # Skip these words
                continue
            else:
                words.append(self.id_to_word[word_id])
        return ' '.join(words)

    def add_words_from_sentence(self, sentence):
        sentence = normalize_sentence(sentence)
        for word in sentence.split():
            if word not in self.word_to_id:
                # add this word to the vocabulary
                self.word_to_id[word] = self.num_words
                self.id_to_word[self.num_words] = word
                self.word_count[word] = 1
                self.num_words += 1
            else:
                # update the word count
                self.word_count[word] += 1

vocab = Vocabulary()
for src, tgt in all_conversations:
    vocab.add_words_from_sentence(src)
    vocab.add_words_from_sentence(tgt)
print(f"Total words in the vocabulary = {vocab.num_words}")

Total words in the vocabulary = 7727


## Dataset Preparation

In [9]:
def transformer_collate_fn(batch, tokenizer):
    bert_vocab = tokenizer.get_vocab()
    bert_pad_token = bert_vocab['[PAD]']
    bert_unk_token = bert_vocab['[UNK]']
    bert_cls_token = bert_vocab['[CLS]']

    inputs, masks_input, outputs, masks_output = [], [], [], []
    for data in batch:
        tokenizer_input = tokenizer([data[0]])
        tokenized_sent = tokenizer_input['input_ids'][0]
        mask_input = tokenizer_input['attention_mask'][0]
        inputs.append(torch.tensor(tokenized_sent))
        tokenizer_output = tokenizer([data[1]])
        tokenized_sent = tokenizer_output['input_ids'][0]
        mask_output = tokenizer_output['attention_mask'][0]
        outputs.append(torch.tensor(tokenized_sent))
        masks_input.append(torch.tensor(mask_input))
        masks_output.append(torch.tensor(mask_output))
    inputs = pad_sequence(inputs, batch_first=True, padding_value=bert_pad_token)
    outputs = pad_sequence(outputs, batch_first=True, padding_value=bert_pad_token)
    masks_input = pad_sequence(masks_input, batch_first=True, padding_value=0.0)
    masks_output = pad_sequence(masks_output, batch_first=True, padding_value=0.0)
    return inputs, masks_input, outputs, masks_output

In [10]:
#create pytorch dataloaders from train_dataset, val_dataset, and test_datset
batch_size=5
train_dataloader = DataLoader(all_conversations,batch_size=batch_size,collate_fn=partial(transformer_collate_fn, tokenizer=tokenizer), shuffle = True)

In [11]:
tokenizer.batch_decode(transformer_collate_fn(all_conversations[0:10],tokenizer)[0], skip_special_tokens=True)

['there.',
 'you have my word. as a gentleman',
 'hi.',
 'have fun tonight?',
 'well no...',
 'then that s all you had to say.',
 'but',
 'do you listen to this crap?',
 'what good stuff?',
 'wow']

In [12]:
tokenizer.batch_decode(transformer_collate_fn(all_conversations[0:10],tokenizer)[2], skip_special_tokens=True)

['where?',
 'you re sweet.',
 'looks like things worked out tonight huh?',
 'tons',
 'then that s all you had to say.',
 'but',
 'you always been this selfish?',
 'what crap?',
 'the real you.',
 'let s go.']

## Polyencoder Model

In [13]:
#torch.cuda.empty_cache()
#bert1 = DistilBertModel.from_pretrained(bert_model_name)
#bert2 = DistilBertModel.from_pretrained(bert_model_name)

bert = DistilBertModel.from_pretrained(bert_model_name)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_transform.bias', 'vocab_projector.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_layer_norm.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [14]:
#Double Bert
class RetrieverPolyencoder(nn.Module):
    def __init__(self, contextBert, candidateBert, vocab, max_len = 300, hidden_dim = 768, out_dim = 64, num_layers = 2, dropout=0.1, device=device):
        super().__init__()

        self.device = device
        self.hidden_dim = hidden_dim
        self.max_len = max_len
        self.out_dim = out_dim
        
        # Context layers
        self.contextBert = contextBert
        self.contextDropout = nn.Dropout(dropout)
        
        # Candidates layers
        self.candidatesBert = candidateBert
        self.pos_emb = nn.Embedding(self.max_len, self.hidden_dim)
        self.candidatesDropout = nn.Dropout(dropout)
        
        self.att_dropout = nn.Dropout(dropout)


    def attention(self, q, k, v, vMask=None):
        w = torch.matmul(q, k.transpose(-1, -2))
        if vMask is not None:
            w *= vMask.unsqueeze(1)
            w = F.softmax(w, -1)
        w = self.att_dropout(w)
        score = torch.matmul(w, v)
        return score

    def score(self, context, context_mask, responses, responses_mask):
        """Run the model on the source and compute the loss on the target.

        Args:
            source: An integer tensor with shape (max_source_sequence_length,
                batch_size) containing subword indices for the source sentences.
            target: An integer tensor with shape (max_target_sequence_length,
                batch_size) containing subword indices for the target sentences.

        Returns:
            A scalar float tensor representing cross-entropy loss on the current batch
            divided by the number of target tokens in the batch.
            Many of the target tokens will be pad tokens. You should mask the loss 
            from these tokens using appropriate mask on the target tokens loss.
        """
        batch_size, nb_cand, seq_len = responses.shape
        # Context
        context_encoded = self.contextBert(context,context_mask)[0]
        pos_emb = self.pos_emb(torch.arange(self.max_len).to(self.device))
        context_att = self.attention(pos_emb, context_encoded, context_encoded, context_mask)

        # Response
        responses_encoded = self.candidatesBert(responses.view(-1,responses.shape[2]), responses_mask.view(-1,responses.shape[2]))[0][:,0,:]
        responses_encoded = responses_encoded.view(batch_size,nb_cand,-1)
        
        context_emb = self.attention(responses_encoded, context_att, context_att).squeeze() 
        dot_product = (context_emb*responses_encoded).sum(-1)
        
        return dot_product

    
    def compute_loss(self, context, context_mask, response, response_mask):
        """Run the model on the source and compute the loss on the target.

        Args:
            source: An integer tensor with shape (max_source_sequence_length,
                batch_size) containing subword indices for the source sentences.
            target: An integer tensor with shape (max_target_sequence_length,
                batch_size) containing subword indices for the target sentences.

        Returns:
            A scalar float tensor representing cross-entropy loss on the current batch
            divided by the number of target tokens in the batch.
            Many of the target tokens will be pad tokens. You should mask the loss 
            from these tokens using appropriate mask on the target tokens loss.
        """
        batch_size = context.shape[0]
        
        # Context
        context_encoded = self.contextBert(context,context_mask)[0]
        pos_emb = self.pos_emb(torch.arange(self.max_len).to(self.device))
        context_att = self.attention(pos_emb, context_encoded, context_encoded, context_mask)

        # Response
        response_encoded = self.candidatesBert(response, response_mask)[0][:,0,:]
        
        response_encoded = response_encoded.unsqueeze(0).expand(batch_size, batch_size, response_encoded.shape[1]) 
        context_emb = self.attention(response_encoded, context_att, context_att).squeeze() 
        dot_product = (context_emb*response_encoded).sum(-1)
        mask = torch.eye(batch_size).to(self.device)
        loss = F.log_softmax(dot_product, dim=-1) * mask
        loss = (-loss.sum(dim=1)).mean()
        return loss

In [15]:
#Simple Bert
class RetrieverPolyencoder_single(nn.Module):
    def __init__(self, Bert, max_len = 300, hidden_dim = 768, out_dim = 64, num_layers = 2, dropout=0.1, device=device):
        super().__init__()

        self.device = device
        self.hidden_dim = hidden_dim
        self.max_len = max_len
        self.out_dim = out_dim
        self.bert = Bert

        # Context layers
        self.contextDropout = nn.Dropout(dropout)
        
        # Candidates layers
        self.pos_emb = nn.Embedding(self.max_len, self.hidden_dim)
        self.candidatesDropout = nn.Dropout(dropout)
        
        self.att_dropout = nn.Dropout(dropout)


    def attention(self, q, k, v, vMask=None):
        w = torch.matmul(q, k.transpose(-1, -2))
        if vMask is not None:
            w *= vMask.unsqueeze(1)
            w = F.softmax(w, -1)
        w = self.att_dropout(w)
        score = torch.matmul(w, v)
        return score

    def score(self, context, context_mask, responses, responses_mask):
        """Run the model on the source and compute the loss on the target.

        Args:
            source: An integer tensor with shape (max_source_sequence_length,
                batch_size) containing subword indices for the source sentences.
            target: An integer tensor with shape (max_target_sequence_length,
                batch_size) containing subword indices for the target sentences.

        Returns:
            A scalar float tensor representing cross-entropy loss on the current batch
            divided by the number of target tokens in the batch.
            Many of the target tokens will be pad tokens. You should mask the loss 
            from these tokens using appropriate mask on the target tokens loss.
        """
        batch_size, nb_cand, seq_len = responses.shape
        # Context
        context_encoded = self.bert(context,context_mask)[0]
        pos_emb = self.pos_emb(torch.arange(self.max_len).to(self.device))
        context_att = self.attention(pos_emb, context_encoded, context_encoded, context_mask)

        # Response
        responses_encoded = self.bert(responses.view(-1,responses.shape[2]), responses_mask.view(-1,responses.shape[2]))[0][:,0,:]
        responses_encoded = responses_encoded.view(batch_size,nb_cand,-1)
        
        context_emb = self.attention(responses_encoded, context_att, context_att).squeeze() 
        dot_product = (context_emb*responses_encoded).sum(-1)
        
        return dot_product

    
    def compute_loss(self, context, context_mask, response, response_mask):
        """Run the model on the source and compute the loss on the target.

        Args:
            source: An integer tensor with shape (max_source_sequence_length,
                batch_size) containing subword indices for the source sentences.
            target: An integer tensor with shape (max_target_sequence_length,
                batch_size) containing subword indices for the target sentences.

        Returns:
            A scalar float tensor representing cross-entropy loss on the current batch
            divided by the number of target tokens in the batch.
            Many of the target tokens will be pad tokens. You should mask the loss 
            from these tokens using appropriate mask on the target tokens loss.
        """
        batch_size = context.shape[0]
        
        # Context
        context_encoded = self.bert(context,context_mask)[0]
        pos_emb = self.pos_emb(torch.arange(self.max_len).to(self.device))
        context_att = self.attention(pos_emb, context_encoded, context_encoded, context_mask)

        # Response
        response_encoded = self.bert(response, response_mask)[0][:,0,:]
        
        response_encoded = response_encoded.unsqueeze(0).expand(batch_size, batch_size, response_encoded.shape[1]) 
        context_emb = self.attention(response_encoded, context_att, context_att).squeeze() 
        dot_product = (context_emb*response_encoded).sum(-1)
        mask = torch.eye(batch_size).to(self.device)
        loss = F.log_softmax(dot_product, dim=-1) * mask
        loss = (-loss.sum(dim=1)).mean()
        return loss

In [16]:
#Bi-encoder
class RetrieverBiencoder(nn.Module):
    def __init__(self, bert):
        super().__init__()
        self.bert = bert
        
    def score(self, context, context_mask, responses, responses_mask):

        context_vec = self.bert(context, context_mask)[0][:,0,:]  # [bs,dim]

        batch_size, res_length = response.shape

        responses_vec = self.bert(responses_input_ids, responses_input_masks)[0][:,0,:]  # [bs,dim]
        responses_vec = responses_vec.view(batch_size, 1, -1)

        responses_vec = responses_vec.squeeze(1)        
        context_vec = context_vec.unsqueeze(1)
        dot_product = torch.matmul(context_vec, responses_vec.permute(0, 2, 1)).squeeze()
        return dot_product
    
    def compute_loss(self, context, context_mask, response, response_mask):

        context_vec = self.bert(context, context_mask)[0][:,0,:]  # [bs,dim]

        batch_size, res_length = response.shape

        responses_vec = self.bert(response, response_mask)[0][:,0,:]  # [bs,dim]
        responses_vec = responses_vec.view(batch_size, -1)

        dot_product = torch.matmul(context_vec, responses_vec.t())  # [bs, bs]
        mask = torch.eye(context.size(0)).to(context_mask.device)
        loss = F.log_softmax(dot_product, dim=-1) * mask
        loss = (-loss.sum(dim=1)).mean()
        return loss


In [19]:
loss_rec = []

def train(model, data_loader, num_epochs, model_file, learning_rate=0.0001):
    """Train the model for given µnumber of epochs and save the trained model in 
    the final model_file.
    """

    decoder_learning_ratio = 5.0
    #encoder_parameter_names = ['word_embedding', 'encoder']
    encoder_parameter_names = ['encode_emb', 'encode_gru', 'l1', 'l2']
                           
    encoder_named_params = list(filter(lambda kv: any(key in kv[0] for key in encoder_parameter_names), model.named_parameters()))
    decoder_named_params = list(filter(lambda kv: not any(key in kv[0] for key in encoder_parameter_names), model.named_parameters()))
    encoder_params = [e[1] for e in encoder_named_params]
    decoder_params = [e[1] for e in decoder_named_params]
    optimizer = torch.optim.AdamW([{'params': encoder_params},
                {'params': decoder_params, 'lr': learning_rate * decoder_learning_ratio}], lr=learning_rate)
    
    clip = 50.0
    for epoch in tqdm.notebook.trange(num_epochs, desc="training", unit="epoch"):
        # print(f"Total training instances = {len(train_dataset)}")
        # print(f"train_data_loader = {len(train_data_loader)} {1180 > len(train_data_loader)/20}")
        with tqdm.notebook.tqdm(
                data_loader,
                desc="epoch {}".format(epoch + 1),
                unit="batch",
                total=len(data_loader)) as batch_iterator:
            model.train()
            total_loss = 0.0
            for i, batch_data in enumerate(batch_iterator, start=1):
                source, source_mask, target, target_mask = batch_data
                print(source.shape)
                print(source_mask.shape)
                print(target.shape)
                print(target_mask.shape)
                optimizer.zero_grad()
                loss = model.compute_loss(source.cuda(), source_mask.cuda(), target.cuda(), target_mask.cuda())
                total_loss += loss.item()
                loss.backward()
                # Gradient clipping before taking the step
                _ = nn.utils.clip_grad_norm_(model.parameters(), clip)
                optimizer.step()

                batch_iterator.set_postfix(mean_loss=total_loss / i, current_loss=loss.item())
            loss_rec.append(total_loss)
    # Save the model after training         
    torch.save(model.state_dict(), model_file)

In [20]:
# You are welcome to adjust these parameters based on your model implementation.
num_epochs = 4
batch_size = 2
learning_rate = 0.001
# Reloading the data_loader to increase batch_size
train_dataloader = DataLoader(all_conversations,batch_size=batch_size,collate_fn=partial(transformer_collate_fn, tokenizer=tokenizer), shuffle = True)

baseline_model = RetrieverPolyencoder_single(bert).to(device)
train(baseline_model, train_dataloader, num_epochs, "/models/baseline_model.pt",learning_rate=learning_rate)
# Download the trained model to local for future use
#files.download('baseline_model.pt')

training:   0%|          | 0/4 [00:00<?, ?epoch/s]

epoch 1:   0%|          | 0/26533 [00:00<?, ?batch/s]

torch.Size([2, 6])
torch.Size([2, 6])
torch.Size([2, 10])
torch.Size([2, 10])
torch.Size([2, 10])
torch.Size([2, 10])
torch.Size([2, 7])
torch.Size([2, 7])
torch.Size([2, 8])
torch.Size([2, 8])
torch.Size([2, 6])
torch.Size([2, 6])
torch.Size([2, 5])
torch.Size([2, 5])
torch.Size([2, 13])
torch.Size([2, 13])
torch.Size([2, 8])
torch.Size([2, 8])
torch.Size([2, 5])
torch.Size([2, 5])
torch.Size([2, 11])
torch.Size([2, 11])
torch.Size([2, 10])
torch.Size([2, 10])
torch.Size([2, 10])
torch.Size([2, 10])
torch.Size([2, 9])
torch.Size([2, 9])
torch.Size([2, 10])
torch.Size([2, 10])
torch.Size([2, 8])
torch.Size([2, 8])
torch.Size([2, 11])
torch.Size([2, 11])
torch.Size([2, 9])
torch.Size([2, 9])
torch.Size([2, 9])
torch.Size([2, 9])
torch.Size([2, 10])
torch.Size([2, 10])
torch.Size([2, 11])
torch.Size([2, 11])
torch.Size([2, 8])
torch.Size([2, 8])
torch.Size([2, 8])
torch.Size([2, 8])
torch.Size([2, 8])
torch.Size([2, 8])
torch.Size([2, 10])
torch.Size([2, 10])
torch.Size([2, 5])
torch.Siz

KeyboardInterrupt: 

In [None]:
loss_rec

In [None]:
baseline_model = RetrieverPolyencoder(bert1,bert2,vocab).to(device)
baseline_model.load_state_dict(torch.load("baseline_model3.pt", map_location=device))

In [None]:
vals = transformer_collate_fn(all_conversations[0:100],tokenizer)

In [None]:
i=3

In [None]:
scores = baseline_model.score(vals[0][i].unsqueeze(0).cuda(),vals[1][i].unsqueeze(0).cuda(),vals[2].unsqueeze(0).cuda(),vals[3].unsqueeze(0).cuda()).detach().cpu().numpy()

In [None]:
all_conversations[i][0]

In [None]:
all_conversations[np.argmax(scores)][1]

In [None]:
max_v = 100
vals = transformer_collate_fn(all_conversations[0:max_v],tokenizer)
correct = 0
for i in range(max_v):
    scores = baseline_model.score(vals[0][i].unsqueeze(0).cuda(),vals[1][i].unsqueeze(0).cuda(),vals[2].unsqueeze(0).cuda(),vals[3].unsqueeze(0).cuda()).detach().cpu().numpy()
    if np.argmax(scores)==i:
        correct+=1
    print(all_conversations[i][0])
    print(all_conversations[np.argmax(scores)][1]+"\n")

In [None]:
print(correct/max_v)