<a href="https://colab.research.google.com/github/SyaoranClone/Persona-Chatbot/blob/master/Persona_chatbot.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers

In [None]:
#install Apex
%%writefile setup.sh

git clone https://github.com/NVIDIA/apex
cd apex
pip install -v --no-cache-dir ./

Overwriting setup.sh


In [None]:
!sh setup.sh

In [None]:
import json
import time
import os
import random
import warnings
from itertools import chain
from argparse import ArgumentParser
from collections import defaultdict
from tqdm import tqdm, trange
from tqdm import tnrange, notebook

import torch
import numpy as np
import torch.nn.functional as F
from torch.nn.parallel import DistributedDataParallel
from torch.utils.data import DataLoader, TensorDataset, RandomSampler, DistributedSampler, SequentialSampler
from transformers import (AdamW, OpenAIGPTDoubleHeadsModel, OpenAIGPTTokenizer,
                                 GPT2DoubleHeadsModel, GPT2Tokenizer, WEIGHTS_NAME, CONFIG_NAME)
from transformers import get_linear_schedule_with_warmup

In [None]:
!pip install spacy ftfy==4.4.3

In [None]:
!python -m spacy download en

In [None]:
torch.cuda.device_count()

1

In [None]:
#According to Huggingface Convai tutorial
SPECIAL_TOKENS = ["<bos>", "<eos>", "<speaker1>", "<speaker2>", "<pad>"]
ATTR_TO_SPECIAL_TOKEN = {'bos_token': '<bos>', 'eos_token': '<eos>', 'pad_token': '<pad>',
                         'additional_special_tokens': ['<speaker1>', '<speaker2>']}
MODEL_INPUTS = ["input_ifrom ignite.contrib.handlers import ProgressBar, PiecewiseLineards", "mc_token_ids", "lm_labels", "mc_labels", "token_type_ids"]
PADDED_INPUTS = ["input_ids", "lm_labels", "token_type_ids"]

In [None]:
#define hyperparameters
class args:
  model_checkpoint = 'gpt2'
  device  = 'cuda' if torch.cuda.is_available() else 'cpu'
  lr  = 6.25e-5
  num_candidates = 2 #4
  personality_permutations = 1 #2
  num_history  = 2 #Number of previous exchanges to keep in history
  fp16_training = "O1" #Set to O0, O1, O2 or O3 for fp16 training
  train_batch_size = 2
  valid_batch_size = 2
  num_epochs= 2 #3
  no_sample=False
  lm_coef = 2.0
  mc_coef = 1.0
  max_norm= 1.0
  top_p = 0.9 # Nucleus filtering (top-p) before sampling (<=0.0: no filtering)
  top_k = 0 # Filter top-k tokens before sampling (<=0: no filtering)
  temperature = 0.7 # Sampling softmax temperature
  max_len = 20 #Maximum length of the output utterances
  min_len = 1
  num_gpu = torch.cuda.device_count() #1
  gradient_accumulation_steps= 4
  local_rank= -1 # for distributed training
  url = "https://s3.amazonaws.com/datasets.huggingface.co/personachat/personachat_self_original.json"
  local_path= "/content/drive/My Drive/Colab Notebooks/Dataset/personachat_self_original.json"
  dataset_cache_path = "/content/drive/My Drive/Colab Notebooks/Dataset/dataset_cache/persona_cache.bin_GPT2Tokenizer"
  saved_dir = "/content/drive/My Drive/Colab Notebooks/Trained Models/persona_chatbot/"

args = args

In [None]:
class FacebookPersonaDataset():
  """
    Concatenate context segments in a single sequence [[bos+persona], [history], [reply+eos]]
    Tokenize and convert them to tensor
    
  """
  def __init__(self,url,cache_path,tokenizer = ''):
    self.file_path = url
    self.tokenizer = tokenizer
    self.cache_path = cache_path

  def load_dataset(self):
    #self.cache_path = self.cache_path + '_' + type(self.tokenizer).__name__  # To avoid using GPT cache for GPT-2 and vice-versa
    if self.cache_path and os.path.isfile(self.cache_path):
      #load tokenized dataset
      dataset = torch.load(self.cache_path)
      print("dataset loaded")
    else:
      with open(self.file_path, "r", encoding="utf-8") as f:
        dataset = json.loads(f.read())
      
      def tokenize(obj):
        if isinstance(obj,str):
          return self.tokenizer.convert_tokens_to_ids(self.tokenizer.tokenize(obj))
        if isinstance(obj,dict):
          return dict((n, tokenize(o)) for n, o in obj.items())
        return list(tokenize(o) for o in obj)
      dataset = tokenize(dataset)
      torch.save(dataset,self.cache_path)
    return dataset

  def _pad_dataset(self,dataset, padding=0):
    """ Pad the dataset. This could be optimized by defining a Dataset class and padding at the batch level, but this is simpler. """
    max_l = max(len(x) for x in dataset["input_ids"])
    for name in PADDED_INPUTS:
        dataset[name] = [x + [padding if name != "lm_labels" else -100] * (max_l - len(x)) for x in dataset[name]]
    return dataset

  def build_input(self,persona,history,reply,lm_labels = False,with_eos = True):
    """ Build a sequence of input from 3 segments: persona, history and last reply. """
    bos, eos, speaker1, speaker2 = self.tokenizer.convert_tokens_to_ids(SPECIAL_TOKENS[:-1])
    #bos, eos, speaker1, speaker2 = "<bos>", "<eos>", "<speaker1>", "<speaker2>"
    #sequence = [[bos] + list(persona) + ["his"] + history + ["rep"] + [reply]]
    sequence = [[bos] + list(chain(*persona))] + history + [reply + ([eos] if with_eos else [])] #add bos, eos
    sequence = [sequence[0]] + [[speaker2 if (len(sequence)-i) % 2 else speaker1] + s for i, s in enumerate(sequence[1:])] #add speaker1, speaker2
    #after concat: [[bos+persona], [history], [reply+eos]]
    instance = {}
    instance["input_ids"] = list(chain(*sequence))
    #segment
    instance["token_type_ids"] = [speaker2 if i % 2 else speaker1 for i, s in enumerate(sequence) for _ in s]
    #position
    instance["mc_token_ids"] = len(instance["input_ids"]) - 1
    #language model labels is used to calculate lm_loss
    instance["lm_labels"] = [-100] * len(instance["input_ids"]) #labels set to -100 are ignored (masked)
    if lm_labels:
        instance["lm_labels"] = ([-100] * sum(len(s) for s in sequence[:-1])) + [-100] + sequence[-1][1:]
    return instance

  def get_data_loaders(self):
    personachat_dataset = self.load_dataset()
    datasets = {"train": defaultdict(list), "valid": defaultdict(list)}
    for name,dataset in personachat_dataset.items():
      num_candidates = len(dataset[0]["utterances"][0]["candidates"]) #num_candidates are same for all dialoges
      if args.num_candidates > 0 and name == 'train':
        num_candidates =  min(num_candidates ,args.num_candidates)
      for dialoge in dataset:
        persona = dialoge["personality"].copy()
        for _ in range(args.personality_permutations):
          for utterance in dialoge["utterances"]:
            history = utterance["history"][-(2*args.num_history+1):]
            for j, candidate in enumerate(utterance["candidates"][-num_candidates:]):
              #The last candidate is a ground truth reponse.
              lm_labels = bool(j==num_candidates-1)
              instance = self.build_input(persona,history,candidate,lm_labels)
              for input_name,data in instance.items():
                datasets[name][input_name].append(data) #datasets['train']['input_ids'] of [[c1 in u1],[c2 in u1],..,[c1 in u 7][c2 in u7]]
            datasets[name]["mc_labels"].append(num_candidates - 1) #7
            datasets[name]["n_candidates"] = num_candidates
          persona = [persona[-1]] + persona[:-1]  # permuted personalitie
          
    #pad input and convert to tensor
    print("pad input and convert to tensor")
    tensor_datasets = {"train": [], "valid": []}
    for dataset_name, dataset in datasets.items():
      dataset =  self._pad_dataset(dataset, padding=self.tokenizer.convert_tokens_to_ids(SPECIAL_TOKENS[-1]))
      for input_name in MODEL_INPUTS:
        tensor =  torch.tensor(dataset[input_name])
        if input_name != "mc_labels":
          tensor = tensor.view((-1, datasets[dataset_name]["n_candidates"]) + tensor.shape[1:])
        tensor_datasets[dataset_name].append(tensor)

    train_dataset, valid_dataset = TensorDataset(*tensor_datasets["train"]), TensorDataset(*tensor_datasets["valid"])
    return train_dataset, valid_dataset 

In [None]:
persona_dataset = FacebookPersonaDataset(args.local_path,args.dataset_cache_path,tokenizer)

In [None]:
train_dataset, valid_dataset = persona_dataset.get_data_loaders() 

dataset loaded
pad input and convert to tensor


In [None]:
train_dataset.tensors[0].shape

torch.Size([131438, 2, 280])

In [None]:
def add_special_token(model,tokenizer):
  origin_num_tokens = len(tokenizer.encoder)
  num_special_tokens = tokenizer.add_special_tokens(ATTR_TO_SPECIAL_TOKEN)
  if num_special_tokens > 0:
        model.resize_token_embeddings(new_num_tokens=origin_num_tokens + num_special_tokens)

In [None]:
model_class_name = GPT2DoubleHeadsModel if "gpt2" in args.model_checkpoint else OpenAIGPTDoubleHeadsModel
tokenizer_class_name = GPT2Tokenizer if "gpt2" in args.model_checkpoint else OpenAIGPTDoubleHeadsModel
model = model_class_name.from_pretrained(args.model_checkpoint)
tokenizer = tokenizer_class_name.from_pretrained(args.model_checkpoint)
model.to(args.device)

In [None]:
add_special_token(model,tokenizer)

In [None]:
def train(train_dataset,valid_dataset,model,tokenizer):
  train_sampler = RandomSampler(train_dataset) if args.local_rank== -1 else DistributedSampler(train_dataset)
  train_loader = DataLoader(train_dataset,batch_size=args.train_batch_size,sampler=train_sampler)
  optimizer = AdamW(model.parameters(),lr = args.lr,correct_bias=True)

  if args.fp16_training:
    from apex import amp
    # Allow Amp to perform casts as required by the opt_level 
    model,optimizer = amp.initialize(model,optimizer,opt_level=args.fp16_training)
  #Linearly decrease the learning rate from lr to zero
  scheduler = get_linear_schedule_with_warmup(optimizer, 
                                                num_warmup_steps = 0, # Default value in run_glue.py
                                                num_training_steps = args.num_epochs* len(train_loader))
  tr_loss, loss = 0.0, 0.0
  global_step = 0
  metrics = {
      "nll" : 1000.0,
      "accuracy": 0.0
  }
  model.zero_grad() # Reset gradient tensor
  if args.num_gpu > 1:
    model = torch.nn.DataParallel(model)

  if args.local_rank != -1:
    model = DistributedDataParallel(model,device_ids = [args.local_rank],output_device=args.local_rank,find_unused_parameters=True)
  for _ in range(args.num_epochs):
    for step,batch in enumerate(notebook.tqdm(train_loader,disable= args.local_rank not in [-1,0])):
      model.train()
      batch = tuple(input_tensor.to(args.device) for input_tensor in batch )
      input_ids, mc_token_ids, lm_labels, mc_labels, token_type_ids = batch

      (lm_loss),(mc_loss), *_ = model(input_ids,token_type_ids=token_type_ids,mc_labels=mc_labels,lm_labels=lm_labels)

      loss = (lm_loss * args.lm_coef + mc_loss * args.mc_coef)/args.gradient_accumulation_steps # Normalize our loss (if averaged)

      if args.num_gpu > 1:
        # mean() to average on multi-gpu parallel training
        loss = loss.mean()

      if step % 100 == 0:
        print("Loss for step {} is {}".format(step, loss))

      if args.fp16_training:
        with amp.scale_loss(loss, optimizer) as scaled_loss:
          scaled_loss.backward()
        torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_norm)
      else:
        # Perform a backward pass to calculate the gradients.
        loss.backward()
        # Clip the norm of the gradients to 1.0. This is to help prevent the "exploding gradients" problem.
        torch.nn.utils.clip_grad_norm_(model.parameters(),args.max_norm)

      tr_loss += loss.item()
      global_step += 1
      if (step+1) % args.gradient_accumulation_steps == 0: # Wait a several backward step
        optimizer.step()
        model.zero_grad()
        # if args.local_rank in [-1,0]:
        #   metrics = evaluate(model, valid_dataset, metrics, tokenizer)
      # Update the learning rate.
      scheduler.step()
    #eval model 
    if args.local_rank in [-1,0]:
      metrics = evaluate(model, valid_dataset, metrics, tokenizer)
        
  return tr_loss/global_step,metrics


In [None]:
def evaluate(model,valid_dataset,metrics,tokenizer):
  valid_sample = SequentialSampler(valid_dataset) if  args.local_rank== -1 else DistributedSampler(train_dataset)
  eval_dataloader = DataLoader(valid_dataset,sampler=valid_sample,batch_size=args.valid_batch_size)

  print(' * Running Evaluation')
  print(' * Num of examples: %d" ',len(valid_dataset))
  print(' * Batch size: %d" ',args.valid_batch_size)
  nlls = None
  accs = None
  for step,batch in enumerate(notebook.tqdm(eval_dataloader,desc="Evaluating")):
    model.eval()
    with torch.no_grad():
      batch = tuple(input_tensor.to(args.device) for input_tensor in batch)
      input_ids, mc_token_ids, lm_labels, mc_labels, token_type_ids = batch
      lm_logits, mc_logits, *_ = model(
          input_ids, token_type_ids=token_type_ids, mc_token_ids=mc_token_ids)
      
      lm_logits_flat_shifted = lm_logits[..., :-1, :].contiguous().view(-1, lm_logits.size(-1))
      lm_labels_flat_shifted = lm_labels[..., 1:].contiguous().view(-1)
            
      x = ((lm_logits_flat_shifted, mc_logits), (lm_labels_flat_shifted, mc_labels))
      #convert pytorch tensor to numpy array to calculate
      nll = torch.nn.CrossEntropyLoss(ignore_index=-100)(x[0][0], x[1][0]).detach().cpu().numpy()
      acc = torch.sum((torch.max(x[0][1], 1)[1] == x[1][1]).int()).detach().cpu().numpy().mean()

    if nlls is None:
      nlls = nll
      accs = acc
    else:
      nlls = np.append(nlls, nll)
      accs = np.append(accs, acc)
  
  nlls_mean = np.mean(nlls)
  accs_mean = np.mean(accs)
    
  if accs_mean>metrics['accuracy'] and nlls_mean<metrics['nll']:
    print(" * New high accuracy and nll! {} {} ".format(accs_mean, nlls_mean))
    metrics.update({'nll': nlls_mean, 'accuracy': accs_mean})
    #Save model if getting high accuracy
    model_to_save = model.module if hasattr(model, "module") else model #Take care of distributed/parallel training
    model_to_save.save_pretrained(args.saved_dir)
    tokenizer.save_pretrained(args.saved_dir)
    
  return metrics 

In [None]:
total_loss, metric = train(train_dataset,valid_dataset,model,tokenizer)

Selected optimization level O1:  Insert automatic casts around Pytorch functions and Tensor methods.

Defaults for this optimization level are:
enabled                : True
opt_level              : O1
cast_model_type        : None
patch_torch_functions  : True
keep_batchnorm_fp32    : None
master_weights         : None
loss_scale             : dynamic
Processing user overrides (additional kwargs that are not None)...
After processing overrides, optimization options are:
enabled                : True
opt_level              : O1
cast_model_type        : None
patch_torch_functions  : True
keep_batchnorm_fp32    : None
master_weights         : None
loss_scale             : dynamic


HBox(children=(FloatProgress(value=0.0, max=65719.0), HTML(value='')))

Loss for step 0 is 46.49711608886719




Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 4096.0
Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 512.0
Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 256.0
Loss for step 100 is 3.585773229598999
Loss for step 200 is 2.053138017654419
Loss for step 300 is 2.286947250366211
Loss for step 400 is 1.7346086502075195
Loss for step 500 is 1.2778892517089844
Loss for step 600 is 2.897172689437866
Loss for step 700 is 1.8886394500732422
Loss for step 800 is 2.0708539485931396
Loss for step 900 is 1.5509870052337646
Loss for step 1000 is 1.4844870567321777
Loss for step 1100 is 1.9230643510818481
Loss for step 1200 is 1.9749897718429565
Loss for step 1300 is 2.3276443481445312
Loss for step 1400 is 2.0347633361816406
Loss for step 1500 is 2.391751289367676
Loss for step 1600 is 1.1113433837890625
Loss for step 1700 is 1.644362449645996
Loss for step 1800 is 1.6592828035354614
Loss for step 1900 is 1.5419082641601562
L

In [None]:
#Interact
def top_filtering(logits, top_k = 0, top_p = 0.9,threshold = -float('Inf'),filter_value=-float('Inf')):
  """ Filter a distribution of logits using top-k and/or nucleus (top-p) filtering
        Args:
            logits: logits distribution shape (vocabulary size)
            top_k >0: keep only top k tokens with highest probability (top-k filtering).
            top_p >0.0: keep the top tokens with cumulative probability >= top_p (nucleus filtering).
                Nucleus filtering is described in Holtzman et al. (http://arxiv.org/abs/1904.09751)
            threshold: a minimal threshold to keep logits
  """
  assert logits.dim() == 1 #batch_size = 1
  top_k = min(top_k,logits.size(-1))
  if top_k > 0:
    # Remove all tokens with a probability less than the last token in the top-k tokens
    indices_to_remove = torch.top_k(logits,top_k)[0][...,-1,None]
    logits[indices_to_remove] = filter_value
  
  if top_p > 0.0:
    # Compute cumulative probabilities of sorted tokens
    sorted_logits, sorted_indices = torch.sort(logits, descending=True)
    cumulative_probabilities = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)

    # Remove tokens with cumulative probability above the threshold
    sorted_indices_to_remove = cumulative_probabilities > top_p
    # Shift the indices to the right to keep also the first token above the threshold
    sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
    sorted_indices_to_remove[..., 0] = 0

    # Back to unsorted indices and set them to -infinity
    indices_to_remove = sorted_indices[sorted_indices_to_remove]
    logits[indices_to_remove] = filter_value
  
  indices_to_remove = logits < threshold
  logits[indices_to_remove] = filter_value

  return logits

In [None]:
def sample_sequence(personality,history,tokenizer,model,persona_dataset,current_output = None):
  """
    Generate reponse from previous reponses
  """
  special_tokens_ids = tokenizer.convert_tokens_to_ids(SPECIAL_TOKENS)
  if current_output is None:
    current_output = []
  for i in range (args.max_len):
    instance = persona_dataset.build_input(personality,history,current_output,with_eos=False)
    input_ids = torch.tensor(instance["input_ids"],device= args.device).unsqueeze(0)
    token_type_ids =  torch.tensor(instance["token_type_ids"], device= args.device).unsqueeze(0)
    
    logits = model(input_ids,token_type_ids = token_type_ids)
    if isinstance(logits, tuple):  # for gpt2 and maybe others
      logits = logits[0] 
    # logits shape (batch_size, num_choices, sequence_length, vocab_size)
    logits = logits[0,-1,:]/args.temperature 
    logits = top_filtering(logits,top_k = args.top_k,top_p = args.top_p)
    probs = F.softmax(logits, -1)

    prev = torch.topk(probs,1)[1] if args.no_sample else torch.multinomial(probs,1)
    
    if i < args.min_len and prev.item() in special_tokens_ids:
      while prev.item() in special_tokens_ids:
        if probs.max().item() == 1:
          warnings.warn("Warning: model generating special token with probability 1.")
          break  # avoid infinitely looping over special token
      prev = torch.multinomial(probs, num_samples=1)
    
    if prev.item() in special_tokens_ids:
      break
    current_output.append(prev.item())
  return current_output

In [None]:
dataset = persona_dataset.load_dataset()
personalities = [dialog["personality"] for dataset in dataset.values() for dialog in dataset]
personality = random.choice(personalities)

In [None]:
def predict(model,tokenizer,personality,persona_dataset):
  print("Selected personality: %s", tokenizer.decode(chain(*personality)))
  history = []
  while True:
    raw_text = input(">>> ")
    while not raw_text:
      print('Prompt should not be empty!')
      raw_text = input(">>> ")
    history.append(tokenizer.encode(raw_text))
    with torch.no_grad():
      out_ids = sample_sequence(personality, history, tokenizer, model, persona_dataset)
    history.append(out_ids)
    history = history[-(2*args.num_history+1):]
    out_text = tokenizer.decode(out_ids, skip_special_tokens=True)
    print(out_text)

In [None]:
#reload checkpoints and evaluate on test dataset
model = OpenAIGPTDoubleHeadsModel.from_pretrained(args.saved_dir)
tokenizer =  OpenAIGPTTokenizer.from_pretrained(args.saved_dir, do_lower_case=True)
model.to(args.device)

In [None]:
personality

In [None]:
tokenizer.decode([547, 1362, 544, 2846, 239])

In [None]:
personality.append([1128, 17624, 1532, 10591,239])

In [None]:
tokenizer.decode(chain(*personality))

In [None]:
personalities = [dialog["personality"] for dataset in dataset.values() for dialog in dataset]
personality = random.choice(personalities)

In [None]:
predict(model,tokenizer,personality,persona_dataset)

Selected personality: %s i like to place blame on other people even when i know it is my fault. i'm from san fransico. i like to smell my own farts. my beer gut is so huge i'ven t seen my feet in two years. i'm always the one who buys the beers.
>>> what is your name ?
my name is george, i love to drive my truck.
>>> what is your name ?
my name is mia, i am a beer truck driver
>>> what is your name ?
my name is jenny, what is yours?
>>> what is your name ?
i am named joe, what is your name?
>>> what is your name ?
i like to drive, you?
>>> what is your name ?
i like to drink beer, do you?
>>> what is your name ?
mine is joe.
>>> what is your name ?
what do you like to do?
>>> what is your name ?
what do you do for work?
>>> what is your name ?
i'm a police officer.
>>> what is your name ?
what do you do?
>>> what is your name ?
what is your job?
>>> what is your name ?
i'm a cop.
>>> what is your name ?
what do you do for a living?
>>> what is your name ?
i'm in business
>>> what is yo

KeyboardInterrupt: ignored

In [None]:
x = (([1,2,3],[[3,4,3],[5,8,5],[6,6,11]]),([3,4,3],[5,5,5]))

In [None]:
torch.cuda.empty_cache()

In [None]:
torch.sum((torch.max(torch.tensor(x[0][1]), 1)[1] == torch.tensor(x[1][1])).int()).detach().cpu().numpy().mean()

0.0