<a href="https://colab.research.google.com/github/SyaoranClone/Persona-Chatbot/blob/master/Persona_chatbot.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers

In [None]:
#install Apex
%%writefile setup.sh

git clone https://github.com/NVIDIA/apex
cd apex
pip install -v --no-cache-dir ./
!sh setup.sh

Writing setup.sh


In [2]:
import json
import time
import os
from itertools import chain
from argparse import ArgumentParser
from collections import defaultdict

import torch
from torch.nn.parallel import DistributedDataParallel
from torch.utils.data import DataLoader, TensorDataset
from transformers import (AdamW, OpenAIGPTDoubleHeadsModel, OpenAIGPTTokenizer,
                                 GPT2DoubleHeadsModel, GPT2Tokenizer, WEIGHTS_NAME, CONFIG_NAME)

In [3]:
#According to Huggingface Convai tutorial
SPECIAL_TOKENS = ["<bos>", "<eos>", "<speaker1>", "<speaker2>", "<pad>"]
ATTR_TO_SPECIAL_TOKEN = {'bos_token': '<bos>', 'eos_token': '<eos>', 'pad_token': '<pad>',
                         'additional_special_tokens': ['<speaker1>', '<speaker2>']}
MODEL_INPUTS = ["input_ids", "mc_token_ids", "lm_labels", "mc_labels", "token_type_ids"]
PADDED_INPUTS = ["input_ids", "lm_labels", "token_type_ids"]

In [4]:
#define hyperparameters
params = {}
params["model_checkpoint"]  = 'gpt2'
params["device"]  = 'cuda' if torch.cuda.is_available() else 'cpu'
params["lr"]  = 6.25e-5
params["num_candidates"]  = 2
params["num_history"]  = 2 #Number of previous exchanges to keep in history
params["fp16_training"] = "O1" #Set to O0, O1, O2 or O3 for fp16 training
params["url"]  = "https://s3.amazonaws.com/datasets.huggingface.co/personachat/personachat_self_original.json"
params["local_path"] = "/content/drive/My Drive/Colab Notebooks/Dataset/personachat_self_original.json"
params["dataset_cache_path"] = "/content/drive/My Drive/Colab Notebooks/Dataset/dataset_cache/persona_cache.bin_GPT2Tokenizer"

In [5]:
class FacebookPersonaDataset():
  """
    Concatenate context segments in a single sequence [[bos+persona], [history], [reply+eos]]
    Tokenize and convert them to tensor
    
  """
  def __init__(self,url,cache_path,tokenizer = ''):
    self.file_path = url
    self.tokenizer = tokenizer
    self.cache_path = cache_path

  def _load_dataset(self):
    #self.cache_path = self.cache_path + '_' + type(self.tokenizer).__name__  # To avoid using GPT cache for GPT-2 and vice-versa
    if self.cache_path and os.path.isfile(self.cache_path):
      #load tokenized dataset
      dataset = torch.load(self.cache_path)
      print("dataset loaded")
    else:
      with open(self.file_path, "r", encoding="utf-8") as f:
        dataset = json.loads(f.read())
      
      def tokenize(obj):
        if isinstance(obj,str):
          return self.tokenizer.convert_tokens_to_ids(self.tokenizer.tokenize(obj))
        if isinstance(obj,dict):
          return dict((n, tokenize(o)) for n, o in obj.items())
        return list(tokenize(o) for o in obj)
      dataset = tokenize(dataset)
      torch.save(dataset,self.cache_path)
    return dataset

  def _pad_dataset(self,dataset, padding=0):
    """ Pad the dataset. This could be optimized by defining a Dataset class and padding at the batch level, but this is simpler. """
    max_l = max(len(x) for x in dataset["input_ids"])
    for name in PADDED_INPUTS:
        dataset[name] = [x + [padding if name != "lm_labels" else -100] * (max_l - len(x)) for x in dataset[name]]
    return dataset

  def _build_input(self,persona,history,reply,lm_labels = False,with_eos = True):
    """ Build a sequence of input from 3 segments: persona, history and last reply. """
    bos, eos, speaker1, speaker2 = self.tokenizer.convert_tokens_to_ids(SPECIAL_TOKENS[:-1])
    #bos, eos, speaker1, speaker2 = "<bos>", "<eos>", "<speaker1>", "<speaker2>"
    #sequence = [[bos] + list(persona) + ["his"] + history + ["rep"] + [reply]]
    sequence = [[bos] + list(chain(*persona))] + history + [reply + ([eos] if with_eos else [])] #add bos, eos
    sequence = [sequence[0]] + [[speaker2 if (len(sequence)-i) % 2 else speaker1] + s for i, s in enumerate(sequence[1:])] #add speaker1, speaker2
    #after concat: [[bos+persona], [history], [reply+eos]]
    instance = {}
    instance["input_ids"] = list(chain(*sequence))
    #segment
    instance["token_type_ids"] = [speaker2 if i % 2 else speaker1 for i, s in enumerate(sequence) for _ in s]
    #position
    instance["mc_token_ids"] = len(instance["input_ids"]) - 1
    #language model labels is used to calculate lm_loss
    instance["lm_labels"] = [-100] * len(instance["input_ids"]) #labels set to -100 are ignored (masked)
    if lm_labels:
        instance["lm_labels"] = ([-100] * sum(len(s) for s in sequence[:-1])) + [-100] + sequence[-1][1:]
    return instance

  def get_data_loaders(self):
    personachat_dataset = self._load_dataset()
    datasets = {"train": defaultdict(list), "valid": defaultdict(list)}
    for name,dataset in personachat_dataset.items():
      num_candidates = len(dataset[0]["utterances"][0]["candidates"]) #num_candidates are same for all dialoges
      if params["num_candidates"] > 0 and name == 'train':
        num_candidates =  min(num_candidates ,params["num_candidates"])
      for dialoge in dataset:
        persona = dialoge["personality"].copy()
        for utterance in dialoge["utterances"]:
          history = utterance["history"][-(2*params["num_history"]+1):]
          for j, candidate in enumerate(utterance["candidates"][-num_candidates:]):
            #The last candidate is a ground truth reponse.
            lm_labels = bool(j==num_candidates-1)
            instance = self._build_input(persona,history,candidate,lm_labels)
            for input_name,data in instance.items():
              datasets[name][input_name].append(data) #datasets['train']['input_ids'] of [[c1 in u1],[c2 in u1],..,[c1 in u 7][c2 in u7]]
          datasets[name]["mc_labels"].append(num_candidates - 1) #
          datasets[name]["n_candidates"] = num_candidates

    #pad input and convert to tensor
    print("pad input and convert to tensor")
    tensor_datasets = {"train": [], "valid": []}
    for dataset_name, dataset in datasets.items():
      dataset =  self._pad_dataset(dataset, padding=self.tokenizer.convert_tokens_to_ids(SPECIAL_TOKENS[-1]))
      for input_name in MODEL_INPUTS:
        tensor =  torch.tensor(dataset[input_name])
        if input_name != "mc_labels":
          tensor = tensor.view((-1, datasets[dataset_name]["n_candidates"]) + tensor.shape[1:])
        tensor_datasets[dataset_name].append(tensor)

    train_dataset, valid_dataset = TensorDataset(*tensor_datasets["train"]), TensorDataset(*tensor_datasets["valid"])
    return train_dataset, valid_dataset 

In [7]:
persona_dataset = FacebookPersonaDataset(params["local_path"],params["dataset_cache_path"],tokenizer)

In [13]:
#datasets = persona_dataset._load_dataset()

dataset loaded


In [8]:
train_dataset, valid_dataset = persona_dataset.get_data_loaders() 

dataset loaded
pad input and convert to tensor


In [13]:
train_dataset.tensors[0].shape

torch.Size([131438, 2, 280])

In [None]:
def add_special_token(model,tokenizer):
  origin_num_tokens = len(tokenizer.encoder)
  num_special_tokens = tokenizer.add_special_tokens(ATTR_TO_SPECIAL_TOKEN)
  if num_special_tokens > 0:
        model.resize_token_embeddings(new_num_tokens=origin_num_tokens + num_special_tokens)

In [6]:
model = GPT2DoubleHeadsModel.from_pretrained(params["model_checkpoint"])
tokenizer = GPT2Tokenizer.from_pretrained(params["model_checkpoint"])
model.to(params["device"])

Some weights of GPT2DoubleHeadsModel were not initialized from the model checkpoint at gpt2 and are newly initialized: ['h.0.attn.masked_bias', 'h.1.attn.masked_bias', 'h.2.attn.masked_bias', 'h.3.attn.masked_bias', 'h.4.attn.masked_bias', 'h.5.attn.masked_bias', 'h.6.attn.masked_bias', 'h.7.attn.masked_bias', 'h.8.attn.masked_bias', 'h.9.attn.masked_bias', 'h.10.attn.masked_bias', 'h.11.attn.masked_bias', 'multiple_choice_head.summary.weight', 'lm_head.weight', 'multiple_choice_head.summary.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


GPT2DoubleHeadsModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0): Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (1): Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2):

In [None]:
add_special_token(model,tokenizer)
optimizer = AdamW(model.parameters(), lr=params["lr"] , correct_bias=True)

In [None]:
if params["fp16_training"]:
  from apex import amp
  # Allow Amp to perform casts as required by the opt_level 
  model,optimizer = amp.initialize(model,optimizer,opt_level=params["fp16_training"])

In [None]:
def get_batch()

In [None]:
def train():
  model.train()
  total_loss = 0
  start_time = time.time()
  
  