<a href="https://colab.research.google.com/github/SyaoranClone/Persona-Chatbot/blob/master/Persona_chatbot.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers

In [None]:
#install Apex
%%writefile setup.sh

git clone https://github.com/NVIDIA/apex
cd apex
pip install -v --no-cache-dir ./
!sh setup.sh

Writing setup.sh


In [1]:
import json
import time
from itertools import chain
from argparse import ArgumentParser

import torch
from torch.nn.parallel import DistributedDataParallel
from torch.utils.data import DataLoader, TensorDataset
# from transformers import (AdamW, OpenAIGPTDoubleHeadsModel, OpenAIGPTTokenizer,
#                                  GPT2DoubleHeadsModel, GPT2Tokenizer, WEIGHTS_NAME, CONFIG_NAME)

In [None]:
SPECIAL_TOKENS = ["<bos>", "<eos>", "<speaker1>", "<speaker2>", "<pad>"]
ATTR_TO_SPECIAL_TOKEN = {'bos_token': '<bos>', 'eos_token': '<eos>', 'pad_token': '<pad>',
                         'additional_special_tokens': ['<speaker1>', '<speaker2>']}

In [None]:
#define hyperparameters
params = {}
params["model_checkpoint"]  = 'gpt2'
params["device"]  = 'cuda' if torch.cuda.is_available() else 'cpu'
params["lr"]  = 6.25e-5
params["num_candidates"]  = 2
params["num_history"]  = 2 #Number of previous exchanges to keep in history
params["fp16_training"] = "O1" #Set to O0, O1, O2 or O3 for fp16 training
params["url"]  = "https://s3.amazonaws.com/datasets.huggingface.co/personachat/personachat_self_original.json"
params["local_path"] = "/content/drive/My Drive/Colab Notebooks/Dataset/personachat_self_original.json"

In [None]:
class DataLoaders():
  def __init__(self,url,tokenizer = ''):
    self.file_path = url
    self.tokenizer = tokenizer

  def _load_dataset(self):
    with open(self.file_path, "r", encoding="utf-8") as f:
      dataset = json.loads(f.read())
    
    # def tokenize(obj):
    #   if isinstance(obj,str):
    #     return self.tokenizer.convert_tokens_to_ids(self.tokenizer.tokenize(obj))
    #   if isinstance(obj,dict):
    #     return dict((n, tokenize(o)) for n, o in obj.items())
    #   return list(tokenize(o) for o in obj)
    # dataset = tokenize(dataset)
    return dataset
  
  def _build_input(self,persona,history,reply,tokenizer,lm_labes = False,with_eos = True):
    """ Build a sequence of input from 3 segments: persona, history and last reply. """
    bos, eos, speaker1, speaker2 = tokenizer.convert_tokens_to_ids(SPECIAL_TOKENS[:-1])
    sequence = [[bos] + list(chain(*persona))] + history + [reply + ([eos] if with_eos else [])] #add bos, eos
    sequence = [sequence[0]] + [[speaker2 if (len(sequence)-i) % 2 else speaker1] + s for i, s in enumerate(sequence[1:])] #add speaker1, speaker2
    instance = {}
    instance["input_ids"] = list(chain(*sequence))
    #segment
    instance["token_type_ids"] = [speaker2 if i % 2 else speaker1 for i, s in enumerate(sequence) for _ in s]
    #position
    instance["mc_token_ids"] = len(instance["input_ids"]) - 1
    #language model labels is used to calculate lm_loss
    instance["lm_labels"] = [-1] * len(instance["input_ids"])
    if lm_labels:
        instance["lm_labels"] = ([-1] * sum(len(s) for s in sequence[:-1])) + [-1] + sequence[-1][1:]
    return instance


  def get_data_loaders(self):
    personachat_dataset = self._load_dataset()
    datasets = {"train": defaultdict(list), "valid": defaultdict(list)}
    for name,dataset in token_data.items():
      for dialoge in dataset:
        persona = dialoge["personality"].copy()
        for utterance in dialoge["utterances"]:
          history = utterance["history"][-(2*params["num_history"]+1):]
          for j, candidate in enumerate(utterance["candidates"][-params["num_candidates"]:]):
            #Return True if j is the last candidate
            lm_labels = bool(j==params["num_candidates"]-1)
            instance = self._build_input(persona,history,candidate,lm_labels)
            for input_name,data in instance.items():
              datasets[name][input_name].append(data)
          datasets[name]["mc_labels"].append(num_candidates - 1)
          

In [None]:
dataloaders = DataLoaders(params["local_path"])

In [None]:
token_data = dataloaders._load_dataset()

In [None]:
for name,dataset in token_data.items():
  for dialoge in dataset:
    persona = dialoge["personality"].copy()
    for utterance in dialoge["utterances"]:
      history = utterance["history"][-(2*params["num_history"]+1):]
      for j, candidate in enumerate(utterance["candidates"][-params["num_candidates"]:]):
        lm_labels = bool(j==params["num_candidates"]-1)
        

In [None]:
arr

['what kind ? holiday parties ? work parties ?',
 'i love dogs want a husky but cant have one yet']

In [None]:
sequence = [['<bos>', 'i', 'like', 'playing', 'football', '.', 'i', 'am', 'from', 'NYC', '.'],
 ['<speaker1>', 'hello', 'how', 'are', 'you', '?'],
 ['<speaker2>', 'i', 'am', 'fine', 'thanks', '.'],
 ['<speaker1>', 'great', 'to', 'hear', '<eos>']]

In [None]:
speaker2 = 'speaker2'
speaker1 = 'speaker1'

In [None]:
([-1] * sum(len(s) for s in sequence[:-1])) + [-100] + sequence[-1][1:]

23

In [None]:
seg = [speaker2 if i % 2 else speaker1 for i, s in enumerate(sequence) for _ in s]

In [None]:
def add_special_token(model,tokenizer):
  origin_num_tokens = len(tokenizer.encoder)
  num_special_tokens = tokenizer.add_special_tokens(ATTR_TO_SPECIAL_TOKEN)
  if num_special_tokens > 0:
        model.resize_token_embeddings(new_num_tokens=origin_num_tokens + num_special_tokens)

In [None]:
model = GPT2DoubleHeadsModel.from_pretrained(params["model_checkpoint"])
tokenizer = GPT2Tokenizer.from_pretrained(params["model_checkpoint"])
model.to(params["device"])

In [None]:
add_special_token(model,tokenizer)
optimizer = AdamW(model.parameters(), lr=params["lr"] , correct_bias=True)

In [None]:
if params["fp16_training"]:
  from apex import amp
  # Allow Amp to perform casts as required by the opt_level 
  model,optimizer = amp.initialize(model,optimizer,opt_level=params["fp16_training"])

In [None]:
def get_batch()

In [None]:
def train():
  model.train()
  total_loss = 0
  start_time = time.time()
  
  