In [0]:
!pip install torch
!pip install pytorch-ignite
!pip install pytorch-transformers>=1.2
!pip install tensorboardX==1.8
!pip install tensorflow  # for tensorboardX

Collecting pytorch-ignite
[?25l  Downloading https://files.pythonhosted.org/packages/8f/31/efcc2b587419b1f54c5c6ef51996f91bb5d8f760537d17de674c89e06048/pytorch_ignite-0.2.1-py2.py3-none-any.whl (84kB)
[K     |████████████████████████████████| 92kB 3.4MB/s 
Installing collected packages: pytorch-ignite
Successfully installed pytorch-ignite-0.2.1
Collecting tensorboardX==1.8
[?25l  Downloading https://files.pythonhosted.org/packages/c3/12/dcaf67e1312475b26db9e45e7bb6f32b540671a9ee120b3a72d9e09bc517/tensorboardX-1.8-py2.py3-none-any.whl (216kB)
[K     |████████████████████████████████| 225kB 3.4MB/s 
Installing collected packages: tensorboardX
Successfully installed tensorboardX-1.8


In [0]:
!python -m spacy download en

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')
[38;5;2m✔ Linking successful[0m
/usr/local/lib/python3.6/dist-packages/en_core_web_sm -->
/usr/local/lib/python3.6/dist-packages/spacy/data/en
You can now load the model via spacy.load('en')


In [0]:
import os
import math
import logging
from pprint import pformat
from collections import defaultdict
from argparse import ArgumentParser
from itertools import chain

import torch
from torch.utils.data import DataLoader, TensorDataset
from torch.nn.parallel import DistributedDataParallel

from ignite.engine import Engine, Events
from ignite.metrics import Accuracy, Loss, MetricsLambda, RunningAverage
from ignite.handlers import ModelCheckpoint
from ignite.contrib.handlers import ProgressBar, PiecewiseLinear

from ignite.contrib.handlers.tensorboard_logger import TensorboardLogger, OutputHandler, OptimizerParamsHandler
from pytorch_transformers import (AdamW, OpenAIGPTDoubleHeadsModel, OpenAIGPTTokenizer,
                                  GPT2DoubleHeadsModel, GPT2Tokenizer, WEIGHTS_NAME, CONFIG_NAME)

import json
from datetime import datetime
import tempfile
import socket
import tarfile
from pytorch_transformers import cached_path

In [0]:
HF_FINETUNED_MODEL = "https://s3.amazonaws.com/models.huggingface.co/transfer-learning-chatbot/gpt_personachat_cache.tar.gz"

In [0]:
parser = ArgumentParser()
parser.add_argument("--dataset_cache", type=str, default='./dataset_cache', help="-->Path or url of the dataset cache")
parser.add_argument("--model_checkpoint", type=str, default="openai-gpt", help="-->Path, url or short name of the model")
parser.add_argument("--mc_coef", type=float, default=1.0, help="-->Multiple-choice loss coef")
parser.add_argument("--max_norm", type=float, default=1.0, help="-->gradient norm (clipping)")
parser.add_argument("--n_epochs", type=int, default=1, help="-->Number of training epochs")
parser.add_argument("--train_batch_size", type=int, default=4, help="-->training batch size ")
parser.add_argument("--valid_batch_size", type=int, default=4, help="-->validation batch size")
parser.add_argument("--num_candidates", type=int, default=2, help="-->no of candidates for training")
parser.add_argument("--max_history", type=int, default=2, help="-->previous dialogue no to keep in history")
parser.add_argument("--gradient_accumulation_steps", type=int, default=8, help="-->Gradients accumulating on several steps")
parser.add_argument("--lr", type=float, default=6.25e-5, help="-->Learning rate")
parser.add_argument("--lm_coef", type=float, default=1.0, help="-->LM loss coef")
parser.add_argument("--eval_before_start", action='store_true', help="-->If true start with a first evaluation before training")
parser.add_argument("--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu", help="-->selection of device for training -GPU or CPU-")
parser.add_argument("--fp16", type=str, default="", help="-->Set to O0, O1, O2 or O3 for fp16 training (see apex documentation)")
parser.add_argument("--personality_permutations", type=int, default=1, help="-->setting permutation no of personality Sentences")

parser.add_argument("--local_rank", type=int, default=-1, help="-->Local rank for distributed training (-1: not distributed)")
# args = parser.parse_args()
args = parser.parse_args(args=[])

In [0]:
ATTR_TO_SPECIAL_TOKEN = {'bos_token': '<bos>', 'eos_token': '<eos>', 'pad_token': '<pad>',
                         'additional_special_tokens': ('<speaker1>', '<speaker2>')}
SPECIAL_TOKENS = ["<bos>", "<eos>", "<speaker1>", "<speaker2>", "<pad>"]
MODEL_INPUTS = ["input_ids", "mc_token_ids", "lm_labels", "mc_labels", "token_type_ids"]
PADDED_INPUTS = ["input_ids", "lm_labels", "token_type_ids"]
def addingSpecialTokens_(model, tokenizer):

    prevTokens = len(tokenizer.encoder)
    newTokensLen = tokenizer.add_special_tokens(ATTR_TO_SPECIAL_TOKEN) 
    # making sure new tokens are being are being added
    if newTokensLen > 0: 
        model.resize_token_embeddings(new_num_tokens= newTokensLen + prevTokens)

In [0]:

if "gpt2" in args.model_checkpoint:
    tokenizer_class = GPT2Tokenizer
else:
    tokenizer_class = OpenAIGPTTokenizer

tokenizer = tokenizer_class.from_pretrained(args.model_checkpoint)

if "gpt2" in args.model_checkpoint:
    model_class = GPT2DoubleHeadsModel
else:
    model_class = OpenAIGPTDoubleHeadsModel
model = model_class.from_pretrained(args.model_checkpoint)
model.to(args.device)
addingSpecialTokens_(model, tokenizer) # adding tokens
optimizer = AdamW(model.parameters(), lr=args.lr, correct_bias=True)





100%|██████████| 815973/815973 [00:00<00:00, 5447817.52B/s]
100%|██████████| 458495/458495 [00:00<00:00, 3810642.50B/s]
ftfy or spacy is not installed using BERT BasicTokenizer instead of SpaCy & ftfy.
100%|██████████| 273/273 [00:00<00:00, 56551.02B/s]
100%|██████████| 478750579/478750579 [00:11<00:00, 41328402.92B/s]


In [0]:


#                                                           Code for reading file on colab

# !pip install -U -q PyDrive
# from pydrive.auth import GoogleAuth
# from pydrive.drive import GoogleDrive
# from google.colab import auth
# from oauth2client.client import GoogleCredentials
# # Authenticate and create the PyDrive client.
# auth.authenticate_user()
# gauth = GoogleAuth()
# gauth.credentials = GoogleCredentials.get_application_default()
# drive = GoogleDrive(gauth)

In [0]:

from google.colab import drive
drive.mount('/content/drive')
os.chdir("/content/drive/My Drive/fyp1")
# with open('dataset.json') as json_file:
#     itemData = json.load(json_file)
# with open('dataset.json', "r", encoding="utf-8") as f:
#     dataset = json.loads(f.read())

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
with open('small_dataset.json', "r", encoding="utf-8") as f:
    dataset = json.loads(f.read())

In [0]:
def tokenize(obj): # three types of inputs are acceptable string dictonary and list
    if isinstance(obj, str):
        return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(obj))
    if isinstance(obj, dict):
        return dict((n, tokenize(o)) for n, o in obj.items())
    return list(tokenize(o) for o in obj)


In [0]:
dataset_cache=args.dataset_cache
dataset_cache = dataset_cache + '_' + type(tokenizer).__name__
dataset = tokenize(dataset)
torch.save(dataset, dataset_cache)

In [0]:
def build_input_from_segments(persona, history, reply, tokenizer, lm_labels=False, with_eos=True):
    ##Build a sequence of input from 3 segments: persona, history and last reply.
    bos, eos, speaker1, speaker2 = tokenizer.convert_tokens_to_ids(SPECIAL_TOKENS[:-1])
    sequence = [[bos] + list(chain(*persona))] + history + [reply + ([eos] if with_eos else [])]
    sequence = [sequence[0]] + [[speaker2 if (len(sequence)-i) % 2 else speaker1] + s for i, s in enumerate(sequence[1:])]
    instance = {}
    instance["input_ids"] = list(chain(*sequence))
    instance["token_type_ids"] = [speaker2 if i % 2 else speaker1 for i, s in enumerate(sequence) for _ in s]
    instance["mc_token_ids"] = len(instance["input_ids"]) - 1
    instance["lm_labels"] = [-1] * len(instance["input_ids"])
    if lm_labels:
        instance["lm_labels"] = ([-1] * sum(len(s) for s in sequence[:-1])) + [-1] + sequence[-1][1:]
    return instance

In [0]:
def pad_dataset(dataset, padding=0):
    #Pad the dataset. This could be optimized by defining a Dataset class and padding at the batch level, but this is simpler. 
    k=0
    for x in dataset["input_ids"]:
        if len(x)>k:
            k=len(x)
    # max_l = max(len(x) for x in dataset["input_ids"])

    for name in PADDED_INPUTS:
        dataset[name] = [x + [padding if name != "lm_labels" else -1] * (k - len(x)) for x in dataset[name]]
    return dataset

In [0]:
def average_distributed_scalar(scalar, args):
    """ Average a scalar over the nodes if we are in distributed training. We use this for distributed evaluation. """
    if args.local_rank == -1:
        return scalar
    scalar_t = torch.tensor(scalar, dtype=torch.float, device=args.device) / torch.distributed.get_world_size()
    torch.distributed.all_reduce(scalar_t, op=torch.distributed.ReduceOp.SUM)
    return scalar_t.item()

In [0]:
def get_data_loaders(dataset_, args, tokenizer):
    """ Prepare the dataset for training and evaluation """
    originalDataset = dataset_

  
    datasets = {"train": defaultdict(list), "valid": defaultdict(list)}
    for dataset_name, dataset in originalDataset.items():
        num_candidates = len(dataset[0]["utterances"][0]["candidates"])
        if args.num_candidates > 0 and dataset_name == 'train':
            num_candidates = min(args.num_candidates, num_candidates)
        for dialog in dataset:
            persona = dialog["personality"].copy()
            for _ in range(args.personality_permutations):
                for utterance in dialog["utterances"]:
                    history = utterance["history"][-(2*args.max_history+1):]
                    for j, candidate in enumerate(utterance["candidates"][-num_candidates:]):
                        lm_labels = bool(j == num_candidates-1)
                        instance = build_input_from_segments(persona, history, candidate, tokenizer, lm_labels)
                        for input_name, input_array in instance.items():
                            datasets[dataset_name][input_name].append(input_array)
                    datasets[dataset_name]["mc_labels"].append(num_candidates - 1)
                    datasets[dataset_name]["n_candidates"] = num_candidates
                persona = [persona[-1]] + persona[:-1]  # permuted personalities

   
    tensor_datasets = {"train": [], "valid": []}
    for dataset_name, dataset in datasets.items():
        dataset = pad_dataset(dataset, padding=tokenizer.convert_tokens_to_ids(SPECIAL_TOKENS[-1]))
        for input_name in MODEL_INPUTS:
            tensor = torch.tensor(dataset[input_name])
            if input_name != "mc_labels":
                tensor = tensor.view((-1, datasets[dataset_name]["n_candidates"]) + tensor.shape[1:])
            tensor_datasets[dataset_name].append(tensor)


    train_dataset, valid_dataset = TensorDataset(*tensor_datasets["train"]), TensorDataset(*tensor_datasets["valid"])
    train_sampler =  None
    valid_sampler =  None
    train_loader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size, shuffle=(True))
    valid_loader = DataLoader(valid_dataset, sampler=valid_sampler, batch_size=args.valid_batch_size, shuffle=False)

    # logger.info("Train dataset (Batch, Candidates, Seq length): {}".format(train_dataset.tensors[0].shape))
    # logger.info("Valid dataset (Batch, Candidates, Seq length): {}".format(valid_dataset.tensors[0].shape))
    return train_loader, valid_loader, train_sampler, valid_sampler


In [0]:

train_loader, val_loader, train_sampler, valid_sampler = get_data_loaders(dataset, args, tokenizer)

In [0]:
def update(engine, batch):
    model.train()
    batch = tuple(input_tensor.to(args.device) for input_tensor in batch)
    input_ids, mc_token_ids, lm_labels, mc_labels, token_type_ids = batch
    (lm_loss), (mc_loss), *_ = model(
        input_ids, token_type_ids=token_type_ids, mc_token_ids=mc_token_ids,
        mc_labels=mc_labels, lm_labels=lm_labels
    )
    loss = (lm_loss * args.lm_coef + mc_loss * args.mc_coef) / args.gradient_accumulation_steps
    if args.fp16:
        with amp.scale_loss(loss, optimizer) as scaled_loss:
            scaled_loss.backward()
        torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_norm)
    else:
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_norm)
    if engine.state.iteration % args.gradient_accumulation_steps == 0:
        optimizer.step()
        optimizer.zero_grad()
    return loss.item()
trainer = Engine(update)


In [0]:
# Evaluation function and evaluator (evaluator output is the input of the metrics)
def inference(engine, batch):
    model.eval()
    with torch.no_grad():
        batch = tuple(input_tensor.to(args.device) for input_tensor in batch)
        input_ids, mc_token_ids, lm_labels, mc_labels, token_type_ids = batch
        # logger.info(tokenizer.decode(input_ids[0, -1, :].tolist()))
        # if we dont send labels to model, it doesnt return losses
        lm_logits, mc_logits, *_ = model(
            input_ids, token_type_ids=token_type_ids, mc_token_ids=mc_token_ids,
        )
        lm_logits_flat_shifted = lm_logits[..., :-1, :].contiguous().view(-1, lm_logits.size(-1))
        lm_labels_flat_shifted = lm_labels[..., 1:].contiguous().view(-1)
        return (lm_logits_flat_shifted, mc_logits), (lm_labels_flat_shifted, mc_labels)
evaluator = Engine(inference)

In [0]:
# Attach evaluation to trainer: we evaluate when we start the training and at the end of each epoch
trainer.add_event_handler(Events.EPOCH_COMPLETED, lambda _: evaluator.run(val_loader))
if args.n_epochs < 1:
    trainer.add_event_handler(Events.COMPLETED, lambda _: evaluator.run(val_loader))
if args.eval_before_start:
    trainer.add_event_handler(Events.STARTED, lambda _: evaluator.run(val_loader))



# Linearly decrease the learning rate from lr to zero
scheduler = PiecewiseLinear(optimizer, "lr", [(0, args.lr), (args.n_epochs * len(train_loader), 0.0)])
trainer.add_event_handler(Events.ITERATION_STARTED, scheduler)

# Prepare metrics - note how we compute distributed metrics
RunningAverage(output_transform=lambda x: x).attach(trainer, "loss")
metrics = {"nll": Loss(torch.nn.CrossEntropyLoss(ignore_index=-1), output_transform=lambda x: (x[0][0], x[1][0])),
            "accuracy": Accuracy(output_transform=lambda x: (x[0][1], x[1][1]))}
metrics.update({"average_nll": MetricsLambda(average_distributed_scalar, metrics["nll"], args),
                "average_accuracy": MetricsLambda(average_distributed_scalar, metrics["accuracy"], args)})
metrics["average_ppl"] = MetricsLambda(math.exp, metrics["average_nll"])
for name, metric in metrics.items():
    metric.attach(evaluator, name)

In [0]:
def make_logdir(model_name: str):
    """Create unique path to save results and checkpoints, e.g. runs/Sep22_19-45-59_gpu-7_gpt2"""
    # Code copied from ignite repo
    current_time = datetime.now().strftime('%b%d_%H-%M-%S')
    logdir = os.path.join(
        'runs', current_time + '_' + socket.gethostname() + '_' + model_name)
    return logdir

In [0]:
# On the main process: add progress bar, tensorboard, checkpoints and save model, configuration and tokenizer before we start to train
if args.local_rank in [-1, 0]:
    pbar = ProgressBar(persist=True)
    pbar.attach(trainer, metric_names=["loss"])
    evaluator.add_event_handler(Events.COMPLETED, lambda _: pbar.log_message("Validation: %s" % pformat(evaluator.state.metrics)))

    log_dir = make_logdir(args.model_checkpoint)
    tb_logger = TensorboardLogger(log_dir)

    tb_logger.attach(trainer, log_handler=OutputHandler(tag="training", metric_names=["loss"]), event_name=Events.ITERATION_COMPLETED)
    tb_logger.attach(trainer, log_handler=OptimizerParamsHandler(optimizer), event_name=Events.ITERATION_STARTED)
    tb_logger.attach(evaluator, log_handler=OutputHandler(tag="validation", metric_names=list(metrics.keys()), another_engine=trainer), event_name=Events.EPOCH_COMPLETED)

    checkpoint_handler = ModelCheckpoint(log_dir, 'checkpoint', save_interval=1, n_saved=3)
    trainer.add_event_handler(Events.EPOCH_COMPLETED, checkpoint_handler, {'mymodel': getattr(model, 'module', model)})  # "getattr" takes care of distributed encapsulation

    torch.save(args, log_dir + '/model_training_args.bin')
    getattr(model, 'module', model).config.to_json_file(os.path.join(log_dir, CONFIG_NAME))
    tokenizer.save_pretrained(log_dir)

# Run the training
trainer.run(train_loader, max_epochs=2)

# On the main process: close tensorboard logger and rename the last checkpoint (for easy re-loading with OpenAIGPTModel.from_pretrained method)
if args.local_rank in [-1, 0] and args.n_epochs > 0:
    os.rename(checkpoint_handler._saved[-1][1][-1], os.path.join(log_dir, WEIGHTS_NAME))  # TODO: PR in ignite to have better access to saved file paths (cleaner)
    tb_logger.close()



HBox(children=(IntProgress(value=0, max=91), HTML(value='')))

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/usr/local/lib/python3.6/dist-packages/IPython/core/interactiveshell.py", line 2882, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-22-dfd6071daa25>", line 21, in <module>
    trainer.run(train_loader, max_epochs=2)
  File "/usr/local/lib/python3.6/dist-packages/ignite/engine/engine.py", line 446, in run
    self._handle_exception(e)
  File "/usr/local/lib/python3.6/dist-packages/ignite/engine/engine.py", line 410, in _handle_exception
    raise e
  File "/usr/local/lib/python3.6/dist-packages/ignite/engine/engine.py", line 437, in run
    self._fire_event(Events.EPOCH_COMPLETED)
  File "/usr/local/lib/python3.6/dist-packages/ignite/engine/engine.py", line 345, in _fire_event
    func(self, *(event_args + args), **kwargs)
  File "<ipython-input-20-e30318fd2e18>", line 1, in <lambda>
    trainer.add_event_handler(Events.EPOCH_COMPLETED, lambda _: evaluator.run(val_loader))
  File "/usr/local/lib/python3

KeyboardInterrupt: ignored