In [0]:
from google.colab import drive
drive.mount('/content/gdrive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/gdrive


In [0]:
cd "/content/gdrive/My Drive/Project/drqa/scripts"

/content/gdrive/My Drive/Project/drqa/scripts


In [0]:
!ls

convert       drqa	    models	     Untitled1.ipynb
data.msgpack  meta.msgpack  Untitled0.ipynb


In [0]:
import re
import os
import sys
import math
import random
import string
import logging
import argparse
from shutil import copyfile
from datetime import datetime
from collections import Counter
import torch
import msgpack
from drqa.model import DocReaderModel
from drqa.utils import str2bool

In [0]:
print(torch.cuda.is_available())

True


In [0]:
#Creating a class of arguments
class arguments:
  def __init__(self):
  #System
    self.log_per_updates=3 #log model loss per x updates (mini-batches).
    self.data_file='data.msgpack' #data messagepack from preprocessing data
    self.model_dir='models' #folder name where models have to be loaded
    self.save_last_only=True #Only save the final models
    self.save_dawn_logs=True #append dawnbench log entries prefixed with dawn_entry:
    self.seed=1013 #random seed for data shuffling, dropout, etc.
    self.cuda=True #GPU usage

  #Training
    self.epochs=40 #Number of epochs
    self.batch_size=32 #Batch size
    self.resume='best_model.pt' #Best model name if already present
    self.resume_options=True #use previous model options, ignore the cli and defaults
    self.reduce_lr=0 #reduce initial (resumed) learning rate by this factor
    self.optimizer='adamax' #supported optimizer: adamax, sgd
    self.grad_clipping=10 
    self.weight_decay=0
    self.learning_rate=0.1 #only applied to SGD
    self.momentum=0 #only applied to SGD
    self.tune_partial=0 #finetune top-x embeddings
    self.fix_embeddings=True #if true, `tune_partial` will be ignored
    self.rnn_padding=True #perform rnn padding (much slower but more accurate)

  #model
    self.question_merge='self_attn'
    self.doc_layers=3
    self.question_layers=3
    self.hidden_size=128
    self.num_features=4
    self.pos=True #use pos tags as a feature
    self.ner=True #use named entity tags as a feature
    self.use_qemb=True 
    self.concat_rnn_layers=True
    self.dropout_emb=0.4
    self.dropout_rnn=0.4
    self.dropout_rnn_output=True
    self.max_len=15
    self.rnn_type='lstm' #supported types: rnn, gru, lstm

In [0]:
args=arguments()

In [0]:
# set model dir
model_dir = args.model_dir
os.makedirs(model_dir, exist_ok=True)
args.model_dir = os.path.abspath(model_dir)

If model is already there we'll use that model for restarting our training

In [0]:
if args.resume == 'best_model.pt' and not os.path.exists(os.path.join(args.model_dir, args.resume)):
        # means we're starting fresh
        args.resume = ''

In [0]:
print(args.resume)

best_model.pt


In [0]:
# set random seed
random.seed(args.seed)
torch.manual_seed(args.seed)
if args.cuda:
    torch.cuda.manual_seed(args.seed)

In [0]:
# setup logger
class ProgressHandler(logging.Handler):
        def __init__(self, level=logging.NOTSET):
            super().__init__(level)

        def emit(self, record):
            log_entry = self.format(record)
            if record.message.startswith('> '):
                sys.stdout.write('{}\r'.format(log_entry.rstrip()))
                sys.stdout.flush()
            else:
                sys.stdout.write('{}\n'.format(log_entry))

In [0]:
log = logging.getLogger(__name__)
log.setLevel(logging.DEBUG)
fh = logging.FileHandler(os.path.join(args.model_dir, 'log.txt'))
fh.setLevel(logging.INFO)
ch = ProgressHandler()
ch.setLevel(logging.DEBUG)
formatter = logging.Formatter(fmt='%(asctime)s %(message)s', datefmt='%m/%d/%Y %I:%M:%S')
fh.setFormatter(formatter)
ch.setFormatter(formatter)
log.addHandler(fh)
log.addHandler(ch)

In [0]:
def lr_decay(optimizer, lr_decay):
    for param_group in optimizer.param_groups:
        param_group['lr'] *= lr_decay
    return optimizer

In [0]:
class BatchGen:
    pos_size = None
    ner_size = None

    def __init__(self, data, batch_size, gpu, evaluation=False):
        """
        input:
            data - list of lists
            batch_size - int
        """
        self.batch_size = batch_size
        self.eval = evaluation
        self.gpu = gpu

        # sort by len
        data = sorted(data, key=lambda x: len(x[1]))
        # chunk into batches
        data = [data[i:i + batch_size] for i in range(0, len(data), batch_size)]

        # shuffle
        if not evaluation:
            random.shuffle(data)

        self.data = data

    def __len__(self):
        return len(self.data)

    def __iter__(self):
        for batch in self.data:
            batch_size = len(batch)
            batch = list(zip(*batch))
            if self.eval:
                assert len(batch) == 8
            else:
                assert len(batch) == 10

            context_len = max(len(x) for x in batch[1])
            context_id = torch.LongTensor(batch_size, context_len).fill_(0)
            for i, doc in enumerate(batch[1]):
                context_id[i, :len(doc)] = torch.LongTensor(doc)

            feature_len = len(batch[2][0][0])

            context_feature = torch.Tensor(batch_size, context_len, feature_len).fill_(0)
            for i, doc in enumerate(batch[2]):
                for j, feature in enumerate(doc):
                    context_feature[i, j, :] = torch.Tensor(feature)

            context_tag = torch.Tensor(batch_size, context_len, self.pos_size).fill_(0)
            for i, doc in enumerate(batch[3]):
                for j, tag in enumerate(doc):
                    context_tag[i, j, tag] = 1

            context_ent = torch.Tensor(batch_size, context_len, self.ner_size).fill_(0)
            for i, doc in enumerate(batch[4]):
                for j, ent in enumerate(doc):
                    context_ent[i, j, ent] = 1

            question_len = max(len(x) for x in batch[5])
            question_id = torch.LongTensor(batch_size, question_len).fill_(0)
            for i, doc in enumerate(batch[5]):
                question_id[i, :len(doc)] = torch.LongTensor(doc)

            context_mask = torch.eq(context_id, 0)
            question_mask = torch.eq(question_id, 0)
            text = list(batch[6])
            span = list(batch[7])
            if not self.eval:
                y_s = torch.LongTensor(batch[8])
                y_e = torch.LongTensor(batch[9])
            if self.gpu:
                context_id = context_id.pin_memory()
                context_feature = context_feature.pin_memory()
                context_tag = context_tag.pin_memory()
                context_ent = context_ent.pin_memory()
                context_mask = context_mask.pin_memory()
                question_id = question_id.pin_memory()
                question_mask = question_mask.pin_memory()
            if self.eval:
                yield (context_id, context_feature, context_tag, context_ent, context_mask,
                       question_id, question_mask, text, span)
            else:
                yield (context_id, context_feature, context_tag, context_ent, context_mask,
                       question_id, question_mask, y_s, y_e, text, span)


In [0]:
def load_data(opt):
    with open('meta.msgpack', 'rb') as f:
        meta = msgpack.load(f)
    embedding = torch.Tensor(meta['embedding'])
    opt['pretrained_words'] = True
    opt['vocab_size'] = embedding.size(0) #Embedding rows length which is vocab length
    opt['embedding_dim'] = embedding.size(1) #Embedding columns which is word vector dimesions 
    opt['pos_size'] = len(meta['vocab_tag']) #Vocab POS tags length
    opt['ner_size'] = len(meta['vocab_ent']) #Vocab NER tags length
    BatchGen.pos_size = opt['pos_size'] #We are assigning the Batch size pos size with number of pos tags
    BatchGen.ner_size = opt['ner_size'] #We are assigning the Batch size NER size with number of NER tags
    with open(opt['data_file'], 'rb') as f:
        data = msgpack.load(f)
    train = data['train'] #Assinging complete train data 
    data['dev'].sort(key=lambda x: len(x[1]))
    dev = [x[:-1] for x in data['dev']] #Assigning all id, context_id, context_features, tag_id, ent_id, question_id, context, context_token_span
    dev_y = [x[-1] for x in data['dev']] #Assigning answers to the dev_y
    return train, dev, dev_y, embedding, opt

In [0]:
train, dev, dev_y, embedding, opt = load_data(vars(args))


In [0]:
log.info(opt)
log.info('[Data loaded.]')

04/18/2020 02:41:22 {'log_per_updates': 3, 'data_file': 'data.msgpack', 'model_dir': '/content/gdrive/My Drive/Project/drqa/scripts/models', 'save_last_only': True, 'save_dawn_logs': True, 'seed': 1013, 'cuda': True, 'epochs': 40, 'batch_size': 32, 'resume': 'best_model.pt', 'resume_options': True, 'reduce_lr': 0, 'optimizer': 'adamax', 'grad_clipping': 10, 'weight_decay': 0, 'learning_rate': 0.1, 'momentum': 0, 'tune_partial': 0, 'fix_embeddings': True, 'rnn_padding': True, 'question_merge': 'self_attn', 'doc_layers': 3, 'question_layers': 3, 'hidden_size': 128, 'num_features': 4, 'pos': True, 'ner': True, 'use_qemb': True, 'concat_rnn_layers': True, 'dropout_emb': 0.4, 'dropout_rnn': 0.4, 'dropout_rnn_output': True, 'max_len': 15, 'rnn_type': 'lstm', 'pretrained_words': True, 'vocab_size': 91590, 'embedding_dim': 300, 'pos_size': 50, 'ner_size': 19}
04/18/2020 02:41:24 [Data loaded.]


In [0]:
if args.save_dawn_logs:
        dawn_start = datetime.now()
        log.info('dawn_entry: epoch\tf1Score\thours')

04/18/2020 02:41:24 dawn_entry: epoch	f1Score	hours


In [0]:
if args.resume:
        log.info('[loading previous model...]')
        checkpoint = torch.load(os.path.join(args.model_dir, args.resume))
        if args.resume_options:
            opt = checkpoint['config']
        state_dict = checkpoint['state_dict']
        model = DocReaderModel(opt, embedding, state_dict)
        epoch_0 = checkpoint['epoch'] + 1
        # synchronize random seed
        random.setstate(checkpoint['random_state'])
        torch.random.set_rng_state(checkpoint['torch_state'])
        if args.cuda:
            torch.cuda.set_rng_state(checkpoint['torch_cuda_state'])
        if args.reduce_lr:
            lr_decay(model.optimizer, lr_decay=args.reduce_lr)
            log.info('[learning rate reduced by {}]'.format(args.reduce_lr))
        batches = BatchGen(dev, batch_size=args.batch_size, evaluation=True, gpu=args.cuda)
        predictions = []
        for i, batch in enumerate(batches):
            predictions.extend(model.predict(batch))
            log.debug('> evaluating [{}/{}]'.format(i, len(batches)))
        em, f1 = score(predictions, dev_y)
        log.info("[dev EM: {} F1: {}]".format(em, f1))
        if math.fabs(em - checkpoint['em']) > 1e-3 or math.fabs(f1 - checkpoint['f1']) > 1e-3:
            log.info('Inconsistent: recorded EM: {} F1: {}'.format(checkpoint['em'], checkpoint['f1']))
            log.error('Error loading model: current code is inconsistent with code used to train the previous model.')
            exit(1)
        best_val_score = checkpoint['best_eval']
else:  
        model = DocReaderModel(opt, embedding)
        epoch_0 = 1
        best_val_score = 0.0

#Calls Doc Reader Model that handles the intializing, underlying network architecture, saving, updating ad predicting examples.
#First it calls AverageMeter() -->beta=0.99, moment=0, value=0, t=0
#It assigns RnnDocReader network to network variable
#In RnnDocReader we are using lstm. 


In [0]:
def _normalize_answer(s):
    def remove_articles(text):
        return re.sub(r'\b(a|an|the)\b', ' ', text)

    def white_space_fix(text):
        return ' '.join(text.split())

    def remove_punc(text):
        exclude = set(string.punctuation)
        return ''.join(ch for ch in text if ch not in exclude)

    def lower(text):
        return text.lower()

    return white_space_fix(remove_articles(remove_punc(lower(s))))


def _exact_match(pred, answers):
    if pred is None or answers is None:
        return False
    pred = _normalize_answer(pred)
    for a in answers:
        if pred == _normalize_answer(a):
            return True
    return False


def _f1_score(pred, answers):
    def _score(g_tokens, a_tokens):
        common = Counter(g_tokens) & Counter(a_tokens)
        num_same = sum(common.values())
        if num_same == 0:
            return 0
        precision = 1. * num_same / len(g_tokens)
        recall = 1. * num_same / len(a_tokens)
        f1 = (2 * precision * recall) / (precision + recall)
        return f1

    if pred is None or answers is None:
        return 0
    g_tokens = _normalize_answer(pred).split()
    scores = [_score(g_tokens, _normalize_answer(a).split()) for a in answers]
    return max(scores)


def score(pred, truth):
    assert len(pred) == len(truth)
    f1 = em = total = 0
    for p, t in zip(pred, truth):
        total += 1
        em += _exact_match(p, t)
        f1 += _f1_score(p, t)
    em = 100. * em / total
    f1 = 100. * f1 / total
    return em, f1

In [0]:
 for epoch in range(epoch_0, epoch_0 + args.epochs):
        log.warning('Epoch {}'.format(epoch))
        # train
        batches = BatchGen(train, batch_size=args.batch_size, gpu=args.cuda)
        start = datetime.now()
        for i, batch in enumerate(batches):
            model.update(batch) #Here we are updatign model with train data batch
            if i % args.log_per_updates == 0:
                log.info('> epoch [{0:2}] updates[{1:6}] train loss[{2:.5f}] remaining[{3}]'.format(
                    epoch, model.updates, model.train_loss.value,
                    str((datetime.now() - start) / (i + 1) * (len(batches) - i - 1)).split('.')[0]))
        log.debug('\n')
        # evaluation=True there is no shuffing of data during batch generation
        batches = BatchGen(dev, batch_size=args.batch_size, evaluation=True, gpu=args.cuda)
        predictions = []
        for i, batch in enumerate(batches):
            predictions.extend(model.predict(batch)) #Here we are predicting the answers for dev st using dev batch gneration
            log.debug('> evaluating [{}/{}]'.format(i, len(batches)))
        em, f1 = score(predictions, dev_y)
        log.warning("dev EM: {} F1: {}".format(em, f1))
        if args.save_dawn_logs:
            time_diff = datetime.now() - dawn_start
            log.warning("dawn_entry: {}\t{}\t{}".format(epoch, f1/100.0, float(time_diff.total_seconds() / 3600.0)))
        # save
        if not args.save_last_only or epoch == epoch_0 + args.epochs - 1:
            model_file = os.path.join(args.model_dir, 'checkpoint_epoch_{}.pt'.format(epoch))
            model.save(model_file, epoch, [em, f1, best_val_score])
            if f1 > best_val_score:
                best_val_score = f1
                copyfile(
                    model_file,
                    os.path.join(args.model_dir, 'best_model.pt'))
                log.info('[new best model saved.]')


04/18/2020 08:34:54 Epoch 1
04/18/2020 08:34:54 Epoch 1
04/18/2020 08:34:54 Epoch 1
04/18/2020 08:42:49 

04/18/2020 08:42:49 

04/18/2020 08:42:49 

04/18/2020 08:43:40 dev EM: 53.4720908230842 F1: 64.43750696880365
04/18/2020 08:43:40 dev EM: 53.4720908230842 F1: 64.43750696880365
04/18/2020 08:43:40 dev EM: 53.4720908230842 F1: 64.43750696880365
04/18/2020 08:43:40 dawn_entry: 1	0.6443750696880365	0.2294558113888889
04/18/2020 08:43:40 dawn_entry: 1	0.6443750696880365	0.2294558113888889
04/18/2020 08:43:40 dawn_entry: 1	0.6443750696880365	0.2294558113888889
04/18/2020 08:43:40 Epoch 2
04/18/2020 08:43:40 Epoch 2
04/18/2020 08:43:40 Epoch 2
04/18/2020 08:51:37 

04/18/2020 08:51:37 

04/18/2020 08:51:37 

04/18/2020 08:52:28 dev EM: 58.7038789025544 F1: 69.0823891660754
04/18/2020 08:52:28 dev EM: 58.7038789025544 F1: 69.0823891660754
04/18/2020 08:52:28 dev EM: 58.7038789025544 F1: 69.0823891660754
04/18/2020 08:52:28 dawn_entry: 2	0.690823891660754	0.3760885711111111
04/18/2020 08: