In [1]:
!pip install transformers

import numpy as np
import torch
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from torch.utils.data.distributed import DistributedSampler
from tqdm import tqdm, trange

from transformers import (
    WEIGHTS_NAME,
    AdamW,
    BertConfig,
    BertForQuestionAnswering,
    BertTokenizer,
    DistilBertConfig,
    DistilBertForQuestionAnswering,
    DistilBertTokenizer,
    get_linear_schedule_with_warmup,
    squad_convert_examples_to_features,
)

from transformers.data.metrics.squad_metrics import (
    compute_predictions_log_probs,
    compute_predictions_logits,
    squad_evaluate,
)

from transformers.data.processors.squad import SquadResult, SquadV1Processor, SquadV2Processor

import tensorflow_datasets as tfds
from transformers.data.processors.squad import SquadResult, SquadV1Processor, SquadV2Processor, squad_convert_example_to_features
from transformers import (
    BertTokenizer
)
import gc
gc.collect()

import glob
import logging
import os
import random
import timeit
from torch.utils.tensorboard import SummaryWriter

def to_list(tensor):
    return tensor.detach().cpu().tolist()
  
def load_and_cache_examples(tokenizer,evaluate = False, output_examples = False):
  local_rank = -1
  data_dir = None
  model_name_or_path = "bert-base-cased"
  max_seq_length = 384
  overwrite_cache = False
  predict_file = None
  train_file = None
  version_2_with_negative = False
  doc_stride = 128
  max_query_length = 64
  threads = 1

  if local_rank not in [-1, 0] and not evaluate:
        # Make sure only the first process in distributed training process the dataset, and the others will use the cache
        torch.distributed.barrier()

    # Load data features from cache or dataset file
  input_dir = data_dir if data_dir else "."
  cached_features_file = os.path.join(
      input_dir,
        "cached_{}_{}_{}".format(
            "dev" if evaluate else "train",
            list(filter(None, model_name_or_path.split("/"))).pop(),
            str(max_seq_length),
        ),
    )

    # Init features and dataset from cache if it exists
  if os.path.exists(cached_features_file) and not overwrite_cache:
        features_and_dataset = torch.load(cached_features_file)
        features, dataset, examples = (
            features_and_dataset["features"],
            features_and_dataset["dataset"],
            features_and_dataset["examples"],
        )
  else:

        if not data_dir and ((evaluate and not predict_file) or (not evaluate and not train_file)):
            try:
                import tensorflow_datasets as tfds
            except ImportError:
                raise ImportError("If not data_dir is specified, tensorflow_datasets needs to be installed.")

            if version_2_with_negative:
                logger.warn("tensorflow_datasets does not handle version 2 of SQuAD.")

            tfds_examples = tfds.load("squad")
            examples = SquadV1Processor().get_examples_from_dataset(tfds_examples, evaluate=evaluate)
        else:
            processor = SquadV2Processor() if version_2_with_negative else SquadV1Processor()
            if evaluate:
                examples = processor.get_dev_examples(data_dir, filename=predict_file)
            else:
                examples = processor.get_train_examples(data_dir, filename=train_file)
        gc.collect()
        features, dataset = squad_convert_examples_to_features(
            examples=examples,
            tokenizer=tokenizer,
            max_seq_length=max_seq_length,
            doc_stride=doc_stride,
            max_query_length=max_query_length,
            is_training=not evaluate,
            return_dataset="pt",
            threads=threads,
        )
        gc.collect()

        if local_rank in [-1, 0]:
            
            torch.save({"features": features, "dataset": dataset, "examples": examples}, cached_features_file)

  if local_rank == 0 and not evaluate:
      # Make sure only the first process in distributed training process the dataset, and the others will use the cache
      torch.distributed.barrier()

  if output_examples:
        return dataset, examples, features
  return dataset

def Train(train_dataset,model,tokenizer):
  train_batch_size = 8
  weight_decay = 0.0
  learning_rate = 5e-5
  adam_epsilon = 1e-8
  warmup_steps = 0
  gradient_accumulation_steps = 1
  num_train_epochs = 3.0 
  per_gpu_train_batch_size = 8
  local_rank = -1
  max_grad_norm = 1.0
  max_steps = -1
  logging_step = 500
  save_steps = 500
  evaluate_during_training = False
  
  if local_rank in [-1, 0]:
    tb_writer = SummaryWriter()
  train_sample = RandomSampler(train_dataset)
  train_dataloader = DataLoader(train_dataset,sampler = train_sample, batch_size= train_batch_size)

  t_total = len(train_dataloader)//gradient_accumulation_steps*num_train_epochs
  no_decay = ["bias", "LayerNorm.weight"]
  optimizer_grouped_parameters = [
      {
          "params" : [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
          "weight_decay" : weight_decay
      },
      {
          "params" : [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0 
      },
  ]

  optimizer = AdamW(optimizer_grouped_parameters,lr = learning_rate,eps = adam_epsilon)
  scheduler = get_linear_schedule_with_warmup(
      optimizer, num_warmup_steps = warmup_steps, num_training_steps = t_total
  )

  global_step = 1
  epochs_trained = 0
  steps_trained_in_current_epoch = 0

  tr_loss , logging_loss = 0.0, 0.0
  model.zero_grad()

  train_iterater = trange(
      epochs_trained, int(num_train_epochs),desc ="Epoch",disable = local_rank not in [-1,0]
  )


  for _ in train_iterater:
    epoch_iterator = tqdm(train_dataloader,desc = "Iteration",disable = local_rank not in [-1.0])
    for step, batch in enumerate(epoch_iterator):

      #Skip past any already trained steps if resuming training
      if steps_trained_in_current_epoch > 0:
        steps_trained_in_current_epoch -= 1
        continue
      device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
      model.train()
      batch = tuple(t.to(device) for t in batch)

      inputs = {
          "input_ids" : batch[0],
          "attention_mask" : batch[1],
          "token_type_ids" : batch[2],
          "start_positions" : batch[3],
          "end_positions" : batch[4],
      }

      outputs = model(**inputs)

      loss = outputs[0]

      loss.backward()

      tr_loss += loss.item()

      if (step + 1)% gradient_accumulation_steps == 0:
        torch.nn.utils.clip_grad_norm(model.parameters(), max_grad_norm)

        optimizer.step()
        scheduler.step()
        model.zero_grad()
        global_step += 1

        if local_rank in [-1,0] and logging_step > 0 and global_step % logging_step == 0:

          if local_rank == -1 and evaluate_during_training:
            results = evaluate(model,tokenizer)

            for key, value in results.items():
              tb_writer.add_scalar("eval_{}".format(key),value,global_step)
            
            tb_writer.add_scalar("lr",scheduler.get_lr()[0],global_step)
            tb_writer.add_scaler("loss", (tr_loss - logging_loss) / logging_step,global_step)
            logging_loss = tr_loss
        if local_rank in [-1,0] and save_steps > 0 and global_step % save_steps == 0:
          output_dir = os.path.join("/Output","checkpoint- {}".format(global_step))
          if not os.path.exists(output_dir):
            os.makedirs(output_dir)
          
          # Take care of distributed/parallel training
          model_to_save = model.module if hasattr(model, "module") else model
          model_to_save.save_pretrained(output_dir)
          tokenizer.save_pretrained(output_dir)

          torch.save(os.path.join(output_dir, "training_args.bin"))
          

          torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt"))
          torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt"))
      
      
      if max_steps > 0 and global_step > max_steps:
        epoch_iterator.close()
        break
    if max_steps > 0 and global_step > max_steps:
      train_iterater.close()
      break
  if local_rank in [-1, 0]:
    tb_writer.close()

  return global_step, tr_loss / global_step

def evaluate(model, tokenizer,prefix = ""):
  output_dir = None
  per_gpu_eval_batch_size = 8
  n_gpu = 1
  n_best_size = 20
  max_answer_length = 30
  do_lower_case = True
  verbose_logging = True
  version_2_with_negative = False
  null_score_diff_threshold = 0.0
  local_rank = -1
  device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
  dataset,examples, features = load_and_cache_examples(tokenizer, evaluate = True, output_examples = True)

  if not os.path.exists(output_dir) and local_rank in [-1,0]:
    os.makedirs(output_dir)
  eval_batch_size = per_gpu_eval_batch_size
  
  eval_sampler = SequentialSampler(dataset)
  eval_dataloader = DataLoader(dataset,sampler = eval_sampler, batch_size = eval_batch_size)


  all_results = []
  start_time = timeit.default_timer()

  for batch in tqdm(eval_dataloader, desc = "Evaluating"):
    model.eval()
    batch = tuple(t.to(device) for t in batch)

    with torch.no_grad():
      input = {
          "input_id" : batch[0],
          "attention_mask" : batch[1],
          "token_type_ids" : batch[2],
      }

      example_indices = batch[3]

      outputs = model(**input)
    
    for i , example_index in enumerate(example_indices):
      eval_feature = features[example_index.item()]
      unique_id = int(eval_feature.unique_id)

      output = [to_list(output) for output in outputs]

      if len(output) >= 5:
        start_logits = output[0]
        start_top_index = output[1]
        end_logits = output[2]
        end_top_index = output[3]
        cls_logits = output[4]

        result = SquadResult(
            unique_id,
            start_logits,
            end_logits,
            start_top_index=start_top_index,
            end_top_index=end_top_index,
            cls_logits=cls_logits,
        )
      else:
        start_logits, end_logits = output
        result = SquadResult(unique_id, start_logits, end_logits)
      
      all_results.append(result)

  evalTime = timeit.default_timer() - start_time
  

  # Compute predictions
  output_prediction_file = os.path.join(output_dir, "predictions_{}.json".format(prefix))
  output_nbest_file = os.path.join(output_dir, "nbest_predictions_{}.json".format(prefix))

  output_null_log_odds_file = None

  predictions = compute_predictions_logits(
            examples,
            features,
            all_results,
            n_best_size,
            max_answer_length,
            do_lower_case,
            output_prediction_file,
            output_nbest_file,
            output_null_log_odds_file,
            verbose_logging,
            version_2_with_negative,
            null_score_diff_threshold,
            tokenizer,
  )

  results = squad_evaluate(examples, predictions)

  return results

def Save_Model(model,tokenizer):
  do_train = True
  local_rank = -1
  output_dir = "/Output"
  device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
  do_lower_case = True
  if do_train and (local_rank == -1 or torch.distributed.get_rank() == 0):
          # Create output directory if needed
          if not os.path.exists(output_dir) and local_rank in [-1, 0]:
              os.makedirs(output_dir)

    
          # Save a trained model, configuration and tokenizer using `save_pretrained()`.
          # They can then be reloaded using `from_pretrained()`
          # Take care of distributed/parallel training
          model_to_save = model.module if hasattr(model, "module") else model
          model_to_save.save_pretrained(output_dir)
          tokenizer.save_pretrained(output_dir)

          # Good practice: save your training arguments together with the trained model
          torch.save(os.path.join(output_dir, "training_args.bin"))

          # Load a trained model and vocabulary that you have fine-tuned
          model = BertForQuestionAnswering.from_pretrained(output_dir)  # , force_download=True)
          tokenizer = BertTokenizer.from_pretrained(output_dir, do_lower_case=do_lower_case)
          model.to(device)

def Evaluate_loop():
  do_eval = True
  local_rank = -1
  results = {}
  do_train = True
  output_dir = "/Output"
  eval_all_checkpoints = True
  model_name_or_path = "bert-base-cased"
  device = torch.device("cuda") if torch.cuda.is_available() else torch.device('cpu')
  if do_eval and local_rank in [-1, 0]:
      if do_train:
          checkpoints = [output_dir]
          if eval_all_checkpoints:
              checkpoints = list(
                  os.path.dirname(c)
                  for c in sorted(glob.glob(output_dir + "/**/" + WEIGHTS_NAME, recursive=True))
              )
              logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN)  # Reduce model loading logs
      else:
          logger.info("Loading checkpoint %s for evaluation", model_name_or_path)
          checkpoints = [model_name_or_path]

      for checkpoint in checkpoints:
          # Reload the model
          global_step = checkpoint.split("-")[-1] if len(checkpoints) > 1 else ""
          model = BertForQuestionAnswering.from_pretrained(checkpoint)  # , force_download=True)
          model.to(device)

          # Evaluate
          result = evaluate(model, tokenizer, prefix=global_step)

          result = dict((k + ("_{}".format(global_step) if global_step else ""), v) for k, v in result.items())
          results.update(result)
  return results

 



In [None]:
if __name__ == '__main__':
  device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
  tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
  config = BertConfig.from_pretrained("bert-base-cased")
  model = BertForQuestionAnswering.from_pretrained("bert-base-cased",config = config)
  model.to(device)

  logger = logging.getLogger(__name__)
  train_dataset = load_and_cache_examples(tokenizer, evaluate=False, output_examples=False)

  global_step, tr_loss = Train(train_dataset, model, tokenizer)
  logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)

  Save_Model(model, tokenizer)

  Results = Evaluate_loop(model)

  print(Results)