In [None]:
%tensorflow_version 2.x
!pip install tokenizers
import tensorflow as tf
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

In [None]:

# Mount google drive

from google.colab import drive
drive.mount('/content/gdrive')


In [None]:
import json
import os
import re
import string
import numpy as np
import tensorflow as tf
import tensorflow_hub as hub
from tensorflow import keras
from tensorflow.keras import layers
from tokenizers import BertWordPieceTokenizer

In [None]:
# ============================================= PREPARING DATASET ======================================================
class Sample:
    def __init__(self, question, context, id, start_char_idx=None, answer_text=None, all_answers=None):
        self.question = question
        self.context = context
        self.start_char_idx = start_char_idx
        self.answer_text = answer_text
        self.all_answers = all_answers
        self.skip = False
        self.start_token_idx = -1
        self.end_token_idx = -1
        self.id = id
        self.over_length = False


    def preprocess(self):
        context = " ".join(str(self.context).split())
        question = " ".join(str(self.question).split())
        tokenized_context = tokenizer.encode(context)
        tokenized_question = tokenizer.encode(question)
        if self.answer_text is not None:
            answer = " ".join(str(self.answer_text).split())
            end_char_idx = self.start_char_idx + len(answer)
            if end_char_idx >= len(context):
                self.skip = True
                return
            is_char_in_ans = [0] * len(context)
            for idx in range(self.start_char_idx, end_char_idx):
                is_char_in_ans[idx] = 1
            ans_token_idx = []
            for idx, (start, end) in enumerate(tokenized_context.offsets):
                if sum(is_char_in_ans[start:end]) > 0:
                    ans_token_idx.append(idx)
            if len(ans_token_idx) == 0:
                self.skip = True
                return
            self.start_token_idx = ans_token_idx[0]
            self.end_token_idx = ans_token_idx[-1]
        input_ids = tokenized_context.ids + tokenized_question.ids[1:]
        token_type_ids = [0] * len(tokenized_context.ids) + [1] * len(tokenized_question.ids[1:])
        attention_mask = [1] * len(input_ids)
        
        
        padding_length = max_seq_length - len(input_ids)
        if padding_length > 0:
            input_ids = input_ids + ([0] * padding_length)
            attention_mask = attention_mask + ([0] * padding_length)
            token_type_ids = token_type_ids + ([0] * padding_length)
        elif padding_length < 0:
            self.over_length = True
            input_ids = input_ids[:max_seq_length]
            attention_mask = attention_mask[:max_seq_length]
            token_type_ids = token_type_ids[:max_seq_length]
        self.input_word_ids = input_ids
        self.input_type_ids = token_type_ids
        self.input_mask = attention_mask
        self.context_token_to_char = tokenized_context.offsets


def create_coqa_examples(raw_data, ExcludeAnswers = False):
  coqa_examples = []
  for item in raw_data["data"]:
      context = item["story"]
      idStub = item["id"]
      assert len(item["questions"]) == len(item["answers"])
      for i in range(len(item["questions"])):
        if not ExcludeAnswers:
          question = item["questions"][i]
          answer = item["answers"][i]
          id = str(idStub) + "_" + str(question["turn_id"])
          assert answer["turn_id"] == question["turn_id"]
          all_answers = [answer["span_text"], answer["input_text"]]
          coqa_eg = Sample(question["input_text"], context, id, answer["span_start"], answer["span_text"], all_answers)
        else:
          question = item["questions"][i]
          id = str(idStub) + "_" + str(question["turn_id"])
          coqa_eg = Sample(question["input_text"], context, id)
        coqa_eg.preprocess()
        coqa_examples.append(coqa_eg)
  return coqa_examples


def create_squad_examples(raw_data, ExcludeAnswers = False):
    squad_examples = []
    for item in raw_data["data"]:
        for para in item["paragraphs"]:
            context = para["context"]
            for qa in para["qas"]:
                question = qa["question"]
                id = qa["id"]
                if "answers" in qa and not ExcludeAnswers:
                    if "is_impossible" in qa and qa["is_impossible"] == True:
                      answer_text = ""
                      all_answers = [""]
                      start_char_idx = 0
                    else:
                      answer_text = qa["answers"][0]["text"]
                      all_answers = [_["text"] for _ in qa["answers"]]
                      start_char_idx = qa["answers"][0]["answer_start"]
                    squad_eg = Sample(question, context, id, start_char_idx, answer_text, all_answers)
                else:
                    squad_eg = Sample(question, context, id)
                squad_eg.preprocess()
                squad_examples.append(squad_eg)
    return squad_examples

def create_inputs_targets(squad_examples):
    dataset_dict = {
        "input_word_ids": [],
        "input_type_ids": [],
        "input_mask": [],
        "start_token_idx": [],
        "end_token_idx": [],
    }
    for item in squad_examples:
        if item.skip == False:
            for key in dataset_dict:
                dataset_dict[key].append(getattr(item, key))
    for key in dataset_dict:
        dataset_dict[key] = np.array(dataset_dict[key])
    x = [dataset_dict["input_word_ids"],
         dataset_dict["input_mask"],
         dataset_dict["input_type_ids"]]
    y = [dataset_dict["start_token_idx"], dataset_dict["end_token_idx"]]
    return x, y

In [None]:
# =================================================== TRAINING =========================================================


class ValidationCallback(keras.callbacks.Callback):

    def normalize_text(self, text):
        text = text.lower()
        text = "".join(ch for ch in text if ch not in set(string.punctuation))
        regex = re.compile(r"\b(a|an|the)\b", re.UNICODE)
        text = re.sub(regex, " ", text)
        text = " ".join(text.split())
        return text

    def __init__(self, x_eval, y_eval):
        self.x_eval = x_eval
        self.y_eval = y_eval

    def on_epoch_end(self, epoch, logs=None):
        pred_start, pred_end = self.model.predict(self.x_eval)
        count = 0
        eval_examples_no_skip = [_ for _ in eval_squad_examples if _.skip == False]
        for idx, (start, end) in enumerate(zip(pred_start, pred_end)):
            squad_eg = eval_examples_no_skip[idx]
            offsets = squad_eg.context_token_to_char
            start = np.argmax(start)
            end = np.argmax(end)
            if start >= len(offsets):
                continue
            pred_char_start = offsets[start][0]
            if end < len(offsets):
                pred_char_end = offsets[end][1]
                pred_ans = squad_eg.context[pred_char_start:pred_char_end]
            else:
                pred_ans = squad_eg.context[pred_char_start:]
            normalized_pred_ans = self.normalize_text(pred_ans)
            normalized_true_ans = [self.normalize_text(_) for _ in squad_eg.all_answers]
            if normalized_pred_ans in normalized_true_ans:
                count += 1
        acc = count / len(self.y_eval[0])
        print(f"\nepoch={epoch + 1}, exact match score={acc:.2f}")

In [None]:

max_seq_length = 512

Possiblilities = ["Squad1.1", "Squad2.0", "CoQA"]

UsedModel = "Squad2.0"
UsedEvalSet = "Squad2.0"
if (UsedModel not in Possiblilities or UsedEvalSet not in Possiblilities):
   raise NameError("Selected model or set not valid")


In [None]:
input_word_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32, name='input_word_ids')
input_mask = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32, name='input_mask')
input_type_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32, name='input_type_ids')
bert_layer = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/2", trainable=True, name="bert_en_uncased_L-12_H-768_A-12")
pooled_output, sequence_output = bert_layer([input_word_ids, input_mask, input_type_ids])
vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy().decode("utf-8")
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = BertWordPieceTokenizer(vocab=vocab_file, lowercase=True)

start_logits = layers.Dense(1, name="start_logit", use_bias=False)(sequence_output)
start_logits = layers.Flatten()(start_logits)
end_logits = layers.Dense(1, name="end_logit", use_bias=False)(sequence_output)
end_logits = layers.Flatten()(end_logits)
start_probs = layers.Activation(keras.activations.softmax)(start_logits)
end_probs = layers.Activation(keras.activations.softmax)(end_logits)
model = keras.Model(inputs=[input_word_ids, input_mask, input_type_ids], outputs=[start_probs, end_probs])
loss = keras.losses.SparseCategoricalCrossentropy(from_logits=False)
optimizer = keras.optimizers.Adam(lr=1e-5, beta_1=0.9, beta_2=0.98, epsilon=1e-9)
model.compile(optimizer=optimizer, loss=[loss, loss])
model.summary()

# Restore the weights
print("Loading: ", UsedModel)
if (UsedModel == "Squad1.1"):
  model.load_weights("/content/gdrive/My Drive/TrainingResults/SQUAD1.1.h5")
elif (UsedModel == "Squad2.0"):
  model.load_weights("/content/gdrive/My Drive/TrainingResults/SQUAD2.0.h5")
elif (UsedModel == "CoQA"):
  model.load_weights("/content/gdrive/My Drive/TrainingResults/CoQA.h5")


In [None]:
def samples_to_predictions(samples):
  results = {}
  for sample in samples:
    if (sample.skip == True):
      results[sample.id] = ""
    if (sample.start_char_idx == -1): # Question predicted as unanswerable
      results[sample.id] = ""
    else:
      if (sample.answer_text != None):
        results[sample.id] = sample.answer_text
      else:
        results[sample.id] = ""
  return results

In [None]:
# ==================================================== SANITY CHECK =========================================================
data = {"data":
    [
        {"title": "Project Apollo",
         "paragraphs": [
             {
                 "context": "The Apollo program, also known as Project Apollo, was the third United States human "
                            "spaceflight program carried out by the National Aeronautics and Space Administration ("
                            "NASA), which accomplished landing the first humans on the Moon from 1969 to 1972. First "
                            "conceived during Dwight D. Eisenhower's administration as a three-man spacecraft to "
                            "follow the one-man Project Mercury which put the first Americans in space, Apollo was "
                            "later dedicated to President John F. Kennedy's national goal of landing a man on the "
                            "Moon and returning him safely to the Earth by the end of the 1960s, which he proposed in "
                            "a May 25, 1961, address to Congress. Project Mercury was followed by the two-man Project "
                            "Gemini. The first manned flight of Apollo was in 1968. Apollo ran from 1961 to 1972, "
                            "and was supported by the two man Gemini program which ran concurrently with it from 1962 "
                            "to 1966. Gemini missions developed some of the space travel techniques that were "
                            "necessary for the success of the Apollo missions. Apollo used Saturn family rockets as "
                            "launch vehicles. Apollo/Saturn vehicles were also used for an Apollo Applications "
                            "Program, which consisted of Skylab, a space station that supported three manned missions "
                            "in 1973-74, and the Apollo-Soyuz Test Project, a joint Earth orbit mission with the "
                            "Soviet Union in 1975.",
                 "qas": [
                     {"question": "What project put the first Americans into space?",
                      "id": "Q1"
                      },
                     {"question": "What program was created to carry out these projects and missions?",
                      "id": "Q2"
                      },
                     {"question": "What year did the first manned Apollo flight occur?",
                      "id": "Q3"
                      },
                     {"question": "What President is credited with the original notion of putting Americans in space?",
                      "id": "Q4"
                      },
                     {"question": "Who did the U.S. collaborate with on an Earth orbit mission in 1975?",
                      "id": "Q5"
                      },
                     {"question": "How long did Project Apollo run?",
                      "id": "Q6"
                      },
                     {"question": "What program helped develop space travel techniques that Project Apollo used?",
                      "id": "Q7"
                      },
                     {"question": "What space station supported three manned missions in 1973-1974?",
                      "id": "Q8"
                      },
                      {"question": "Is this question unanswerable?",
                      "id": "Q9"
                      }
                 ]}]}]}

test_samples = create_squad_examples(data)
x_test, _ = create_inputs_targets(test_samples)
pred_start, pred_end = model.predict(x_test)
for idx, (start, end) in enumerate(zip(pred_start, pred_end)):
  test_sample = test_samples[idx]
  offsets = test_sample.context_token_to_char
  start = np.argmax(start)
  end = np.argmax(end)
  pred_ans = None
  if start >= len(offsets):
    pred_ans = "Unaswerable"
    pred_char_start = -1
  else:    
    pred_char_start = offsets[start][0]
    if end < len(offsets):
      pred_ans = test_sample.context[pred_char_start:offsets[end][1]]
    else:
      pred_ans = test_sample.context[pred_char_start:offsets[start+2][1]]
  print("Q: " + test_sample.question)
  print("A: " + pred_ans)

In [None]:
# ==================================================== PREDICTIONS =========================================================








print("Loading eval set: ", UsedEvalSet)
if (UsedEvalSet == "CoQA"):
  target_eval_set_path = "/content/gdrive/My Drive/TrainingResults/TrimmedDevSets/CoQA-dev_Trimmed.json"
elif (UsedEvalSet == "Squad1.1"):
  target_eval_set_path = "/content/gdrive/My Drive/TrainingResults/TrimmedDevSets/Squad1.1-dev_Trimmed.json"
elif (UsedEvalSet == "Squad2.0"):
  target_eval_set_path = "/content/gdrive/My Drive/TrainingResults/TrimmedDevSets/Squad2.0-dev_Trimmed.json"
with open(target_eval_set_path) as f: EvalData = json.load(f)
AllSamples = create_squad_examples(EvalData, True)


test_samples = AllSamples
print(test_samples[0].context)
print(test_samples[0].question)

x_test, _ = create_inputs_targets(test_samples)
pred_start, pred_end = model.predict(x_test)
for idx, (start, end) in enumerate(zip(pred_start, pred_end)):
  test_sample = test_samples[idx]
  offsets = test_sample.context_token_to_char
  start = np.argmax(start)
  end = np.argmax(end)
  pred_ans = None
  if start >= len(offsets):
    pred_ans = "Unaswerable"
    pred_char_start = -1
  else:    
    pred_char_start = offsets[start][0]
    if end < len(offsets):
      pred_ans = test_sample.context[pred_char_start:offsets[end][1]]
    else:
      pred_ans = test_sample.context[pred_char_start:offsets[start+2][1]]
  test_samples[idx].answer_text = pred_ans
  test_samples[idx].start_char_idx = pred_char_start



In [None]:
#print(samples_to_predictions(test_samples))
with open('/content/gdrive/My Drive/TrainingResults/PredictionsV4/'+ UsedModel + '---'+ UsedEvalSet + '-eval-set_Trimmed.json', 'w') as fp:
    json.dump(samples_to_predictions(test_samples), fp)

In [None]:
def samples_to_squad_json(samples):
  ContextQuestionPairs = {}
  for sample in samples:
    QAPair = {}
    QAPair["id"] = sample.id
    QAPair["question"] = sample.question
    QAPair["answers"] = []
    if (sample.start_char_idx == -1):
      QAPair["is_impossible"] = True
    else:  
      QAPair["answers"].append({"text":sample.answer_text, "answer_start":sample.start_char_idx })
      if (len(sample.all_answers) > 1):
        for ans in sample.all_answers:
          QAPair["answers"].append({"text":ans, "answer_start":0 }) # We don't have the answer locations for all_answers, but the eval script does not use this anyway
      QAPair["is_impossible"] = False
    if sample.context in ContextQuestionPairs:
      ContextQuestionPairs[sample.context].append(QAPair)
    else:
      qas = [QAPair]
      ContextQuestionPairs[sample.context] = qas
  
  raw_data={}
  data = []
  
  for context, qas in ContextQuestionPairs.items():
    data.append({"title": "", "paragraphs": [{"context": context, "qas": qas}]})
  raw_data["data"] = data
  return raw_data



In [None]:
#Stop execution
raise NameError("Stopped execution")

In [None]:
def remove_seqlen_exceeding_samples(samples, max_len = 512):
  newsamples = []
  for sample in samples:
    if (sample.over_length == False):
      newsamples.append(sample)
  return newsamples

In [None]:
## Create dev sets where no context exceeds the seqlen

eval_path = keras.utils.get_file("evalSQUAD1.1.json", "https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json")

with open('/content/gdrive/My Drive/TrainingResults/TrimmedDevSets/Squad1.1-dev_Trimmed.json', 'w') as fp:
  with open(eval_path) as f: raw_eval_data = json.load(f)
  json.dump(samples_to_squad_json(remove_seqlen_exceeding_samples(create_squad_examples(raw_eval_data))), fp)



In [None]:
eval_path = keras.utils.get_file("evalSQUAD2.0.json", "https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v2.0.json")

with open('/content/gdrive/My Drive/TrainingResults/TrimmedDevSets/Squad2.0-dev_Trimmed.json', 'w') as fp:
  with open(eval_path) as f: raw_eval_data = json.load(f)
  json.dump(samples_to_squad_json(remove_seqlen_exceeding_samples(create_squad_examples(raw_eval_data))), fp)

In [None]:
eval_path = keras.utils.get_file("evalCoQA.json", "https://nlp.stanford.edu/data/coqa/coqa-dev-v1.0.json")

with open('/content/gdrive/My Drive/TrainingResults/TrimmedDevSets/CoQA-dev_Trimmed.json', 'w') as fp:
  with open(eval_path) as f: raw_eval_data = json.load(f)
  json.dump(samples_to_squad_json(remove_seqlen_exceeding_samples(create_coqa_examples(raw_eval_data))), fp)