<a href="https://colab.research.google.com/github/GiacomoDamicantonio/SQUAD/blob/main/SQUAD.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
os.system("pip install -q \"transformers==4.3\"")
import warnings
warnings.filterwarnings('ignore')
import os
import re
import json
import string
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from transformers import AutoTokenizer, TFAutoModel
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

Collecting tokenizers
[?25l  Downloading https://files.pythonhosted.org/packages/d4/e2/df3543e8ffdab68f5acc73f613de9c2b155ac47f162e725dcac87c521c11/tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3MB)
[K     |████████████████████████████████| 3.3MB 7.1MB/s 
[?25hInstalling collected packages: tokenizers
Successfully installed tokenizers-0.10.3
Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/d5/43/cfe4ee779bbd6a678ac6a97c5a5cdeb03c35f9eaebbb9720b036680f9a2d/transformers-4.6.1-py3-none-any.whl (2.2MB)
[K     |████████████████████████████████| 2.3MB 5.3MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/75/ee/67241dc87f266093c533a2d4d3d69438e57d7a90abb216fa076e7d475d4a/sacremoses-0.0.45-py3-none-any.whl (895kB)
[K     |████████████████████████████████| 901kB 57.4MB/s 
Collecting huggingface-hub==0.0.8
  Downloading https://files.pythonhosted.or

In [2]:
class SquadExample:
    def __init__(self, question, context, start_char_idx, answer_text, all_answers, qid):
        self.question = question
        self.context = context
        self.start_char_idx = start_char_idx
        self.answer_text = answer_text
        self.all_answers = all_answers
        self.qid = qid
        self.skip = False

    def preprocess(self):
        context = self.context
        question = self.question
        answer_text = self.answer_text
        start_char_idx = self.start_char_idx

        # Clean context, answer and question
        context = " ".join(str(context).split())
        question = " ".join(str(question).split())
        answer = " ".join(str(answer_text).split())

        # Find end character index of answer in context
        end_char_idx = start_char_idx + len(answer)
        if end_char_idx >= len(context):
            self.skip = True
            return

        # Mark the character indexes in context that are in answer
        is_char_in_ans = [0] * len(context)
        for idx in range(start_char_idx, end_char_idx):
            is_char_in_ans[idx] = 1

        # Tokenize context
        tokenized_context = tokenizer(context, return_offsets_mapping=True)

        # Find tokens that were created from answer characters
        offsets = tokenized_context.offset_mapping
        ans_token_idx = []
        for idx, (start, end) in enumerate(offsets):
            if sum(is_char_in_ans[start:end]) > 0:
                ans_token_idx.append(idx)

        if len(ans_token_idx) == 0:
            self.skip = True
            return

        # Find start and end token index for tokens from answer
        start_token_idx = ans_token_idx[0]
        end_token_idx = ans_token_idx[-1]

        # Tokenize question
        tokenized_question = tokenizer(question, return_offsets_mapping=True)

        # Create inputs
        input_ids = tokenized_context.input_ids + tokenized_question.input_ids[1:]
        token_type_ids = [0] * len(tokenized_context.input_ids) + [1] * len(
            tokenized_question.input_ids[1:]
        )
        attention_mask = [1] * len(input_ids)

        # Pad and create attention masks.
        # Skip if truncation is needed
        padding_length = max_len - len(input_ids)
        if padding_length > 0:  # pad
            input_ids = input_ids + ([0] * padding_length)
            attention_mask = attention_mask + ([0] * padding_length)
            token_type_ids = token_type_ids + ([0] * padding_length)
        elif padding_length < 0:  # skip
            self.skip = True
            return

        self.input_ids = input_ids
        self.token_type_ids = token_type_ids
        self.attention_mask = attention_mask
        self.start_token_idx = start_token_idx
        self.end_token_idx = end_token_idx
        self.context_token_to_char = offsets

def create_squad_examples(raw_data, errors):
    squad_examples = []
    for item in raw_data["data"]:
        for para in item["paragraphs"]:
            context = para["context"]
            for qa in para["qas"]:
              if qa['id'] not in errors:
                question = qa["question"]
                answer_text = qa["answers"][0]["text"]
                all_answers = [_["text"] for _ in qa["answers"]]
                start_char_idx = qa["answers"][0]["answer_start"]
                squad_eg = SquadExample(
                    question, context, start_char_idx, answer_text, all_answers, qa['id']
                )
                squad_eg.preprocess()
                squad_examples.append(squad_eg)
              else:
                print(qa)
    return squad_examples

def create_eval_examples(raw_data, errors):
    squad_examples = []
    for item in raw_data["data"][:5]:
        for para in item["paragraphs"]:
            context = para["context"]
            for qa in para["qas"]:
              if qa['id'] not in errors:
                question = qa["question"]
                answer_text = qa["answers"][0]["text"]
                all_answers = [_["text"] for _ in qa["answers"]]
                start_char_idx = qa["answers"][0]["answer_start"]
                squad_eg = SquadExample(
                    question, context, start_char_idx, answer_text, all_answers, qa['id']
                )
                squad_eg.preprocess()
                squad_examples.append(squad_eg)

    return squad_examples

def create_inputs_targets(squad_examples):
    dataset_dict = {
        "input_ids": [],
        "token_type_ids": [],
        "attention_mask": [],
        "start_token_idx": [],
        "end_token_idx": [],
    }
    for item in squad_examples:
        if item.skip == False:
            for key in dataset_dict:
                dataset_dict[key].append(getattr(item, key))
    for key in dataset_dict:
        dataset_dict[key] = np.array(dataset_dict[key])

    x = [
        dataset_dict["input_ids"],
        dataset_dict["token_type_ids"],
        dataset_dict["attention_mask"],
    ]
    y = [dataset_dict["start_token_idx"], dataset_dict["end_token_idx"]]
    return x, y

In [None]:
max_len = 512

# Save the slow pretrained tokenizer
model_name = 'roberta-large'
tokenizer = AutoTokenizer.from_pretrained(model_name)

path = '/content/drive/MyDrive/SQUAD'

with open(path+'/training_set.json') as f:
    raw_train_data = json.load(f)

with open(path+'/error IDs.txt', 'r') as filename:
  errors = filename.read().split('\n')

train_squad_examples = create_squad_examples(raw_train_data, errors)
x_train, y_train = create_inputs_targets(train_squad_examples)
print(f"{len(train_squad_examples)} training points created.")

eval_squad_examples = create_eval_examples(raw_train_data, errors)
x_eval, y_eval = create_inputs_targets(eval_squad_examples)
print(f"{len(eval_squad_examples)} evaluation points created.")

In [None]:
def create_model():
    ## BERT encoder
    encoder = TFAutoModel.from_pretrained(model_name)

    ## QA Model
    input_ids = layers.Input(shape=(max_len,), dtype=tf.int32)
    token_type_ids = layers.Input(shape=(max_len,), dtype=tf.int32)
    attention_mask = layers.Input(shape=(max_len,), dtype=tf.int32)
    embedding = encoder(
        input_ids = input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask
    ).last_hidden_state

    start_logits = layers.Dense(1, use_bias=False, name='start_logit')(embedding)
    start_logits = layers.Flatten(name = 'flatten_start')(start_logits)

    end_logits = layers.Dense(1, use_bias=False, name = 'end_logit')(embedding)
    end_logits = layers.Flatten(name = 'flatten_end')(end_logits)

    start_probs = layers.Activation(keras.activations.softmax, name = 'start_pred')(start_logits)
    end_probs = layers.Activation(keras.activations.softmax, name = 'end_pred')(end_logits)

    model = keras.Model(
        inputs=[input_ids, token_type_ids, attention_mask],
        outputs=[start_probs, end_probs],
    )
    return model

def accuracy(y_true, y_pred):
    sampled = tf.argmax(y_pred, axis=-1)
    acc = 1 - tf.math.count_nonzero(tf.squeeze(tf.cast(y_true, tf.int64)) - sampled) / tf.cast(len(sampled), tf.int64)
    return acc

In [None]:
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)

saveDir = os.path.join(os.getcwd(), 'saved_models')
if not os.path.isdir(saveDir):
    os.makedirs(saveDir)
chkpt = saveDir + '/' + model_name + '.hdf5'
    # Create distribution strategy
tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
tf.config.experimental_connect_to_cluster(tpu)
tf.tpu.experimental.initialize_tpu_system(tpu)
strategy = tf.distribute.TPUStrategy(tpu)

    # Create model
with strategy.scope():
    model = create_model()

ES_start = EarlyStopping(monitor='val_start_pred_accuracy', patience=2,verbose=1, mode='auto', restore_best_weights = True)
ES_end = EarlyStopping(monitor='val_end_pred_accuracy', patience=2,verbose=1, mode='auto', restore_best_weights = True)

cp_cb = ModelCheckpoint(filepath = chkpt, monitor='val_loss', verbose=1, 
                        save_best_only=False, mode='auto', 
                        save_weights_only=True)

callbacks = [ES_end, ES_start, cp_cb]

loss = keras.losses.SparseCategoricalCrossentropy(from_logits=False)
optimizer = keras.optimizers.Adam(lr=1e-5, beta_1=0.9, beta_2=0.98, epsilon=1e-9)
model.compile(optimizer=optimizer, loss=[loss, loss], metrics=[accuracy])

model.summary()

model.fit(
    x_train,
    y_train,
    epochs=200,  # For demonstration, 3 epochs are recommended
    verbose=1,
    batch_size=128,
    validation_data=(x_eval,y_eval),
    callbacks = callbacks
)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=487203636.0, style=ProgressStyle(descri…




Some layers from the model checkpoint at distilroberta-base were not used when initializing TFRobertaModel: ['lm_head']
- This IS expected if you are initializing TFRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFRobertaModel were initialized from the model checkpoint at distilroberta-base.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaModel for predictions without further training.


Model: "model_14"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_45 (InputLayer)           [(None, 512)]        0                                            
__________________________________________________________________________________________________
input_46 (InputLayer)           [(None, 512)]        0                                            
__________________________________________________________________________________________________
tf_roberta_model_3 (TFRobertaMo TFBaseModelOutputWit 82118400    input_45[0][0]                   
                                                                 input_46[0][0]                   
__________________________________________________________________________________________________
start_logit (Dense)             (None, 512, 1)       768         tf_roberta_model_3[0][0]  

<tensorflow.python.keras.callbacks.History at 0x7efaa21098d0>

In [None]:
!wget https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json -O test_set.json

def create_test_examples(raw_data, errors):
    squad_examples = []
    skipped = []
    for item in raw_data["data"]:
        for para in item["paragraphs"]:
            context = para["context"]
            for qa in para["qas"]:
              if qa['id'] not in errors:
                question = qa["question"]
                answer_text = qa["answers"][0]["text"]
                all_answers = [_["text"] for _ in qa["answers"]]
                start_char_idx = qa["answers"][0]["answer_start"]
                squad_eg = SquadExample(
                    question, context, start_char_idx, answer_text, all_answers, qa['id']
                )
                squad_eg.preprocess()
                if squad_eg.skip == False:
                  squad_examples.append(squad_eg)
                else: 
                  skipped.append(squad_eg.qid)
    return squad_examples, skipped

def create_inputs_targets(squad_examples):
    dataset_dict = {
        "input_ids": [],
        "token_type_ids": [],
        "attention_mask": [],
        "start_token_idx": [],
        "end_token_idx": [],
    }
    for item in squad_examples:
        if item.skip == False:
            for key in dataset_dict:
                dataset_dict[key].append(getattr(item, key))
    for key in dataset_dict:
        dataset_dict[key] = np.array(dataset_dict[key])

    x = [
        dataset_dict["input_ids"],
        dataset_dict["token_type_ids"],
        dataset_dict["attention_mask"],
    ]
    y = [dataset_dict["start_token_idx"], dataset_dict["end_token_idx"]]
    return x, y

with open('test_set.json') as f:
    raw_test_data = json.load(f)

test_squad_examples, skipped = create_test_examples(raw_test_data, errors)
x_test, y_test = create_inputs_targets(test_squad_examples)
print(f"{len(test_squad_examples)} evaluation points created.")

--2021-05-25 15:00:59--  https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json
Resolving rajpurkar.github.io (rajpurkar.github.io)... 185.199.108.153, 185.199.109.153, 185.199.110.153, ...
Connecting to rajpurkar.github.io (rajpurkar.github.io)|185.199.108.153|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4854279 (4.6M) [application/json]
Saving to: ‘test_set.json’


2021-05-25 15:00:59 (25.8 MB/s) - ‘test_set.json’ saved [4854279/4854279]

10462 evaluation points created.


In [None]:
predictions = model.predict(x_test, verbose=1)



In [None]:
num_samples = len(predictions[0])

start, end = list(np.argmax(predictions, axis=-1).squeeze())
lines_c = 0
with open("dev_predictions.txt","w") as out:
    out.write("{")
    for x in skipped:
      out.write(f'''"{x}": "42",\n''')
    for ans_idx in range(num_samples):
        if test_squad_examples[ans_idx].skip == False:
          if end[ans_idx] == 0:
              if ans_idx == num_samples-1:
                  out.write(f'''"{squad_test_examples[ans_idx].qid}": ""''')
              else:
                  out.write(f'''"{squad_test_examples[ans_idx].qid}": "",\n''')
          else:
              predicted_ans = tokenizer.decode(test_squad_examples[ans_idx].input_ids[start[ans_idx] : end[ans_idx]+1]).replace("\n"," ")
              if ans_idx == num_samples-1:
                  out.write(f'''"{test_squad_examples[ans_idx].qid}": "{predicted_ans.replace('"',"")}"''')
              else:
                  out.write(f'''"{test_squad_examples[ans_idx].qid}": "{predicted_ans.replace('"',"")}",\n''')
        else:
            out.write(f'''"{test_squad_examples[ans_idx].qid}": "42",\n''')
    out.write("}")

evaluation = !python3 evaluate.py test_set.json dev_predictions.txt
print(evaluation)

['{', '  "exact": 78.18353831598864,', '  "f1": 84.96487470436928,', '  "total": 10570,', '  "HasAns_exact": 78.18353831598864,', '  "HasAns_f1": 84.96487470436928,', '  "HasAns_total": 10570', '}']
