# The dataset

### The dataset is called SQUAD.

### each datapoint is a question, a context that may contain the answer to the question, the start index of the answer, the answer


In [82]:
import transformers
from datasets import load_dataset

print(transformers.__version__) # "transformers_version": "4.15.0",
print(tf.__version__) # TensorFlow version 2.7.0

4.46.0
2.17.0


In [2]:
dataset = load_dataset('squad')

In [3]:
# squad_v2 = False
# model_checkpoint = "distilbert-base-uncased"
# batch_size = 16

In [4]:
for q, a in zip(dataset["train"]["question"][:5], dataset["train"]["answers"]):
    print(f"{q} => {a} \n")

To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France? => {'text': ['Saint Bernadette Soubirous'], 'answer_start': [515]} 

What is in front of the Notre Dame Main Building? => {'text': ['a copper statue of Christ'], 'answer_start': [188]} 

The Basilica of the Sacred heart at Notre Dame is beside to which structure? => {'text': ['the Main Building'], 'answer_start': [279]} 

What is the Grotto at Notre Dame? => {'text': ['a Marian place of prayer and reflection'], 'answer_start': [381]} 

What sits on top of the Main Building at Notre Dame? => {'text': ['a golden statue of the Virgin Mary'], 'answer_start': [92]} 



# this function adds the end of the answer to the dataset, and does some sanity checks


In [5]:
def compute_end_index(answers, contexts):
    """Add end index to answers"""
    # print(contexts)
    fixed_answers = []
    for answer, context in zip(answers, contexts):
        gold_text = answer["text"][0]
        # print(gold_text)
        answer["text"] = gold_text
        # print(answer["text"])

        start_idx = answer["answer_start"][0]
        answer["answer_start"] = start_idx

        # Make sure the starting index is valid and there is an answer
        assert start_idx >= 0 and len(gold_text.strip()) > 0
        end_idx = start_idx + len(gold_text)
        answer["answer_end"] = end_idx

        # Make sure the corresponding context matches the actual answer
        assert context[start_idx:end_idx] == gold_text

        fixed_answers.append(answer)
    return fixed_answers, contexts

In [6]:
for answer in dataset["train"]["answers"]:
    print(answer)
    break

{'text': ['Saint Bernadette Soubirous'], 'answer_start': [515]}


In [7]:
train_questions = dataset["train"]["question"]
print("Training data corrections")
train_answers, train_contexts = compute_end_index(
    dataset["train"]["answers"], dataset["train"]["context"]
)

test_questions = dataset["validation"]["question"]
print("\nValidation data correction")
test_answers, test_contexts = compute_end_index(
    dataset["validation"]["answers"], dataset["validation"]["context"]
)

Training data corrections

Validation data correction


# Implementing BERT


In [49]:
from transformers import BertTokenizerFast

tokenizer = BertTokenizerFast.from_pretrained(
    "bert-base-uncased"
)  # using the pretrained tokenizer (there is no distinction between uppercase and lowercase)

tokenizer

BertTokenizerFast(name_or_path='bert-base-uncased', vocab_size=30522, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=False),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

# Testing the tokenizer


In [75]:
from transformers import BertTokenizerFast
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

"""
text - A single or batch of text sequences to be encoded by the tokenizer
text_pair - An optinal single or batch of text sequences to be encoded by the tokenizer. useful for multipair(question and context)
"""

context = "This is the context"
question = "This is the question"
token_ids = tokenizer(
    text=context, text_pair=question,
    padding=False, return_tensors='tf'
)


# tokenizer.convert_ids_to_tokens(token_ids["input_ids"].numpy()[0])

In [10]:
print(tokenizer.convert_ids_to_tokens(token_ids["input_ids"].numpy()[0]))

['[CLS]', 'this', 'is', 'the', 'context', '[SEP]', 'this', 'is', 'the', 'question', '[SEP]']


# Encoding the train and test dataset


In [11]:
# Encoding train
train_encodings = tokenizer(
    train_questions, train_contexts, truncation=True, padding=True, return_tensors="tf"
)

# Encoding test
test_encodings = tokenizer(
    test_questions, test_contexts, truncation=True, padding=True, return_tensors="tf"
)

print("train_encodings.shape: {}".format(train_encodings["input_ids"].shape))

train_encodings.shape: (87599, 512)


In [77]:
# tokenizer.convert_ids_to_tokens(train_encodings[0])


In [17]:
# convert the character-based indices to token-based indecis


def replace_char_with_token_indices(encodings, answers):
    """
    This function takes a set of BatchEncodings called encodings generated by the tokenizer and a set of
    answers( a list of dictionaries). Then it updates the provided encodings with two new keys:
    start_position and end_positions. These keys respectively hold the token-based indices denoting the
    start and end of the answer. if the asnwer is not found, set the start and end indices to the last token.
    To convert the existing character-based indeces to token-based indices, call char_to_token() provided by
    the BatchEncodings class.
    """

    start_positions = []
    end_positions = []
    n_updates = 0

    # Go through all the answers
    for i in range(len(answers)):
        # Get the token position for both start and end char positions
        start_positions.append(encodings.char_to_token(i, answers[i]["answer_start"]))
        end_positions.append(encodings.char_to_token(i, answers[i]["answer_end"] - 1))

        if start_positions[-1] is None or end_positions[-1] is None:
            n_updates += 1

        # if start position is None, the answer passage has been truncated
        # https://huggingface.co/transformers/custom_datasets.html#qa-squad

        if start_positions[-1] is None:
            start_positions[-1] = tokenizer.model_max_length - 1

        if end_positions[-1] is None:
            end_positions[-1] = tokenizer.model_max_length - 1

    print("{}/{} had answers truncated".format(n_updates, len(answers)))
    encodings.update(
        {"start_positions": start_positions, "end_positions": end_positions}
    )

In [80]:
replace_char_with_token_indices(train_encodings, train_answers)
replace_char_with_token_indices(test_encodings, test_answers)

81196/87599 had answers truncated
9786/10570 had answers truncated


In [81]:
from transformers import BertConfig, TFBertForQuestionAnswering

config = BertConfig.from_pretrained("bert-base-uncased", return_dict=False)
print(config)

model = TFBertForQuestionAnswering.from_pretrained("bert-base-uncased", config=config)

RuntimeError: Failed to import transformers.models.bert.modeling_tf_bert because of the following error (look up to see its traceback):
Your currently installed version of Keras is Keras 3, but this is not yet supported in Transformers. Please install the backwards-compatible tf-keras package with `pip install tf-keras`.

## Defining a Tensorflow dataset

data will
consist of two tuples: one containing inputs and the other containing the targets.

### The input tuple contains:

- Input token IDs – A batch of padded token IDs of size [batch size, sequence length]
- Attention mask – A batch of attention masks of size [batch size, sequence length]

### The output tuple contains:

- Start index of the answer – A batch of start indices of the answer
- End index of the answer – A batch of end indices of the answer


In [19]:
def data_gen(input_ids, attension_mask, start_positions, end_positions):
    """generator for data

    Since the data is already processed, it's a matter of reorganizing the already existing data to return
    """

    for inps, attn, start_pos, end_pos in zip(
        input_ids, attension_mask, start_positions, end_positions
    ):
        yield (inps, attn), (start_pos, end_pos)

In [20]:
from functools import partial
import tensorflow as tf

# Define the generator as a callable
train_data_gen = partial(
    data_gen,
    input_ids=train_encodings["input_ids"],
    attention_mask=train_encodings["attention_mask"],
    start_positions=train_encodings["start_positions"],
    end_positions=train_encodings["end_positions"],
)

# Define the dataset
train_dataset = tf.data.Dataset.from_generator(
    train_data_gen, output_types=(("int32", "int32"), ("int32", "int32"))
)

# shuffle the data
train_dataset = train_dataset.shuffle(1000)
print("\t Done")

	 Done


### split our dataset into two: a training set and a validation dataset. We will use the first

10,000 samples as the validation set. The rest of the data is used as the training set. Both datasets
will be batched using a batch size of 4


In [38]:
# Valid set is taken as the first 10000 samples in the shuffled set
valid_dataset = train_dataset.take(10000)
valid_dataset = valid_dataset.batch(4)


train_dataset = train_dataset.skip(10000)
train_dataset = train_dataset.batch(4)

In [53]:
for item in train_dataset.take(0):
    print(item.shape)


### Creating Test dataset


In [157]:
print("Creating test data")

# Generator callable
test_data_gen = partial(
    data_gen,
    input_ids=test_encodings["input_ids"],
    attention_mask=test_encodings["attention_mask"],
    start_positions=test_encodings["start_positions"],
    end_positions=test_encodings["end_positions"],
)


test_dataset = tf.data.Dataset.from_generator(
    test_data_gen, output_types=(("int32", "int32"), ("int32", "int32"))
)

test_dataset = test_dataset.batch(8)

Creating test data


In [158]:
from transformers import BertConfig, TFBertForQuestionAnswering

config = BertConfig.from_pretrained("bert-base-uncased", return_dict=False)
print(config)

BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "return_dict": false,
  "transformers_version": "4.45.2",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}



In [159]:
model = TFBertForQuestionAnswering.from_pretrained("bert-base-uncased", config=config)

All PyTorch model weights were used when initializing TFBertForQuestionAnswering.

Some weights or buffers of the TF 2.0 model TFBertForQuestionAnswering were not initialized from the PyTorch model and are newly initialized: ['qa_outputs.weight', 'qa_outputs.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [164]:
def tf_wrap_model(model):
    """Wraps the huggingface's model with in the Keras Functional API"""
    # Define inputs
    input_ids = tf.keras.layers.Input(
        [
            None,
        ],
        dtype=tf.int32,
        name="input_ids",
    )
    attention_mask = tf.keras.layers.Input(
        [
            None,
        ],
        dtype=tf.int32,
        name="attention_mask",
    )
    # Define the output (TFQuestionAnsweringModelOutput)
    out = model(input_ids)

    # print(dir(out))
    # Get the correct attributes in the produced object to generate an
    # output tuple
    wrap_model = tf.keras.models.Model(
        [input_ids, attention_mask], outputs=(out.start_logits, out.end_logits)
    )
    return wrap_model

In [165]:
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
acc = tf.keras.metrics.SparseCategoricalAccuracy()
optimizer = tf.keras.optimizers.Adam(learning_rate=1e-5)
model_v2 = tf_wrap_model(model)
model_v2.compile(optimizer=optimizer, loss=loss, metrics=[acc])

AttributeError: 'tuple' object has no attribute 'start_logits'

In [142]:
# model = TFBertForQuestionAnswering.from_pretrained("ydshieh/bert-base-cased-squad2")


# question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"

# inputs = tokenizer(question, text, return_tensors="tf")
# outputs = model(**inputs)

# answer_start_index = int(tf.math.argmax(outputs.start_logits, axis=-1)[0])
# answer_end_index = int(tf.math.argmax(outputs.end_logits, axis=-1)[0])

# predict_answer_tokens = inputs.input_ids[0, answer_start_index : answer_end_index + 1]
# tokenizer.decode(predict_answer_tokens)
# # "a nice puppet"

# print(outputs.start_logits)
# print(dir(outputs))

### Losses


AttributeError: 'tuple' object has no attribute 'start_logits'

In [92]:
model_v2.fit(train_dataset, validation_data=valid_dataset, epochs=3)