# Analysis of SQuAD 2.0 dataset

In [1]:
from question_answering.paths import extractive_qa_paths
from question_answering.utils import core_qa_utils
from transformers import AutoTokenizer

  _torch_pytree._register_pytree_node(


In [2]:
raw_train_dataset, raw_test_dataset = core_qa_utils.load_datasets_from_json(
    dataset_path=extractive_qa_paths.squad2_dataset_dir,
    filenames=["original_train.json", "original_test.json"],
)

In [3]:
train_dataset = raw_train_dataset.select(range(115000))
val_dataset = raw_train_dataset.select(range(115000, 130319))
test_dataset = raw_test_dataset

## Maximum number of tokens in any sample across dataset

In [4]:
def tokenize_sample(sample, tokenizer, max_tokens=None, padding=False):
    question = sample["question"].strip()
    context = sample["context"].strip()

    return tokenizer(question, context, max_length=max_tokens, padding=padding)

#### BERT uncased

In [5]:
bert_uncased_tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")

In [6]:
tokenized_train_dataset = train_dataset.map(
    lambda row: tokenize_sample(row, bert_uncased_tokenizer)
)
tokenized_val_dataset = val_dataset.map(
    lambda row: tokenize_sample(row, bert_uncased_tokenizer)
)
tokenized_test_dataset = test_dataset.map(
    lambda row: tokenize_sample(row, bert_uncased_tokenizer)
)

Map:   0%|          | 0/120000 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (572 > 512). Running this sequence through the model will result in indexing errors


Map:   0%|          | 0/10319 [00:00<?, ? examples/s]

Map:   0%|          | 0/11873 [00:00<?, ? examples/s]

In [7]:
max_bert_uncased_train_tokens = len(max(tokenized_train_dataset["input_ids"], key=len))
max_bert_uncased_val_tokens = len(max(tokenized_val_dataset["input_ids"], key=len))
max_bert_uncased_test_tokens = len(max(tokenized_test_dataset["input_ids"], key=len))
max_bert_uncased_tokens = max(
    max_bert_uncased_train_tokens,
    max_bert_uncased_val_tokens,
    max_bert_uncased_test_tokens,
)

print(
    f"Max number of tokens in tokenized train dataset: {max_bert_uncased_train_tokens}"
)
print(f"Max number of tokens in tokenized val dataset: {max_bert_uncased_val_tokens}")
print(f"Max number of tokens in tokenized test dataset: {max_bert_uncased_test_tokens}")
print(f"Max number of tokens overall: {max_bert_uncased_tokens}")

Max number of tokens in tokenized train dataset: 870
Max number of tokens in tokenized val dataset: 866
Max number of tokens in tokenized test dataset: 819
Max number of tokens overall: 870


#### BERT cased

In [8]:
bert_cased_tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-cased")

In [9]:
tokenized_train_dataset = train_dataset.map(
    lambda row: tokenize_sample(row, bert_cased_tokenizer)
)
tokenized_val_dataset = val_dataset.map(
    lambda row: tokenize_sample(row, bert_cased_tokenizer)
)
tokenized_test_dataset = test_dataset.map(
    lambda row: tokenize_sample(row, bert_cased_tokenizer)
)

Map:   0%|          | 0/120000 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (607 > 512). Running this sequence through the model will result in indexing errors


Map:   0%|          | 0/10319 [00:00<?, ? examples/s]

Map:   0%|          | 0/11873 [00:00<?, ? examples/s]

In [10]:
max_bert_cased_train_tokens = len(max(tokenized_train_dataset["input_ids"], key=len))
max_bert_cased_val_tokens = len(max(tokenized_val_dataset["input_ids"], key=len))
max_bert_cased_test_tokens = len(max(tokenized_test_dataset["input_ids"], key=len))
max_bert_cased_tokens = max(
    max_bert_cased_train_tokens, max_bert_cased_val_tokens, max_bert_cased_test_tokens
)

print(f"Max number of tokens in tokenized train dataset: {max_bert_cased_train_tokens}")
print(f"Max number of tokens in tokenized val dataset: {max_bert_cased_val_tokens}")
print(f"Max number of tokens in tokenized test dataset: {max_bert_cased_test_tokens}")
print(f"Max number of tokens overall: {max_bert_cased_tokens}")

Max number of tokens in tokenized train dataset: 882
Max number of tokens in tokenized val dataset: 878
Max number of tokens in tokenized test dataset: 833
Max number of tokens overall: 882


#### ALBERT cased

In [11]:
albert_cased_tokenizer = AutoTokenizer.from_pretrained("albert/albert-base-v2")

In [12]:
tokenized_train_dataset = train_dataset.map(
    lambda row: tokenize_sample(row, albert_cased_tokenizer)
)
tokenized_val_dataset = val_dataset.map(
    lambda row: tokenize_sample(row, albert_cased_tokenizer)
)
tokenized_test_dataset = test_dataset.map(
    lambda row: tokenize_sample(row, albert_cased_tokenizer)
)

Map:   0%|          | 0/120000 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (602 > 512). Running this sequence through the model will result in indexing errors


Map:   0%|          | 0/10319 [00:00<?, ? examples/s]

Map:   0%|          | 0/11873 [00:00<?, ? examples/s]

In [13]:
max_albert_cased_train_tokens = len(max(tokenized_train_dataset["input_ids"], key=len))
max_albert_cased_val_tokens = len(max(tokenized_val_dataset["input_ids"], key=len))
max_albert_cased_test_tokens = len(max(tokenized_test_dataset["input_ids"], key=len))
max_albert_cased_tokens = max(
    max_albert_cased_train_tokens,
    max_albert_cased_val_tokens,
    max_albert_cased_test_tokens,
)

print(
    f"Max number of tokens in tokenized train dataset: {max_albert_cased_train_tokens}"
)
print(f"Max number of tokens in tokenized val dataset: {max_albert_cased_val_tokens}")
print(f"Max number of tokens in tokenized test dataset: {max_albert_cased_test_tokens}")
print(f"Max number of tokens overall: {max_albert_cased_tokens}")

Max number of tokens in tokenized train dataset: 978
Max number of tokens in tokenized val dataset: 974
Max number of tokens in tokenized test dataset: 824
Max number of tokens overall: 978


## Data samples structure

#### Train sample with answer

In [27]:
sample_with_answer = val_dataset[0]

In [30]:
sample_with_answer

{'id': '57324e38e17f3d1400422826',
 'title': 'Dwight_D._Eisenhower',
 'context': 'Following the German unconditional surrender, Eisenhower was appointed Military Governor of the U.S. Occupation Zone, based at the IG Farben Building in Frankfurt am Main. He had no responsibility for the other three zones, controlled by Britain, France and the Soviet Union, except for the city of Berlin, which was managed by the Four-Power Authorities through the Allied Kommandatura as the governing body. Upon discovery of the Nazi concentration camps, he ordered camera crews to document evidence of the atrocities in them for use in the Nuremberg Trials. He reclassified German prisoners of war (POWs) in U.S. custody as Disarmed Enemy Forces (DEFs), who were no longer subject to the Geneva Convention. Eisenhower followed the orders laid down by the Joint Chiefs of Staff (JCS) in directive JCS 1067, but softened them by bringing in 400,000 tons of food for civilians and allowing more fraternization. In res

#### Train sample without answer

In [31]:
sample_without_answer = val_dataset[5]

In [32]:
sample_without_answer

{'id': '5a711df70efcfe001a8afd4f',
 'title': 'Pesticide',
 'context': 'Some evidence shows that alternatives to pesticides can be equally effective as the use of chemicals. For example, Sweden has halved its use of pesticides with hardly any reduction in crops.[unreliable source?] In Indonesia, farmers have reduced pesticide use on rice fields by 65% and experienced a 15% crop increase.[unreliable source?] A study of Maize fields in northern Florida found that the application of composted yard waste with high carbon to nitrogen ratio to agricultural fields was highly effective at reducing the population of plant-parasitic nematodes and increasing crop yield, with yield increases ranging from 10% to 212%; the observed effects were long-term, often not appearing until the third season of the study.',
 'question': 'What happened after Indonesia cut its pesticide use in half?',
 'answers': {'text': [], 'answer_start': []},
 'answer_text': [],
 'answer_start': []}

#### Test sample with answer

In [67]:
test_sample_with_answer = test_dataset[1]

In [68]:
test_sample_with_answer

{'id': '5ad26a5fd7d075001a42931b',
 'title': 'Force',
 'context': 'Pushing against an object on a frictional surface can result in a situation where the object does not move because the applied force is opposed by static friction, generated between the object and the table surface. For a situation with no movement, the static friction force exactly balances the applied force resulting in no acceleration. The static friction increases or decreases in response to the applied force up to an upper limit determined by the characteristics of the contact between the surface and the object.',
 'question': 'What increases or decreases in response to static friction?',
 'answers': {'text': [], 'answer_start': []},
 'answer_text': [],
 'answer_start': []}

#### Test sample without answer

In [69]:
test_sample_without_answer = test_dataset[0]

In [70]:
test_sample_without_answer

{'id': '5733ea04d058e614000b6595',
 'title': 'French_and_Indian_War',
 'context': "In the spring of 1753, Paul Marin de la Malgue was given command of a 2,000-man force of Troupes de la Marine and Indians. His orders were to protect the King's land in the Ohio Valley from the British. Marin followed the route that Céloron had mapped out four years earlier, but where Céloron had limited the record of French claims to the burial of lead plates, Marin constructed and garrisoned forts. He first constructed Fort Presque Isle (near present-day Erie, Pennsylvania) on Lake Erie's south shore. He had a road built to the headwaters of LeBoeuf Creek. Marin constructed a second fort at Fort Le Boeuf (present-day Waterford, Pennsylvania), designed to guard the headwaters of LeBoeuf Creek. As he moved south, he drove off or captured British traders, alarming both the British and the Iroquois. Tanaghrisson, a chief of the Mingo, who were remnants of Iroquois and other tribes who had been driven west 