# Analysis of SQuAD 1.0 dataset

In [2]:
from question_answering.paths import extractive_qa_paths
from question_answering.utils import core_qa_utils
from transformers import AutoTokenizer

  _torch_pytree._register_pytree_node(


In [3]:
raw_train_dataset, raw_test_dataset = core_qa_utils.load_datasets_from_json(
    dataset_path=extractive_qa_paths.squad1_dataset_dir,
    filenames=["original_train.json", "original_test.json"],
)

In [4]:
train_dataset = raw_train_dataset.select(range(80000))
val_dataset = raw_train_dataset.select(range(80000, 87599))
test_dataset = raw_test_dataset

## Maximum number of tokens in any sample across dataset

In [5]:
def tokenize_sample(sample, tokenizer, max_tokens=None, padding=False):
    question = sample["question"].strip()
    context = sample["context"].strip()

    return tokenizer(question, context, max_length=max_tokens, padding=padding)

#### BERT uncased

In [8]:
bert_uncased_tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")

Downloading tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Downloading config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [12]:
tokenized_train_dataset = train_dataset.map(
    lambda row: tokenize_sample(row, bert_uncased_tokenizer)
)
tokenized_val_dataset = val_dataset.map(
    lambda row: tokenize_sample(row, bert_uncased_tokenizer)
)
tokenized_test_dataset = test_dataset.map(
    lambda row: tokenize_sample(row, bert_uncased_tokenizer)
)

Map:   0%|          | 0/80000 [00:00<?, ? examples/s]

In [16]:
max_bert_uncased_train_tokens = len(max(tokenized_train_dataset["input_ids"], key=len))
max_bert_uncased_val_tokens = len(max(tokenized_val_dataset["input_ids"], key=len))
max_bert_uncased_test_tokens = len(max(tokenized_test_dataset["input_ids"], key=len))
max_bert_uncased_tokens = max(
    max_bert_uncased_train_tokens,
    max_bert_uncased_val_tokens,
    max_bert_uncased_test_tokens,
)

print(
    f"Max number of tokens in tokenized train dataset: {max_bert_uncased_train_tokens}"
)
print(f"Max number of tokens in tokenized val dataset: {max_bert_uncased_val_tokens}")
print(f"Max number of tokens in tokenized test dataset: {max_bert_uncased_test_tokens}")
print(f"Max number of tokens overall: {max_bert_uncased_tokens}")

Max number of tokens in tokenized train dataset: 870
Max number of tokens in tokenized val dataset: 594
Max number of tokens in tokenized test dataset: 819
Max number of tokens overall: 870


#### BERT cased

In [19]:
bert_cased_tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-cased")

In [20]:
tokenized_train_dataset = train_dataset.map(
    lambda row: tokenize_sample(row, bert_cased_tokenizer)
)
tokenized_val_dataset = val_dataset.map(
    lambda row: tokenize_sample(row, bert_cased_tokenizer)
)
tokenized_test_dataset = test_dataset.map(
    lambda row: tokenize_sample(row, bert_cased_tokenizer)
)

Map:   0%|          | 0/10570 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (637 > 512). Running this sequence through the model will result in indexing errors


In [21]:
max_bert_cased_train_tokens = len(max(tokenized_train_dataset["input_ids"], key=len))
max_bert_cased_val_tokens = len(max(tokenized_val_dataset["input_ids"], key=len))
max_bert_cased_test_tokens = len(max(tokenized_test_dataset["input_ids"], key=len))
max_bert_cased_tokens = max(
    max_bert_cased_train_tokens, max_bert_cased_val_tokens, max_bert_cased_test_tokens
)

print(f"Max number of tokens in tokenized train dataset: {max_bert_cased_train_tokens}")
print(f"Max number of tokens in tokenized val dataset: {max_bert_cased_val_tokens}")
print(f"Max number of tokens in tokenized test dataset: {max_bert_cased_test_tokens}")
print(f"Max number of tokens overall: {max_bert_cased_tokens}")

Max number of tokens in tokenized train dataset: 882
Max number of tokens in tokenized val dataset: 614
Max number of tokens in tokenized test dataset: 833
Max number of tokens overall: 882


#### ALBERT cased

In [22]:
albert_cased_tokenizer = AutoTokenizer.from_pretrained("albert/albert-base-v2")

Downloading tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/684 [00:00<?, ?B/s]

Downloading spiece.model:   0%|          | 0.00/760k [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/1.31M [00:00<?, ?B/s]

In [23]:
tokenized_train_dataset = train_dataset.map(
    lambda row: tokenize_sample(row, albert_cased_tokenizer)
)
tokenized_val_dataset = val_dataset.map(
    lambda row: tokenize_sample(row, albert_cased_tokenizer)
)
tokenized_test_dataset = test_dataset.map(
    lambda row: tokenize_sample(row, albert_cased_tokenizer)
)

Map:   0%|          | 0/80000 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (549 > 512). Running this sequence through the model will result in indexing errors


Map:   0%|          | 0/7599 [00:00<?, ? examples/s]

Map:   0%|          | 0/10570 [00:00<?, ? examples/s]

In [24]:
max_albert_cased_train_tokens = len(max(tokenized_train_dataset["input_ids"], key=len))
max_albert_cased_val_tokens = len(max(tokenized_val_dataset["input_ids"], key=len))
max_albert_cased_test_tokens = len(max(tokenized_test_dataset["input_ids"], key=len))
max_albert_cased_tokens = max(
    max_albert_cased_train_tokens,
    max_albert_cased_val_tokens,
    max_albert_cased_test_tokens,
)

print(
    f"Max number of tokens in tokenized train dataset: {max_albert_cased_train_tokens}"
)
print(f"Max number of tokens in tokenized val dataset: {max_albert_cased_val_tokens}")
print(f"Max number of tokens in tokenized test dataset: {max_albert_cased_test_tokens}")
print(f"Max number of tokens overall: {max_albert_cased_tokens}")

Max number of tokens in tokenized train dataset: 978
Max number of tokens in tokenized val dataset: 611
Max number of tokens in tokenized test dataset: 824
Max number of tokens overall: 978


## Data samples structure

#### Train sample

In [5]:
train_sample = val_dataset[0]

In [7]:
train_sample

{'id': '57301a88b2c2fd1400568878',
 'title': 'Liberia',
 'context': 'The Americo-Liberian settlers did not identify with the indigenous peoples they encountered, especially those in communities of the more isolated "bush." They knew nothing of their cultures, languages or animist religion. Encounters with tribal Africans in the bush often developed as violent confrontations. The colonial settlements were raided by the Kru and Grebo people from their inland chiefdoms. Because of feeling set apart and superior by their culture and education to the indigenous peoples, the Americo-Liberians developed as a small elite that held on to political power. It excluded the indigenous tribesmen from birthright citizenship in their own lands until 1904, in a repetition of the United States\' treatment of Native Americans. Because of the cultural gap between the groups and assumption of superiority of western culture, the Americo-Liberians envisioned creating a western-style state to which the tribes

#### Test sample

In [8]:
train_sample = test_dataset[0]

In [9]:
train_sample

{'id': '56be4db0acb8001400a502ec',
 'title': 'Super_Bowl_50',
 'context': 'Super Bowl 50 was an American football game to determine the champion of the National Football League (NFL) for the 2015 season. The American Football Conference (AFC) champion Denver Broncos defeated the National Football Conference (NFC) champion Carolina Panthers 24–10 to earn their third Super Bowl title. The game was played on February 7, 2016, at Levi\'s Stadium in the San Francisco Bay Area at Santa Clara, California. As this was the 50th Super Bowl, the league emphasized the "golden anniversary" with various gold-themed initiatives, as well as temporarily suspending the tradition of naming each Super Bowl game with Roman numerals (under which the game would have been known as "Super Bowl L"), so that the logo could prominently feature the Arabic numerals 50.',
 'question': 'Which NFL team represented the AFC at Super Bowl 50?',
 'answers': {'text': ['Denver Broncos', 'Denver Broncos', 'Denver Broncos'],


## Tokens histograms

In [32]:
train_input_ids = tokenized_train_dataset["input_ids"]
below_maximum = []
for sample_input_ids in train_input_ids:
    if len(sample_input_ids) <= albert_cased_tokenizer.model_max_length:
        below_maximum.append(sample_input_ids)

len(below_maximum)

79882

In [33]:
len(train_input_ids)

80000

In [None]:
import matplotlib.pyplot as plt


def plot_token_distribution(
    tokenized_bert_uncased_dataset,
    tokenized_bert_cased_dataset,
    tokenized_albert_cased_dataset,
):
    b

    plt.subp

#### BERT uncased