# Preprocessing of the SQuAD dataset
First we make sure the necessary subfolders are in place:


In [None]:
# Create the necessary subfolders...
import os
root = os.environ['PROJECT_ROOT']
os.makedirs(os.path.join(root, "data", "SQuAD", "source_documents"), exist_ok=True)
os.makedirs(os.path.join(root, "data", "SQuAD", "human"), exist_ok=True)



When you download the SQuAD-2.0 dataset from [GitHub](https://rajpurkar.github.io/SQuAD-explorer/explore/v2.0/dev/), you get two files:
-`dev-v2.0.json`
-`train-v2.0.json`

These files should be in the directory `data/SQuAD/source_documents`

This notebook explains extracts `questions.json` and `corpus.json` from these files in the structure expected by the rest of the code. In this case we sample the questions from the dev set, but we include documents from both dev and train splits in the corpus.
Adjust your sampling strategy by changing the arguments in `configs/preprocessing/squad.yaml`.

In [None]:
import json
import os
import yaml
import warnings
import random
from collections import defaultdict

In [None]:
root = os.environ['PROJECT_ROOT']
datapath = os.path.join(root, "data", "SQuAD")
config = "squad"
with open(os.path.join(root, "configs", "preprocessing", f"{config}.yaml"), "r") as f:
    config = yaml.safe_load(f)

In [None]:
# Some checks
if "questions_splits" not in config.keys():
    warnings.warn("questions_splits argument not in config file. Using the default of only the dev split.")
questions_splits = config.get("questions_splits", ['dev'])
corpus_splits = config.get("corpus_splits", ['dev', 'train'])
assert questions_splits <= corpus_splits, f"One of the question splits {questions_splits} is not included in the corpus splits {corpus_splits}!"

First we construct and save the corpus

In [None]:
corpus = {}

for split in corpus_splits:
    with open(os.path.join(datapath, "source_documents", f"{split}-v2.0.json"), "r", encoding="utf8") as f:
        data = json.load(f)['data']
        
    # Create a dictionary mapping titles to concatenated paragraph texts
    corpus.update({
        " ".join(d['title'].split("_")): "\n ".join(par['context'] for par in d['paragraphs'])
        for d in data
    })

with open(os.path.join(datapath, "corpus.json"), "w", encoding="utf8") as f:
    json.dump(corpus, f, indent=3)


Next we sample questions. 

In [None]:
random.seed(0)
questions = defaultdict(list)

for split in questions_splits:
    with open(os.path.join(datapath, "source_documents", f"{split}-v2.0.json"), "r", encoding="utf8") as f:
        data = json.load(f)['data']

    n_source_docs = min(config.get("n_source_docs_per_split", len(data)), len(data))
    n_questions_per_doc = config.get("n_questions_per_doc")
    print(f"Sampling {n_questions_per_doc * n_source_docs} questions from the {split} split...")

    sampled_docs = random.sample(data, n_source_docs)

    for doc in sampled_docs:
            title, paragraphs = doc['title'], doc['paragraphs']
            title = " ".join(title.split("_"))
            available_qas = [qa for par in paragraphs for qa in par['qas'] if not qa['is_impossible']]

            # Exit strategy: if there are fewer questions than required, use all available questions
            if len(available_qas) < n_questions_per_doc:
                print(f"document '{title}' only contains {len(available_qas)} questions, while {n_questions_per_doc} were requested. Using all available questions.")
                sampled_qas = available_qas
            else:
                sampled_qas = random.sample(available_qas, n_questions_per_doc)

            # Store the selected unique questions
            questions[title] = [
                {"question": qa['question'], "reference": qa['answers'][0]['text']}
                for qa in sampled_qas
            ]

with open(os.path.join(datapath,"human", "questions.json"), "w", encoding="utf8") as f:
     json.dump(questions, f, indent=3)

print(f"Saved questions.")
