## Formating of validation datasets to SQuAD structure similar to HuggingFace Datasets 
### from the dataset format generated by https://github.com/amazon-research/qa-dataset-converter

In [1]:
import pandas as pd
import json
from tqdm import tqdm

In [2]:
# name of the dataset you want to format to SQuAD structure
dataset_name = 'triviaqa_dev'

In [3]:
# loading of validation dataset in json format
json1_file = open(f"datasets/{dataset_name}.json")
json1_str = json1_file.read()
json_dataset = json.loads(json1_str)

In [4]:
def format_json(json_dataset):
    """Format the json dataset format generated with converted (link in the top cell)
    changes the structure of the dataset to structure similar to the HuggingFace Dataset's SQuAD
    leaves only answearable samples

    Args:
        json_dataset (json): dataset in json format

    Returns:
        Dataframe: formatted dataset in Pandas Dataframe
    """
    questions = []
    answers = []
    contexts = []
    ids = []

    for i in tqdm(range(len(json_dataset['data']))):
        if not json_dataset['data'][i]['paragraphs'][0]['qas'][0]['is_impossible']:
            questions.append(json_dataset['data'][i]['paragraphs'][0]['qas'][0]['question'])
            contexts.append(json_dataset['data'][i]['paragraphs'][0]['context'])
            ids.append(json_dataset['data'][i]['paragraphs'][0]['qas'][0]['id'])

            ans_starts = []
            ans_texts = []

            for ans in json_dataset['data'][i]['paragraphs'][0]['qas'][0]['answers']:
                ans_starts.append(ans['answer_start'])
                ans_texts.append(ans['text'])
            answers.append({'text': ans_texts, 'answer_start': ans_starts})

    df = pd.DataFrame(
    {'id': ids,
    'context': contexts,
     'question': questions,
     'answers': answers
    })

            
    return df

In [5]:
# call function and save the formatted dataset
format_json(json_dataset).to_json(f"datasets/{dataset_name}_formatted.json", orient='records')

100%|██████████| 14229/14229 [00:00<00:00, 124286.22it/s]
