### Notebook to convert Squad Dataset from JSON to CSV

Dataset version: 1.1

Code updated from [here](https://www.kaggle.com/code/sanjay11100/squad-stanford-q-a-json-to-pandas-dataframe)

#### Import libraries

In [1]:
import numpy as np
import pandas as pd
import json

#### Converter functions for training and validation datasets.

In [23]:
def squad_json_to_dataframe_train(input_file_path, record_path = ['data','paragraphs','qas','answers'],
                           verbose = 1):
    """
    Converts SQuAD JSON training data into a pandas DataFrame.
    
    Args:
        input_file_path: path to the squad json file.
        record_path: path to deepest level in json file default value is
                    ['data','paragraphs','qas','answers']
        verbose: 0 to suppress output, default is 1
    """
    if verbose:
        print("Reading the json file")    
    file = json.loads(open(input_file_path).read())
    if verbose:
        print("processing...")
    
    # parsing different level's in the json file
    answers_df = pd.json_normalize(file, record_path)
    questions_df = pd.json_normalize(file, record_path[:-1])
    paragraphs_df = pd.json_normalize(file, record_path[:-2])
    
    # combining it into single dataframe
    context_repeats = np.repeat(paragraphs_df['context'].values, paragraphs_df.qas.str.len())
    question_id_repeats = np.repeat(questions_df['id'].values, questions_df['answers'].str.len())
    
    questions_df['context'] = context_repeats
    answers_df['q_idx'] = question_id_repeats
    
    result_df = pd.concat(
        [questions_df[['id', 'question', 'context']].set_index('id'), answers_df.set_index('q_idx')],
        axis=1,  
        sort=False  
    ).reset_index()
    
    result_df['c_id'] = result_df['context'].factorize()[0]
    
    if verbose:
        print("shape of the dataframe is {}".format(result_df.shape))
        print("Done")
    
    return result_df


def squad_json_to_dataframe_dev(input_file_path, record_path = ['data','paragraphs','qas','answers'],
                           verbose = 1):
    """
    Converts SQuAD JSON dev/validation data into a pandas DataFrame.
    Uses only the first answer (considered best) for each question.
    
    Args:
        input_file_path: path to the squad json file.
        record_path: path to deepest level in json file default value is
                    ['data','paragraphs','qas','answers']
        verbose: 0 to suppress output, default is 1
    """
    if verbose:
        print("Reading the json file")    
    file = json.loads(open(input_file_path).read())
    if verbose:
        print("processing...")
    
    # parsing different level's in the json file
    questions_df = pd.json_normalize(file, record_path[:-1])
    paragraphs_df = pd.json_normalize(file, record_path[:-2])
    
    # Extract text and answer_start from the first answer in each list
    questions_df['text'] = questions_df['answers'].apply(lambda x: x[0]['text'] if len(x) > 0 else None)
    questions_df['answer_start'] = questions_df['answers'].apply(lambda x: x[0]['answer_start'] if len(x) > 0 else None)
    
    # combining it into single dataframe
    context_repeats = np.repeat(paragraphs_df['context'].values, paragraphs_df.qas.str.len())
    questions_df['context'] = context_repeats
    
    # Create the final dataframe with the same structure as train
    result_df = questions_df[['id', 'question', 'context', 'text', 'answer_start']]
    result_df['c_id'] = result_df['context'].factorize()[0]
    
    if verbose:
        print("shape of the dataframe is {}".format(result_df.shape))
        print("Done")
    
    return result_df

In [None]:
#prevous dev converter code. (kept all answers in an list)
def squad_json_to_dataframe_dev_keep_all_answers(input_file_path, record_path = ['data','paragraphs','qas','answers'],
                           verbose = 1):
    """
    Converts SQuAD JSON dev/validation data into a pandas DataFrame.
    Handles multiple possible answers for a single question.
    
    Args:
        input_file_path: path to the squad json file.
        record_path: path to deepest level in json file default value is
                    ['data','paragraphs','qas','answers']
        verbose: 0 to suppress output, default is 1
    """
    if verbose:
        print("Reading the json file")    
    file = json.loads(open(input_file_path).read())
    if verbose:
        print("processing...")
    
    # parsing different level's in the json file
    answers_df = pd.json_normalize(file, record_path)
    questions_df = pd.json_normalize(file, record_path[:-1])
    paragraphs_df = pd.json_normalize(file, record_path[:-2])
    
    # combining it into single dataframe
    context_repeats = np.repeat(paragraphs_df['context'].values, paragraphs_df.qas.str.len())
    # Note: Dev data is different, so we don't repeat question IDs
    # ndx = np.repeat(questions_df['id'].values, questions_df['answers'].str.len())
    
    questions_df['context'] = context_repeats
    # answers_df['q_idx'] = ndx  # Not needed for dev data
    
    result_df = questions_df[['id', 'question', 'context', 'answers']].set_index('id').reset_index()
    result_df['c_id'] = result_df['context'].factorize()[0]
    
    if verbose:
        print("shape of the dataframe is {}".format(result_df.shape))
        print("Done")
    
    return result_df

#### Run functions

In [None]:
# Training Data:
input_pth = 'DM_Dataset/v1.1/train-v1.1.json'
record_path = ['data','paragraphs','qas','answers']
train = squad_json_to_dataframe_train(input_file_path=input_pth,record_path=record_path)

Reading the json file
processing...
shape of the dataframe is (87599, 6)
Done


In [None]:
# Checking the data and then writing it to a csv file
display(train.head())

train.to_csv('DM_Dataset/v1.1/train.csv',index=False)

Unnamed: 0,index,question,context,answer_start,text,c_id
0,5733be284776f41900661182,To whom did the Virgin Mary allegedly appear i...,"Architecturally, the school has a Catholic cha...",515,Saint Bernadette Soubirous,0
1,5733be284776f4190066117f,What is in front of the Notre Dame Main Building?,"Architecturally, the school has a Catholic cha...",188,a copper statue of Christ,0
2,5733be284776f41900661180,The Basilica of the Sacred heart at Notre Dame...,"Architecturally, the school has a Catholic cha...",279,the Main Building,0
3,5733be284776f41900661181,What is the Grotto at Notre Dame?,"Architecturally, the school has a Catholic cha...",381,a Marian place of prayer and reflection,0
4,5733be284776f4190066117e,What sits on top of the Main Building at Notre...,"Architecturally, the school has a Catholic cha...",92,a golden statue of the Virgin Mary,0


In [24]:
# Dev Data:
input_pth = 'DM_Dataset/v1.1/dev-v1.1.json'
record_path = ['data','paragraphs','qas','answers']
dev = squad_json_to_dataframe_dev(input_file_path=input_pth,record_path=record_path)

Reading the json file
processing...
shape of the dataframe is (10570, 6)
Done


In [28]:
# Checking the data and then writing it to a csv file
display(dev.head())

dev.to_csv('DM_Dataset/v1.1/validation.csv',index=False)


Unnamed: 0,id,question,context,text,answer_start,c_id
0,56be4db0acb8001400a502ec,Which NFL team represented the AFC at Super Bo...,Super Bowl 50 was an American football game to...,Denver Broncos,177,0
1,56be4db0acb8001400a502ed,Which NFL team represented the NFC at Super Bo...,Super Bowl 50 was an American football game to...,Carolina Panthers,249,0
2,56be4db0acb8001400a502ee,Where did Super Bowl 50 take place?,Super Bowl 50 was an American football game to...,"Santa Clara, California",403,0
3,56be4db0acb8001400a502ef,Which NFL team won Super Bowl 50?,Super Bowl 50 was an American football game to...,Denver Broncos,177,0
4,56be4db0acb8001400a502f0,What color was used to emphasize the 50th anni...,Super Bowl 50 was an American football game to...,gold,488,0
