# Create key-value groundtruth from custom Genetic Reports 0.1 dataset



## Setup


In [1]:
import json
import os

def process_json_file(input_filepath, output_filepath):
    with open(input_filepath, 'r', encoding='utf-8') as file:
        data = json.load(file)

    questions = {}
    answers = {}

    # First pass: collect questions and answers
    for item in data['form']:
        if item['label'] == 'question':
            questions[item['id']] = item['text']
        elif item['label'] == 'answer':
            answers[item['id']] = item['text']

    # Second pass: link questions and answers
    results = {}
    for item in data['form']:
        if item['label'] == 'question':
            question_id = item['id']
            for link in item['linking']:
                answer_id = link[1]
                if answer_id in answers:
                    question_text = questions.get(question_id, "")
                    answer_text = answers.get(answer_id, "")
                    if question_text in results:
                        results[question_text].append(answer_text)
                    else:
                        results[question_text] = [answer_text]

    # Write the output JSON
    output_data = {}
    for question, answer_list in results.items():
        output_data[question] = answer_list

    with open(output_filepath, 'w', encoding='utf-8') as file:
        json.dump(output_data, file, ensure_ascii=False, indent=4)


## Load dataset

Upload result from cloud provider algorithm in zip format

In [4]:
!unzip spark_generic_reports_v6.zip

Archive:  spark_generic_reports_v6.zip
   creating: spark_generic_reports_v6/
   creating: spark_generic_reports_v6/annotations/
  inflating: spark_generic_reports_v6/annotations/Caris-Molecular-Intelligence_MI-Profile_Breast_NOS_WEBchanged_0.json  
  inflating: spark_generic_reports_v6/annotations/Caris-Molecular-Intelligence_MI-Profile_Breast_NOS_WEB_0.json  
  inflating: spark_generic_reports_v6/annotations/Caris-Molecular-Intelligence_MI-Profile_Breast_NOS_WEB_1.json  
  inflating: spark_generic_reports_v6/annotations/CarisReport_2023_NSCLC_KRAS_G12C_PD-L1-unlocked_10.json  
  inflating: spark_generic_reports_v6/annotations/CarisReport_2023_NSCLC_KRAS_G12C_PD-L1-unlocked_11.json  
  inflating: spark_generic_reports_v6/annotations/CarisReport_2023_NSCLC_KRAS_G12C_PD-L1-unlocked_12.json  
  inflating: spark_generic_reports_v6/annotations/CarisReport_2023_NSCLC_KRAS_G12C_PD-L1-unlocked_13.json  
  inflating: spark_generic_reports_v6/annotations/CarisReport_2023_NSCLC_KRAS_G12C_PD-L1-u

## Execution

In [5]:
# Define paths
funsd_path = r'spark_generic_reports_v6/annotations'
output_path = r'spark_generic_reports_v6_gt'

os.makedirs(output_path, exist_ok=True)

In [6]:
# Process all JSON files in the input directory
for filename in os.listdir(funsd_path):
    if filename.endswith('.json'):
        input_filepath = os.path.join(funsd_path, filename)
        output_filepath = os.path.join(output_path, filename)
        process_json_file(input_filepath, output_filepath)

print("Processing completed.")

Processing completed.


## Download result

In [7]:
!zip -r spark_generic_reports_v6_gt.zip spark_generic_reports_v6_gt

  adding: spark_generic_reports_v6_gt/ (stored 0%)
  adding: spark_generic_reports_v6_gt/oncoextra-tnbc-ntrk-wm-sample-report_4.json (deflated 58%)
  adding: spark_generic_reports_v6_gt/Tempus-Onco_Clinical-Report-Sample_9.json (deflated 51%)
  adding: spark_generic_reports_v6_gt/CarisReport_2023_NSCLC_KRAS_G12C_PD-L1_15.json (deflated 41%)
  adding: spark_generic_reports_v6_gt/Tempus-Onco_Clinical-Report-Sample_6.json (deflated 55%)
  adding: spark_generic_reports_v6_gt/Tempus-Onco_Clinical-Report-Sample_4.json (deflated 52%)
  adding: spark_generic_reports_v6_gt/oncoextra-tnbc-ntrk-wm-sample-report_1.json (deflated 55%)
  adding: spark_generic_reports_v6_gt/CarisReport_2023_NSCLC_KRAS_G12C_PD-L1-unlocked_8.json (deflated 54%)
  adding: spark_generic_reports_v6_gt/CarisReport_2023_NSCLC_KRAS_G12C_PD-L1-unlocked_14.json (deflated 44%)
  adding: spark_generic_reports_v6_gt/CarisReport_2023_NSCLC_KRAS_G12C_PD-L1_16.json (deflated 37%)
  adding: spark_generic_reports_v6_gt/CarisReport_202