In [33]:
import ijson
import json
import os
import glob
import pandas as pd
from collections import Counter
from genson import SchemaBuilder
import heapq

In [2]:
drug_events_path = r'C:\Users\macie\OneDrive\Documents\Edukacja\YEAR 3\SM2\BEP\OpenFDA\drug-event-0001-of-0005.json\drug-event-0001-of-0005.json'
sample = r'C:\Users\macie\OneDrive\Documents\Edukacja\YEAR 3\SM2\BEP\OpenFDA\Open_FDA_sample.json'
# data Q1 2025
first_of_3 = r'C:\Users\macie\OneDrive\Documents\Edukacja\YEAR 3\SM2\BEP\OpenFDA\Data_Q1_2024\drug-event-0001-of-0030.json'
second_of_3 = r'C:\Users\macie\OneDrive\Documents\Edukacja\YEAR 3\SM2\BEP\OpenFDA\Data_Q1_2024\drug-event-0002-of-0030.json'
third_of_3 = r'C:\Users\macie\OneDrive\Documents\Edukacja\YEAR 3\SM2\BEP\OpenFDA\Data_Q1_2024\drug-event-0003-of-0030.json'
folder_path = r'C:\Users\macie\OneDrive\Documents\Edukacja\YEAR 3\SM2\BEP\OpenFDA\Data_Q1_2024'

## Generator for interating over JSON reports

In [7]:
# generator for iterating over JSON reports

def iterate_reports_ijson(path):
    """Yields one report at a time from the 'results' array inside the full dataset,
    iterating over all .json files if a directory is provided."""
    
    def yield_file(file_path):
        with open(file_path, 'rb') as f:
            parser = ijson.items(f, 'results.item')
            for report in parser:
                yield report

    # If path is a directory, iterate over all JSON files inside
    if os.path.isdir(path):
        for file_path in sorted(glob.glob(os.path.join(path, '*.json'))):
            yield from yield_file(file_path)
    else:
        yield from yield_file(path)

## Helper: key extraction from each report

In [8]:
def extract_keys(d, prefix=''):
    """Recursively extracts all keys from a nested JSON object."""
    keys = []
    if isinstance(d, dict):
        for k, v in d.items():
            keys.append(prefix + k)
            keys.extend(extract_keys(v, prefix + k + '.'))
    elif isinstance(d, list):
        for item in d:
            keys.extend(extract_keys(item, prefix))
    return keys

## Inspection functions

In [9]:
# List all unique keys in the JSON files
def list_all_keys(file_path):
    unique_keys = set()
    for report in iterate_reports_ijson(file_path):
        unique_keys.update(extract_keys(report))

    print(f"Total unique keys: {len(unique_keys)}")
    for key in sorted(unique_keys):
        print(key)


def just_key_number(file_path):
    unique_keys = set()
    for report in iterate_reports_ijson(file_path):
        unique_keys.update(extract_keys(report))

    print(f"Total unique keys: {len(unique_keys)}")
    return len(unique_keys)

def find_largest_report(file_path):
    """Finds the largest report in terms of the number of keys."""
    max_keys = 0
    largest_report = None
    for report in iterate_reports_ijson(file_path):
        num_keys = len(extract_keys(report))
        if num_keys > max_keys:
            max_keys = num_keys
            largest_report = report
    return largest_report, max_keys


def count_key_frequency(file_path):
    """Counts how often each key appears across all reports."""
    key_counter = Counter()
    for report in iterate_reports_ijson(file_path):
        key_counter.update(extract_keys(report))

    print("Most common keys:")
    for key, count in key_counter.most_common():
        print(f"{key}: {count}")



def infer_type(value):
    """Helper to infer simple type names."""
    if isinstance(value, dict):
        return 'dict'
    elif isinstance(value, list):
        return 'list'
    else:
        return type(value).__name__

def infer_key_types(file_path):
    """Infers data types for each key across all reports."""
    type_mapping = {}

    for report in iterate_reports_ijson(file_path):
        for key in extract_keys(report):
            keys = key.split('.')
            temp = report
            try:
                for k in keys:
                    temp = temp[k]
                type_mapping.setdefault(key, set()).add(infer_type(temp))
            except (KeyError, TypeError):
                continue

    print("Key type mapping:")
    for key, types in sorted(type_mapping.items()):
        print(f"{key}: {', '.join(types)}")

def generate_json_schema(file_path):
    """Generates and prints a JSON Schema inferred from the reports."""
    builder = SchemaBuilder()

    for report in iterate_reports_ijson(file_path):
        builder.add_object(report)

    schema = builder.to_schema()
    print(json.dumps(schema, indent=4))

In [11]:
def extract_report_by_id(path, report_id):
    """
    Iterate through the JSON (single file or directory) using ijson,
    returning the first report dict whose safetyreportid equals report_id,
    or None if not found.
    """
    for report in iterate_reports_ijson(path):
        if report.get("safetyreportid") == report_id:
            return report
    return None

In [20]:
# point this at either a single .json or the folder containing multiple parts
data_path = folder_path  # or sample, or first_of_3, etc.

# choose the ID you want to pull out
want_id = "23541411"

single = extract_report_by_id(data_path, want_id)
if single is None:
    print(f"No report found with id {want_id}")
else:
    # write it out
    out_file = f"sample_{want_id}.json"
    with open(out_file, "w", encoding="utf-8") as f:
        json.dump(single, f, indent=2)
    print(f"Wrote {out_file}")

Wrote sample_23541411.json


In [21]:
longest_report = r"C:\Users\macie\OneDrive\Documents\Edukacja\YEAR 3\SM2\BEP\OpenFDA\notebooks\sample_23541411.json"

def szybko(file_path):
    """
    List all unique keys from a single JSON file, not expecting array structure
    like in the OpenFDA data.
    """
    with open(file_path, 'r') as f:
        data = json.load(f)
    
    unique_keys = set(extract_keys(data))
    
    print(f"Total unique keys: {len(unique_keys)}")
    for key in sorted(unique_keys):
        print(key)
    
    return unique_keys

szybko(longest_report)

Total unique keys: 82
companynumb
duplicate
fulfillexpeditecriteria
occurcountry
patient
patient.drug
patient.drug.actiondrug
patient.drug.activesubstance
patient.drug.activesubstance.activesubstancename
patient.drug.drugadditional
patient.drug.drugadministrationroute
patient.drug.drugbatchnumb
patient.drug.drugcharacterization
patient.drug.drugdosageform
patient.drug.drugdosagetext
patient.drug.drugindication
patient.drug.drugintervaldosagedefinition
patient.drug.drugintervaldosageunitnumb
patient.drug.drugseparatedosagenumb
patient.drug.drugstructuredosagenumb
patient.drug.drugstructuredosageunit
patient.drug.drugtreatmentduration
patient.drug.drugtreatmentdurationunit
patient.drug.medicinalproduct
patient.drug.openfda
patient.drug.openfda.application_number
patient.drug.openfda.brand_name
patient.drug.openfda.generic_name
patient.drug.openfda.manufacturer_name
patient.drug.openfda.nui
patient.drug.openfda.package_ndc
patient.drug.openfda.pharm_class_cs
patient.drug.openfda.pharm_cla

{'companynumb',
 'duplicate',
 'fulfillexpeditecriteria',
 'occurcountry',
 'patient',
 'patient.drug',
 'patient.drug.actiondrug',
 'patient.drug.activesubstance',
 'patient.drug.activesubstance.activesubstancename',
 'patient.drug.drugadditional',
 'patient.drug.drugadministrationroute',
 'patient.drug.drugbatchnumb',
 'patient.drug.drugcharacterization',
 'patient.drug.drugdosageform',
 'patient.drug.drugdosagetext',
 'patient.drug.drugindication',
 'patient.drug.drugintervaldosagedefinition',
 'patient.drug.drugintervaldosageunitnumb',
 'patient.drug.drugseparatedosagenumb',
 'patient.drug.drugstructuredosagenumb',
 'patient.drug.drugstructuredosageunit',
 'patient.drug.drugtreatmentduration',
 'patient.drug.drugtreatmentdurationunit',
 'patient.drug.medicinalproduct',
 'patient.drug.openfda',
 'patient.drug.openfda.application_number',
 'patient.drug.openfda.brand_name',
 'patient.drug.openfda.generic_name',
 'patient.drug.openfda.manufacturer_name',
 'patient.drug.openfda.nui',
 

In [None]:
first_keys = szybko(longest_report)
secod_path = r"C:\Users\macie\OneDrive\Documents\Edukacja\YEAR 3\SM2\BEP\OpenFDA\notebooks\sample_23546882.json"
second_keys = szybko(secod_path)

print(len(first_keys))
print(len(second_keys))

iiii = set(first_keys).union(second_keys)
print(len(iiii))

In [25]:
def extract_key_set(report):
    """Returns a set of unique flattened keys from a nested JSON object."""
    key_set = set()
    def collect(obj, prefix=''):
        if isinstance(obj, dict):
            for k, v in obj.items():
                full_key = f"{prefix}.{k}" if prefix else k
                key_set.add(full_key)
                collect(v, full_key)
        elif isinstance(obj, list):
            for item in obj:
                collect(item, prefix)
    collect(report)
    return key_set

def build_representative_sample(path, all_keys, output_folder="representative_sample"):
    """
    Build a minimal sample set of reports from the dataset that together cover all given unique keys.
    """
    os.makedirs(output_folder, exist_ok=True)
    covered_keys = set()
    selected_ids = []
    
    for report in iterate_reports_ijson(path):
        report_keys = extract_key_set(report)
        new_keys = report_keys - covered_keys
        if new_keys:
            report_id = report.get("safetyreportid")
            if report_id:
                selected_ids.append(report_id)
                covered_keys.update(new_keys)
                with open(os.path.join(output_folder, f"sample_{report_id}.json"), "w", encoding="utf-8") as f:
                    json.dump(report, f, indent=2)
                print(f"✔️ Saved report {report_id} (added {len(new_keys)} new keys)")
        if covered_keys == all_keys:
            break

    print(f"\n🎯 Done! Selected {len(selected_ids)} reports to cover all {len(all_keys)} keys.")
    return selected_ids

In [26]:
all_keys = set()
for report in iterate_reports_ijson(folder_path):
    all_keys.update(extract_key_set(report))

In [29]:
print(set(all_keys).difference(iiii))

{'patient.drug.drugcumulativedosageunit', 'authoritynumb', 'patient.drug.drugcumulativedosagenumb', 'primarysource.literaturereference', 'patient.drug.drugrecurreadministration', 'patient.drug.drugenddate', 'patient.drug.drugenddateformat', 'patient.drug.drugstartdateformat', 'patient.drug.drugstartdate'}


In [30]:
selected_ids = build_representative_sample(data_path, all_keys, output_folder="representative_sample")


✔️ Saved report 19520083 (added 71 new keys)
✔️ Saved report 19529532 (added 8 new keys)
✔️ Saved report 19652409 (added 5 new keys)
✔️ Saved report 19854733 (added 4 new keys)
✔️ Saved report 20199698 (added 1 new keys)
✔️ Saved report 17998181 (added 2 new keys)
✔️ Saved report 23361967 (added 2 new keys)

🎯 Done! Selected 7 reports to cover all 93 keys.


In [31]:
def combine_sample_reports(input_folder="representative_sample", output_file="OpenFDA_sample_combined.json"):
    """
    Combine all single-report JSON files from a folder into one file
    formatted like the original OpenFDA data.
    """
    combined = {
        "meta": {
            "disclaimer": "This is a sample dataset derived from OpenFDA data.",
            "terms": "https://open.fda.gov/terms/",
            "license": "https://open.fda.gov/license/",
            "last_updated": "2025-04-23",
            "results": {
                "skip": 0,
                "limit": 12000,
                "total": 0  # will be filled below
            }
        },
        "results": []
    }

    for fname in os.listdir(input_folder):
        if fname.endswith(".json"):
            with open(os.path.join(input_folder, fname), "r", encoding="utf-8") as f:
                report = json.load(f)
                combined["results"].append(report)

    combined["meta"]["results"]["total"] = len(combined["results"])

    with open(output_file, "w", encoding="utf-8") as out:
        json.dump(combined, out, indent=2)

    print(f"✅ Combined {len(combined['results'])} reports into '{output_file}'")

In [39]:
combine_sample_reports("representative_sample", "OpenFDA_sample_combined.json")

✅ Combined 7 reports into 'OpenFDA_sample_combined.json'


In [None]:
def max_nesting_depth(obj, level=0):
    """Recursively determine the maximum nesting depth of a JSON object."""
    if isinstance(obj, dict):
        return max((max_nesting_depth(v, level + 1) for v in obj.values()), default=level)
    elif isinstance(obj, list):
        return max((max_nesting_depth(item, level + 1) for item in obj), default=level)
    return level

def find_and_save_deepest_reports(path, output_folder="deepest_reports", top_n=3):
    """
    Find and save the top N reports with the deepest nesting structure.
    """
    os.makedirs(output_folder, exist_ok=True)
    heap = []

    for report in iterate_reports_ijson(path):
        depth = max_nesting_depth(report)
        report_id = report.get("safetyreportid", "unknown")
        heapq.heappush(heap, (-depth, report_id, report))  # max-heap simulation

    for i in range(top_n):
        if heap:
            depth, report_id, report = heapq.heappop(heap)
            filename = os.path.join(output_folder, f"deep_{report_id}.json")
            with open(filename, "w", encoding="utf-8") as f:
                json.dump(report, f, indent=2)
            print(f"📁 Saved report {report_id} with depth {-depth} → {filename}")
