## 2. Collect all unique value-carrying fields across the dataset

In this step, we extract only the paths in each report that correspond to **actual values** (strings, numbers, lists),  
excluding structural container fields (like `patient`, `primarysource`, etc.).

This gives us a more precise count of "fields that need to be represented in the SQL schema."


In [None]:
def extract_value_key_set(report):
    """
    Recursively extract only the paths that carry values (not container keys like dicts).
    Returns a set of field paths leading to scalars or lists.
    """
    value_keys = set()

    def collect(obj, prefix=''):
        if isinstance(obj, dict):
            for k, v in obj.items():
                full_key = f"{prefix}.{k}" if prefix else k
                if isinstance(v, dict):
                    collect(v, full_key)
                elif isinstance(v, list):
                    if v:  # skip empty lists
                        value_keys.add(full_key)
                else:
                    value_keys.add(full_key)
        elif isinstance(obj, list):
            for item in obj:
                collect(item, prefix)

    collect(report)
    return value_keys


In [None]:
value_fields_set = set()
max_keys = 0
max_report_id = None

for report in tqdm(iterate_reports_ijson(data_path), desc="Extracting value field paths"):
    keys = extract_value_key_set(report)
    value_fields_set.update(keys)
    if len(keys) > max_keys:
        max_keys = len(keys)
        max_report_id = report.get("safetyreportid")

print(f"✅ Unique value-carrying fields in dataset: {len(value_fields_set)}")
print(f"📈 Max number of value fields in a single report: {max_keys} (report ID: {max_report_id})")
