In [None]:
import pandas as pd
import numpy as np

#create constants for files
EXPORT_CSV  = "redcap_export.csv"
DICT_CSV    = "BCNBDataDictionary_DataDictionary.csv"
FORM_NAME   = "donor_family_medical_history"
OUTPUT_CSV  = "family_history_issues.csv"

#load data and interpret correctly
df = pd.read_csv(EXPORT_CSV,encoding="utf-8")
df.columns = df.columns.str.strip()
dd = pd.read_csv(DICT_CSV,encoding="utf-8")

#pull out all FH variables from the dictionary
fh_vars = (
    dd.loc[dd["Form Name"] == FORM_NAME, "Variable / Field Name"]
      .tolist()
)
if not fh_vars:
    raise ValueError(f"No fields found in dict for form '{FORM_NAME}'")

#find which are present/missing in the export
present = [v for v in fh_vars if v in df.columns]
missing_from_export = [v for v in fh_vars if v not in df.columns]


#put all brca columns together as one to check for answers
brca_opts = [c for c in df.columns if c.startswith("brca_results___")]
if brca_opts:
    df["brca_results_any"] = df[brca_opts].notnull().any(axis=1)
    present.append("brca_results_any")

#stop script if no results found which is unlikely
if not present:
    raise RuntimeError(
        "No family-history fields found in your export! "
    )

#confirm how many are found and which are missing- out of 17 how many are found and how many are not found and which category
print(f"Found {len(present)} FH fields in export; "
      f"{len(missing_from_export)} missing ({missing_from_export})")

#treat 0/'0' as missing/NaN 
checkbox_cols = [c for c in df.columns if "___" in c]
df[checkbox_cols] = df[checkbox_cols].replace({0: np.nan, "0": np.nan})

#identify records with no FH data
mask_all_blank = df[present].isnull().all(axis=1)

#find records missing any required fields
req_fields = (
    dd.loc[
        (dd["Form Name"] == FORM_NAME) &
        (dd["Required Field?"] == "y"),
        "Variable / Field Name"
    ]
    .tolist()
)
#check the ones that actually exist and flag any rows that are missing 
req_present = [f for f in req_fields if f in df.columns]
if req_fields and not req_present:
    raise RuntimeError("According to the dictionary there are required FH fields, "
                       "but none were in your export.")

mask_missing_req = False
if req_present:
    mask_missing_req = df[req_present].isnull().any(axis=1)

#combine and export problem records
issues = df.loc[mask_all_blank | mask_missing_req, ["record_id"] + present]
print(f"Found {len(issues)} records missing/incomplete family history")

issues.to_csv(OUTPUT_CSV, index=False, encoding="utf-8")
print(f"Written problem records to {OUTPUT_CSV}")
