In [None]:
import pandas as pd
import yaml

# Load configuration
CONFIG_PATH = "./corebehrt/configs/select_cohort.yaml"

with open(CONFIG_PATH, "r") as file:
    config = yaml.safe_load(file)

# Paths from YAML
ehr_cohort_path = config["paths"]["ehr_cohort"]
mammogram_scores_path = config["paths"]["mammogram_scores"]
filtered_exposure_path = config["paths"]["exposure"]  # This file will be used as exposure in cohort selection


def filter_and_match_mammogram_exposure():
    """
    Filters and matches mammogram data with the EHR cohort using 'Accession_ID'.
    Ensures only patients with mammograms are included and assigns the first mammogram event.
    """
    print("Loading EHR cohort...")
    ehr_cohort = pd.read_csv(ehr_cohort_path, usecols=["Accession_ID"])
    ehr_cohort["Accession_ID"] = ehr_cohort["Accession_ID"].astype(str)

    print("Loading mammogram scores...")
    mammogram_scores = pd.read_csv(mammogram_scores_path)
    mammogram_scores["Accession_ID"] = mammogram_scores["Accession_ID"].astype(str)

    # Filter mammogram data to include only patients in EHR cohort
    matched_mammograms = mammogram_scores[mammogram_scores["Accession_ID"].isin(ehr_cohort["Accession_ID"])]

    if matched_mammograms.empty:
        raise ValueError("No matching patients found between EHR cohort and mammogram records!")

    print(f"Matched {len(matched_mammograms)} mammogram records with EHR cohort.")

    # Assign the first mammogram event per patient (based on timestamp)
    matched_mammograms["Bestillingstidspunkt"] = pd.to_datetime(matched_mammograms["Bestillingstidspunkt"])
    first_mammogram = matched_mammograms.sort_values(by=["Accession_ID", "Bestillingstidspunkt"]) \
                                         .drop_duplicates(subset="Accession_ID", keep="first")

    print(f"Final cohort with mammograms: {len(first_mammogram)} patients.")

    # Save cleaned exposure file
    first_mammogram.to_csv(filtered_exposure_path, index=False)
    print(f"Filtered mammogram exposure file saved to {filtered_exposure_path}")


if __name__ == "__main__":
    filter_and_match_mammogram_exposure()