In [1]:
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None)
%load_ext google.cloud.bigquery
from google.cloud import bigquery

client=bigquery.Client()
project_id = "som-nero-phi-jonc101"



In [2]:

table_id_base_cohort_with_labels_exlucsion = f"{project_id}.blood_culture_stewardship_sandy_2024.enriched_label_filtered"

In [4]:
query = f"""
WITH demos AS (
    SELECT 
        c.*,
        demo.gender,

        -- BMI capping logic (replace 10 and 80 with your preferred thresholds)
        CASE 
            WHEN demo.bmi IS NULL THEN NULL
            WHEN demo.bmi < 0 THEN NULL          -- too low → set null
            WHEN demo.bmi > 200 THEN NULL          -- too high → set null
            ELSE demo.bmi                         -- keep valid BMI
        END AS bmi,

        CASE 
            WHEN demo.birth_date_jittered_utc IS NOT NULL 
            THEN DATE_DIFF(DATE(c.blood_culture_order_datetime_utc), DATE(demo.birth_date_jittered_utc), DAY) / 365.0
            ELSE NULL
        END AS age,

        CASE
            WHEN demo.birth_date_jittered_utc IS NOT NULL
            THEN DATE_DIFF(DATE(c.blood_culture_order_datetime_utc), DATE(demo.birth_date_jittered_utc), DAY)
            ELSE NULL
        END AS age_days

    FROM `{table_id_base_cohort_with_labels_exlucsion}` c
    LEFT JOIN `som-nero-phi-jonc101.shc_core_2024.demographic` demo
    ON c.anon_id = demo.anon_id
)

SELECT *
FROM demos
WHERE age <= 18
"""

PEDs_ED_BCx_order_cohort_strict_bmi_age_gender= client.query(query).to_dataframe()

In [None]:
PEDs_ED_BCx_order_cohort_strict_bmi_age_gender

In [None]:
# print("this is enriched label for adult only and prevalence at set level")
# PEDs_ED_BCx_order_cohort_strict_bmi_age_gender["set_level_label"].value_counts(normalize=True)

# print("this is enriched label for adult only and prevalence at encounter level")

# PEDs_ED_BCx_order_cohort_strict_bmi_age_gender.drop_duplicates(subset=["anon_id", "pat_enc_csn_id_coded"])["final_label"].value_counts(normalize=True)

In [5]:
df_sorted = PEDs_ED_BCx_order_cohort_strict_bmi_age_gender.sort_values(
    ['anon_id', 'pat_enc_csn_id_coded', 'blood_culture_order_datetime_utc']
)

PEDs_ED_BCx_order_cohort_strict_bmi_age_gender_latest = (
    df_sorted
    .groupby(['anon_id', 'pat_enc_csn_id_coded'], as_index=False)
    .tail(1)
    .reset_index(drop=True)
)


In [None]:

# table_id_analyse_cohort_withfinal_label_peds = f"{project_id}.blood_culture_stewardship_sandy_2024.enriched_label_filtered_peds_only_analysis_cohort"

# # Upload the DataFrame to BigQuery
# PEDs_ED_BCx_order_cohort_strict_bmi_age_gender_latest.to_gbq(
#     destination_table=table_id_analyse_cohort_withfinal_label_peds,
#     project_id=project_id,
#     if_exists='replace'  # This will replace the table if it exists
# )

# print(f"Uploaded PEDs_ED_BCx_order_cohort_strict_bmi_age_gender to {table_id_analyse_cohort_withfinal_label_peds}")

In [None]:
test_cohort = PEDs_ED_BCx_order_cohort_strict_bmi_age_gender_latest[PEDs_ED_BCx_order_cohort_strict_bmi_age_gender_latest["order_year"] >= 2023]

In [8]:
test_cohort = PEDs_ED_BCx_order_cohort_strict_bmi_age_gender_latest[PEDs_ED_BCx_order_cohort_strict_bmi_age_gender_latest["order_year"] >= 2021]

In [9]:
test_cohort["final_label"].value_counts()

final_label
negative         5895
positive          197
contamination      22
Name: count, dtype: int64

In [10]:
# df has a column 'label' where 1=positive, 0=negative
positive_sample = test_cohort[test_cohort['final_label'] == "positive"].sample(n=60, random_state=42)
negative_sample = test_cohort[test_cohort['final_label'] != "positive"].sample(n=40, random_state=42)

balanced_300 = pd.concat([positive_sample, negative_sample]).sample(frac=1, random_state=42)

In [12]:
query = """
WITH unique_mrn AS (
    SELECT anon_id
    FROM `som-nero-phi-jonc101-secure.starr_map.shc_map_2025-07-17`
    GROUP BY anon_id
    HAVING COUNT(*) = 1
)
SELECT DISTINCT mrn.*
FROM `som-nero-phi-jonc101-secure.starr_map.shc_map_2025-07-17` AS mrn
JOIN unique_mrn u ON mrn.anon_id = u.anon_id
JOIN `som-nero-phi-jonc101.blood_culture_stewardship_sandy_2024.enriched_label_filtered_peds_only_analysis_cohort` AS cohort
  ON mrn.anon_id = cohort.anon_id
WHERE cohort.order_year >= 2021
"""
mrn_mapping_test_cohort = client.query(query).to_dataframe()

In [None]:
mrn_mapping_test_cohort

In [13]:
test_cohort_with_mrn = test_cohort.merge(mrn_mapping_test_cohort, on="anon_id", how="inner")

In [14]:
# if jitter might be a string column, coerce to numeric first
test_cohort_with_mrn["jitter_days"] = pd.to_numeric(test_cohort_with_mrn["jitter"], errors="coerce")

test_cohort_with_mrn["candidate_epic_time"] = (
    test_cohort_with_mrn["blood_culture_order_datetime"]
    - pd.to_timedelta(test_cohort_with_mrn["jitter_days"], unit="D")  # negative values fine
    # - pd.Timedelta(hours=8)
)

In [16]:
columns_to_keep = ["anon_id", "mrn", "pat_enc_csn_id_coded","blood_culture_order_datetime", "blood_culture_order_datetime_utc", "jitter_days","candidate_epic_time", "final_label"]

In [17]:
test_cohort_with_mrn_columns_to_keep = test_cohort_with_mrn[columns_to_keep]

In [20]:
positive_sample = test_cohort_with_mrn_columns_to_keep[test_cohort_with_mrn_columns_to_keep['final_label'] == "positive"].sample(n=180, random_state=42)
negative_sample = test_cohort_with_mrn_columns_to_keep[test_cohort_with_mrn_columns_to_keep['final_label'] != "positive"].sample(n=120, random_state=42)

balanced_300 = pd.concat([positive_sample, negative_sample]).sample(frac=1, random_state=42)

In [21]:
balanced_300

Unnamed: 0,anon_id,mrn,pat_enc_csn_id_coded,blood_culture_order_datetime,blood_culture_order_datetime_utc,jitter_days,candidate_epic_time,final_label
4260,JC6161804,50781749,131327323718,2022-02-26 18:05:00,2022-02-27 02:05:00+00:00,11,2022-02-15 18:05:00,negative
4721,JC6175540,51390995,131362990021,2023-10-02 12:42:00,2023-10-02 19:42:00+00:00,5,2023-09-27 12:42:00,negative
3596,JC3752765,5002042025,131395453416,2025-02-06 01:38:00,2025-02-06 09:38:00+00:00,20,2025-01-17 01:38:00,positive
3105,JC3570866,52264835,131374671772,2024-03-23 19:04:00,2024-03-24 02:04:00+00:00,-3,2024-03-26 19:04:00,positive
1025,JC2354733,47053236,131365275173,2023-10-16 00:41:00,2023-10-16 07:41:00+00:00,-17,2023-11-02 00:41:00,negative
...,...,...,...,...,...,...,...,...
4956,JC6284897,83931675,131333618821,2022-05-08 13:14:00,2022-05-08 20:14:00+00:00,-24,2022-06-01 13:14:00,negative
72,JC1264851,22625990,131314235222,2021-07-21 19:05:00,2021-07-22 02:05:00+00:00,7,2021-07-14 19:05:00,positive
670,JC2268178,42561092,131355492124,2023-06-29 20:52:00,2023-06-30 03:52:00+00:00,30,2023-05-30 20:52:00,positive
1477,JC2404151,48828305,131347961390,2023-02-22 12:09:00,2023-02-22 20:09:00+00:00,26,2023-01-27 12:09:00,negative


In [22]:
balanced_300.to_csv("../data/peds_mrn_mapping_balanced_300.csv", index=False)

In [None]:
# import pandas as pd
# import numpy as np
# pd.set_option('display.max_columns', None)
# pd.read_csv("../data/balanced_100.csv")

