In [3]:
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None)
%load_ext google.cloud.bigquery
from google.cloud import bigquery

client=bigquery.Client()
project_id = "som-nero-phi-jonc101"

The google.cloud.bigquery extension is already loaded. To reload it, use:
  %reload_ext google.cloud.bigquery


# Base Cohort 2024 LPCH

In [4]:
query = f"""
WITH base_shc AS (
    SELECT DISTINCT
        op.anon_id, 
        op.pat_enc_csn_id_coded,
        op.order_proc_id_coded, 
        op.order_time_jittered_utc,
        EXTRACT(YEAR FROM op.order_time_jittered_utc) AS order_year,
        op.ordering_mode,
        op.department_id
    FROM `som-nero-phi-jonc101.shc_core_2024.order_proc` op
    INNER JOIN `som-nero-phi-jonc101.shc_core_2024.lab_result` lr
      ON op.order_proc_id_coded = lr.order_id_coded
    WHERE op.order_type LIKE "Microbiology%"
      AND op.description LIKE "%BLOOD%"
      AND NOT op.order_status LIKE ANY ('Discontinued', 'Canceled')
), 
ED_orders_shc AS (
    SELECT 
        b.anon_id,
        b.pat_enc_csn_id_coded,
        b.order_proc_id_coded,
        b.order_time_jittered_utc AS blood_culture_order_datetime_utc,
        b.order_year,
        b.ordering_mode,
        b.department_id,
        department_name
    FROM base_shc b 
    INNER JOIN `som-nero-phi-jonc101.shc_core_2024.dep_map` USING (department_id)
    WHERE LOWER(department_name) LIKE '%emergency%'
), 
ED_arrival_orders_shc AS (
    SELECT 
        b.anon_id,
        b.pat_enc_csn_id_coded,
        b.order_proc_id_coded,
        b.blood_culture_order_datetime_utc,
        b.order_year,
        ad.effective_time_jittered_utc AS ed_arrival_datetime_utc
    FROM ED_orders_shc b
    INNER JOIN `som-nero-phi-jonc101.shc_core_2024.adt` ad
      USING (anon_id, pat_enc_csn_id_coded, department_id)
    WHERE event_type = 'Admission'
      AND b.blood_culture_order_datetime_utc >= ad.effective_time_jittered_utc
), 
ED_orders_shc_peds AS (
    SELECT 
        b.anon_id,
        b.pat_enc_csn_id_coded,
        b.order_proc_id_coded,
        b.blood_culture_order_datetime_utc,
        b.order_year,
        DATE_DIFF(DATE(b.blood_culture_order_datetime_utc), DATE(d.birth_date_jittered_utc), DAY) AS age_days,
        DATE_DIFF(DATE(b.blood_culture_order_datetime_utc), DATE(d.birth_date_jittered_utc), DAY) / 365.0 AS age_years,
        d.gender,
        d.canonical_race AS race,
        d.bmi,
        "shc" AS source
    FROM ED_arrival_orders_shc b
    INNER JOIN `som-nero-phi-jonc101.shc_core_2024.demographic` d
      USING (anon_id)
    WHERE DATE_DIFF(
            DATE(b.blood_culture_order_datetime_utc),
            DATE(d.birth_date_jittered_utc),
            DAY
          ) / 365.0 <= 18
), 

-- ===========================
-- LPCH side
-- ===========================
base_lpch AS (
    SELECT DISTINCT
        op.anon_id, 
        op.pat_enc_csn_id_coded,
        op.order_proc_id_coded, 
        op.order_time_jittered_utc,
        EXTRACT(YEAR FROM op.order_time_jittered_utc) AS order_year,
        op.ordering_mode,
        op.department_id
    FROM `som-nero-phi-jonc101.lpch_core_2024.lpch_order_proc` op
    INNER JOIN `som-nero-phi-jonc101.lpch_core_2024.lpch_lab_result` lr
      ON op.order_proc_id_coded = lr.order_id_coded
    WHERE op.order_type LIKE "Microbiology%"
      AND op.description LIKE "%BLOOD%"
      AND NOT op.order_status LIKE ANY ('Discontinued', 'Canceled')
), 
ED_orders_lpch AS (
    SELECT 
        b.anon_id,
        b.pat_enc_csn_id_coded,
        b.order_proc_id_coded,
        b.order_time_jittered_utc AS blood_culture_order_datetime_utc,
        b.order_year,
        b.ordering_mode,
        b.department_id,
        department_name
    FROM base_lpch b 
    INNER JOIN `som-nero-phi-jonc101.lpch_core_2024.lpch_dep_map` USING (department_id)
    WHERE LOWER(department_name) LIKE '%emergency%'
),

lpch_demo_marked AS (
    SELECT
        anon_id,
        birth_date_jittered AS birth_date_jittered_utc,
        gender,
        canonical_race,
        bmi,
        COUNT(DISTINCT birth_date_jittered) OVER (PARTITION BY anon_id) AS birth_cnt
    FROM `som-nero-phi-jonc101.lpch_core_2024.lpch_demographic`
),

lpch_demo_dedup AS (
    SELECT DISTINCT
        anon_id,
        birth_date_jittered_utc,
        gender,
        canonical_race,
        bmi
    FROM lpch_demo_marked
    WHERE birth_cnt = 1
),

ED_orders_lpch_peds AS (
    SELECT 
        b.anon_id,
        b.pat_enc_csn_id_coded,
        b.order_proc_id_coded,
        b.blood_culture_order_datetime_utc,
        b.order_year,
        DATE_DIFF(DATE(b.blood_culture_order_datetime_utc), DATE(d.birth_date_jittered_utc), DAY) AS age_days,
        DATE_DIFF(DATE(b.blood_culture_order_datetime_utc), DATE(d.birth_date_jittered_utc), DAY) / 365.0 AS age_years,
        d.gender,
        d.canonical_race AS race,
        d.bmi,
        "lpch" AS source
    FROM ED_orders_lpch b
    INNER JOIN lpch_demo_dedup d
      USING (anon_id)
    WHERE DATE_DIFF(
            DATE(b.blood_culture_order_datetime_utc),
            DATE(d.birth_date_jittered_utc),
            DAY
          ) / 365.0 <= 18
)

SELECT * FROM ED_orders_shc_peds
UNION ALL
SELECT * FROM ED_orders_lpch_peds;

"""
base_cohort = client.query(query).to_dataframe()

# query = f"""
# WITH base_shc AS (
#     SELECT DISTINCT
#         op.anon_id, 
#         op.pat_enc_csn_id_coded,
#         op.order_proc_id_coded, 
#         op.order_time_jittered_utc,
#         EXTRACT(YEAR FROM op.order_time_jittered_utc) AS order_year,
#         op.ordering_mode,
#         op.department_id
#     FROM `som-nero-phi-jonc101.shc_core_2024.order_proc` op
#     INNER JOIN `som-nero-phi-jonc101.shc_core_2024.lab_result` lr
#       ON op.order_proc_id_coded = lr.order_id_coded
#     WHERE op.order_type LIKE "Microbiology%"
#       AND op.description LIKE "%BLOOD%"
#       AND NOT op.order_status LIKE ANY ('Discontinued', 'Canceled')
# ), 
# ED_orders_shc AS (
#     SELECT 
#         b.anon_id,
#         b.pat_enc_csn_id_coded,
#         b.order_proc_id_coded,
#         b.order_time_jittered_utc AS blood_culture_order_datetime_utc,
#         b.order_year,
#         b.ordering_mode,
#         b.department_id,
#         department_name
#     FROM base_shc b 
#     INNER JOIN `som-nero-phi-jonc101.shc_core_2024.dep_map` USING (department_id)
#     WHERE LOWER(department_name) LIKE '%emergency%'
# ), 
# ED_arrival_orders_shc AS (
#     SELECT 
#         b.anon_id,
#         b.pat_enc_csn_id_coded,
#         b.order_proc_id_coded,
#         b.blood_culture_order_datetime_utc,
#         b.order_year,
#         ad.effective_time_jittered_utc AS ed_arrival_datetime_utc
#     FROM ED_orders_shc b
#     INNER JOIN `som-nero-phi-jonc101.shc_core_2024.adt` ad
#       USING (anon_id, pat_enc_csn_id_coded, department_id)
#     WHERE event_type = 'Admission'
#       AND b.blood_culture_order_datetime_utc >= ad.effective_time_jittered_utc
# ), 
# ED_orders_shc_peds AS (
#     SELECT 
#         b.anon_id,
#         b.pat_enc_csn_id_coded,
#         b.order_proc_id_coded,
#         b.blood_culture_order_datetime_utc,
#         b.order_year,
#         DATE_DIFF(DATE(b.blood_culture_order_datetime_utc), DATE(d.birth_date_jittered_utc), DAY) AS age_days,
#         DATE_DIFF(DATE(b.blood_culture_order_datetime_utc), DATE(d.birth_date_jittered_utc), DAY) / 365.0 AS age_years,
#         d.gender,
#         d.canonical_race AS race,
#         d.bmi,
#         "shc" AS source
#     FROM ED_arrival_orders_shc b
#     INNER JOIN `som-nero-phi-jonc101.shc_core_2024.demographic` d
#       USING (anon_id)
#     WHERE DATE_DIFF(
#             DATE(b.blood_culture_order_datetime_utc),
#             DATE(d.birth_date_jittered_utc),
#             DAY
#           ) / 365.0 <= 18
# ),

# -- ===========================
# -- LPCH side
# -- ===========================
# base_lpch AS (
#     SELECT DISTINCT
#         op.anon_id, 
#         op.pat_enc_csn_id_coded,
#         op.order_proc_id_coded, 
#         op.order_time_jittered_utc,
#         EXTRACT(YEAR FROM op.order_time_jittered_utc) AS order_year,
#         op.ordering_mode,
#         op.department_id
#     FROM `som-nero-phi-jonc101.lpch_core_2024.lpch_order_proc` op
#     INNER JOIN `som-nero-phi-jonc101.lpch_core_2024.lpch_lab_result` lr
#       ON op.order_proc_id_coded = lr.order_id_coded
#     WHERE op.order_type LIKE "Microbiology%"
#       AND op.description LIKE "%BLOOD%"
#       AND NOT op.order_status LIKE ANY ('Discontinued', 'Canceled')
# ), 
# ED_orders_lpch AS (
#     SELECT 
#         b.anon_id,
#         b.pat_enc_csn_id_coded,
#         b.order_proc_id_coded,
#         b.order_time_jittered_utc AS blood_culture_order_datetime_utc,
#         b.order_year,
#         b.ordering_mode,
#         b.department_id,
#         department_name
#     FROM base_lpch b 
#     INNER JOIN `som-nero-phi-jonc101.lpch_core_2024.lpch_dep_map` USING (department_id)
#     WHERE LOWER(department_name) LIKE '%emergency%'
# ),
# ED_arrival_orders_lpch AS (
#     SELECT 
#         b.anon_id,
#         b.pat_enc_csn_id_coded,
#         b.order_proc_id_coded,
#         b.blood_culture_order_datetime_utc,
#         b.order_year,
#         ad.effective_time_jittered_utc AS ed_arrival_datetime_utc
#     FROM ED_orders_lpch b
#     INNER JOIN `som-nero-phi-jonc101.lpch_core_2024.lpch_adt` ad
#       USING (anon_id, pat_enc_csn_id_coded, department_id)
#     WHERE event_type = 'Admission'
#       AND b.blood_culture_order_datetime_utc >= ad.effective_time_jittered_utc
# ), 

# lpch_demo_marked AS (
#     SELECT
#         anon_id,
#         birth_date_jittered AS birth_date_jittered_utc,
#         gender,
#         canonical_race,
#         bmi,
#         COUNT(DISTINCT birth_date_jittered) OVER (PARTITION BY anon_id) AS birth_cnt
#     FROM `som-nero-phi-jonc101.lpch_core_2024.lpch_demographic`
# ),

# lpch_demo_dedup AS (
#     SELECT DISTINCT
#         anon_id,
#         birth_date_jittered_utc,
#         gender,
#         canonical_race,
#         bmi
#     FROM lpch_demo_marked
#     WHERE birth_cnt = 1
# ),

# ED_orders_lpch_peds AS (
#     SELECT 
#         b.anon_id,
#         b.pat_enc_csn_id_coded,
#         b.order_proc_id_coded,
#         b.blood_culture_order_datetime_utc,
#         b.order_year,
#         DATE_DIFF(DATE(b.blood_culture_order_datetime_utc), DATE(d.birth_date_jittered_utc), DAY) AS age_days,
#         DATE_DIFF(DATE(b.blood_culture_order_datetime_utc), DATE(d.birth_date_jittered_utc), DAY) / 365.0 AS age_years,
#         d.gender,
#         d.canonical_race AS race,
#         d.bmi,
#         "lpch" AS source
#     FROM  ED_arrival_orders_lpch b
#     INNER JOIN lpch_demo_dedup d
#       USING (anon_id)
#     WHERE DATE_DIFF(
#             DATE(b.blood_culture_order_datetime_utc),
#             DATE(d.birth_date_jittered_utc),
#             DAY
#           ) / 365.0 <= 18
# )

# SELECT * FROM ED_orders_shc_peds
# UNION ALL
# SELECT * FROM ED_orders_lpch_peds;
# """

# base_cohort = client.query(query).to_dataframe()
# base_cohort_filtered = client.query(query).to_dataframe()


In [7]:
base_cohort["source"].value_counts()

source
shc     16180
lpch    13826
Name: count, dtype: int64

In [5]:
base_cohort["order_year"].value_counts().sort_index()

order_year
2011     291
2012     420
2013    1096
2014    1847
2015    2264
2016    2237
2017    2109
2018    2123
2019    2574
2020    2160
2021    2334
2022    2816
2023    3406
2024    3270
2025    1059
Name: count, dtype: Int64

In [15]:
# # Upload refactor_base_cohort to BigQuery as a temporary table
table_id_base_cohort_temp = f"{project_id}.blood_culture_stewardship_peds_sandy_2024.base_cohort_temp_peds"

# Upload the DataFrame to BigQuery
base_cohort.to_gbq(
    destination_table=table_id_base_cohort_temp,
    project_id=project_id,
    if_exists='replace'  # This will replace the table if it exists
)

print(f"Uploaded base_cohort to {table_id_base_cohort_temp}")

  base_cohort.to_gbq(
100%|██████████| 1/1 [00:00<00:00, 1923.11it/s]

Uploaded base_cohort to som-nero-phi-jonc101.blood_culture_stewardship_peds_sandy_2024.base_cohort_temp_peds





In [12]:
base_cohort

Unnamed: 0,anon_id,pat_enc_csn_id_coded,order_proc_id_coded,blood_culture_order_datetime_utc,order_year,age_days,age_years,gender,race,bmi,source
0,JC2224489,131014741687,387953113,2011-09-13 01:21:00+00:00,2011,745,2.041096,Male,Other,34.75,shc
1,JC2242861,131016265231,392476924,2011-11-06 22:28:00+00:00,2011,191,0.523288,Male,White,15.41,shc
2,JC2242861,131016265231,392476925,2011-11-06 22:28:00+00:00,2011,191,0.523288,Male,White,15.41,shc
3,JC1313184,131014865534,388391071,2011-08-08 02:55:00+00:00,2011,388,1.063014,Male,Asian,21.91,shc
4,JC2190435,131013743171,383951393,2011-06-28 10:27:00+00:00,2011,6516,17.852055,Female,Other,40.03,shc
...,...,...,...,...,...,...,...,...,...,...,...
30001,JC3851622,131397751107,1031162508,2025-01-19 06:51:00+00:00,2025,1247,3.416438,Female,White,17.78,shc
30002,JC3860753,131396836148,1028039533,2025-01-21 17:49:00+00:00,2025,911,2.495890,Male,Other,16.80,shc
30003,JC3580491,131397740026,1031113818,2025-02-20 01:18:00+00:00,2025,477,1.306849,Male,Asian,18.81,shc
30004,JC3580491,131397740026,1031078259,2025-02-19 22:14:00+00:00,2025,476,1.304110,Male,Asian,18.81,shc


# Label

In [None]:
# # need to plug in for adults labelling process
# query = f"""
# WITH culture AS (
#   SELECT DISTINCT
#     c.anon_id,
#     c.pat_enc_csn_id_coded,
#     c.order_proc_id_coded,
#     lr.ord_value,
#     COALESCE(lr.extended_value_comment, lr.extended_comp_comment) AS comment
#   FROM blood_culture_stewardship_peds_sandy_2024.base_cohort_temp_peds c
#   JOIN `som-nero-phi-jonc101.shc_core_2024.lab_result` lr
#     ON c.anon_id = lr.anon_id
#    AND c.order_proc_id_coded = lr.order_id_coded
#    AND c.pat_enc_csn_id_coded = lr.pat_enc_csn_id_coded
#    AND c.blood_culture_order_datetime_utc = lr.order_time_jittered_utc
# ),

# -- only orders that actually had an organism in sensitivity
# culture_with_sens AS (
#   SELECT
#     cu.anon_id,
#     cu.pat_enc_csn_id_coded,
#     cu.order_proc_id_coded,
#     cu.ord_value,
#     cu.comment
#   FROM culture cu
#   JOIN (
#     SELECT DISTINCT anon_id, order_proc_id_coded
#     FROM `som-nero-phi-jonc101.shc_core_2024.culture_sensitivity`
#     WHERE organism IS NOT NULL
#   ) cs
#   USING (anon_id, order_proc_id_coded)
# ),

# -- flag each row
# row_flags AS (
#   SELECT
#     anon_id,
#     pat_enc_csn_id_coded,
#     order_proc_id_coded,
#     ord_value,
#     comment,

#     -- base contaminant / no-growth signals
#     LOWER(ord_value) LIKE '%no%grow%'         AS is_no_growth,
#     LOWER(ord_value) LIKE '%not%detect%'      AS is_not_detected,
#     LOWER(ord_value) LIKE '%negative%'        AS is_negative_text,
#     REGEXP_CONTAINS(
#       UPPER(comment),
#       r'NO GROWTH|COAG\s*NEG\s*STAPH|GRAM\+\s*RODS|GRAM\s*POS\s*RODS|CONTAMIN'
#     ) AS is_comment_contam,

#     -- the two single-bottle CoNS patterns
#     REGEXP_CONTAINS(
#       LOWER(ord_value),
#       r'^aerobic bottle:\s*coag negative staphylococcus'
#     ) AS is_aerobic_cons,

#     REGEXP_CONTAINS(
#       LOWER(ord_value),
#       r'^anaerobic bottle:\s*coag negative staphylococcus'
#     ) AS is_anaerobic_cons,

#     -- your special case: "Both Aerobic bottles: Coag Negative Staphylococcus" -> POSITIVE
#     REGEXP_CONTAINS(
#       LOWER(ord_value),
#       r'^both aerobic bottles:\s*coag negative staphylococcus'
#     ) AS is_both_aerobic_bottles_cons

#   FROM culture_with_sens
# ),

# -- now collapse to order-level
# order_flags AS (
#   SELECT
#     anon_id,
#     pat_enc_csn_id_coded,
#     order_proc_id_coded,

#     -- did we see that special "both aerobic bottles ..." row?
#     MAX(CASE WHEN is_both_aerobic_bottles_cons THEN 1 ELSE 0 END) AS has_both_aerobic_bottles_cons,

#     -- did we see aerobic CoNS?
#     MAX(CASE WHEN is_aerobic_cons THEN 1 ELSE 0 END) AS has_aerobic_cons,

#     -- did we see anaerobic CoNS?
#     MAX(CASE WHEN is_anaerobic_cons THEN 1 ELSE 0 END) AS has_anaerobic_cons,

#     -- did we see ANY row that is NOT obviously no-growth/negative/comment-contam
#     -- and also not one of the single-bottle CoNS?
#     MAX(
#       CASE
#         WHEN is_both_aerobic_bottles_cons THEN 1            -- this one counts as positive
#         WHEN is_aerobic_cons OR is_anaerobic_cons THEN 0    -- we'll handle with bottle-pair logic
#         WHEN is_no_growth OR is_not_detected OR is_negative_text OR is_comment_contam THEN 0
#         ELSE 1
#       END
#     ) AS has_other_positive_like
#   FROM row_flags
#   GROUP BY 1,2,3
# ),

# -- final decision per order
# labeled_orders AS (
#   SELECT
#     anon_id,
#     pat_enc_csn_id_coded,
#     order_proc_id_coded,
#     CASE
#       -- 1) special text: "Both Aerobic bottles: Coag Negative Staphylococcus"
#       WHEN has_both_aerobic_bottles_cons = 1 THEN 1

#       -- 2) we saw BOTH aerobic and anaerobic single-bottle CoNS -> call it positive
#       WHEN has_aerobic_cons = 1 AND has_anaerobic_cons = 1 THEN 1

#       -- 3) any other clearly positive-like row we didn't filter away
#       WHEN has_other_positive_like = 1 THEN 1

#       -- 4) otherwise, it's negative (e.g. only aerobic CoNS, or only anaerobic CoNS, or just no growth)
#       ELSE 0
#     END AS positive_blood_culture
#   FROM order_flags
# )

# SELECT
#   b.*,
#   IFNULL(l.positive_blood_culture, 0) AS positive_blood_culture
# FROM blood_culture_stewardship_peds_sandy_2024.base_cohort_temp_peds b
# LEFT JOIN labeled_orders l
#   USING (anon_id, pat_enc_csn_id_coded, order_proc_id_coded);

# """
# base_cohort_with_labels = client.query(query).to_dataframe()
# print(f"Created base_cohort_with_labels with {len(base_cohort_with_labels)} rows")


Created base_cohort_with_labels with 31158 rows


In [11]:
table_id_base_cohort_temp = f"{project_id}.blood_culture_stewardship_peds_sandy_2024.base_cohort_temp_peds"

In [13]:
query = f"""
WITH all_labs AS (
  SELECT
    anon_id,
    pat_enc_csn_id_coded,
    order_id_coded,
    order_time_jittered_utc,
    ord_value,
    extended_value_comment,
    extended_comp_comment
  FROM `som-nero-phi-jonc101.lpch_core_2024.lpch_lab_result`
  WHERE anon_id IN (SELECT DISTINCT anon_id FROM {table_id_base_cohort_temp})
  UNION ALL
  SELECT
    anon_id,
    pat_enc_csn_id_coded,
    order_id_coded,
    order_time_jittered_utc,
    ord_value,
    extended_value_comment,
    extended_comp_comment
  FROM `som-nero-phi-jonc101.shc_core_2024.lab_result`
  WHERE anon_id IN (SELECT DISTINCT anon_id FROM {table_id_base_cohort_temp})
),
all_sensitivity AS (
  SELECT
    anon_id,
    order_proc_id_coded,
    organism
  FROM `som-nero-phi-jonc101.shc_core_2024.culture_sensitivity`
  WHERE anon_id IN (SELECT DISTINCT anon_id FROM {table_id_base_cohort_temp})
  UNION ALL
  SELECT
    anon_id,
    order_proc_id_coded,
    organism
  FROM `som-nero-phi-jonc101.lpch_core_2024.lpch_culture_sensitivity`
  WHERE anon_id IN (SELECT DISTINCT anon_id FROM {table_id_base_cohort_temp})
),

-- match the base cohort orders to all labs (SHC+LPCH)
culture AS (
  SELECT DISTINCT
    c.anon_id,
    c.pat_enc_csn_id_coded,
    c.order_proc_id_coded,
    c.blood_culture_order_datetime_utc,
    lr.ord_value,
    COALESCE(lr.extended_value_comment, lr.extended_comp_comment) AS comment
  FROM {table_id_base_cohort_temp} c
  JOIN all_labs lr
    ON c.anon_id = lr.anon_id
   AND c.order_proc_id_coded = lr.order_id_coded
   AND c.pat_enc_csn_id_coded = lr.pat_enc_csn_id_coded
   AND c.blood_culture_order_datetime_utc = lr.order_time_jittered_utc
),

-- only orders that actually had an organism in sensitivity (from either source)
culture_with_sens AS (
  SELECT
    cu.anon_id,
    cu.pat_enc_csn_id_coded,
    cu.order_proc_id_coded,
    cu.ord_value,
    cu.comment
  FROM culture cu
  JOIN (
    SELECT DISTINCT anon_id, order_proc_id_coded
    FROM all_sensitivity
    WHERE organism IS NOT NULL
  ) cs
  USING (anon_id, order_proc_id_coded)
),

-- flag each row like the adult logic
row_flags AS (
  SELECT
    anon_id,
    pat_enc_csn_id_coded,
    order_proc_id_coded,
    ord_value,
    comment,

    -- base contaminant / no-growth signals
    LOWER(ord_value) LIKE '%no%grow%'    AS is_no_growth,
    LOWER(ord_value) LIKE '%not%detect%' AS is_not_detected,
    LOWER(ord_value) LIKE '%negative%'   AS is_negative_text,
    REGEXP_CONTAINS(
      UPPER(comment),
      r'NO GROWTH|COAG\\s*NEG\\s*STAPH|GRAM\\+\\s*RODS|GRAM\\s*POS\\s*RODS|CONTAMIN'
    ) AS is_comment_contam,

    -- single-bottle CoNS patterns
    REGEXP_CONTAINS(
      LOWER(ord_value),
      r'^aerobic bottle:\\s*coag negative staphylococcus'
    ) AS is_aerobic_cons,

    REGEXP_CONTAINS(
      LOWER(ord_value),
      r'^anaerobic bottle:\\s*coag negative staphylococcus'
    ) AS is_anaerobic_cons,

    -- special case: "Both Aerobic bottles: Coag Negative Staphylococcus" -> POSITIVE
    REGEXP_CONTAINS(
      LOWER(ord_value),
      r'^both aerobic bottles:\\s*coag negative staphylococcus'
    ) AS is_both_aerobic_bottles_cons

  FROM culture_with_sens
),

-- collapse to order-level like adult
order_flags AS (
  SELECT
    anon_id,
    pat_enc_csn_id_coded,
    order_proc_id_coded,

    MAX(CASE WHEN is_both_aerobic_bottles_cons THEN 1 ELSE 0 END) AS has_both_aerobic_bottles_cons,
    MAX(CASE WHEN is_aerobic_cons THEN 1 ELSE 0 END) AS has_aerobic_cons,
    MAX(CASE WHEN is_anaerobic_cons THEN 1 ELSE 0 END) AS has_anaerobic_cons,

    -- any other clearly positive-like row (not a no-growth / negative / comment-contam,
    -- and not a single-bottle CoNS, unless it's the special both-aerobic one)
    MAX(
      CASE
        WHEN is_both_aerobic_bottles_cons THEN 1
        WHEN is_aerobic_cons OR is_anaerobic_cons THEN 0
        WHEN is_no_growth OR is_not_detected OR is_negative_text OR is_comment_contam THEN 0
        ELSE 1
      END
    ) AS has_other_positive_like

  FROM row_flags
  GROUP BY 1,2,3
),

-- final decision per order, same priority as adult
labeled_orders AS (
  SELECT
    anon_id,
    pat_enc_csn_id_coded,
    order_proc_id_coded,
    CASE
      WHEN has_both_aerobic_bottles_cons = 1 THEN 1
      WHEN has_aerobic_cons = 1 AND has_anaerobic_cons = 1 THEN 1
      WHEN has_other_positive_like = 1 THEN 1
      ELSE 0
    END AS positive_blood_culture
  FROM order_flags
)

SELECT
  b.*,
  IFNULL(l.positive_blood_culture, 0) AS positive_blood_culture
FROM {table_id_base_cohort_temp} b
LEFT JOIN labeled_orders l
  USING (anon_id, pat_enc_csn_id_coded, order_proc_id_coded);
"""

base_cohort_with_labels = client.query(query).to_dataframe()
print(f"Created base_cohort_with_labels with {len(base_cohort_with_labels)} rows")


Created base_cohort_with_labels with 30006 rows


In [14]:
base_cohort_with_labels["positive_blood_culture"].mean()

np.float64(0.046990601879624075)

# exclusion 2: no positive blood culture within preceding 14 days


In [117]:
import numpy as np
import pandas as pd

df = base_cohort_with_labels.copy()
tcol = 'blood_culture_order_datetime_utc'

# 1) Normalize timestamps (UTC -> naive) and sort once
df[tcol] = pd.to_datetime(df[tcol], utc=True, errors='coerce')
df = df.dropna(subset=[tcol])
df[tcol] = df[tcol].dt.tz_convert('UTC').dt.tz_localize(None)
df = df.sort_values(['anon_id', tcol], kind='mergesort')

# 2) Work in int64 nanoseconds for speed
times_i8 = df[tcol].view('i8')

# Keep time for positives; sentinel for non-positives
NEG_INF = np.iinfo('int64').min
pos_i8 = np.where(df['positive_blood_culture'].eq(1), times_i8, NEG_INF).astype('int64')
pos_i8 = pd.Series(pos_i8, index=df.index)

# 3) Per-patient cumulative max of positive times, then shift by 1 row
#    to get the *strictly prior* positive time at each row.
cum_max = pos_i8.groupby(df['anon_id'], sort=False).cummax()
prev_pos_i8 = cum_max.groupby(df['anon_id'], sort=False).shift(1).fillna(NEG_INF).astype('int64')

# 4) Compare against the current time
FOURTEEN_D_NS = np.int64(14 * 24 * 60 * 60 * 1_000_000_000)
has_prev = prev_pos_i8.values != NEG_INF
delta_ns = times_i8 - prev_pos_i8.values

drop_mask = has_prev & (delta_ns > 0) & (delta_ns <= FOURTEEN_D_NS)

# 5) Keep rows without a prior positive in the preceding 14 days
base_cohort_with_labels_exclusion_2 = df.loc[~drop_mask, [
    'anon_id','pat_enc_csn_id_coded','order_proc_id_coded',
    'blood_culture_order_datetime_utc','order_year', 'age_days', 'age_years', 'gender', 'race', 'bmi', 'source',
    'positive_blood_culture', 
]].reset_index(drop=True)


  times_i8 = df[tcol].view('i8')


In [118]:
# Apply to your kept dataframe (e.g., `result` from earlier)
base_cohort_with_labels_exclusion_2_overwrite = base_cohort_with_labels_exclusion_2.copy()

# Ensure 0/1 integers
# df2['positive_blood_culture'] = df2['positive_blood_culture'].fillna(0).astype(int)

# Overwrite: if any positive within the same (anon_id, encounter), set all to 1
base_cohort_with_labels_exclusion_2_overwrite['positive_blood_culture'] = (
    base_cohort_with_labels_exclusion_2_overwrite.groupby(['anon_id', 'pat_enc_csn_id_coded'])['positive_blood_culture']
       .transform('max')
    .astype(int)
)

In [119]:
print("after exclusion 2 and overwrite, the prevalence of positive blood culture is", round(base_cohort_with_labels_exclusion_2_overwrite["positive_blood_culture"].mean() * 100, 2), "%")
print("the number of valid orders is", len(base_cohort_with_labels_exclusion_2_overwrite))

after exclusion 2 and overwrite, the prevalence of positive blood culture is 2.76 %
the number of valid orders is 27515


In [30]:
encounter_level = base_cohort_with_labels_exclusion_2_overwrite[["anon_id", "pat_enc_csn_id_coded", "positive_blood_culture"]].drop_duplicates(subset=["anon_id", "pat_enc_csn_id_coded"])
print("after exclusion 2 and overwrite, the prevalence of positive blood culture at encounter level is", round(encounter_level["positive_blood_culture"].mean() * 100, 2), "%")
print("the number of valid encounter is", len(encounter_level))

after exclusion 2 and overwrite, the prevalence of positive blood culture at encounter level is 2.61 %
the number of valid encounter is 25929


In [122]:
# Upload refactor_base_cohort to BigQuery as a temporary table
table_id_base_cohort_with_labels_exlucsion_n_overwrite = f"{project_id}.blood_culture_stewardship_peds_sandy_2024.base_cohort_with_labels_exlucsion_n_overwrite_peds"

# Upload the DataFrame to BigQuery
base_cohort_with_labels_exclusion_2_overwrite.to_gbq(
    destination_table=table_id_base_cohort_with_labels_exlucsion_n_overwrite,
    project_id=project_id,
    if_exists='replace'  # This will replace the table if it exists
)

print(f"Uploaded base_cohort_with_labels_exclusion_2_overwrite to {table_id_base_cohort_with_labels_exlucsion_n_overwrite}")

  base_cohort_with_labels_exclusion_2_overwrite.to_gbq(
100%|██████████| 1/1 [00:00<00:00, 7410.43it/s]

Uploaded base_cohort_with_labels_exclusion_2_overwrite to som-nero-phi-jonc101.blood_culture_stewardship_peds_sandy_2024.base_cohort_with_labels_exlucsion_n_overwrite_peds





In [120]:
final_base_bmi_age_race_gender = base_cohort_with_labels_exclusion_2_overwrite.copy()

In [121]:
final_base_bmi_age_race_gender

Unnamed: 0,anon_id,pat_enc_csn_id_coded,order_proc_id_coded,blood_culture_order_datetime_utc,order_year,age_days,age_years,gender,race,bmi,source,positive_blood_culture
0,JC1000964,131181803241,489700509,2016-03-09 05:24:00,2016,4254,11.654795,Male,White,16.11,shc,0
1,JC1000964,313661322,717728953,2016-03-09 07:02:00,2016,4254,11.654795,Male,White,16.11,lpch,0
2,JC1001312,131092528115,467723412,2015-06-19 17:18:00,2015,3983,10.912329,Male,Other,27.27,shc,0
3,JC1001312,311762468,711125650,2015-06-19 18:32:00,2015,3983,10.912329,Male,Other,27.27,lpch,0
4,JC1001474,131270266094,615238072,2019-05-21 14:13:00,2019,5432,14.882192,Female,Other,22.32,shc,0
...,...,...,...,...,...,...,...,...,...,...,...,...
27510,JC998040,311007275,708372050,2015-02-10 08:03:00,2015,4314,11.819178,Female,Other,30.55,lpch,0
27511,JC998572,131264900613,598880066,2019-03-08 16:04:00,2019,5350,14.657534,Female,Asian,10.95,shc,0
27512,JC998572,321846584,745779961,2019-03-08 16:31:00,2019,5350,14.657534,Female,Asian,10.95,lpch,0
27513,JC999859,131038386282,443087974,2014-08-04 20:48:00,2014,6409,17.558904,Female,Asian,21.94,shc,0


# Demo
age, sex, ADI Score, bmi

### adi calculation
- 9 digit exact match
- 5 digit imputed with CA state avg match
- null value imputed with CA state avg match

In [123]:
from google.cloud import bigquery

PROJECT_ID = "som-nero-phi-jonc101"
client = bigquery.Client(project=PROJECT_ID)

# Base cohort table (already defined in your session)
TABLE_BASE = table_id_base_cohort_with_labels_exlucsion_n_overwrite

# Reference tables
TABLE_ZIP = f"`{PROJECT_ID}.shc_core_2024.zip`"
TABLE_ADI = f"`{PROJECT_ID}.mapdata.ADI_data_CA`"

query = f"""
WITH
-- 1) Cohort + cleaned ZIP (remove dashes)
cohort_zip AS (
  SELECT
    mc.*,
    REPLACE(z.zip, '-', '') AS zip_clean
  FROM {TABLE_BASE} mc
  LEFT JOIN {TABLE_ZIP} z
    ON mc.anon_id = z.anon_id
),

-- 2) Keep only numeric ADI scores (drop flags like 'P','U','NA', etc.)
adi_numeric AS (
  SELECT
    zip,
    CAST(adi_score AS FLOAT64) AS adi_num
  FROM {TABLE_ADI}
  WHERE SAFE_CAST(adi_score AS FLOAT64) IS NOT NULL
),

-- 3) ZIP5 average from ZIP+4 rows (your colleague's method)
zip5_avg AS (
  SELECT LEFT(zip, 5) AS zip5, AVG(adi_num) AS avg_adi_zip5
  FROM adi_numeric
  WHERE LENGTH(zip) = 9
  GROUP BY zip5
),

-- 4) Statewide fallback computed from ZIP5 means
state_avg AS (
  SELECT AVG(avg_adi_zip5) AS avg_adi_state
  FROM zip5_avg
),

-- 5) Score with priority: exact ZIP -> ZIP5 avg -> STATE avg
scored AS (
  SELECT
    cz.*,
    COALESCE(
      adi_exact.adi_num,        -- exact ZIP match
      z5.avg_adi_zip5,          -- ZIP5 mean
      sa.avg_adi_state          -- statewide mean of ZIP5 means
    ) AS adi_score,
    CASE
      WHEN adi_exact.adi_num IS NOT NULL THEN 0
      ELSE 1
    END AS adi_imputed_flag
  FROM cohort_zip cz
  LEFT JOIN adi_numeric adi_exact
    ON cz.zip_clean = adi_exact.zip
  LEFT JOIN zip5_avg z5
    ON LEFT(cz.zip_clean, 5) = z5.zip5
  CROSS JOIN state_avg sa
)

-- Final output
SELECT *
FROM scored
"""

job = client.query(query, location="US")
adi_score_df = job.to_dataframe()


In [124]:
final_base_bmi_age_race_gender_adi = pd.merge(final_base_bmi_age_race_gender, adi_score_df[["anon_id", "pat_enc_csn_id_coded", "order_proc_id_coded", "blood_culture_order_datetime_utc","zip_clean", "adi_score", "adi_imputed_flag"]], on=["anon_id", "pat_enc_csn_id_coded", "order_proc_id_coded", "blood_culture_order_datetime_utc"], how="left")

In [126]:
# Upload refactor_base_cohort to BigQuery as a temporary table
table_id_final_base_bmi_age_race_gender_adi_temp = f"{project_id}.blood_culture_stewardship_peds_sandy_2024.final_base_bmi_age_race_gender_adi_temp_peds"

# Upload the DataFrame to BigQuery
final_base_bmi_age_race_gender_adi.to_gbq(
    destination_table=table_id_final_base_bmi_age_race_gender_adi_temp,
    project_id=project_id,
    if_exists='replace'  # This will replace the table if it exists
)

print(f"Uploaded final_base_bmi_age_race_gender_adi to {table_id_final_base_bmi_age_race_gender_adi_temp}")

  final_base_bmi_age_race_gender_adi.to_gbq(
100%|██████████| 1/1 [00:00<00:00, 2169.84it/s]

Uploaded final_base_bmi_age_race_gender_adi to som-nero-phi-jonc101.blood_culture_stewardship_peds_sandy_2024.final_base_bmi_age_race_gender_adi_temp_peds





In [194]:
final_base_bmi_age_race_gender_adi[final_base_bmi_age_race_gender_adi["age_days"] <= 90]["order_year"].value_counts()

order_year
2023    405
2024    372
2022    355
2019    331
2021    318
2018    300
2020    270
2015    236
2017    215
2016    211
2014    169
2025    101
2013     89
2012     25
2011     23
Name: count, dtype: Int64

# Vitals

## temp, resp, heart rate,  blood pressure, SPO2
### temp: 26.5 - 45 degree C
### resp ranges 0-90(peds), adult: 0-60 
### hr: 250 max
### bp: 250 (sys) /150(dia)
### O2: N/A

In [129]:
# Read the CSV directly
csv_url = "/Users/sandychen/Downloads/_Temp_ and _Temp (in Celsius)_updated.csv"
df = pd.read_csv(csv_url)
df.rename(columns={'Unnamed: 1': 'label'}, inplace=True)
valid_template = df[df["label"] == "Yes"]

In [190]:
df

Unnamed: 0,template,label
0,OR Moderate Sedation,Yes
1,Disposition Vitals,Yes
2,ECT VS,Yes
3,HBO Treatment Record,Maybe
4,Infusion Center Assessment,Yes
...,...,...
128,Advisory Flowsheet Action Data,Maybe
129,Prediction of Alcohol Withdrawal Severity Scal...,Yes
130,PACU TCAR CAS CEA Assessment,Yes
131,(Retired) Nursing Care Path,No


In [130]:
# Conservative physiologic range (°C). Adjust if needed.
TEMP_C_MIN = 26.7
TEMP_C_MAX = 45.0

tmpl_list = valid_template["template"].tolist()
if tmpl_list:
    tmpl_in_shc = "AND f_shc.template IN (" + ", ".join([f"'{t}'" for t in tmpl_list]) + ")"
    tmpl_in_lpch = "AND f_lpch.template IN (" + ", ".join([f"'{t}'" for t in tmpl_list]) + ")"
else:
    tmpl_in_shc = ""
    tmpl_in_lpch = ""

query = f"""
-- Temperature (°C) statistics per blood culture order in [-48h, 0h]
WITH flowsheet_window AS (
  -- 1) SHC flowsheet rows
  SELECT
    c.anon_id,
    c.pat_enc_csn_id_coded,
    c.order_proc_id_coded,
    c.blood_culture_order_datetime_utc,
    f_shc.recorded_time_jittered_utc AS ts,
    TRIM(f_shc.row_disp_name) AS row_name,
    CAST(f_shc.meas_value AS STRING) AS meas_value_str,
    CAST(f_shc.units AS STRING) AS units,
    CAST(f_shc.template AS STRING) AS template
  FROM `{table_id_final_base_bmi_age_race_gender_adi_temp}` AS c
  LEFT JOIN `som-nero-phi-jonc101.shc_core_2024.flowsheet` AS f_shc
    ON c.anon_id = f_shc.anon_id
   AND TIMESTAMP_DIFF(
         f_shc.recorded_time_jittered_utc,
         CAST(c.blood_culture_order_datetime_utc AS TIMESTAMP),
         HOUR
       ) BETWEEN -48 AND 0
    {tmpl_in_shc}

  UNION ALL

  -- 2) LPCH flowsheet rows
  SELECT
    c.anon_id,
    c.pat_enc_csn_id_coded,
    c.order_proc_id_coded,
    c.blood_culture_order_datetime_utc,
    -- lpch recorded_time might not be TIMESTAMP -> cast
    CAST(f_lpch.recorded_time_jittered_utc AS TIMESTAMP) AS ts,
    TRIM(f_lpch.row_disp_name) AS row_name,
    CAST(f_lpch.meas_value AS STRING) AS meas_value_str,
    CAST(f_lpch.units AS STRING) AS units,
    CAST(f_lpch.template AS STRING) AS template
  FROM `{table_id_final_base_bmi_age_race_gender_adi_temp}` AS c
  LEFT JOIN `som-nero-phi-jonc101.lpch_core_2024.lpch_flowsheet` AS f_lpch
    ON c.anon_id = f_lpch.anon_id
   AND TIMESTAMP_DIFF(
         CAST(f_lpch.recorded_time_jittered_utc AS TIMESTAMP),
         CAST(c.blood_culture_order_datetime_utc AS TIMESTAMP),
         HOUR
       ) BETWEEN -48 AND 0
    {tmpl_in_lpch}
),

labeled AS (
  SELECT
    anon_id, pat_enc_csn_id_coded, order_proc_id_coded, blood_culture_order_datetime_utc,
    ts, row_name, meas_value_str, units, template,
    CASE
      WHEN row_name IN (
        'ECMO Venous Blood Temperature', 'Rectal Temperature', 'Patient Core Temperature',
        'ECMO Arterial Blood Temperature', 'Temperature', 'Temperature (C)',
        'Temperature (Blood - PA line)', 'Temp', 'Temp (in Celsius)'
      ) THEN 'temp'
      ELSE NULL
    END AS vital
  FROM flowsheet_window
  WHERE row_name IS NOT NULL
),

parsed AS (
  SELECT
    anon_id, pat_enc_csn_id_coded, order_proc_id_coded, blood_culture_order_datetime_utc,
    ts, vital, row_name, meas_value_str, units, template,
    COALESCE(
      SAFE_CAST(meas_value_str AS FLOAT64),
      SAFE_CAST(REGEXP_EXTRACT(meas_value_str, r'-?\\d+(?:\\.\\d+)?') AS FLOAT64)
    ) AS value_num,
    CASE
      WHEN REGEXP_CONTAINS(LOWER(units), r'(?:°|deg|fahrenheit|\\bf\\b)') THEN 'F'
      WHEN REGEXP_CONTAINS(LOWER(units), r'(?:°|deg|celsius|\\bc\\b)') THEN 'C'
      ELSE NULL
    END AS unit_hint
  FROM labeled
  WHERE vital = 'temp'
),

canon AS (
  SELECT
    anon_id, pat_enc_csn_id_coded, order_proc_id_coded, blood_culture_order_datetime_utc,
    ts, template,
    CASE
      WHEN COALESCE(
             unit_hint,
             CASE WHEN value_num BETWEEN 80 AND 113 THEN 'F'
                  WHEN value_num BETWEEN 26.7 AND 45 THEN 'C' END
           ) = 'F' THEN (value_num - 32) * 5/9
      ELSE value_num
    END AS temp_c
  FROM parsed
),

canon_imputed AS (
  SELECT
    anon_id, pat_enc_csn_id_coded, order_proc_id_coded, blood_culture_order_datetime_utc,
    ts, template,
    temp_c,
    CASE
      WHEN temp_c IS NULL THEN NULL
      WHEN temp_c < {TEMP_C_MIN} OR temp_c > {TEMP_C_MAX} THEN 37.0
      ELSE temp_c
    END AS temp_c_imp,
    CASE
      WHEN temp_c IS NULL THEN FALSE
      WHEN temp_c < {TEMP_C_MIN} OR temp_c > {TEMP_C_MAX} THEN TRUE
      ELSE FALSE
    END AS was_imputed
  FROM canon
),

temp_agg AS (
  SELECT
    anon_id, pat_enc_csn_id_coded, order_proc_id_coded, blood_culture_order_datetime_utc,
    ARRAY_AGG(DISTINCT template IGNORE NULLS) AS template_list,
    COUNT(*) AS temp_obs_n,
    COUNTIF(temp_c IS NOT NULL) AS temp_raw_nonnull_n,
    COUNTIF(was_imputed) AS temp_imputed_n,
    MIN(temp_c_imp) AS temp_min_c,
    AVG(temp_c_imp) AS temp_avg_c,
    MAX(temp_c_imp) AS temp_max_c,
    APPROX_QUANTILES(temp_c_imp, 100)[OFFSET(50)] AS temp_median_c
  FROM canon_imputed
  GROUP BY 1,2,3,4
),

temp_hist AS (
  SELECT
    anon_id, pat_enc_csn_id_coded, order_proc_id_coded, blood_culture_order_datetime_utc,
    CAST(ROUND(temp_c_imp, 1) AS FLOAT64) AS val, COUNT(*) AS cnt
  FROM canon_imputed
  WHERE temp_c_imp IS NOT NULL
  GROUP BY 1,2,3,4,5
),

temp_mode AS (
  SELECT
    anon_id, pat_enc_csn_id_coded, order_proc_id_coded, blood_culture_order_datetime_utc,
    val AS temp_mode_c
  FROM (
    SELECT t.*,
           ROW_NUMBER() OVER (
             PARTITION BY anon_id, pat_enc_csn_id_coded, order_proc_id_coded, blood_culture_order_datetime_utc
             ORDER BY cnt DESC, val ASC
           ) AS rn
    FROM temp_hist t
  )
  WHERE rn = 1
)

SELECT
  b.anon_id,
  b.pat_enc_csn_id_coded,
  b.order_proc_id_coded,
  b.blood_culture_order_datetime_utc,
  a.template_list,
  a.temp_obs_n,
  a.temp_raw_nonnull_n,
  a.temp_imputed_n,
  a.temp_min_c,
  a.temp_avg_c,
  a.temp_max_c,
  a.temp_median_c,
  m.temp_mode_c
FROM `{table_id_final_base_bmi_age_race_gender_adi_temp}` AS b
LEFT JOIN temp_agg a
  USING (anon_id, pat_enc_csn_id_coded, order_proc_id_coded, blood_culture_order_datetime_utc)
LEFT JOIN temp_mode m
  USING (anon_id, pat_enc_csn_id_coded, order_proc_id_coded, blood_culture_order_datetime_utc)
ORDER BY b.anon_id, b.blood_culture_order_datetime_utc
"""
final_base_temp_only = client.query(query).to_dataframe()


In [131]:
final_base_temp_only

Unnamed: 0,anon_id,pat_enc_csn_id_coded,order_proc_id_coded,blood_culture_order_datetime_utc,template_list,temp_obs_n,temp_raw_nonnull_n,temp_imputed_n,temp_min_c,temp_avg_c,temp_max_c,temp_median_c,temp_mode_c
0,JC1000964,131181803241,489700509,2016-03-09 05:24:00,[Vitals],4,4,0,36.611111,36.972222,37.388889,36.722222,36.6
1,JC1000964,313661322,717728953,2016-03-09 07:02:00,[Vitals],4,4,0,36.611111,36.972222,37.388889,36.722222,36.6
2,JC1001312,131092528115,467723412,2015-06-19 17:18:00,[Vitals],2,2,0,36.111111,36.416667,36.722222,36.111111,36.1
3,JC1001312,311762468,711125650,2015-06-19 18:32:00,[Vitals],2,2,0,36.111111,36.416667,36.722222,36.111111,36.1
4,JC1001474,131270266094,615238072,2019-05-21 14:13:00,[Vitals],1,1,0,38.111111,38.111111,38.111111,38.111111,38.1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
27510,JC998040,311007275,708372050,2015-02-10 08:03:00,[Vitals],1,1,0,36.722222,36.722222,36.722222,36.722222,36.7
27511,JC998572,131264900613,598880066,2019-03-08 16:04:00,[Vitals],1,1,0,36.944444,36.944444,36.944444,36.944444,36.9
27512,JC998572,321846584,745779961,2019-03-08 16:31:00,[Vitals],1,1,0,36.944444,36.944444,36.944444,36.944444,36.9
27513,JC999859,131038386282,443087974,2014-08-04 20:48:00,[Vitals],1,1,0,37.111111,37.111111,37.111111,37.111111,37.1


In [132]:
# Upload the DataFrame to BigQuery
table_id_final_base_temperature_only= f"{project_id}.blood_culture_stewardship_peds_sandy_2024.table_id_final_base_temperature_only_peds"

final_base_temp_only.to_gbq(
    destination_table=table_id_final_base_temperature_only,
    project_id=project_id,
    if_exists='replace'  # This will replace the table if it exists
)

print(f"Uploaded final_base_temp_only to {table_id_final_base_temperature_only}")

  final_base_temp_only.to_gbq(
100%|██████████| 1/1 [00:00<00:00, 3412.78it/s]

Uploaded final_base_temp_only to som-nero-phi-jonc101.blood_culture_stewardship_peds_sandy_2024.table_id_final_base_temperature_only_peds





# Resp

In [133]:
RESP_MIN = 0
RESP_MAX = 90

query_resp = f"""
WITH flowsheet_window AS (
  -- SHC flowsheet
  SELECT
    c.anon_id,
    c.pat_enc_csn_id_coded,
    c.order_proc_id_coded,
    c.blood_culture_order_datetime_utc,
    f_shc.recorded_time_jittered_utc AS ts,
    TRIM(f_shc.row_disp_name) AS row_name,
    CAST(f_shc.meas_value AS STRING) AS meas_value_str,
    CAST(f_shc.units AS STRING) AS units
  FROM `{table_id_final_base_bmi_age_race_gender_adi_temp}` AS c
  LEFT JOIN `som-nero-phi-jonc101.shc_core_2024.flowsheet` AS f_shc
    ON c.anon_id = f_shc.anon_id
   AND TIMESTAMP_DIFF(
         f_shc.recorded_time_jittered_utc,
         CAST(c.blood_culture_order_datetime_utc AS TIMESTAMP),
         HOUR
       ) BETWEEN -48 AND 0

  UNION ALL

  -- LPCH flowsheet
  SELECT
    c.anon_id,
    c.pat_enc_csn_id_coded,
    c.order_proc_id_coded,
    c.blood_culture_order_datetime_utc,
    CAST(f_lpch.recorded_time_jittered_utc AS TIMESTAMP) AS ts,
    TRIM(f_lpch.row_disp_name) AS row_name,
    CAST(f_lpch.meas_value AS STRING) AS meas_value_str,
    CAST(f_lpch.units AS STRING) AS units
  FROM `{table_id_final_base_bmi_age_race_gender_adi_temp}` AS c
  LEFT JOIN `som-nero-phi-jonc101.lpch_core_2024.lpch_flowsheet` AS f_lpch
    ON c.anon_id = f_lpch.anon_id
   AND TIMESTAMP_DIFF(
         CAST(f_lpch.recorded_time_jittered_utc AS TIMESTAMP),
         CAST(c.blood_culture_order_datetime_utc AS TIMESTAMP),
         HOUR
       ) BETWEEN -48 AND 0
),

labeled AS (
  SELECT
    anon_id,
    pat_enc_csn_id_coded,
    order_proc_id_coded,
    blood_culture_order_datetime_utc,
    ts,
    row_name,
    meas_value_str,
    units,
    CASE
      -- adult labels
      WHEN UPPER(TRIM(row_name)) IN ('RESP', 'RESP RATE')
           OR TRIM(row_name) = '(Retired) Resp Rate Total'
      -- (optional) add common peds-style spelling:
      -- OR UPPER(TRIM(row_name)) = 'RESPIRATORY RATE'
      THEN 'resp'
      ELSE NULL
    END AS vital
  FROM flowsheet_window
  WHERE row_name IS NOT NULL
),

parsed AS (
  SELECT
    anon_id,
    pat_enc_csn_id_coded,
    order_proc_id_coded,
    blood_culture_order_datetime_utc,
    ts,
    COALESCE(
      SAFE_CAST(meas_value_str AS FLOAT64),
      SAFE_CAST(REGEXP_EXTRACT(meas_value_str, r'-?\\d+(?:\\.\\d+)?') AS FLOAT64)
    ) AS resp_val
  FROM labeled
  WHERE vital = 'resp'
),

canon AS (
  SELECT
    anon_id,
    pat_enc_csn_id_coded,
    order_proc_id_coded,
    blood_culture_order_datetime_utc,
    ts,
    CASE
      WHEN resp_val IS NULL THEN NULL
      WHEN resp_val < {RESP_MIN} OR resp_val > {RESP_MAX} THEN NULL
      ELSE resp_val
    END AS resp_ok
  FROM parsed
),

resp_agg AS (
  SELECT
    anon_id,
    pat_enc_csn_id_coded,
    order_proc_id_coded,
    blood_culture_order_datetime_utc,
    COUNT(*) AS resp_obs_n,
    COUNTIF(resp_ok IS NOT NULL) AS resp_nonnull_n,
    MIN(resp_ok) AS resp_min,
    AVG(resp_ok) AS resp_avg,
    MAX(resp_ok) AS resp_max,
    APPROX_QUANTILES(resp_ok, 100)[OFFSET(50)] AS resp_median
  FROM canon
  GROUP BY 1,2,3,4
),

resp_hist AS (
  SELECT
    anon_id,
    pat_enc_csn_id_coded,
    order_proc_id_coded,
    blood_culture_order_datetime_utc,
    CAST(ROUND(resp_ok, 0) AS FLOAT64) AS val,
    COUNT(*) AS cnt
  FROM canon
  WHERE resp_ok IS NOT NULL
  GROUP BY 1,2,3,4,5
),

resp_mode AS (
  SELECT
    anon_id,
    pat_enc_csn_id_coded,
    order_proc_id_coded,
    blood_culture_order_datetime_utc,
    val AS resp_mode
  FROM (
    SELECT
      t.*,
      ROW_NUMBER() OVER (
        PARTITION BY anon_id, pat_enc_csn_id_coded, order_proc_id_coded, blood_culture_order_datetime_utc
        ORDER BY cnt DESC, val ASC
      ) AS rn
    FROM resp_hist t
  )
  WHERE rn = 1
)

SELECT
  b.anon_id,
  b.pat_enc_csn_id_coded,
  b.order_proc_id_coded,
  b.blood_culture_order_datetime_utc,
  a.resp_obs_n,
  a.resp_nonnull_n,
  a.resp_min,
  a.resp_avg,
  a.resp_max,
  a.resp_median,
  m.resp_mode
FROM `{table_id_final_base_bmi_age_race_gender_adi_temp}` AS b
LEFT JOIN resp_agg AS a
  USING (anon_id, pat_enc_csn_id_coded, order_proc_id_coded, blood_culture_order_datetime_utc)
LEFT JOIN resp_mode AS m
  USING (anon_id, pat_enc_csn_id_coded, order_proc_id_coded, blood_culture_order_datetime_utc)
ORDER BY b.anon_id, b.blood_culture_order_datetime_utc
"""
final_base_resp_only = client.query(query_resp).to_dataframe()


In [134]:
# Upload the DataFrame to BigQuery
table_id_final_base_resp_only= f"{project_id}.blood_culture_stewardship_peds_sandy_2024.table_id_final_base_resp_only_peds"

final_base_resp_only.to_gbq(
    destination_table=table_id_final_base_resp_only,
    project_id=project_id,
    if_exists='replace'  # This will replace the table if it exists
)

print(f"Uploaded final_base_resp_only to {table_id_final_base_resp_only}")

  final_base_resp_only.to_gbq(
100%|██████████| 1/1 [00:00<00:00, 4539.29it/s]

Uploaded final_base_resp_only to som-nero-phi-jonc101.blood_culture_stewardship_peds_sandy_2024.table_id_final_base_resp_only_peds





#### heart rate

In [135]:
HR_MIN = 0
HR_MAX = 250

# if you have a valid_template df like before, you can uncomment this part
# tmpl_list = valid_template["template"].tolist()
# if tmpl_list:
#     tmpl_in_shc = "AND f_shc.template IN (" + ", ".join([f"'{t}'" for t in tmpl_list]) + ")"
#     tmpl_in_lpch = "AND f_lpch.template IN (" + ", ".join([f"'{t}'" for t in tmpl_list]) + ")"
# else:
#     tmpl_in_shc = ""
#     tmpl_in_lpch = ""

# for now, no template filter
tmpl_in_shc = ""
tmpl_in_lpch = ""

query_hr = f"""
WITH flowsheet_window AS (
  -- SHC HR rows
  SELECT
    c.anon_id,
    c.pat_enc_csn_id_coded,
    c.order_proc_id_coded,
    c.blood_culture_order_datetime_utc,
    f_shc.recorded_time_jittered_utc AS ts,
    TRIM(f_shc.row_disp_name) AS row_name,
    CAST(f_shc.meas_value AS STRING) AS meas_value_str,
    CAST(f_shc.units AS STRING) AS units,
    CAST(f_shc.template AS STRING) AS template
  FROM `{table_id_final_base_bmi_age_race_gender_adi_temp}` AS c
  LEFT JOIN `som-nero-phi-jonc101.shc_core_2024.flowsheet` AS f_shc
    ON c.anon_id = f_shc.anon_id
   AND TIMESTAMP_DIFF(
         f_shc.recorded_time_jittered_utc,
         CAST(c.blood_culture_order_datetime_utc AS TIMESTAMP),
         HOUR
       ) BETWEEN -48 AND 0
    {tmpl_in_shc}

  UNION ALL

  -- LPCH HR rows
  SELECT
    c.anon_id,
    c.pat_enc_csn_id_coded,
    c.order_proc_id_coded,
    c.blood_culture_order_datetime_utc,
    CAST(f_lpch.recorded_time_jittered_utc AS TIMESTAMP) AS ts,
    TRIM(f_lpch.row_disp_name) AS row_name,
    CAST(f_lpch.meas_value AS STRING) AS meas_value_str,
    CAST(f_lpch.units AS STRING) AS units,
    CAST(f_lpch.template AS STRING) AS template
  FROM `{table_id_final_base_bmi_age_race_gender_adi_temp}` AS c
  LEFT JOIN `som-nero-phi-jonc101.lpch_core_2024.lpch_flowsheet` AS f_lpch
    ON c.anon_id = f_lpch.anon_id
   AND TIMESTAMP_DIFF(
         CAST(f_lpch.recorded_time_jittered_utc AS TIMESTAMP),
         CAST(c.blood_culture_order_datetime_utc AS TIMESTAMP),
         HOUR
       ) BETWEEN -48 AND 0
    {tmpl_in_lpch}
),

labeled AS (
  SELECT
    anon_id, pat_enc_csn_id_coded, order_proc_id_coded, blood_culture_order_datetime_utc,
    ts, row_name, meas_value_str, units, template,
    CASE
      -- common adult/peds ways HR is stored
      WHEN UPPER(TRIM(row_name)) IN ('HEART RATE', 'PULSE', 'HR')
           OR TRIM(row_name) = 'Pulse Rate'
      THEN 'heart_rate'
      ELSE NULL
    END AS vital
  FROM flowsheet_window
  WHERE row_name IS NOT NULL
),

parsed AS (
  SELECT
    anon_id, pat_enc_csn_id_coded, order_proc_id_coded, blood_culture_order_datetime_utc,
    ts, template,
    COALESCE(
      SAFE_CAST(meas_value_str AS FLOAT64),
      SAFE_CAST(REGEXP_EXTRACT(meas_value_str, r'-?\\d+(?:\\.\\d+)?') AS FLOAT64)
    ) AS hr_val
  FROM labeled
  WHERE vital = 'heart_rate'
),

canon AS (
  SELECT
    anon_id, pat_enc_csn_id_coded, order_proc_id_coded, blood_culture_order_datetime_utc,
    ts, template,
    CASE
      WHEN hr_val IS NULL THEN NULL
      WHEN hr_val < {HR_MIN} OR hr_val > {HR_MAX} THEN NULL
      ELSE hr_val
    END AS hr_ok
  FROM parsed
),

hr_agg AS (
  SELECT
    anon_id, pat_enc_csn_id_coded, order_proc_id_coded, blood_culture_order_datetime_utc,
    ARRAY_AGG(DISTINCT template IGNORE NULLS) AS template_list,
    COUNT(*) AS hr_obs_n,
    COUNTIF(hr_ok IS NOT NULL) AS hr_nonnull_n,
    MIN(hr_ok) AS hr_min,
    AVG(hr_ok) AS hr_avg,
    MAX(hr_ok) AS hr_max,
    APPROX_QUANTILES(hr_ok, 100)[OFFSET(50)] AS hr_median
  FROM canon
  GROUP BY 1,2,3,4
),

hr_hist AS (
  SELECT
    anon_id, pat_enc_csn_id_coded, order_proc_id_coded, blood_culture_order_datetime_utc,
    CAST(ROUND(hr_ok, 0) AS FLOAT64) AS val,
    COUNT(*) AS cnt
  FROM canon
  WHERE hr_ok IS NOT NULL
  GROUP BY 1,2,3,4,5
),

hr_mode AS (
  SELECT
    anon_id, pat_enc_csn_id_coded, order_proc_id_coded, blood_culture_order_datetime_utc,
    val AS hr_mode
  FROM (
    SELECT
      t.*,
      ROW_NUMBER() OVER (
        PARTITION BY anon_id, pat_enc_csn_id_coded, order_proc_id_coded, blood_culture_order_datetime_utc
        ORDER BY cnt DESC, val ASC
      ) AS rn
    FROM hr_hist t
  )
  WHERE rn = 1
)

SELECT
  b.anon_id,
  b.pat_enc_csn_id_coded,
  b.order_proc_id_coded,
  b.blood_culture_order_datetime_utc,
  a.template_list,
  a.hr_obs_n,
  a.hr_nonnull_n,
  a.hr_min,
  a.hr_avg,
  a.hr_max,
  a.hr_median,
  m.hr_mode
FROM `{table_id_final_base_bmi_age_race_gender_adi_temp}` AS b
LEFT JOIN hr_agg AS a
  USING (anon_id, pat_enc_csn_id_coded, order_proc_id_coded, blood_culture_order_datetime_utc)
LEFT JOIN hr_mode AS m
  USING (anon_id, pat_enc_csn_id_coded, order_proc_id_coded, blood_culture_order_datetime_utc)
ORDER BY b.anon_id, b.blood_culture_order_datetime_utc
"""
final_base_hr_only = client.query(query_hr).to_dataframe()


In [136]:
final_base_hr_only.isnull().mean()*100

anon_id                             0.000000
pat_enc_csn_id_coded                0.000000
order_proc_id_coded                 0.000000
blood_culture_order_datetime_utc    0.000000
template_list                       0.000000
hr_obs_n                            1.650009
hr_nonnull_n                        1.650009
hr_min                              1.951663
hr_avg                              1.951663
hr_max                              1.951663
hr_median                           1.951663
hr_mode                             1.951663
dtype: float64

In [137]:
# Upload the DataFrame to BigQuery
table_id_final_base_hr_only= f"{project_id}.blood_culture_stewardship_peds_sandy_2024.table_id_final_base_hr_only_peds"

final_base_hr_only.to_gbq(
    destination_table=table_id_final_base_hr_only,
    project_id=project_id,
    if_exists='replace'  # This will replace the table if it exists
)

print(f"Uploaded final_base_hr_only to {table_id_final_base_hr_only}")

  final_base_hr_only.to_gbq(
100%|██████████| 1/1 [00:00<00:00, 5377.31it/s]

Uploaded final_base_hr_only to som-nero-phi-jonc101.blood_culture_stewardship_peds_sandy_2024.table_id_final_base_hr_only_peds





# bp

In [138]:
BP_SYS_MAX = 250
BP_DIA_MAX = 150

# if you want to filter by template, you can fill these later
tmpl_in_shc = ""
tmpl_in_lpch = ""

query_bp = f"""
WITH flowsheet_window AS (
  -- SHC BP rows
  SELECT
    c.anon_id,
    c.pat_enc_csn_id_coded,
    c.order_proc_id_coded,
    c.blood_culture_order_datetime_utc,
    f_shc.recorded_time_jittered_utc AS ts,
    TRIM(f_shc.row_disp_name) AS row_name,
    CAST(f_shc.meas_value AS STRING) AS meas_value_str,
    CAST(f_shc.units AS STRING) AS units,
    CAST(f_shc.template AS STRING) AS template
  FROM `{table_id_final_base_bmi_age_race_gender_adi_temp}` AS c
  LEFT JOIN `som-nero-phi-jonc101.shc_core_2024.flowsheet` AS f_shc
    ON c.anon_id = f_shc.anon_id
   AND TIMESTAMP_DIFF(
         f_shc.recorded_time_jittered_utc,
         CAST(c.blood_culture_order_datetime_utc AS TIMESTAMP),
         HOUR
       ) BETWEEN -48 AND 0
    {tmpl_in_shc}

  UNION ALL

  -- LPCH BP rows
  SELECT
    c.anon_id,
    c.pat_enc_csn_id_coded,
    c.order_proc_id_coded,
    c.blood_culture_order_datetime_utc,
    CAST(f_lpch.recorded_time_jittered_utc AS TIMESTAMP) AS ts,
    TRIM(f_lpch.row_disp_name) AS row_name,
    CAST(f_lpch.meas_value AS STRING) AS meas_value_str,
    CAST(f_lpch.units AS STRING) AS units,
    CAST(f_lpch.template AS STRING) AS template
  FROM `{table_id_final_base_bmi_age_race_gender_adi_temp}` AS c
  LEFT JOIN `som-nero-phi-jonc101.lpch_core_2024.lpch_flowsheet` AS f_lpch
    ON c.anon_id = f_lpch.anon_id
   AND TIMESTAMP_DIFF(
         CAST(f_lpch.recorded_time_jittered_utc AS TIMESTAMP),
         CAST(c.blood_culture_order_datetime_utc AS TIMESTAMP),
         HOUR
       ) BETWEEN -48 AND 0
    {tmpl_in_lpch}
),

labeled AS (
  SELECT
    anon_id, pat_enc_csn_id_coded, order_proc_id_coded, blood_culture_order_datetime_utc,
    ts, row_name, meas_value_str, units, template,
    CASE
      -- you can add more variants like 'Non-Invasive BP', 'NBP', 'BP (noninvasive)'
      WHEN UPPER(TRIM(row_name)) IN ('BP', 'NIBP') THEN 'blood_pressure'
      ELSE NULL
    END AS vital
  FROM flowsheet_window
  WHERE row_name IS NOT NULL
),

-- Parse systolic/diastolic from strings like "120/70"
parsed AS (
  SELECT
    anon_id,
    pat_enc_csn_id_coded,
    order_proc_id_coded,
    blood_culture_order_datetime_utc,
    ts,
    template,
    SAFE_CAST(SPLIT(REGEXP_REPLACE(meas_value_str, r'[^0-9/]', ''), '/')[SAFE_OFFSET(0)] AS FLOAT64) AS sys_val,
    SAFE_CAST(SPLIT(REGEXP_REPLACE(meas_value_str, r'[^0-9/]', ''), '/')[SAFE_OFFSET(1)] AS FLOAT64) AS dia_val
  FROM labeled
  WHERE vital = 'blood_pressure'
),

-- Apply upper cutoffs
canon AS (
  SELECT
    anon_id,
    pat_enc_csn_id_coded,
    order_proc_id_coded,
    blood_culture_order_datetime_utc,
    ts,
    template,
    CASE
      WHEN sys_val IS NULL THEN NULL
      WHEN sys_val <= {BP_SYS_MAX} THEN sys_val
      ELSE NULL
    END AS bp_sys_ok,
    CASE
      WHEN dia_val IS NULL THEN NULL
      WHEN dia_val <= {BP_DIA_MAX} THEN dia_val
      ELSE NULL
    END AS bp_dia_ok
  FROM parsed
),

bp_agg AS (
  SELECT
    anon_id,
    pat_enc_csn_id_coded,
    order_proc_id_coded,
    blood_culture_order_datetime_utc,
    ARRAY_AGG(DISTINCT template IGNORE NULLS) AS template_list,
    COUNT(*) AS bp_obs_n,
    COUNTIF(bp_sys_ok IS NOT NULL) AS sys_nonnull_n,
    COUNTIF(bp_dia_ok IS NOT NULL) AS dia_nonnull_n,
    MIN(bp_sys_ok) AS sysbp_min,
    AVG(bp_sys_ok) AS sysbp_avg,
    MAX(bp_sys_ok) AS sysbp_max,
    APPROX_QUANTILES(bp_sys_ok, 100)[OFFSET(50)] AS sysbp_median,
    MIN(bp_dia_ok) AS diabp_min,
    AVG(bp_dia_ok) AS diabp_avg,
    MAX(bp_dia_ok) AS diabp_max,
    APPROX_QUANTILES(bp_dia_ok, 100)[OFFSET(50)] AS diabp_median
  FROM canon
  GROUP BY 1,2,3,4
),

-- Modes for sys
bp_sys_hist AS (
  SELECT
    anon_id,
    pat_enc_csn_id_coded,
    order_proc_id_coded,
    blood_culture_order_datetime_utc,
    CAST(ROUND(bp_sys_ok, 0) AS FLOAT64) AS val,
    COUNT(*) AS cnt
  FROM canon
  WHERE bp_sys_ok IS NOT NULL
  GROUP BY 1,2,3,4,5
),
-- Modes for dia
bp_dia_hist AS (
  SELECT
    anon_id,
    pat_enc_csn_id_coded,
    order_proc_id_coded,
    blood_culture_order_datetime_utc,
    CAST(ROUND(bp_dia_ok, 0) AS FLOAT64) AS val,
    COUNT(*) AS cnt
  FROM canon
  WHERE bp_dia_ok IS NOT NULL
  GROUP BY 1,2,3,4,5
),
bp_sys_mode AS (
  SELECT
    anon_id,
    pat_enc_csn_id_coded,
    order_proc_id_coded,
    blood_culture_order_datetime_utc,
    val AS sysbp_mode
  FROM (
    SELECT
      t.*,
      ROW_NUMBER() OVER (
        PARTITION BY anon_id, pat_enc_csn_id_coded, order_proc_id_coded, blood_culture_order_datetime_utc
        ORDER BY cnt DESC, val ASC
      ) AS rn
    FROM bp_sys_hist t
  )
  WHERE rn = 1
),
bp_dia_mode AS (
  SELECT
    anon_id,
    pat_enc_csn_id_coded,
    order_proc_id_coded,
    blood_culture_order_datetime_utc,
    val AS diabp_mode
  FROM (
    SELECT
      t.*,
      ROW_NUMBER() OVER (
        PARTITION BY anon_id, pat_enc_csn_id_coded, order_proc_id_coded, blood_culture_order_datetime_utc
        ORDER BY cnt DESC, val ASC
      ) AS rn
    FROM bp_dia_hist t
  )
  WHERE rn = 1
)

SELECT
  b.anon_id,
  b.pat_enc_csn_id_coded,
  b.order_proc_id_coded,
  b.blood_culture_order_datetime_utc,
  a.template_list,
  a.bp_obs_n,
  a.sys_nonnull_n,
  a.dia_nonnull_n,
  a.sysbp_min,
  a.sysbp_avg,
  a.sysbp_max,
  a.sysbp_median,
  sm.sysbp_mode,
  a.diabp_min,
  a.diabp_avg,
  a.diabp_max,
  a.diabp_median,
  dm.diabp_mode
FROM `{table_id_final_base_bmi_age_race_gender_adi_temp}` AS b
LEFT JOIN bp_agg AS a
  USING (anon_id, pat_enc_csn_id_coded, order_proc_id_coded, blood_culture_order_datetime_utc)
LEFT JOIN bp_sys_mode AS sm
  USING (anon_id, pat_enc_csn_id_coded, order_proc_id_coded, blood_culture_order_datetime_utc)
LEFT JOIN bp_dia_mode AS dm
  USING (anon_id, pat_enc_csn_id_coded, order_proc_id_coded, blood_culture_order_datetime_utc)
ORDER BY b.anon_id, b.blood_culture_order_datetime_utc
"""
final_base_bp_only = client.query(query_bp).to_dataframe()


In [139]:
final_base_bp_only

Unnamed: 0,anon_id,pat_enc_csn_id_coded,order_proc_id_coded,blood_culture_order_datetime_utc,template_list,bp_obs_n,sys_nonnull_n,dia_nonnull_n,sysbp_min,sysbp_avg,sysbp_max,sysbp_median,sysbp_mode,diabp_min,diabp_avg,diabp_max,diabp_median,diabp_mode
0,JC1000964,131181803241,489700509,2016-03-09 05:24:00,"[PCT Vitals, Intake and Output, Vitals, Acute ...",8,4,4,105.0,109.000000,117.0,105.0,105.0,57.0,61.250000,71.0,58.0,57.0
1,JC1000964,313661322,717728953,2016-03-09 07:02:00,"[DATA VALIDATE, PCT Vitals, Intake and Output,...",9,5,5,104.0,108.000000,117.0,105.0,105.0,57.0,62.400000,71.0,59.0,57.0
2,JC1001312,131092528115,467723412,2015-06-19 17:18:00,"[Vitals, DATA VALIDATE]",3,3,3,115.0,119.000000,124.0,118.0,115.0,67.0,70.333333,75.0,69.0,67.0
3,JC1001312,311762468,711125650,2015-06-19 18:32:00,"[DATA VALIDATE, Vitals, ICU VS]",6,6,6,115.0,123.500000,134.0,120.0,115.0,62.0,70.666667,76.0,69.0,75.0
4,JC1001474,131270266094,615238072,2019-05-21 14:13:00,[Vitals],1,1,1,88.0,88.000000,88.0,88.0,88.0,45.0,45.000000,45.0,45.0,45.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27510,JC998040,311007275,708372050,2015-02-10 08:03:00,[Vitals],2,2,2,102.0,106.000000,110.0,102.0,102.0,63.0,63.500000,64.0,63.0,63.0
27511,JC998572,131264900613,598880066,2019-03-08 16:04:00,[Vitals],3,3,3,101.0,121.333333,155.0,108.0,101.0,63.0,70.666667,77.0,72.0,63.0
27512,JC998572,321846584,745779961,2019-03-08 16:31:00,[Vitals],5,5,5,101.0,128.400000,155.0,131.0,101.0,63.0,79.400000,96.0,77.0,63.0
27513,JC999859,131038386282,443087974,2014-08-04 20:48:00,[DATA VALIDATE],5,5,5,133.0,138.000000,149.0,137.0,133.0,84.0,95.000000,106.0,96.0,96.0


# spo2

In [140]:
SPO2_MIN = 0     # percent
SPO2_MAX = 100   # percent

# if you have a valid_template list, plug it here later
tmpl_in_shc = ""
tmpl_in_lpch = ""

query_spo2 = f"""
WITH flowsheet_window AS (
  -- SHC SpO2 rows
  SELECT
    c.anon_id,
    c.pat_enc_csn_id_coded,
    c.order_proc_id_coded,
    c.blood_culture_order_datetime_utc,
    f_shc.recorded_time_jittered_utc AS ts,
    TRIM(f_shc.row_disp_name) AS row_name,
    CAST(f_shc.meas_value AS STRING) AS meas_value_str,
    CAST(f_shc.units AS STRING) AS units,
    CAST(f_shc.template AS STRING) AS template
  FROM `{table_id_final_base_bmi_age_race_gender_adi_temp}` AS c
  LEFT JOIN `som-nero-phi-jonc101.shc_core_2024.flowsheet` AS f_shc
    ON c.anon_id = f_shc.anon_id
   AND TIMESTAMP_DIFF(
         f_shc.recorded_time_jittered_utc,
         CAST(c.blood_culture_order_datetime_utc AS TIMESTAMP),
         HOUR
       ) BETWEEN -48 AND 0
    {tmpl_in_shc}

  UNION ALL

  -- LPCH SpO2 rows
  SELECT
    c.anon_id,
    c.pat_enc_csn_id_coded,
    c.order_proc_id_coded,
    c.blood_culture_order_datetime_utc,
    CAST(f_lpch.recorded_time_jittered_utc AS TIMESTAMP) AS ts,
    TRIM(f_lpch.row_disp_name) AS row_name,
    CAST(f_lpch.meas_value AS STRING) AS meas_value_str,
    CAST(f_lpch.units AS STRING) AS units,
    CAST(f_lpch.template AS STRING) AS template
  FROM `{table_id_final_base_bmi_age_race_gender_adi_temp}` AS c
  LEFT JOIN `som-nero-phi-jonc101.lpch_core_2024.lpch_flowsheet` AS f_lpch
    ON c.anon_id = f_lpch.anon_id
   AND TIMESTAMP_DIFF(
         CAST(f_lpch.recorded_time_jittered_utc AS TIMESTAMP),
         CAST(c.blood_culture_order_datetime_utc AS TIMESTAMP),
         HOUR
       ) BETWEEN -48 AND 0
    {tmpl_in_lpch}
),

labeled AS (
  SELECT
    anon_id,
    pat_enc_csn_id_coded,
    order_proc_id_coded,
    blood_culture_order_datetime_utc,
    ts,
    row_name,
    meas_value_str,
    units,
    template,
    CASE
      -- add more if you see LPCH variants like "Oxygen Saturation"
      WHEN UPPER(TRIM(row_name)) IN ('SPO2', 'O2', 'O2 SAT', 'OXYGEN SATURATION')
      THEN 'spo2'
      ELSE NULL
    END AS vital
  FROM flowsheet_window
  WHERE row_name IS NOT NULL
),

parsed AS (
  SELECT
    anon_id,
    pat_enc_csn_id_coded,
    order_proc_id_coded,
    blood_culture_order_datetime_utc,
    ts,
    template,
    COALESCE(
      SAFE_CAST(meas_value_str AS FLOAT64),
      SAFE_CAST(REGEXP_EXTRACT(meas_value_str, r'-?\\d+(?:\\.\\d+)?') AS FLOAT64)
    ) AS spo2_val
  FROM labeled
  WHERE vital = 'spo2'
),

canon AS (
  SELECT
    anon_id,
    pat_enc_csn_id_coded,
    order_proc_id_coded,
    blood_culture_order_datetime_utc,
    ts,
    template,
    CASE
      WHEN spo2_val IS NULL THEN NULL
      WHEN spo2_val < {SPO2_MIN} OR spo2_val > {SPO2_MAX} THEN NULL
      ELSE spo2_val
    END AS spo2_ok
  FROM parsed
),

spo2_agg AS (
  SELECT
    anon_id,
    pat_enc_csn_id_coded,
    order_proc_id_coded,
    blood_culture_order_datetime_utc,
    ARRAY_AGG(DISTINCT template IGNORE NULLS) AS template_list,
    COUNT(*) AS spo2_obs_n,
    COUNTIF(spo2_ok IS NOT NULL) AS spo2_nonnull_n,
    MIN(spo2_ok) AS spo2_min,
    AVG(spo2_ok) AS spo2_avg,
    MAX(spo2_ok) AS spo2_max,
    APPROX_QUANTILES(spo2_ok, 100)[OFFSET(50)] AS spo2_median
  FROM canon
  GROUP BY 1,2,3,4
),

spo2_hist AS (
  SELECT
    anon_id,
    pat_enc_csn_id_coded,
    order_proc_id_coded,
    blood_culture_order_datetime_utc,
    CAST(ROUND(spo2_ok, 0) AS FLOAT64) AS val,
    COUNT(*) AS cnt
  FROM canon
  WHERE spo2_ok IS NOT NULL
  GROUP BY 1,2,3,4,5
),

spo2_mode AS (
  SELECT
    anon_id,
    pat_enc_csn_id_coded,
    order_proc_id_coded,
    blood_culture_order_datetime_utc,
    val AS spo2_mode
  FROM (
    SELECT
      t.*,
      ROW_NUMBER() OVER (
        PARTITION BY anon_id, pat_enc_csn_id_coded, order_proc_id_coded, blood_culture_order_datetime_utc
        ORDER BY cnt DESC, val ASC
      ) AS rn
    FROM spo2_hist t
  )
  WHERE rn = 1
)

SELECT
  b.anon_id,
  b.pat_enc_csn_id_coded,
  b.order_proc_id_coded,
  b.blood_culture_order_datetime_utc,
  a.template_list,
  a.spo2_obs_n,
  a.spo2_nonnull_n,
  a.spo2_min,
  a.spo2_avg,
  a.spo2_max,
  a.spo2_median,
  m.spo2_mode
FROM `{table_id_final_base_bmi_age_race_gender_adi_temp}` AS b
LEFT JOIN spo2_agg AS a
  USING (anon_id, pat_enc_csn_id_coded, order_proc_id_coded, blood_culture_order_datetime_utc)
LEFT JOIN spo2_mode AS m
  USING (anon_id, pat_enc_csn_id_coded, order_proc_id_coded, blood_culture_order_datetime_utc)
ORDER BY b.anon_id, b.blood_culture_order_datetime_utc
"""
final_base_spo2_only = client.query(query_spo2).to_dataframe()


In [141]:
final_base_spo2_only.isnull().mean()*100

anon_id                             0.000000
pat_enc_csn_id_coded                0.000000
order_proc_id_coded                 0.000000
blood_culture_order_datetime_utc    0.000000
template_list                       0.000000
spo2_obs_n                          1.962566
spo2_nonnull_n                      1.962566
spo2_min                            2.271488
spo2_avg                            2.271488
spo2_max                            2.271488
spo2_median                         2.271488
spo2_mode                           2.271488
dtype: float64

In [142]:
# Upload the DataFrame to BigQuery
table_id_final_base_spo2_only= f"{project_id}.blood_culture_stewardship_peds_sandy_2024.table_id_final_base_spo2_only_peds"

final_base_spo2_only.to_gbq(
    destination_table=table_id_final_base_spo2_only,
    project_id=project_id,
    if_exists='replace'  # This will replace the table if it exists
)

print(f"Uploaded final_base_spo2_only to {table_id_final_base_spo2_only}")

  final_base_spo2_only.to_gbq(
100%|██████████| 1/1 [00:00<00:00, 1273.70it/s]

Uploaded final_base_spo2_only to som-nero-phi-jonc101.blood_culture_stewardship_peds_sandy_2024.table_id_final_base_spo2_only_peds





# concat

In [143]:
import pandas as pd
from functools import reduce

# List of all per-vital DataFrames
dfs = [final_base_temp_only,final_base_resp_only, final_base_hr_only, final_base_bp_only, final_base_spo2_only]

# Merge keys
merge_keys = ["anon_id", "pat_enc_csn_id_coded", "order_proc_id_coded", "blood_culture_order_datetime_utc"]

# Keep only key columns + aggregate metrics (min, max, median, mode)
dfs_clean = [
    df.loc[:, df.columns.str.contains("|".join(merge_keys)) | df.columns.str.contains("(min|max|median|mode|avg)", case=False)]
    for df in dfs
]

# Merge all DataFrames on the same keys
final_base_vitals = reduce(lambda left, right: pd.merge(left, right, on=merge_keys, how="outer"), dfs_clean)


  df.loc[:, df.columns.str.contains("|".join(merge_keys)) | df.columns.str.contains("(min|max|median|mode|avg)", case=False)]


In [144]:
# Upload refactor_base_cohort to BigQuery as a temporary table
table_id_final_base_vitals = f"{project_id}.blood_culture_stewardship_peds_sandy_2024.final_base_vitals_peds"

# Upload the DataFrame to BigQuery
final_base_vitals.to_gbq(
    destination_table=table_id_final_base_vitals,
    project_id=project_id,
    if_exists='replace'  # This will replace the table if it exists
)

print(f"Uploaded final_base_vitals to {table_id_final_base_vitals}")

  final_base_vitals.to_gbq(
100%|██████████| 1/1 [00:00<00:00, 13189.64it/s]

Uploaded final_base_vitals to som-nero-phi-jonc101.blood_culture_stewardship_peds_sandy_2024.final_base_vitals_peds





# Complete blood count with differntial CBCd
### WBC, HgB, PLT, ANC, ALC, % Neutrophils, %Lymphocytes, glucose, lactate, crp, procalcitonin

# wbc

In [145]:
query_wbc = f"""
WITH all_lab_result AS (
  -- SHC labs
  SELECT
    anon_id,
    pat_enc_csn_id_coded,
    lab_name,
    base_name,
    ord_value,
    reference_unit,
    CAST(order_time_jittered_utc AS TIMESTAMP) AS order_ts
  FROM `som-nero-phi-jonc101.shc_core_2024.lab_result`
  WHERE anon_id IN (SELECT DISTINCT anon_id FROM {table_id_final_base_bmi_age_race_gender_adi_temp})

  UNION ALL

  -- LPCH labs
  SELECT
    anon_id,
    pat_enc_csn_id_coded,
    lab_name,
    base_name,
    ord_value,
    reference_unit,
    CAST(order_time_jittered_utc AS TIMESTAMP) AS order_ts
  FROM `som-nero-phi-jonc101.lpch_core_2024.lpch_lab_result`
  WHERE anon_id IN (SELECT DISTINCT anon_id FROM {table_id_final_base_bmi_age_race_gender_adi_temp})
),

wbc_raw AS (
  SELECT
    c.anon_id,
    c.pat_enc_csn_id_coded,
    c.order_proc_id_coded,
    c.blood_culture_order_datetime_utc,
    CASE
      -- WBC already in 10^3/µL-ish units
      WHEN LOWER(lr.base_name) = 'wbc'
       AND LOWER(lr.reference_unit) IN (
         'thousand/ul','k/ul','10x3/ul','10*3/ul','x10e3/ul'
         ,'x10^3/ul'
       )
        THEN SAFE_CAST(lr.ord_value AS FLOAT64)
      -- raw per µL → convert
      WHEN LOWER(lr.base_name) = 'wbc'
       AND LOWER(lr.reference_unit) IN ('/ul','ul')
        THEN SAFE_CAST(lr.ord_value AS FLOAT64) / 1000
      -- missing unit but base_name = wbc → take as is
      WHEN LOWER(lr.base_name) = 'wbc'
       AND (lr.reference_unit IS NULL OR lr.reference_unit = '')
        THEN SAFE_CAST(lr.ord_value AS FLOAT64)
      ELSE NULL
    END AS wbc
  FROM {table_id_final_base_bmi_age_race_gender_adi_temp} AS c
  LEFT JOIN all_lab_result AS lr
    USING (anon_id, pat_enc_csn_id_coded)
  WHERE TIMESTAMP_DIFF(
          lr.order_ts,
          CAST(c.blood_culture_order_datetime_utc AS TIMESTAMP),
          HOUR
        ) BETWEEN -48 AND 0
),

wbc_agg AS (
  SELECT
    anon_id,
    pat_enc_csn_id_coded,
    order_proc_id_coded,
    blood_culture_order_datetime_utc,
    ROUND(MIN(wbc), 2) AS min_wbc,
    ROUND(MAX(wbc), 2) AS max_wbc,
    ROUND(AVG(wbc), 2) AS avg_wbc,
    ROUND(APPROX_QUANTILES(wbc, 100)[OFFSET(50)], 2) AS median_wbc
  FROM wbc_raw
  WHERE wbc IS NOT NULL
  GROUP BY 1,2,3,4
)

SELECT
  b.anon_id,
  b.pat_enc_csn_id_coded,
  b.order_proc_id_coded,
  b.blood_culture_order_datetime_utc,
  a.min_wbc,
  a.max_wbc,
  a.avg_wbc,
  a.median_wbc
FROM {table_id_final_base_bmi_age_race_gender_adi_temp} AS b
LEFT JOIN wbc_agg AS a
  USING (anon_id, pat_enc_csn_id_coded, order_proc_id_coded, blood_culture_order_datetime_utc)
ORDER BY b.anon_id, b.blood_culture_order_datetime_utc
"""
final_base_wbc_only = client.query(query_wbc).to_dataframe()


In [146]:
final_base_wbc_only.isnull().mean()*100

anon_id                             0.000000
pat_enc_csn_id_coded                0.000000
order_proc_id_coded                 0.000000
blood_culture_order_datetime_utc    0.000000
min_wbc                             8.108305
max_wbc                             8.108305
avg_wbc                             8.108305
median_wbc                          8.108305
dtype: float64

### Neutrophils (%)

In [147]:
query_neut = f"""
WITH all_lab_result AS (
  -- SHC labs
  SELECT
    anon_id,
    pat_enc_csn_id_coded,
    lab_name,
    base_name,
    ord_value,
    reference_unit,
    CAST(order_time_jittered_utc AS TIMESTAMP) AS order_ts
  FROM `som-nero-phi-jonc101.shc_core_2024.lab_result`
  WHERE anon_id IN (SELECT DISTINCT anon_id FROM {table_id_final_base_bmi_age_race_gender_adi_temp})

  UNION ALL

  -- LPCH labs
  SELECT
    anon_id,
    pat_enc_csn_id_coded,
    lab_name,
    base_name,
    ord_value,
    reference_unit,
    CAST(order_time_jittered_utc AS TIMESTAMP) AS order_ts
  FROM `som-nero-phi-jonc101.lpch_core_2024.lpch_lab_result`
  WHERE anon_id IN (SELECT DISTINCT anon_id FROM {table_id_final_base_bmi_age_race_gender_adi_temp})
),

neut_raw AS (
  SELECT
    c.anon_id,
    c.pat_enc_csn_id_coded,
    c.order_proc_id_coded,
    c.blood_culture_order_datetime_utc,
    CASE
      WHEN LOWER(TRIM(lr.lab_name)) IN ('neutrophil %')
      THEN SAFE_CAST(lr.ord_value AS FLOAT64)
    END AS neutrophils
  FROM {table_id_final_base_bmi_age_race_gender_adi_temp} AS c
  LEFT JOIN all_lab_result AS lr
    USING (anon_id, pat_enc_csn_id_coded)
  WHERE TIMESTAMP_DIFF(
          lr.order_ts,
          CAST(c.blood_culture_order_datetime_utc AS TIMESTAMP),
          HOUR
        ) BETWEEN -48 AND 0
),

neut_agg AS (
  SELECT
    anon_id,
    pat_enc_csn_id_coded,
    order_proc_id_coded,
    blood_culture_order_datetime_utc,
    ROUND(MIN(neutrophils), 2) AS min_neutrophils,
    ROUND(MAX(neutrophils), 2) AS max_neutrophils,
    ROUND(AVG(neutrophils), 2) AS avg_neutrophils,
    ROUND(APPROX_QUANTILES(neutrophils, 100)[OFFSET(50)], 2) AS median_neutrophils
  FROM neut_raw
  GROUP BY 1,2,3,4
)

SELECT
  b.anon_id,
  b.pat_enc_csn_id_coded,
  b.order_proc_id_coded,
  b.blood_culture_order_datetime_utc,
  a.min_neutrophils,
  a.max_neutrophils,
  a.avg_neutrophils,
  a.median_neutrophils
FROM {table_id_final_base_bmi_age_race_gender_adi_temp} AS b
LEFT JOIN neut_agg AS a
  USING (anon_id, pat_enc_csn_id_coded, order_proc_id_coded, blood_culture_order_datetime_utc)
ORDER BY b.anon_id, b.blood_culture_order_datetime_utc
"""
final_base_neutrophils_only = client.query(query_neut).to_dataframe()


In [148]:
final_base_neutrophils_only.isnull().mean()*100

anon_id                              0.000000
pat_enc_csn_id_coded                 0.000000
order_proc_id_coded                  0.000000
blood_culture_order_datetime_utc     0.000000
min_neutrophils                     51.266582
max_neutrophils                     51.266582
avg_neutrophils                     51.266582
median_neutrophils                  51.266582
dtype: float64

# ANC

In [149]:
query_anc = f"""
WITH all_lab_result AS (
  -- SHC labs
  SELECT
    anon_id,
    pat_enc_csn_id_coded,
    lab_name,
    base_name,
    ord_value,
    reference_unit,
    CAST(order_time_jittered_utc AS TIMESTAMP) AS order_ts
  FROM `som-nero-phi-jonc101.shc_core_2024.lab_result`
  WHERE anon_id IN (SELECT DISTINCT anon_id FROM {table_id_final_base_bmi_age_race_gender_adi_temp})

  UNION ALL

  -- LPCH labs
  SELECT
    anon_id,
    pat_enc_csn_id_coded,
    lab_name,
    base_name,
    ord_value,
    reference_unit,
    CAST(order_time_jittered_utc AS TIMESTAMP) AS order_ts
  FROM `som-nero-phi-jonc101.lpch_core_2024.lpch_lab_result`
  WHERE anon_id IN (SELECT DISTINCT anon_id FROM {table_id_final_base_bmi_age_race_gender_adi_temp})
),

neut_raw AS (
  SELECT
    c.anon_id,
    c.pat_enc_csn_id_coded,
    c.order_proc_id_coded,
    c.blood_culture_order_datetime_utc,
    CASE 
      WHEN TRIM(lr.base_name) = 'NEUTAB'
      THEN SAFE_CAST(lr.ord_value AS FLOAT64)
    END AS neutrophils
  FROM {table_id_final_base_bmi_age_race_gender_adi_temp} AS c
  LEFT JOIN all_lab_result AS lr
    USING (anon_id, pat_enc_csn_id_coded)
  WHERE TIMESTAMP_DIFF(
          lr.order_ts,
          CAST(c.blood_culture_order_datetime_utc AS TIMESTAMP),
          HOUR
        ) BETWEEN -48 AND 0
),

neut_agg AS (
  SELECT
    anon_id,
    pat_enc_csn_id_coded,
    order_proc_id_coded,
    blood_culture_order_datetime_utc,
    ROUND(MIN(neutrophils), 2) AS min_anc,
    ROUND(MAX(neutrophils), 2) AS max_anc,
    ROUND(AVG(neutrophils), 2) AS avg_anc,
    ROUND(APPROX_QUANTILES(neutrophils, 100)[OFFSET(50)], 2) AS median_anc
  FROM neut_raw
  GROUP BY 1,2,3,4
)

SELECT
  b.anon_id,
  b.pat_enc_csn_id_coded,
  b.order_proc_id_coded,
  b.blood_culture_order_datetime_utc,
  a.min_anc,
  a.max_anc,
  a.avg_anc,
  a.median_anc
FROM {table_id_final_base_bmi_age_race_gender_adi_temp} AS b
LEFT JOIN neut_agg AS a
  USING (anon_id, pat_enc_csn_id_coded, order_proc_id_coded, blood_culture_order_datetime_utc)
ORDER BY b.anon_id, b.blood_culture_order_datetime_utc
"""
final_base_anc_only = client.query(query_anc).to_dataframe()


In [150]:
final_base_anc_only.isnull().mean()*100

anon_id                              0.000000
pat_enc_csn_id_coded                 0.000000
order_proc_id_coded                  0.000000
blood_culture_order_datetime_utc     0.000000
min_anc                             24.641105
max_anc                             24.641105
avg_anc                             24.641105
median_anc                          24.641105
dtype: float64

# lymphocytes (%)

In [151]:
query_lymph = f"""
WITH all_lab_result AS (
  -- SHC labs
  SELECT
    anon_id,
    pat_enc_csn_id_coded,
    lab_name,
    base_name,
    ord_value,
    reference_unit,
    CAST(order_time_jittered_utc AS TIMESTAMP) AS order_ts
  FROM `som-nero-phi-jonc101.shc_core_2024.lab_result`
  WHERE anon_id IN (SELECT DISTINCT anon_id FROM {table_id_final_base_bmi_age_race_gender_adi_temp})

  UNION ALL

  -- LPCH labs
  SELECT
    anon_id,
    pat_enc_csn_id_coded,
    lab_name,
    base_name,
    ord_value,
    reference_unit,
    CAST(order_time_jittered_utc AS TIMESTAMP) AS order_ts
  FROM `som-nero-phi-jonc101.lpch_core_2024.lpch_lab_result`
  WHERE anon_id IN (SELECT DISTINCT anon_id FROM {table_id_final_base_bmi_age_race_gender_adi_temp})
),

lymph_raw AS (
  SELECT
    c.anon_id,
    c.pat_enc_csn_id_coded,
    c.order_proc_id_coded,
    c.blood_culture_order_datetime_utc,
    CASE
      WHEN TRIM(lr.base_name) IN ('LYM')
      THEN SAFE_CAST(lr.ord_value AS FLOAT64)
    END AS lymphocytes
  FROM {table_id_final_base_bmi_age_race_gender_adi_temp} AS c
  LEFT JOIN all_lab_result AS lr
    USING (anon_id, pat_enc_csn_id_coded)
  WHERE TIMESTAMP_DIFF(
          lr.order_ts,
          CAST(c.blood_culture_order_datetime_utc AS TIMESTAMP),
          HOUR
        ) BETWEEN -48 AND 0
),

lymph_agg AS (
  SELECT
    anon_id,
    pat_enc_csn_id_coded,
    order_proc_id_coded,
    blood_culture_order_datetime_utc,
    ROUND(MIN(lymphocytes), 2) AS min_lymphocytes,
    ROUND(MAX(lymphocytes), 2) AS max_lymphocytes,
    ROUND(AVG(lymphocytes), 2) AS avg_lymphocytes,
    ROUND(APPROX_QUANTILES(lymphocytes, 100)[OFFSET(50)], 2) AS median_lymphocytes
  FROM lymph_raw
  GROUP BY 1,2,3,4
)

SELECT
  b.anon_id,
  b.pat_enc_csn_id_coded,
  b.order_proc_id_coded,
  b.blood_culture_order_datetime_utc,
  a.min_lymphocytes,
  a.max_lymphocytes,
  a.avg_lymphocytes,
  a.median_lymphocytes
FROM {table_id_final_base_bmi_age_race_gender_adi_temp} AS b
LEFT JOIN lymph_agg AS a
  USING (anon_id, pat_enc_csn_id_coded, order_proc_id_coded, blood_culture_order_datetime_utc)
ORDER BY b.anon_id, b.blood_culture_order_datetime_utc
"""
final_base_lymphocytes_only = client.query(query_lymph).to_dataframe()


In [152]:
final_base_lymphocytes_only.isnull().mean() * 100

anon_id                              0.000000
pat_enc_csn_id_coded                 0.000000
order_proc_id_coded                  0.000000
blood_culture_order_datetime_utc     0.000000
min_lymphocytes                     18.026531
max_lymphocytes                     18.026531
avg_lymphocytes                     18.026531
median_lymphocytes                  18.026531
dtype: float64

# ALC

In [153]:
query_alc = f"""
WITH all_lab_result AS (
  -- SHC labs
  SELECT
    anon_id,
    pat_enc_csn_id_coded,
    lab_name,
    base_name,
    ord_value,
    reference_unit,
    CAST(order_time_jittered_utc AS TIMESTAMP) AS order_ts
  FROM `som-nero-phi-jonc101.shc_core_2024.lab_result`
  WHERE anon_id IN (SELECT DISTINCT anon_id FROM {table_id_final_base_bmi_age_race_gender_adi_temp})

  UNION ALL

  -- LPCH labs
  SELECT
    anon_id,
    pat_enc_csn_id_coded,
    lab_name,
    base_name,
    ord_value,
    reference_unit,
    CAST(order_time_jittered_utc AS TIMESTAMP) AS order_ts
  FROM `som-nero-phi-jonc101.lpch_core_2024.lpch_lab_result`
  WHERE anon_id IN (SELECT DISTINCT anon_id FROM {table_id_final_base_bmi_age_race_gender_adi_temp})
),

lymph_raw AS (
  SELECT
    c.anon_id,
    c.pat_enc_csn_id_coded,
    c.order_proc_id_coded,
    c.blood_culture_order_datetime_utc,
    CASE
      WHEN TRIM(lr.base_name) IN ('LYMAB') AND LOWER(TRIM(lr.reference_unit)) = 'k/ul'
        THEN SAFE_CAST(lr.ord_value AS FLOAT64)
      WHEN TRIM(lr.base_name) IN ('LYMAB') AND LOWER(TRIM(lr.reference_unit)) = 'ul'
        THEN SAFE_CAST(lr.ord_value AS FLOAT64) / 1000
      WHEN TRIM(lr.base_name) IN ('LYMAB') AND LOWER(TRIM(lr.reference_unit)) = 'cells/ul'
        THEN SAFE_CAST(lr.ord_value AS FLOAT64) / 1000
      WHEN TRIM(lr.base_name) IN ('LYMAB') AND LOWER(TRIM(lr.reference_unit)) IS NULL
        THEN SAFE_CAST(lr.ord_value AS FLOAT64) / 1000
      ELSE NULL
    END AS lymphocytes
  FROM {table_id_final_base_bmi_age_race_gender_adi_temp} AS c
  LEFT JOIN all_lab_result AS lr
    USING (anon_id, pat_enc_csn_id_coded)
  WHERE TIMESTAMP_DIFF(
          lr.order_ts,
          CAST(c.blood_culture_order_datetime_utc AS TIMESTAMP),
          HOUR
        ) BETWEEN -48 AND 0
),

lymph_agg AS (
  SELECT
    anon_id,
    pat_enc_csn_id_coded,
    order_proc_id_coded,
    blood_culture_order_datetime_utc,
    ROUND(MIN(lymphocytes), 2) AS min_alc,
    ROUND(MAX(lymphocytes), 2) AS max_alc,
    ROUND(AVG(lymphocytes), 2) AS avg_alc,
    ROUND(APPROX_QUANTILES(lymphocytes, 100)[OFFSET(50)], 2) AS median_alc
  FROM lymph_raw
  GROUP BY 1,2,3,4
)

SELECT
  b.anon_id,
  b.pat_enc_csn_id_coded,
  b.order_proc_id_coded,
  b.blood_culture_order_datetime_utc,
  a.min_alc,
  a.max_alc,
  a.avg_alc,
  a.median_alc
FROM {table_id_final_base_bmi_age_race_gender_adi_temp} AS b
LEFT JOIN lymph_agg AS a
  USING (anon_id, pat_enc_csn_id_coded, order_proc_id_coded, blood_culture_order_datetime_utc)
ORDER BY b.anon_id, b.blood_culture_order_datetime_utc
"""
final_base_alc_only = client.query(query_alc).to_dataframe()


In [154]:
final_base_alc_only.isnull().mean()*100

anon_id                              0.000000
pat_enc_csn_id_coded                 0.000000
order_proc_id_coded                  0.000000
blood_culture_order_datetime_utc     0.000000
min_alc                             18.826095
max_alc                             18.826095
avg_alc                             18.826095
median_alc                          18.826095
dtype: float64

# hgb

In [155]:
query_hgb = f"""
WITH all_lab_result AS (
  -- SHC labs
  SELECT
    anon_id,
    pat_enc_csn_id_coded,
    lab_name,
    base_name,
    ord_value,
    reference_unit,
    CAST(order_time_jittered_utc AS TIMESTAMP) AS order_ts
  FROM `som-nero-phi-jonc101.shc_core_2024.lab_result`
  WHERE anon_id IN (SELECT DISTINCT anon_id FROM {table_id_final_base_bmi_age_race_gender_adi_temp})

  UNION ALL

  -- LPCH labs
  SELECT
    anon_id,
    pat_enc_csn_id_coded,
    lab_name,
    base_name,
    ord_value,
    reference_unit,
    CAST(order_time_jittered_utc AS TIMESTAMP) AS order_ts
  FROM `som-nero-phi-jonc101.lpch_core_2024.lpch_lab_result`
  WHERE anon_id IN (SELECT DISTINCT anon_id FROM {table_id_final_base_bmi_age_race_gender_adi_temp})
),

hgb_raw AS (
  SELECT
    c.anon_id,
    c.pat_enc_csn_id_coded,
    c.order_proc_id_coded,
    c.blood_culture_order_datetime_utc,
    CASE
      WHEN LOWER(lr.base_name) = 'hgb'
           AND LOWER(TRIM(lr.reference_unit)) = 'mg/dl'
        THEN SAFE_CAST(lr.ord_value AS FLOAT64) / 1000
      WHEN LOWER(lr.base_name) = 'hgb'
           AND (LOWER(TRIM(lr.reference_unit)) LIKE '%g/dl%' OR LOWER(TRIM(lr.reference_unit)) LIKE '%gm/dl%')
        THEN SAFE_CAST(lr.ord_value AS FLOAT64)
    END AS hgb_mgdl
  FROM {table_id_final_base_bmi_age_race_gender_adi_temp} AS c
  LEFT JOIN all_lab_result AS lr
    USING (anon_id, pat_enc_csn_id_coded)
  WHERE TIMESTAMP_DIFF(
          lr.order_ts,
          CAST(c.blood_culture_order_datetime_utc AS TIMESTAMP),
          HOUR
        ) BETWEEN -48 AND 0
),

hgb_agg AS (
  SELECT
    anon_id,
    pat_enc_csn_id_coded,
    order_proc_id_coded,
    blood_culture_order_datetime_utc,
    ROUND(MIN(hgb_mgdl), 2) AS min_hgb,
    ROUND(MAX(hgb_mgdl), 2) AS max_hgb,
    ROUND(AVG(hgb_mgdl), 2) AS avg_hgb,
    ROUND(APPROX_QUANTILES(hgb_mgdl, 100)[OFFSET(50)], 2) AS median_hgb
  FROM hgb_raw
  GROUP BY 1,2,3,4
)

SELECT
  b.anon_id,
  b.pat_enc_csn_id_coded,
  b.order_proc_id_coded,
  b.blood_culture_order_datetime_utc,
  a.min_hgb,
  a.max_hgb,
  a.avg_hgb,
  a.median_hgb
FROM {table_id_final_base_bmi_age_race_gender_adi_temp} AS b
LEFT JOIN hgb_agg AS a
  USING (anon_id, pat_enc_csn_id_coded, order_proc_id_coded, blood_culture_order_datetime_utc)
ORDER BY b.anon_id, b.blood_culture_order_datetime_utc
"""
final_base_hgb_only = client.query(query_hgb).to_dataframe()


In [156]:
final_base_hgb_only.isnull().mean()

anon_id                             0.000000
pat_enc_csn_id_coded                0.000000
order_proc_id_coded                 0.000000
blood_culture_order_datetime_utc    0.000000
min_hgb                             0.077848
max_hgb                             0.077848
avg_hgb                             0.077848
median_hgb                          0.077848
dtype: float64

### platelets


In [157]:
query_plt = f"""
WITH all_lab_result AS (
  -- SHC labs
  SELECT
    anon_id,
    pat_enc_csn_id_coded,
    lab_name,
    base_name,
    ord_value,
    reference_unit,
    CAST(order_time_jittered_utc AS TIMESTAMP) AS order_ts
  FROM `som-nero-phi-jonc101.shc_core_2024.lab_result`
  WHERE anon_id IN (SELECT DISTINCT anon_id FROM {table_id_final_base_bmi_age_race_gender_adi_temp})

  UNION ALL

  -- LPCH labs
  SELECT
    anon_id,
    pat_enc_csn_id_coded,
    lab_name,
    base_name,
    ord_value,
    reference_unit,
    CAST(order_time_jittered_utc AS TIMESTAMP) AS order_ts
  FROM `som-nero-phi-jonc101.lpch_core_2024.lpch_lab_result`
  WHERE anon_id IN (SELECT DISTINCT anon_id FROM {table_id_final_base_bmi_age_race_gender_adi_temp})
),

plt_raw AS (
  SELECT
    c.anon_id,
    c.pat_enc_csn_id_coded,
    c.order_proc_id_coded,
    c.blood_culture_order_datetime_utc,
    CASE
      WHEN LOWER(lr.base_name) = 'plt'
           AND LOWER(lr.reference_unit) IN ('x10e3/ul','10x3/ul','10*3/ul','k/ul','thousand/ul')
        THEN SAFE_CAST(lr.ord_value AS FLOAT64)
      WHEN LOWER(lr.base_name) = 'plt'
           AND LOWER(lr.reference_unit) IN ('/ul','ul')
        THEN SAFE_CAST(lr.ord_value AS FLOAT64) / 1000
    END AS plt_kul
  FROM {table_id_final_base_bmi_age_race_gender_adi_temp} AS c
  LEFT JOIN all_lab_result AS lr
    USING (anon_id, pat_enc_csn_id_coded)
  WHERE TIMESTAMP_DIFF(
          lr.order_ts,
          CAST(c.blood_culture_order_datetime_utc AS TIMESTAMP),
          HOUR
        ) BETWEEN -48 AND 0
),

plt_agg AS (
  SELECT
    anon_id,
    pat_enc_csn_id_coded,
    order_proc_id_coded,
    blood_culture_order_datetime_utc,
    ROUND(MIN(plt_kul), 2) AS min_plt,
    ROUND(MAX(plt_kul), 2) AS max_plt,
    ROUND(AVG(plt_kul), 2) AS avg_plt,
    ROUND(APPROX_QUANTILES(plt_kul, 100)[OFFSET(50)], 2) AS median_plt
  FROM plt_raw
  GROUP BY 1,2,3,4
)

SELECT
  b.anon_id,
  b.pat_enc_csn_id_coded,
  b.order_proc_id_coded,
  b.blood_culture_order_datetime_utc,
  a.min_plt,
  a.max_plt,
  a.avg_plt,
  a.median_plt
FROM {table_id_final_base_bmi_age_race_gender_adi_temp} AS b
LEFT JOIN plt_agg AS a
  USING (anon_id, pat_enc_csn_id_coded, order_proc_id_coded, blood_culture_order_datetime_utc)
ORDER BY b.anon_id, b.blood_culture_order_datetime_utc
"""
final_base_plt_only = client.query(query_plt).to_dataframe()


In [158]:
final_base_plt_only.isnull().mean()

anon_id                             0.000000
pat_enc_csn_id_coded                0.000000
order_proc_id_coded                 0.000000
blood_culture_order_datetime_utc    0.000000
min_plt                             0.082137
max_plt                             0.082137
avg_plt                             0.082137
median_plt                          0.082137
dtype: float64

# glucose

In [159]:
query_glucose = f"""
WITH all_lab_result AS (
  -- SHC labs
  SELECT
    anon_id,
    pat_enc_csn_id_coded,
    lab_name,
    base_name,
    ord_value,
    reference_unit,
    CAST(order_time_jittered_utc AS TIMESTAMP) AS order_ts
  FROM `som-nero-phi-jonc101.shc_core_2024.lab_result`
  WHERE anon_id IN (SELECT DISTINCT anon_id FROM {table_id_final_base_bmi_age_race_gender_adi_temp})

  UNION ALL

  -- LPCH labs
  SELECT
    anon_id,
    pat_enc_csn_id_coded,
    lab_name,
    base_name,
    ord_value,
    reference_unit,
    CAST(order_time_jittered_utc AS TIMESTAMP) AS order_ts
  FROM `som-nero-phi-jonc101.lpch_core_2024.lpch_lab_result`
  WHERE anon_id IN (SELECT DISTINCT anon_id FROM {table_id_final_base_bmi_age_race_gender_adi_temp})
),

glu_raw AS (
  SELECT
    c.anon_id,
    c.pat_enc_csn_id_coded,
    c.order_proc_id_coded,
    c.blood_culture_order_datetime_utc,
    CASE
      WHEN LOWER(lr.base_name) = 'glu' THEN SAFE_CAST(lr.ord_value AS FLOAT64)
    END AS glucose
  FROM {table_id_final_base_bmi_age_race_gender_adi_temp} AS c
  LEFT JOIN all_lab_result AS lr
    USING (anon_id, pat_enc_csn_id_coded)
  WHERE TIMESTAMP_DIFF(
          lr.order_ts,
          CAST(c.blood_culture_order_datetime_utc AS TIMESTAMP),
          HOUR
        ) BETWEEN -48 AND 0
),

glu_agg AS (
  SELECT
    anon_id,
    pat_enc_csn_id_coded,
    order_proc_id_coded,
    blood_culture_order_datetime_utc,
    ROUND(MIN(glucose), 2) AS min_glucose,
    ROUND(MAX(glucose), 2) AS max_glucose,
    ROUND(AVG(glucose), 2) AS avg_glucose,
    ROUND(APPROX_QUANTILES(glucose, 100)[OFFSET(50)], 2) AS median_glucose
  FROM glu_raw
  GROUP BY 1,2,3,4
)

SELECT
  b.anon_id,
  b.pat_enc_csn_id_coded,
  b.order_proc_id_coded,
  b.blood_culture_order_datetime_utc,
  a.min_glucose,
  a.max_glucose,
  a.avg_glucose,
  a.median_glucose
FROM {table_id_final_base_bmi_age_race_gender_adi_temp} AS b
LEFT JOIN glu_agg AS a
  USING (anon_id, pat_enc_csn_id_coded, order_proc_id_coded, blood_culture_order_datetime_utc)
ORDER BY b.anon_id, b.blood_culture_order_datetime_utc
"""
final_base_glucose_only = client.query(query_glucose).to_dataframe()


In [160]:
final_base_glucose_only.isnull().mean()

anon_id                             0.000000
pat_enc_csn_id_coded                0.000000
order_proc_id_coded                 0.000000
blood_culture_order_datetime_utc    0.000000
min_glucose                         0.208468
max_glucose                         0.208468
avg_glucose                         0.208468
median_glucose                      0.208468
dtype: float64

# lactate

In [161]:
query_lactate = f"""
WITH all_lab_result AS (
  -- SHC labs
  SELECT
    anon_id,
    pat_enc_csn_id_coded,
    lab_name,
    base_name,
    ord_value,
    reference_unit,
    CAST(order_time_jittered_utc AS TIMESTAMP) AS order_ts
  FROM `som-nero-phi-jonc101.shc_core_2024.lab_result`
  WHERE anon_id IN (SELECT DISTINCT anon_id FROM {table_id_final_base_bmi_age_race_gender_adi_temp})

  UNION ALL

  -- LPCH labs
  SELECT
    anon_id,
    pat_enc_csn_id_coded,
    lab_name,
    base_name,
    ord_value,
    reference_unit,
    CAST(order_time_jittered_utc AS TIMESTAMP) AS order_ts
  FROM `som-nero-phi-jonc101.lpch_core_2024.lpch_lab_result`
  WHERE anon_id IN (SELECT DISTINCT anon_id FROM {table_id_final_base_bmi_age_race_gender_adi_temp})
),

lac_raw AS (
  SELECT
    c.anon_id,
    c.pat_enc_csn_id_coded,
    c.order_proc_id_coded,
    c.blood_culture_order_datetime_utc,
    CASE
      WHEN LOWER(lr.base_name) IN ('lac', 'lacpoc', 'lacwbl')
           AND (
             LOWER(lr.reference_unit) IN ('mmol/l','mmole/l','mmoll')
             OR lr.reference_unit IS NULL
           )
        THEN SAFE_CAST(lr.ord_value AS FLOAT64)
      WHEN LOWER(lr.base_name) IN ('lac', 'lacpoc', 'lacwbl')
           AND LOWER(lr.reference_unit) IN ('mg/dl')
        THEN SAFE_CAST(lr.ord_value AS FLOAT64) / 9.008
    END AS lactate
  FROM {table_id_final_base_bmi_age_race_gender_adi_temp} AS c
  LEFT JOIN all_lab_result AS lr
    USING (anon_id, pat_enc_csn_id_coded)
  WHERE TIMESTAMP_DIFF(
          lr.order_ts,
          CAST(c.blood_culture_order_datetime_utc AS TIMESTAMP),
          HOUR
        ) BETWEEN -48 AND 0
),

lac_agg AS (
  SELECT
    anon_id,
    pat_enc_csn_id_coded,
    order_proc_id_coded,
    blood_culture_order_datetime_utc,
    ROUND(MIN(lactate), 2) AS min_lactate,
    ROUND(MAX(lactate), 2) AS max_lactate,
    ROUND(AVG(lactate), 2) AS avg_lactate,
    ROUND(APPROX_QUANTILES(lactate, 100)[OFFSET(50)], 2) AS median_lactate
  FROM lac_raw
  GROUP BY 1,2,3,4
)

SELECT
  b.anon_id,
  b.pat_enc_csn_id_coded,
  b.order_proc_id_coded,
  b.blood_culture_order_datetime_utc,
  a.min_lactate,
  a.max_lactate,
  a.avg_lactate,
  a.median_lactate
FROM {table_id_final_base_bmi_age_race_gender_adi_temp} AS b
LEFT JOIN lac_agg AS a
  USING (anon_id, pat_enc_csn_id_coded, order_proc_id_coded, blood_culture_order_datetime_utc)
ORDER BY b.anon_id, b.blood_culture_order_datetime_utc
"""
final_base_lactate_only = client.query(query_lactate).to_dataframe()


In [162]:
final_base_lactate_only.isnull().mean()

anon_id                             0.000000
pat_enc_csn_id_coded                0.000000
order_proc_id_coded                 0.000000
blood_culture_order_datetime_utc    0.000000
min_lactate                         0.832637
max_lactate                         0.832637
avg_lactate                         0.832637
median_lactate                      0.832637
dtype: float64

In [163]:
final_base_bmi_age_race_gender_adi.isnull().mean()

anon_id                             0.000000
pat_enc_csn_id_coded                0.000000
order_proc_id_coded                 0.000000
blood_culture_order_datetime_utc    0.000000
order_year                          0.000000
age_days                            0.000000
age_years                           0.000000
gender                              0.000000
race                                0.000000
bmi                                 0.073923
source                              0.000000
positive_blood_culture              0.000000
zip_clean                           0.066473
adi_score                           0.000000
adi_imputed_flag                    0.000000
dtype: float64

# Blood gas
## glucose and lactate , done in lab section

In [164]:
import pandas as pd
from functools import reduce

# List of all per-vital DataFrames
labs_df = [final_base_wbc_only, final_base_neutrophils_only,final_base_anc_only, final_base_lymphocytes_only, final_base_alc_only, final_base_hgb_only, final_base_plt_only, final_base_glucose_only, final_base_lactate_only]

# Merge keys
merge_keys = ["anon_id", "pat_enc_csn_id_coded", "order_proc_id_coded", "blood_culture_order_datetime_utc"]

# Keep only key columns + aggregate metrics (min, max, median, mode)
dfs_clean = [
    df.loc[:, df.columns.str.contains("|".join(merge_keys)) | df.columns.str.contains("(min|max|median|mode|avg)", case=False)]
    for df in labs_df
]

# Merge all DataFrames on the same keys
final_base_labs_without_cpr_or_pct = reduce(lambda left, right: pd.merge(left, right, on=merge_keys, how="outer"), dfs_clean)


  df.loc[:, df.columns.str.contains("|".join(merge_keys)) | df.columns.str.contains("(min|max|median|mode|avg)", case=False)]


In [165]:

table_id_final_base_labs_without_cpr_or_pct = f"{project_id}.blood_culture_stewardship_peds_sandy_2024.final_base_labs_without_cpr_or_pct_peds"

# Upload the DataFrame to BigQuery
final_base_labs_without_cpr_or_pct.to_gbq(
    destination_table=table_id_final_base_labs_without_cpr_or_pct,
    project_id=project_id,
    if_exists='replace'  # This will replace the table if it exists
)

print(f"Uploaded final_base_labs_without_cpr_or_pct to {table_id_final_base_labs_without_cpr_or_pct}")

  final_base_labs_without_cpr_or_pct.to_gbq(
100%|██████████| 1/1 [00:00<00:00, 2882.68it/s]

Uploaded final_base_labs_without_cpr_or_pct to som-nero-phi-jonc101.blood_culture_stewardship_peds_sandy_2024.final_base_labs_without_cpr_or_pct_peds





In [166]:
final_base_labs_without_cpr_or_pct.isnull().mean()

anon_id                             0.000000
pat_enc_csn_id_coded                0.000000
order_proc_id_coded                 0.000000
blood_culture_order_datetime_utc    0.000000
min_wbc                             0.081083
max_wbc                             0.081083
avg_wbc                             0.081083
median_wbc                          0.081083
min_neutrophils                     0.512666
max_neutrophils                     0.512666
avg_neutrophils                     0.512666
median_neutrophils                  0.512666
min_anc                             0.246411
max_anc                             0.246411
avg_anc                             0.246411
median_anc                          0.246411
min_lymphocytes                     0.180265
max_lymphocytes                     0.180265
avg_lymphocytes                     0.180265
median_lymphocytes                  0.180265
min_alc                             0.188261
max_alc                             0.188261
avg_alc   

# UA 

In [175]:
query = f"""
-- UA features per blood-culture order (one row per order)
WITH base AS (
  SELECT *
  FROM {table_id_final_base_bmi_age_race_gender_adi_temp}
),

-- limit UA table to patients in base, from BOTH SHC and LPCH
all_UA AS (
  -- SHC
  SELECT
    anon_id,
    pat_enc_csn_id_coded,
    CAST(order_time_jittered_utc AS TIMESTAMP) AS order_ts,
    lab_name,
    ord_value,
    reference_unit,
    component_id
  FROM `som-nero-phi-jonc101.shc_core_2024.lab_result`
  WHERE component_id IN (1230100515,1230100517,1230100518,1230100514)
    AND anon_id IN (
      SELECT DISTINCT anon_id
      FROM {table_id_final_base_bmi_age_race_gender_adi_temp}
    )
    AND ord_value IS NOT NULL

  UNION ALL

  -- LPCH
  SELECT
    anon_id,
    pat_enc_csn_id_coded,
    CAST(order_time_jittered_utc AS TIMESTAMP) AS order_ts,
    lab_name,
    ord_value,
    reference_unit,
    component_id
  FROM `som-nero-phi-jonc101.lpch_core_2024.lpch_lab_result`
  WHERE component_id IN (1230100515,1230100517,1230100518,1230100514)
    AND anon_id IN (
      SELECT DISTINCT anon_id
      FROM {table_id_final_base_bmi_age_race_gender_adi_temp}
    )
    AND ord_value IS NOT NULL
),

-- attach each UA to each blood-culture order within [-48h, 0h]
ua_window AS (
  SELECT
    b.anon_id,
    b.pat_enc_csn_id_coded,
    b.order_proc_id_coded,
    b.blood_culture_order_datetime_utc,
    u.order_ts AS order_time_jittered,
    u.component_id,
    u.ord_value,
    u.reference_unit
  FROM base b
  JOIN all_UA u
    USING (anon_id, pat_enc_csn_id_coded)
  WHERE TIMESTAMP_DIFF(
          u.order_ts,
          CAST(b.blood_culture_order_datetime_utc AS TIMESTAMP),
          HOUR
        ) BETWEEN -48 AND 0
),

-- map raw ord_value -> POSITIVE/NEGATIVE per component
ua_labeled AS (
  SELECT
    anon_id,
    pat_enc_csn_id_coded,
    order_proc_id_coded,
    blood_culture_order_datetime_utc,
    order_time_jittered,
    component_id,

    CASE 
      WHEN component_id = 1230100515 AND LOWER(ord_value) IN ('negative','neg','neh') THEN 'NEGATIVE'
      WHEN component_id = 1230100515 AND LOWER(ord_value) IN (
        'n/a, color interference','positive','large','moderate2.0',
        'small','trace','2+','125++','3+','4+','1+'
      ) THEN 'POSITIVE'
    END AS Leukocyte_Esterase,

    CASE 
      WHEN component_id = 1230100517 AND reference_unit LIKE '%HPF%' THEN
        CASE 
          WHEN SAFE_CAST(REGEXP_EXTRACT(ord_value, r'(\\d+)') AS NUMERIC) >= 5 THEN 'POSITIVE'
          WHEN SAFE_CAST(REGEXP_EXTRACT(ord_value, r'(\\d+)') AS NUMERIC) < 5 THEN 'NEGATIVE'
        END
    END AS WBC_urine,

    CASE 
      WHEN component_id = 1230100518 AND LOWER(ord_value) IN ('none seen','no significant amount of bacteria detected.','none') THEN 'NEGATIVE'
      WHEN component_id = 1230100518 AND LOWER(ord_value) IN ('rare','occasional','many','moderate','few','41','profuse') THEN 'POSITIVE'
    END AS Bacteria_urine,

    CASE 
      WHEN component_id = 1230100514 AND LOWER(ord_value) IN ('negative','neg','neh') THEN 'NEGATIVE'
      WHEN component_id = 1230100514 AND LOWER(ord_value) IN ('n/a, color interference','positive') THEN 'POSITIVE'
    END AS Nitrite_urine
  FROM ua_window
),

-- pick the *latest* UA per component for each order
ua_ranked AS (
  SELECT
    *,
    ROW_NUMBER() OVER (
      PARTITION BY anon_id, pat_enc_csn_id_coded, order_proc_id_coded, component_id
      ORDER BY order_time_jittered DESC
    ) AS rn
  FROM ua_labeled
),
ua_latest AS (
  SELECT *
  FROM ua_ranked
  WHERE rn = 1
),

-- pivot/aggregate to 1 row per order
ua_pivot AS (
  SELECT
    anon_id,
    pat_enc_csn_id_coded,
    order_proc_id_coded,
    MAX(Leukocyte_Esterase) AS Leukocyte_Esterase,
    MAX(WBC_urine)         AS WBC_urine,
    MAX(Bacteria_urine)    AS Bacteria_urine,
    MAX(Nitrite_urine)     AS Nitrite_urine
  FROM ua_latest
  GROUP BY 1,2,3
)

SELECT
  b.anon_id,
  b.pat_enc_csn_id_coded,
  b.order_proc_id_coded,
  b.blood_culture_order_datetime_utc,
  u.Leukocyte_Esterase,
  u.WBC_urine,
  u.Bacteria_urine,
  u.Nitrite_urine
FROM base b
LEFT JOIN ua_pivot u
USING (anon_id, pat_enc_csn_id_coded, order_proc_id_coded)
"""
final_base_ua = client.query(query).to_dataframe()


# LDA

In [179]:
query = f"""
WITH all_LDA AS (
  -- SHC LDA
  SELECT
    anon_id,
    pat_enc_csn_id_coded,
    CAST(placement_instant_jittered_utc AS TIMESTAMP) AS placement_ts,
    CAST(removal_instant_jittered_utc AS TIMESTAMP) AS removal_ts,
    description
  FROM `som-nero-phi-jonc101.shc_core_2024.lda`
  WHERE (
      description LIKE ANY ('%picc%','%ETT%','%CVC%','%ECMO%','%EVD%','%ET%Tube%')
      OR LOWER(description) LIKE ANY ('%icu%line%','%tunnel%catheter%','%surgical%ur%catheter%','%port%','%dialysis%catheter%')
    )
    AND anon_id IN (
      SELECT DISTINCT anon_id
      FROM {table_id_final_base_bmi_age_race_gender_adi_temp}
    )

  UNION ALL

  -- LPCH LDA
  SELECT
    anon_id,
    pat_enc_csn_id_coded,
    CAST(placement_instant_jittered_utc AS TIMESTAMP) AS placement_ts,
    CAST(removal_instant_jittered_utc AS TIMESTAMP) AS removal_ts,
    description
  FROM `som-nero-phi-jonc101.lpch_core_2024.lpch_lda`
  WHERE (
      description LIKE ANY ('%picc%','%ETT%','%CVC%','%ECMO%','%EVD%','%ET%Tube%')
      OR LOWER(description) LIKE ANY ('%icu%line%','%tunnel%catheter%','%surgical%ur%catheter%','%port%','%dialysis%catheter%')
    )
    AND anon_id IN (
      SELECT DISTINCT anon_id
      FROM {table_id_final_base_bmi_age_race_gender_adi_temp}
    )
),

LDA_pres AS (
  SELECT
    c.anon_id,
    c.pat_enc_csn_id_coded,
    c.order_proc_id_coded,
    CASE 
      WHEN description LIKE '%EVD%' THEN 'EVD'
      WHEN description LIKE '%ET%Tube%' THEN 'ET_Tube'
      WHEN LOWER(description) LIKE '%surgical%ur%catheter%' THEN 'Surgical_Urin_Catheter'
      ELSE 'otherline'
    END AS Line_Presense
  FROM all_LDA AS l
  INNER JOIN {table_id_final_base_bmi_age_race_gender_adi_temp} AS c
    USING (anon_id, pat_enc_csn_id_coded)
  -- line present at the time of blood-culture order:
  -- removal after order, placement before order
  WHERE TIMESTAMP_DIFF(l.removal_ts, CAST(c.blood_culture_order_datetime_utc AS TIMESTAMP), HOUR) > 0
    AND TIMESTAMP_DIFF(l.placement_ts, CAST(c.blood_culture_order_datetime_utc AS TIMESTAMP), HOUR) < 0
)

SELECT
  c.anon_id,
  c.pat_enc_csn_id_coded,
  c.order_proc_id_coded,
  c.blood_culture_order_datetime_utc,
  IF(COUNT(lda.Line_Presense) > 0, 1, 0) AS has_any_line
FROM {table_id_final_base_bmi_age_race_gender_adi_temp} AS c
LEFT JOIN LDA_pres AS lda
  USING (anon_id, pat_enc_csn_id_coded, order_proc_id_coded)
GROUP BY ALL
"""
final_base_lda = client.query(query).to_dataframe()


In [171]:
final_base_lda["has_any_line"].value_counts()

has_any_line
0    27339
1      176
Name: count, dtype: Int64

# merge labs, ua, and lda
##### final_base_labs_without_cpr_or_pct
####  final_base_ua
#### final_base_lda




In [180]:
# Merge all three tables: labs, UA, and LDA
# All tables share the same merge keys
merge_keys = ["anon_id", "pat_enc_csn_id_coded", "order_proc_id_coded", "blood_culture_order_datetime_utc"]

# First merge labs with UA
merged_labs_ua = pd.merge(
    final_base_labs_without_cpr_or_pct, 
    final_base_ua, 
    on=merge_keys, 
    how='outer'
)

# Then merge the result with LDA
final_labs_ua_lda = pd.merge(
    merged_labs_ua, 
    final_base_lda, 
    on=merge_keys, 
    how='outer'
)

print(f"Final merged table shape: {final_labs_ua_lda.shape}")
print(f"Columns: {list(final_labs_ua_lda.columns)}")

# Display the merged table
final_labs_ua_lda


Final merged table shape: (27515, 45)
Columns: ['anon_id', 'pat_enc_csn_id_coded', 'order_proc_id_coded', 'blood_culture_order_datetime_utc', 'min_wbc', 'max_wbc', 'avg_wbc', 'median_wbc', 'min_neutrophils', 'max_neutrophils', 'avg_neutrophils', 'median_neutrophils', 'min_anc', 'max_anc', 'avg_anc', 'median_anc', 'min_lymphocytes', 'max_lymphocytes', 'avg_lymphocytes', 'median_lymphocytes', 'min_alc', 'max_alc', 'avg_alc', 'median_alc', 'min_hgb', 'max_hgb', 'avg_hgb', 'median_hgb', 'min_plt', 'max_plt', 'avg_plt', 'median_plt', 'min_glucose', 'max_glucose', 'avg_glucose', 'median_glucose', 'min_lactate', 'max_lactate', 'avg_lactate', 'median_lactate', 'Leukocyte_Esterase', 'WBC_urine', 'Bacteria_urine', 'Nitrite_urine', 'has_any_line']


Unnamed: 0,anon_id,pat_enc_csn_id_coded,order_proc_id_coded,blood_culture_order_datetime_utc,min_wbc,max_wbc,avg_wbc,median_wbc,min_neutrophils,max_neutrophils,avg_neutrophils,median_neutrophils,min_anc,max_anc,avg_anc,median_anc,min_lymphocytes,max_lymphocytes,avg_lymphocytes,median_lymphocytes,min_alc,max_alc,avg_alc,median_alc,min_hgb,max_hgb,avg_hgb,median_hgb,min_plt,max_plt,avg_plt,median_plt,min_glucose,max_glucose,avg_glucose,median_glucose,min_lactate,max_lactate,avg_lactate,median_lactate,Leukocyte_Esterase,WBC_urine,Bacteria_urine,Nitrite_urine,has_any_line
0,JC1000964,313661322,717728953,2016-03-09 07:02:00,13.0,13.0,13.0,13.0,74.3,74.3,74.3,74.3,9.67,9.67,9.67,9.67,16.3,16.3,16.3,16.3,2.13,2.13,2.13,2.13,11.3,11.3,11.3,11.3,267.0,267.0,267.0,267.0,,,,,,,,,,,,,0
1,JC1000964,131181803241,489700509,2016-03-09 05:24:00,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0
2,JC1001312,311762468,711125650,2015-06-19 18:32:00,25.9,25.9,25.9,25.9,82.9,82.9,82.9,82.9,21.48,21.48,21.48,21.48,12.8,12.8,12.8,12.8,3.32,3.32,3.32,3.32,16.6,16.6,16.6,16.6,444.0,444.0,444.0,444.0,601.0,601.0,601.00,601.0,4.32,4.32,4.32,4.32,,,,,0
3,JC1001312,131092528115,467723412,2015-06-19 17:18:00,25.9,25.9,25.9,25.9,82.9,82.9,82.9,82.9,21.48,21.48,21.48,21.48,12.8,12.8,12.8,12.8,3.32,3.32,3.32,3.32,16.6,16.6,16.6,16.6,444.0,444.0,444.0,444.0,448.0,601.0,524.50,448.0,4.32,4.32,4.32,4.32,,,,,0
4,JC1001474,322642690,743704390,2019-05-21 14:39:00,18.5,18.5,18.5,18.5,89.6,89.6,89.6,89.6,16.63,16.63,16.63,16.63,6.1,6.1,6.1,6.1,1.13,1.13,1.13,1.13,13.6,13.6,13.6,13.6,295.0,295.0,295.0,295.0,287.0,287.0,287.00,287.0,6.12,6.12,6.12,6.12,NEGATIVE,NEGATIVE,POSITIVE,NEGATIVE,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27510,JC998040,131075054539,459336805,2015-02-10 07:16:00,15.4,15.4,15.4,15.4,87.0,87.0,87.0,87.0,13.37,13.37,13.37,13.37,5.2,5.2,5.2,5.2,0.80,0.80,0.80,0.80,10.7,10.7,10.7,10.7,169.0,169.0,169.0,169.0,103.0,103.0,103.00,103.0,,,,,,,,,0
27511,JC998572,321846584,745779961,2019-03-08 16:31:00,8.6,8.6,8.6,8.6,90.0,90.0,90.0,90.0,7.76,7.76,7.76,7.76,4.7,4.7,4.7,4.7,0.41,0.41,0.41,0.41,11.3,11.3,11.3,11.3,470.0,470.0,470.0,470.0,111.0,111.0,111.00,111.0,,,,,NEGATIVE,NEGATIVE,NEGATIVE,NEGATIVE,0
27512,JC998572,131264900613,598880066,2019-03-08 16:04:00,8.6,8.6,8.6,8.6,90.0,90.0,90.0,90.0,7.76,7.76,7.76,7.76,4.7,4.7,4.7,4.7,0.41,0.41,0.41,0.41,11.3,11.3,11.3,11.3,470.0,470.0,470.0,470.0,111.0,111.0,111.00,111.0,,,,,NEGATIVE,NEGATIVE,NEGATIVE,NEGATIVE,0
27513,JC999859,309520757,702205929,2014-08-04 20:50:00,17.6,17.6,17.6,17.6,,,,,,,,,9.2,9.2,9.2,9.2,1.63,1.63,1.63,1.63,14.6,17.0,15.8,14.6,379.0,379.0,379.0,379.0,109.0,115.0,112.33,113.0,7.44,7.44,7.44,7.44,,,,,0


In [182]:
table_id_final_labs_ua_lda = f"{project_id}.blood_culture_stewardship_peds_sandy_2024.final_base_labs_without_cpr_or_pct_ua_lda_peds"

# Upload the DataFrame to BigQuery
final_labs_ua_lda.to_gbq(
    destination_table=table_id_final_labs_ua_lda,
    project_id=project_id,
    if_exists='replace'  # This will replace the table if it exists
)

print(f"Uploaded final_labs_ua_lda to {table_id_final_labs_ua_lda}")

  final_labs_ua_lda.to_gbq(
100%|██████████| 1/1 [00:00<00:00, 4848.91it/s]

Uploaded final_labs_ua_lda to som-nero-phi-jonc101.blood_culture_stewardship_peds_sandy_2024.final_base_labs_without_cpr_or_pct_ua_lda_peds







#### the final labs_ua_lda table is stored at som-nero-phi-jonc101.blood_culture_stewardship_peds_sandy_2024.final_base_labs_without_cpr_or_pct_ua_lda_peds

## supplement

In [183]:
table_id_final_base_bmi_age_race_gender_adi_temp = f"{project_id}.blood_culture_stewardship_peds_sandy_2024.final_base_bmi_age_race_gender_adi_temp_peds"

query_cr = f"""
WITH all_lab_result AS (
  -- SHC labs
  SELECT
    anon_id,
    pat_enc_csn_id_coded,
    lab_name,
    base_name,
    ord_value,
    reference_unit,
    CAST(order_time_jittered_utc AS TIMESTAMP) AS order_ts
  FROM `som-nero-phi-jonc101.shc_core_2024.lab_result`
  WHERE anon_id IN (SELECT DISTINCT anon_id FROM {table_id_final_base_bmi_age_race_gender_adi_temp})

  UNION ALL

  -- LPCH labs
  SELECT
    anon_id,
    pat_enc_csn_id_coded,
    lab_name,
    base_name,
    ord_value,
    reference_unit,
    CAST(order_time_jittered_utc AS TIMESTAMP) AS order_ts
  FROM `som-nero-phi-jonc101.lpch_core_2024.lpch_lab_result`
  WHERE anon_id IN (SELECT DISTINCT anon_id FROM {table_id_final_base_bmi_age_race_gender_adi_temp})
),

cr_raw AS (
  SELECT
    c.anon_id,
    c.pat_enc_csn_id_coded,
    c.order_proc_id_coded,
    c.blood_culture_order_datetime_utc,
    CASE
      WHEN LOWER(lr.base_name) = 'cr'
           AND LOWER(lr.reference_unit) = 'mg/dl'
        THEN SAFE_CAST(lr.ord_value AS FLOAT64)
    END AS cr
  FROM {table_id_final_base_bmi_age_race_gender_adi_temp} AS c
  LEFT JOIN all_lab_result AS lr
    USING (anon_id, pat_enc_csn_id_coded)
  WHERE TIMESTAMP_DIFF(
          lr.order_ts,
          CAST(c.blood_culture_order_datetime_utc AS TIMESTAMP),
          HOUR
        ) BETWEEN -48 AND 0
),

cr_agg AS (
  SELECT
    anon_id,
    pat_enc_csn_id_coded,
    order_proc_id_coded,
    blood_culture_order_datetime_utc,
    ROUND(MIN(cr), 2) AS min_cr,
    ROUND(MAX(cr), 2) AS max_cr,
    ROUND(AVG(cr), 2) AS avg_cr,
    ROUND(APPROX_QUANTILES(cr, 100)[OFFSET(50)], 2) AS median_cr
  FROM cr_raw
  GROUP BY 1,2,3,4
)

SELECT
  b.anon_id,
  b.pat_enc_csn_id_coded,
  b.order_proc_id_coded,
  b.blood_culture_order_datetime_utc,
  a.min_cr,
  a.max_cr,
  a.avg_cr,
  a.median_cr
FROM {table_id_final_base_bmi_age_race_gender_adi_temp} AS b
LEFT JOIN cr_agg AS a
  USING (anon_id, pat_enc_csn_id_coded, order_proc_id_coded, blood_culture_order_datetime_utc)
ORDER BY b.anon_id, b.blood_culture_order_datetime_utc
"""
final_base_cr_only = client.query(query_cr).to_dataframe()


In [184]:
final_base_cr_only.isnull().mean()

anon_id                             0.000000
pat_enc_csn_id_coded                0.000000
order_proc_id_coded                 0.000000
blood_culture_order_datetime_utc    0.000000
min_cr                              0.215882
max_cr                              0.215882
avg_cr                              0.215882
median_cr                           0.215882
dtype: float64

In [185]:
# Upload the DataFrame to BigQuery
table_id_final_labs_cr_only = f"{project_id}.blood_culture_stewardship_peds_sandy_2024.final_base_labs_cr_only_peds"
final_base_cr_only.to_gbq(
    destination_table=table_id_final_labs_cr_only,
    project_id=project_id,
    if_exists='replace'  # This will replace the table if it exists
)

print(f"Uploaded final_base_cr_only to {table_id_final_labs_cr_only}")

  final_base_cr_only.to_gbq(
100%|██████████| 1/1 [00:00<00:00, 2325.00it/s]

Uploaded final_base_cr_only to som-nero-phi-jonc101.blood_culture_stewardship_peds_sandy_2024.final_base_labs_cr_only_peds



