## SQL & BQ
Collection of files queried from or pushed to BQ

All validation goes to 6_validation directory

NOTES:
- num_value1 --> numerical_val_1
- num_value2 --> numerical_val_2

In [1]:
# !pip install pandas-gbq

In [5]:
import pandas as pd
import matplotlib.mlab as mlab
import matplotlib.pyplot as plt
from datetime import datetime, timedelta

# %matplotlib inline
# %load_ext rpy2.ipython

In [6]:
import os 
from google.cloud import bigquery
from google.cloud.bigquery import dbapi

##Use correct path based on whether you are, Nero or local
# use Ctrl + Insert to copy and Shift + Insert to paste

# for Nero:
# os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = '/home/minh084/.config/gcloud/application_default_credentials.json' # old onprem
# os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = '/home/jupyter/.config/gcloud/application_default_credentials.json'

# for local computer:
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = r'C:\Users\User\AppData\Roaming\gcloud\application_default_credentials.json' 

##set correct Nero project
os.environ['GCLOUD_PROJECT'] = 'som-nero-phi-jonc101' 

##Setting up BQ API, m1:
client = bigquery.Client()

##using dbAPI connection, m2:
conn = dbapi.connect(client)

In [7]:
datadir = "../../DataTD/shc2021"
outdir = "../../OutputTD/shc2021"

pd.set_option('display.max_columns', 50)
pd.set_option('display.max_rows', 50)

### Run 1_cohort1_init notebook
Results in `1_cohort1` as the original validation cohort.

Label is the label for highest level of care within 24 hours since admission.

### Queries information for other inclusion/ex criteria
When querying, join with `1_cohort1`
- Inpatient/hospital encounters only
- Full code only
- Age 18 and older only

In [7]:
# m1_encounter.sql

q = """ 

SELECT c.*,
    e.inpatient_data_id_coded, 
    e.enc_type, e.visit_type, e.acuity_level, e.ACUITY_LEVEL_C,
    e.hosp_admsn_time_jittered_utc
FROM 
    `som-nero-phi-jonc101.shc_core_2021.encounter` as e
RIGHT JOIN 
    `som-nero-phi-jonc101.triageTD.1_cohort1` as c
ON (c.anon_id=e.anon_id and c.pat_enc_csn_id_coded=e.pat_enc_csn_id_coded)
ORDER BY
  c.anon_id
  
"""

query_job=client.query(q)
df=query_job.to_dataframe().to_csv(os.path.join(datadir, 'encounters_2021.csv'), index=False)

In [8]:
# m2_codestatus.sql

q = """ 

SELECT c.*,
    o.order_type, o.order_status, o.display_name, o.description, 
    o.order_time_jittered_utc
FROM 
    `som-nero-phi-jonc101.shc_core_2021.order_proc` as o
JOIN 
    `som-nero-phi-jonc101.triageTD.1_cohort1` as c
ON (c.anon_id=o.anon_id and c.pat_enc_csn_id_coded=o.pat_enc_csn_id_coded)
WHERE o.order_type = "Code Status"
ORDER BY
  c.anon_id
""" 

query_job=client.query(q)
df=query_job.to_dataframe().to_csv(os.path.join(datadir, 'code_status_2021.csv'), index=False)

In [9]:
# m3_demographic.sql
# there is NO d.recent_conf_enc_jittered as recent_date,

q = """ 

SELECT c.anon_id,
    d.gender, d.canonical_race as race, d.language, 
    d.recent_ht_in_cms as recent_height_cm, d.recent_wt_in_kgs as recent_weight_kg,
    d.insurance_payor_name as insurance,  
    DATE(CAST(d.birth_date_jittered as TIMESTAMP)) as dob
FROM 
    `som-nero-phi-jonc101.shc_core_2021.demographic` as d
JOIN 
    `som-nero-phi-jonc101.triageTD.1_cohort1` as c
ON c.anon_id=d.anon_id
ORDER BY
  c.anon_id
"""

query_job=client.query(q)
df=query_job.to_dataframe().to_csv(os.path.join(datadir, 'demographics_2021.csv'), index=False)

### Pushed the 2_cohort2 after running 2_criteria_R notebook
Use `1_cohort1` to filter out patients with exclusion/inc criteria, resulting in updated `2_cohort2`

In [10]:
df = pd.read_csv(os.path.join(outdir, "2_cohort2.csv"))
print(len(df)) # 17128
print(list(df.columns))

63050
['anon_id', 'pat_enc_csn_id_coded', 'admit_time_jittered', 'label', 'admit_time', 'adm_year', 'adm_month', 'inpatient_data_id_coded', 'ESI', 'hosp_admsn_time', 'ed_time_hr', 'gender', 'race', 'language', 'recent_height_cm', 'recent_weight_kg', 'insurance', 'age']


In [11]:
df.head()

Unnamed: 0,anon_id,pat_enc_csn_id_coded,admit_time_jittered,label,admit_time,adm_year,adm_month,inpatient_data_id_coded,ESI,hosp_admsn_time,ed_time_hr,gender,race,language,recent_height_cm,recent_weight_kg,insurance,age
0,JC1000116,131066472308,2015-01-28 00:46:00+00:00,0,2015-01-28 00:46:00,2015,1,19328596,3.0,2015-01-27 04:24:00,20.366667,Female,Other,Spanish,154.0,73.05,HPSM,38
1,JC1000116,131295313275,2020-09-29 22:45:00+00:00,0,2020-09-29 22:45:00,2020,9,57868578,3.0,2020-09-29 16:02:00,6.716667,Female,Other,Spanish,154.0,73.05,HPSM,44
2,JC1000296,131100574537,2015-07-03 04:51:00+00:00,0,2015-07-03 04:51:00,2015,7,21370977,2.0,2015-07-02 04:03:00,24.8,Male,Other,Spanish,163.4,61.25,MEDICARE,63
3,JC1000441,131074233004,2015-02-23 06:51:00+00:00,0,2015-02-23 06:51:00,2015,2,19747233,3.0,2015-02-22 20:52:00,9.983333,Female,Other,English,167.64,90.0,MEDI-CAL,22
4,JC1000441,131074471154,2015-02-26 23:07:00+00:00,0,2015-02-26 23:07:00,2015,2,19773625,3.0,2015-02-26 14:00:00,9.116667,Female,Other,English,167.64,90.0,MEDI-CAL,22


In [12]:
# removed {'name' : 'recent_date', 'type' : 'DATE'}
table_schema = [{'name' : 'anon_id', 'type' : 'STRING'},
                {'name' : 'pat_enc_csn_id_coded', 'type' : 'INTEGER'},
                {'name' : 'inpatient_data_id_coded', 'type': 'INTEGER'},
                {'name' : 'admit_time', 'type' : 'TIMESTAMP'},
                {'name' : 'label', 'type' : 'INTEGER'},
                {'name' : 'hosp_admsn_time', 'type': 'TIMESTAMP'},
                {'name' : 'ed_time_hr', 'type': 'FLOAT'},
                {'name' : 'ESI', 'type': 'INTEGER'},
                {'name' : 'age', 'type' : 'INTEGER'},
                {'name' : 'gender', 'type' : 'STRING'},
                {'name' : 'race', 'type' : 'STRING'},
                {'name' : 'language', 'type' : 'STRING'},
                {'name' : 'insurance', 'type' : 'STRING'},
                {'name' : 'recent_height_cm', 'type' : 'FLOAT'},
                {'name' : 'recent_weight_kg', 'type' : 'FLOAT'}]
DATASET_NAME = 'triageTD'
TABLE_NAME = '2_cohort2'
df.to_gbq(destination_table='triageTD.%s' % TABLE_NAME,
                 project_id='som-nero-phi-jonc101',
                 table_schema=table_schema,
                 if_exists='replace')

1it [00:06,  6.72s/it]


### Use updated 6_2_cohort2 with inpatient_id_coded to query flowsheet

In [13]:
# m4_HWflowsheet.sql

q = """ 

SELECT c.anon_id, c.pat_enc_csn_id_coded, c.inpatient_data_id_coded,
    f.row_disp_name, f.units, f.recorded_time_utc, f.numerical_val_1 as num_value1, f.numerical_val_2 as num_value2
FROM 
    `som-nero-phi-jonc101.triageTD.2_cohort2` as c
JOIN 
    `som-nero-phi-jonc101.shc_core_2021.flowsheet` as f
ON 
    (c.anon_id=f.anon_id and c.inpatient_data_id_coded=f.inpatient_data_id_coded)
WHERE
(
(row_disp_name="Weight")
OR
(row_disp_name="Height")
)
"""

query_job=client.query(q)
df=query_job.to_dataframe().to_csv(os.path.join(datadir, 'HWflowsheet_2021.csv'), index=False)

In [14]:
# m5_flowsheet.sql

q = """ 

SELECT c.anon_id, c.pat_enc_csn_id_coded, c.inpatient_data_id_coded, c.admit_time, c.label,
    f.template, f.row_disp_name, f.units, f.recorded_time_utc, f.numerical_val_1 as num_value1, f.numerical_val_2 as num_value2
FROM 
    `som-nero-phi-jonc101.triageTD.2_cohort2` as c 
JOIN 
    `som-nero-phi-jonc101.shc_core_2021.flowsheet` as f
ON 
    (c.anon_id=f.anon_id and c.inpatient_data_id_coded=f.inpatient_data_id_coded)
WHERE
    recorded_time_utc < admit_time --, 'yyyy-mm-dd hh24:mi:ss'
AND row_disp_name in 
('Heart Rate', 'Pulse', "Resting HR", 'Resting Heart Rate (bpm)', 'Resting Pulse Rate: (Record BPM)', -- smaller number, might be too noisy
 'O2', 'O2 (LPM)', 'O2 Flow (L/min)', 'O2 Delivery Method', 
 'Resp Rate', 'Resp', 'Respiratory Rate', -- "Resting RR" not there
 'BP', 'NIBP', 'Arterial Systolic BP' , 'Arterial Diastolic BP' , 'Blood Pressure', "Resting BP", --'Resting Systolic Blood Pressure',
 'Temp', 'Temp (in Celsius)', 'Temperature (Blood - PA line)', 'Temp 2', 'Temperature', 
 'Activity', 'Mobility', 
 'acuity score', 'Acuity as Level of Care',
 'LOC', 'LOC Score')
-- removed GCS, too many missing and not consistent
--  'SpO2', "Resting SpO2", 'Oxygen Saturation', 'Resting O2 Saturation', -- difficult to interpret without O2 delivery
"""

query_job=client.query(q)
df=query_job.to_dataframe().to_csv(os.path.join(datadir, 'flowsheet_2021.csv'), index=False)

### Push 5_cohort3 file from R notebook to Big Query
- From 5_cohort3_vitalsigns_R.ipynb
- This is the updated cohort with at least a complete set of vital signs. First set in this file

In [6]:
# valdir = "../../OutputTD/6_validation"
df = pd.read_csv(os.path.join(outdir, "5_cohort3.csv"))
print(len(df)) # 61176
print(len(df.pat_enc_csn_id_coded.unique()))
df.head(5)

61176
61176


Unnamed: 0,anon_id,pat_enc_csn_id_coded,inpatient_data_id_coded,admit_time,label,DBP,Pulse,RR,SBP,Temp
0,JC1000116,131066472308,19328596,2015-01-28 00:46:00+00:00,0,75,79,18,127,37.0
1,JC1000116,131295313275,57868578,2020-09-29 22:45:00+00:00,0,77,81,18,120,36.8
2,JC1000296,131100574537,21370977,2015-07-03 04:51:00+00:00,0,63,120,20,98,37.05
3,JC1000441,131074233004,19747233,2015-02-23 06:51:00+00:00,0,66,60,18,127,36.0
4,JC1000441,131074471154,19773625,2015-02-26 23:07:00+00:00,0,65,57,18,115,37.0


In [10]:
# %load_ext google.cloud.bigquery
# cohort file
table_schema = [{'name' : 'anon_id', 'type' : 'STRING'},
                {'name' : 'pat_enc_csn_id_coded', 'type' : 'INTEGER'},
                {'name' : 'inpatient_data_id_coded', 'type': 'INTEGER'},
                {'name' : 'admit_time', 'type' : 'TIMESTAMP'},
                {'name' : 'label', 'type' : 'INTEGER'},
                {'name' : 'DBP', 'type' : 'INTEGER'},
                {'name' : 'SBP', 'type' : 'INTEGER'},
                {'name' : 'Pulse', 'type' : 'INTEGER'},
                {'name' : 'RR', 'type' : 'INTEGER'},
                {'name' : 'Temp', 'type' : 'FLOAT'}]
                       
DATASET_NAME = 'triageTD'
TABLE_NAME = '5_cohort3'
df.to_gbq(destination_table='triageTD.%s' % TABLE_NAME,
          project_id='som-nero-phi-jonc101', 
          table_schema=table_schema,
          if_exists='replace')

1it [00:11, 11.38s/it]


### Get other tables using cohort3

In [11]:
# m6_labs.sql

q = """ 
SELECT cohort.*,
        order_id_coded, lab_name, base_name, ord_value, ord_num_value, 
        reference_low, reference_high, reference_unit, result_in_range_yn, result_flag, 
        result_time_utc, order_time_utc , taken_time_utc
      
FROM `som-nero-phi-jonc101.shc_core_2021.lab_result` as labs
RIGHT JOIN `som-nero-phi-jonc101.triageTD.5_cohort3` as cohort  -- # join labs to cohort

ON labs.pat_enc_csn_id_coded = cohort.pat_enc_csn_id_coded
AND labs.anon_id = cohort.anon_id

WHERE admit_time >= result_time_utc  -- # only labs before admit time
AND extract(year from admit_time) > 2014  -- # only CSNs after 2014
AND base_name in 
    ('AG', 'AGAP', 'BASOAB', 'BUN', 'CL', 'CR', 'EGFR', 'EOSAB', 'GLU', 'HCO3', 'HCO3A', 'HCO3V', 
 'HCT', 'HGB', 'INR', 'K', 'LAC', 'LACWBL', 'LYMAB', 'MONOAB', 'NEUTAB', 'NEUTABS', 'O2SATA', 
 'O2SATV', 'PCAGP', 'PCBUN', 'PCCL', 'PCO2A', 'PCO2V', 'PH', 'PHA', 'PHV', 'PLT', 'PO2A', 'PO2V',
 'PT', 'TBIL', 'TCO2A', 'TNI', 'WBC', 'NA', 'ALB', 'ALKP', 'ALT', 'AST', 'BE', 'CA', 'CO2', 
 'GLOB', 'MCH', 'RDW', 'TP') -- 'GLUURN' removed, all NA
 
"""

query_job=client.query(q)
df=query_job.to_dataframe().to_csv(os.path.join(datadir, 'labs_2021.csv'), index=False)

### Use cohort3 to get Tiffany's ADT and death tables --> create cohort4 with labels

In [7]:
q = """
SELECT adt.anon_id, adt.pat_enc_csn_id_coded, adt.effective_time_jittered_utc, adt.seq_num_in_enc,
        adt.pat_class, adt.base_pat_class_c, adt.pat_lvl_of_care_c, adt.pat_lv_of_care, 
        adt.event_type, adt.pat_service, adt.department_id
    
FROM shc_core_2021.adt adt
RIGHT JOIN triageTD.5_cohort3 c 
ON adt.anon_id = c.anon_id and adt.pat_enc_csn_id_coded = c.pat_enc_csn_id_coded
"""
query_job = client.query(q)
adt = query_job.to_dataframe() # not necessary?
# adt.to_csv(cohort_adt_file, index=False)
adt.to_csv(os.path.join(datadir, 'cohort3_adt_2021.csv'), index=None)

In [13]:
q = """
select anon_id, death_date_jittered
  from shc_core_2021.demographic
  where anon_id
  in (select anon_id from triageTD.5_cohort3)
"""

query_job = client.query(q)
df = query_job.to_dataframe() # not necessary?
# adt.to_csv(cohort_adt_file, index=False)
df.to_csv(os.path.join(datadir, 'cohort3_demo_deaths_2021.csv'), index=None)

### Run label notebooks

### Push cohort4 
- labels at 24hrs (OLD)
- with labels within 3 hours & no OR related dept (has all labels at 3/6/9/12/24 hour)

In [15]:
# from Tiffany's cohort with labels within 3 hours
df = pd.read_csv(os.path.join(outdir, "7_cohort4_labels.csv"))
df["admit_time"] = pd.to_datetime(df["admit_time"]) 
print(df.shape) # 16700 --> 16484 --> 60464 (merged) --> 60648 (cohort3hrlabels)
df.head(5) # 60648

(60648, 48)


Unnamed: 0,anon_id,pat_enc_csn_id_coded,admit_time,label_max3,label_3hr_recent,admit_label,has_admit_label,died_within_24hrs,death_3hr_max_label,death_3hr_recent_label,first_label,first_label_minutes_since_admit,acute_to_critical_label_recent_3hr,critical_to_acute_label_recent_3hr,acute_to_critical_label_max_3hr,critical_to_acute_label_max_3hr,label_max6,label_6hr_recent,death_6hr_max_label,death_6hr_recent_label,acute_to_critical_label_recent_6hr,critical_to_acute_label_recent_6hr,acute_to_critical_label_max_6hr,critical_to_acute_label_max_6hr,label_max9,label_9hr_recent,death_9hr_max_label,death_9hr_recent_label,acute_to_critical_label_recent_9hr,critical_to_acute_label_recent_9hr,acute_to_critical_label_max_9hr,critical_to_acute_label_max_9hr,label_max12,label_12hr_recent,death_12hr_max_label,death_12hr_recent_label,acute_to_critical_label_recent_12hr,critical_to_acute_label_recent_12hr,acute_to_critical_label_max_12hr,critical_to_acute_label_max_12hr,label_max24,label_24hr_recent,death_24hr_max_label,death_24hr_recent_label,acute_to_critical_label_recent_24hr,critical_to_acute_label_recent_24hr,acute_to_critical_label_max_24hr,critical_to_acute_label_max_24hr
0,JC1000116,131066472308,2015-01-28 00:46:00,0,0,0.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,JC1000116,131295313275,2020-09-29 22:45:00,0,0,0.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,JC1000296,131100574537,2015-07-03 04:51:00,0,0,0.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,JC1000441,131074233004,2015-02-23 06:51:00,0,0,0.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,JC1000527,131084989828,2015-05-26 23:19:00,0,0,0.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [16]:
# can't push if colnames have a dot (from one-hot coding) --> need to rename to push             
DATASET_NAME = 'triageTD'
TABLE_NAME = '7_cohort4_labels'
df.to_gbq(destination_table='triageTD.%s' % TABLE_NAME,
          project_id='som-nero-phi-jonc101', 
          if_exists='replace')

1it [00:06,  6.38s/it]


#### This is the new cohort4, those with labels within the first 3 hours, and no OR-related events within the first 24 hours

In [9]:
# from Tiffany's cohort with labels within 3 hours, no OR related dept
df = pd.read_csv(os.path.join(outdir, "7_cohort4_3hr_labels_noOR.csv"))
df["admit_time"] = pd.to_datetime(df["admit_time"]) 
print(df.shape) 
df.head(5) # 52532

(52532, 48)


Unnamed: 0,anon_id,pat_enc_csn_id_coded,admit_time,label_max3,label_3hr_recent,admit_label,has_admit_label,died_within_24hrs,death_3hr_max_label,death_3hr_recent_label,first_label,first_label_minutes_since_admit,acute_to_critical_label_recent_3hr,critical_to_acute_label_recent_3hr,acute_to_critical_label_max_3hr,critical_to_acute_label_max_3hr,label_max6,label_6hr_recent,death_6hr_max_label,death_6hr_recent_label,acute_to_critical_label_recent_6hr,critical_to_acute_label_recent_6hr,acute_to_critical_label_max_6hr,critical_to_acute_label_max_6hr,label_max9,label_9hr_recent,death_9hr_max_label,death_9hr_recent_label,acute_to_critical_label_recent_9hr,critical_to_acute_label_recent_9hr,acute_to_critical_label_max_9hr,critical_to_acute_label_max_9hr,label_max12,label_12hr_recent,death_12hr_max_label,death_12hr_recent_label,acute_to_critical_label_recent_12hr,critical_to_acute_label_recent_12hr,acute_to_critical_label_max_12hr,critical_to_acute_label_max_12hr,label_max24,label_24hr_recent,death_24hr_max_label,death_24hr_recent_label,acute_to_critical_label_recent_24hr,critical_to_acute_label_recent_24hr,acute_to_critical_label_max_24hr,critical_to_acute_label_max_24hr
0,JC1000116,131066472308,2015-01-28 00:46:00,0,0,0.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,JC1000116,131295313275,2020-09-29 22:45:00,0,0,0.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,JC1000296,131100574537,2015-07-03 04:51:00,0,0,0.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,JC1000441,131074233004,2015-02-23 06:51:00,0,0,0.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,JC1000527,131084989828,2015-05-26 23:19:00,0,0,0.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [10]:
print(list(df.columns))

['anon_id', 'pat_enc_csn_id_coded', 'admit_time', 'label_max3', 'label_3hr_recent', 'admit_label', 'has_admit_label', 'died_within_24hrs', 'death_3hr_max_label', 'death_3hr_recent_label', 'first_label', 'first_label_minutes_since_admit', 'acute_to_critical_label_recent_3hr', 'critical_to_acute_label_recent_3hr', 'acute_to_critical_label_max_3hr', 'critical_to_acute_label_max_3hr', 'label_max6', 'label_6hr_recent', 'death_6hr_max_label', 'death_6hr_recent_label', 'acute_to_critical_label_recent_6hr', 'critical_to_acute_label_recent_6hr', 'acute_to_critical_label_max_6hr', 'critical_to_acute_label_max_6hr', 'label_max9', 'label_9hr_recent', 'death_9hr_max_label', 'death_9hr_recent_label', 'acute_to_critical_label_recent_9hr', 'critical_to_acute_label_recent_9hr', 'acute_to_critical_label_max_9hr', 'critical_to_acute_label_max_9hr', 'label_max12', 'label_12hr_recent', 'death_12hr_max_label', 'death_12hr_recent_label', 'acute_to_critical_label_recent_12hr', 'critical_to_acute_label_recent_

In [11]:
# can't push if colnames have a dot (from one-hot coding) --> need to rename to push             
DATASET_NAME = 'triageTD'
TABLE_NAME = '7_cohort4_3hr_labels_noOR'
df.to_gbq(destination_table='triageTD.%s' % TABLE_NAME,
          project_id='som-nero-phi-jonc101', 
          if_exists='replace')

1it [00:09,  9.68s/it]


### Check split from old cohort and new set that has not been used
- Old: from Triage 2015 - 03/2020
- New: from 04/2020 - 09/2021

In [12]:
# print(len(df1920[df1920.year == 2020]))
df['year'] = df['admit_time'].dt.year
df['month'] = df['admit_time'].dt.month
print(df['year'].value_counts())

df15_18 = df[df['admit_time'].dt.year < 2019]
print(len(df15_18)) # 31562

df19_ = df[df['admit_time'].dt.year > 2018]
print(df19_['year'].value_counts())

df20_ = df[df['admit_time'].dt.year > 2019]
print(df20_['year'].value_counts())
print(df20_.groupby(["year", "month"]).size().reset_index(name="n"))

2019    8709
2020    8607
2018    8248
2021    7759
2015    7219
2016    6116
2017    5874
Name: year, dtype: int64
27457
2019    8709
2020    8607
2021    7759
Name: year, dtype: int64
2020    8607
2021    7759
Name: year, dtype: int64
    year  month     n
0   2020      1   770
1   2020      2   714
2   2020      3   604
3   2020      4   566
4   2020      5   664
5   2020      6   680
6   2020      7   791
7   2020      8   746
8   2020      9   750
9   2020     10   804
10  2020     11   717
11  2020     12   801
12  2021      1   786
13  2021      2   689
14  2021      3   767
15  2021      4   772
16  2021      5   901
17  2021      6   906
18  2021      7   979
19  2021      8  1021
20  2021      9   938


In [13]:
# df["admit_time"] = pd.to_datetime(df["admit_time"])

# all correct counts
df1920 = df[(df['admit_time'].dt.year == 2019) | 
            (df['admit_time'].dt.year == 2020) & (df['admit_time'].dt.month < 4)]
print(len(df1920)) # 10797

df2020 = df[(df['admit_time'].dt.year == 2020) & (df['admit_time'].dt.month > 3)]
print(len(df2020)) # 6519
df2021 = df[(df['admit_time'].dt.year == 2021)]
print(len(df2021)) # 7759

print(len(df2020) + len(df2021)) # 14278

# df2020.month.unique()

10797
6519
7759
14278


### Push Tiffany's cohort4, has labels within 3hr, no OR related events, and with first ED time

In [5]:
# from Tiffany's cohort with labels within 3 hours, no OR related dept
df = pd.read_csv(os.path.join(outdir, "7_cohort4_3hr_noOR_first_ED_time.csv"))
df["admit_time"] = pd.to_datetime(df["admit_time"]) 
print(df.shape) 
df.head(5) # 52532

(52532, 3)


Unnamed: 0,anon_id,admit_time,first_ED_time
0,JC1170548,2015-01-02 03:48:00,2015-01-01 18:40:00 UTC
1,JC913990,2015-01-02 05:53:00,2015-01-02 01:56:00 UTC
2,JC529112,2015-01-05 03:20:00,2015-01-04 18:13:00 UTC
3,JC1702404,2015-01-06 14:47:00,2015-01-06 09:04:00 UTC
4,JC523028,2015-01-03 21:24:00,2015-01-03 14:51:00 UTC


In [9]:
# %load_ext google.cloud.bigquery
# cohort file
df.rename({'first_ED_time': 'first_ed_time'}, axis=1, inplace=True)
df.admit_time = pd.to_datetime(df.admit_time, utc=True)
df.first_ed_time = pd.to_datetime(df.first_ed_time, utc=True)

table_schema = [{'name' : 'anon_id', 'type' : 'STRING'},
                {'name' : 'admit_time', 'type' : 'TIMESTAMP'},
                {'name' : 'first_ed_time', 'type' : 'TIMESTAMP'}]
                       
DATASET_NAME = 'triageTD'
TABLE_NAME = '7_cohort4_3hr_noOR_first_ED_time'
df.to_gbq(destination_table='triageTD.%s' % TABLE_NAME,
          project_id='som-nero-phi-jonc101', 
          table_schema=table_schema,
          if_exists='replace')

1it [00:05,  5.88s/it]


### CHECK year split for cohort in sparse_matrix

In [14]:
df_cohort = df
train_labels = df_cohort[df_cohort['admit_time'].dt.year < 2019] # 2015 - 2018, old data
validation_labels = df_cohort[(df_cohort['admit_time'].dt.year == 2019) | 
                               (df_cohort['admit_time'].dt.year == 2020) & (df_cohort['admit_time'].dt.month < 4)] # old data 2019 - 03/2020

train_and_val_labels = df_cohort[(df_cohort['admit_time'].dt.year < 2020) | 
                                 (df_cohort['admit_time'].dt.year == 2020) & (df['admit_time'].dt.month < 4)] # all old data
test_labels = df_cohort[(df_cohort['admit_time'].dt.year == 2020) & (df_cohort['admit_time'].dt.month > 3) |
                         (df_cohort['admit_time'].dt.year == 2021)] # new data has 04/2020 -2021
print(len(train_labels)) # should be 31562
print(len(validation_labels)) # should be 12418
print(len(train_and_val_labels)) # should be 43980 - the full old cohort
print(len(test_labels)) # should be 16484 - the full new cohort
# len(trainval_labels) + len(test_labels)

27457
10797
38254
14278


### Check other tables: 
- diagnosis
- department
- treatment team
- clinical_doc_meta for notes

In [17]:
# diagnosis under shc_core_2021 - to search for DKA 
q = """ 

SELECT d.dx_name, d.icd10, d.pat_enc_csn_id_jittered
FROM 
    `som-nero-phi-jonc101.shc_core_2021.diagnosis` as d
RIGHT JOIN 
    `som-nero-phi-jonc101.triageTD.7_cohort4_labels` as c
ON (c.anon_id=d.anon_id and c.pat_enc_csn_id_coded=d.pat_enc_csn_id_jittered)
ORDER BY
  c.anon_id
  
"""

query_job=client.query(q)
df=query_job.to_dataframe().to_csv(os.path.join(datadir, 'diagnosis_2021.csv'), index=False)

In [18]:
# departments shc_core_2021 - to get OR department names and IDs

q = """ 

SELECT d.department_id as dept_id, d.department_name as dept_name
FROM 
    `som-nero-phi-jonc101.shc_core_2021.dep_map` as d
INNER JOIN
    `som-nero-phi-jonc101.shc_core_2021.adt` as adt
ON (d.department_id = adt.department_id)
RIGHT JOIN 
    `som-nero-phi-jonc101.triageTD.7_cohort4_labels` as c
ON (c.anon_id=adt.anon_id and c.pat_enc_csn_id_coded=adt.pat_enc_csn_id_coded)
ORDER BY
  c.anon_id
  
"""

query_job=client.query(q)
df=query_job.to_dataframe().to_csv(os.path.join(datadir, 'dept_2021.csv'), index=False)

In [20]:
# treatment_team, shc_core_2021 - to get provider names and id

q = """ 

SELECT t.anon_id, t.pat_enc_csn_id_coded, t.name, t.prov_map_id, 
        t.trtmnt_tm_begin_dt_jittered_utc as treatment_start_time,
        c.admit_time
FROM 
    `som-nero-phi-jonc101.shc_core_2021.treatment_team` as t
RIGHT JOIN 
    `som-nero-phi-jonc101.triageTD.7_cohort4_labels` as c
ON (t.anon_id=c.anon_id and t.pat_enc_csn_id_coded=c.pat_enc_csn_id_coded)
ORDER BY
  c.anon_id
  
"""

query_job=client.query(q)
df=query_job.to_dataframe().to_csv(os.path.join(datadir, 'team_2021.csv'), index=False)

In [27]:
# clinical_doc_meta, shc_core_2021; to get note types type and providers

q = """ 

SELECT DISTINCT m.anon_id, m.pat_enc_csn_id_coded, m.author_prov_map_id, m.auth_lnked_prov_map_id, 
        m.note_status_c, m.note_status, m.note_type,
FROM 
    `som-nero-phi-jonc101.shc_core_2021.clinical_doc_meta` as m
RIGHT JOIN 
    `som-nero-phi-jonc101.triageTD.7_cohort4_labels` as c
ON (m.anon_id = c.anon_id)
  
"""

query_job=client.query(q)
df=query_job.to_dataframe().to_csv(os.path.join(datadir, 'note_type_2021.csv'), index=False)

In [4]:
# use cohort4_3hr_noOR to get number of treatment team member counts in the ED before admission
q = """
select distinct anon_id, team.pat_enc_csn_id_coded, name, prov_map_id, team.trtmnt_tm_begin_dt_jittered_utc
    from `som-nero-phi-jonc101.shc_core_2021.treatment_team` as team
    left join 
    (select distinct pat_enc_csn_id_coded, admit_time
       from `som-nero-phi-jonc101.triageTD.7_cohort4_3hr_labels_noOR`
    ) as cohort
    on team.pat_enc_csn_id_coded = cohort.pat_enc_csn_id_coded
    where trtmnt_tm_begin_dt_jittered_utc < admit_time
"""

query_job=client.query(q)
df=query_job.to_dataframe().to_csv(os.path.join(datadir, 'cohort4_3hr_noOR_team_2021.csv'), index=False)

In [11]:
# use cohort_4_3hr_noOR to get note counts in the ED before admission
q = """
select m.anon_id, m.pat_enc_csn_id_coded, m.author_prov_map_id, m.auth_lnked_prov_map_id, 
        m.effective_dept_id as dept_id, m.ambulatory, 
        m.note_status_c, m.note_status, m.note_type, 
        m.note_date_jittered_utc, m.filing_date_jittered_utc,
        cohort.admit_time, cohort.first_ed_time
        
from `som-nero-phi-jonc101.shc_core_2021.clinical_doc_meta` as m
inner join 
    (select anon_id, admit_time, first_ed_time
    from `som-nero-phi-jonc101.triageTD.7_cohort4_3hr_noOR_first_ED_time`) as cohort
    using (anon_id)
    where timestamp_add(cohort.admit_time, interval 24 HOUR) >= m.effective_time_jittered_utc
    and m.effective_time_jittered_utc >= cohort.first_ed_time
"""

query_job=client.query(q)
df=query_job.to_dataframe().to_csv(os.path.join(datadir, 'cohort4_3hr_noOR_notes_2021.csv'), index=False)

In [15]:
q = """
select distinct name, prov_map_id
    from `som-nero-phi-jonc101.shc_core_2021.treatment_team` as team
    where team.prov_map_id
    in 
    (select distinct notes.auth_lnked_prov_map_id
        from `som-nero-phi-jonc101.shc_core_2021.clinical_doc_meta` as notes
        inner join 
            (   select anon_id, admit_time, first_ed_time
                from `som-nero-phi-jonc101.triageTD.7_cohort4_3hr_noOR_first_ED_time`) as cohort
        using (anon_id)
        where timestamp_add(cohort.admit_time, interval 24 HOUR) >= notes.effective_time_jittered_utc
        and notes.effective_time_jittered_utc >= cohort.first_ed_time
    ) 
union distinct

select distinct name, prov_map_id
    from `som-nero-phi-jonc101.shc_core_2021.treatment_team` as team
    where team.prov_map_id
    in 
    (select distinct notes.auth_lnked_prov_map_id
        from `som-nero-phi-jonc101.shc_core_2021.clinical_doc_meta` as notes
        inner join 
            (   select anon_id, admit_time, first_ed_time
                from `som-nero-phi-jonc101.triageTD.7_cohort4_3hr_noOR_first_ED_time`) as cohort
        using (anon_id)
        where timestamp_add(cohort.admit_time, interval 24 HOUR) >= notes.effective_time_jittered_utc
        and notes.effective_time_jittered_utc >= cohort.first_ed_time
    ) 
"""

query_job=client.query(q)
df=query_job.to_dataframe().to_csv(os.path.join(datadir, 'cohort4_3hr_noOR_team_2021.csv'), index=False)

### Push simpledata

### Complex data

In [19]:
# final cohort data with all feature values, warning due to time NA for demos
# will be use with feature counts
df = pd.read_csv(os.path.join(outdir, "6_9_coh4_feature_values.csv"))
print(len(df)) # 1190477
df.head(5)

1175680


  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,anon_id,pat_enc_csn_id_coded,admit_time,first_label,death_24hr_recent_label,death_24hr_max_label,feature_type,features,values,time
0,JC1000116,131295313275,2020-09-29 22:45:00+00:00,0.0,0,0,demo,ESI_i,3.0,
1,JC1000939,131295018112,2020-08-22 11:41:00+00:00,0.0,1,1,demo,ESI_i,2.0,
2,JC1001688,131288774622,2020-06-19 21:36:00+00:00,0.0,0,0,demo,ESI_i,3.0,
3,JC1001688,131302440087,2021-02-26 11:46:00+00:00,0.0,0,0,demo,ESI_i,3.0,
4,JC1001842,131305861173,2021-02-23 08:08:00+00:00,0.0,0,0,demo,ESI_i,3.0,


In [30]:
table_schema = [{'name' : 'anon_id', 'type' : 'STRING'},
                {'name' : 'pat_enc_csn_id_coded', 'type' : 'INTEGER'},
                {'name' : 'admit_time', 'type' : 'TIMESTAMP'},
                {'name' : 'first_label', 'type' : 'INTEGER'},
                {'name' : 'death_24hr_recent_label', 'type' : 'INTEGER'},
                {'name' : 'feature_type', 'type' : 'STRING'},
                {'name' : 'features', 'type' : 'STRING'},
                {'name' : 'values', 'type' : 'FLOAT'},
                {'name' : 'recorded_time', 'type' : 'TIMESTAMP'}]

                       
DATASET_NAME = 'triageTD'
TABLE_NAME = '6_9_coh4_feature_values'
df.to_gbq(destination_table='triageTD.%s' % TABLE_NAME,
          project_id='som-nero-phi-jonc101',
          table_schema=table_schema,
          if_exists='replace')

1it [00:33, 33.63s/it]


### Feature order counts 
- This is needed for 6.11 notebook

In [49]:
q = open('../SQL/feature_counts/Code_Counts_val.sql', 'r')
query_job =  client.query(q.read()).to_dataframe().to_csv(os.path.join(datadir, 'coh4_order_code_counts_2021.csv'), index=False)

In [50]:
df = pd.read_csv(os.path.join(datadir, "coh4_order_code_counts_2021.csv"))
print(len(df)) # 6086852
df.head(5)

2792909


Unnamed: 0,anon_id,pat_enc_csn_id_coded,admit_time,feature_type,features,values
0,JC641497,131284412456,2020-04-01 17:34:00+00:00,Diagnosis,J10.00,1
1,JC641497,131284412456,2020-04-01 17:34:00+00:00,Lab,TROPONIN I,4
2,JC641497,131284412456,2020-04-01 17:34:00+00:00,Diagnosis,Z79.01,1
3,JC641497,131284412456,2020-04-01 17:34:00+00:00,Diagnosis,N39.0,1
4,JC641497,131284412456,2020-04-01 17:34:00+00:00,Diagnosis,I50.43,1


In [51]:
table_schema = [{'name' : 'anon_id', 'type' : 'STRING'},
                {'name' : 'pat_enc_csn_id_coded', 'type' : 'INTEGER'},
                {'name' : 'admit_time', 'type' : 'TIMESTAMP'},
                {'name' : 'feature_type', 'type' : 'STRING'},
                {'name' : 'features', 'type' : 'STRING'},
                {'name' : 'values', 'type' : 'FLOAT'}]

                       
DATASET_NAME = 'triageTD'
TABLE_NAME = 'coh4_order_code_counts_2021'
df.to_gbq(destination_table='triageTD.%s' % TABLE_NAME,
          project_id='som-nero-phi-jonc101',
          table_schema=table_schema,
          if_exists='replace')

1it [01:16, 76.32s/it]


### Push 6_11_coh4_all_features_all_long_year (ALL old and new cohort, ALL features, train and test bins)
This is the data set contains demos, vital and lab counts in bins, and order counts -- long format for modeling
- all new and old cohort of 60,464 unique csn: `6_11_coh4_all_features_all_long_year`

In [53]:
# this one has trainbin and testbin in the same dataframe
# df0 = pd.read_csv(os.path.join(featuredir, "2_9_coh5_features_all_long_year.csv"))
df = pd.read_csv(os.path.join(outdir, "6_11_coh4_all_features_all_long_year.csv"))
print(len(df)) # 14,269,242
df.head(5)

14269242


Unnamed: 0,anon_id,pat_enc_csn_id_coded,admit_time,feature_type,features,values,year
0,JC29f8ad2,131274729058,2019-08-31 12:52:00+00:00,demo,ESI_i,3.0,2019
1,JC29f8ad3,131278291027,2019-10-05 23:48:00+00:00,demo,ESI_i,3.0,2019
2,JC29f8b9c,131266787806,2019-05-05 01:07:00+00:00,demo,ESI_i,2.0,2019
3,JC29f8beb,131264387263,2019-03-15 03:35:00+00:00,demo,ESI_i,3.0,2019
4,JC29f8beb,131279241689,2019-11-27 15:29:00+00:00,demo,ESI_i,3.0,2019


In [54]:
df["admit_time"] = pd.to_datetime(df["admit_time"])
df['year'] = df['admit_time'].dt.year

In [55]:
table_schema = [{'name' : 'anon_id', 'type' : 'STRING'},
                {'name' : 'pat_enc_csn_id_coded', 'type' : 'INTEGER'},
                {'name' : 'admit_time', 'type' : 'TIMESTAMP'},
                {'name' : 'feature_type', 'type' : 'STRING'},
                {'name' : 'features', 'type' : 'STRING'},
                {'name' : 'values', 'type' : 'FLOAT'},
                {'name' : 'year', 'type': 'INTEGER'}]

                       
DATASET_NAME = 'triageTD'
TABLE_NAME = '6_11_coh4_all_features_all_long_year'
df.to_gbq(destination_table='triageTD.%s' % TABLE_NAME,
          project_id='som-nero-phi-jonc101',
          table_schema=table_schema,
          if_exists='replace')

1it [06:50, 410.37s/it]


### CHECK year split for sparse_matrix

In [9]:
dft = pd.read_csv(os.path.join(outdir, "6_10_coh4_binned_labs_vitals_train.csv"))
dft["admit_time"] = pd.to_datetime(dft["admit_time"])

print(len(dft)) # 1826919
dft.head(5)

1826919


Unnamed: 0,anon_id,pat_enc_csn_id_coded,admit_time,feature_type,features,values
0,JC29f8ad2,131274729058,2019-08-31 12:52:00+00:00,labs_results_train,ALB_3,1
1,JC29f8ad2,131274729058,2019-08-31 12:52:00+00:00,labs_results_train,ALK_7,1
2,JC29f8ad2,131274729058,2019-08-31 12:52:00+00:00,labs_results_train,ALT_0,1
3,JC29f8ad2,131274729058,2019-08-31 12:52:00+00:00,labs_results_train,AST_1,1
4,JC29f8ad2,131274729058,2019-08-31 12:52:00+00:00,labs_results_train,AnionGap_9,1


In [10]:
df["admit_time"] = pd.to_datetime(df["admit_time"])
dfmonth = df[(df['admit_time'].dt.year == 2020) & (df['admit_time'].dt.month > 3) | (df['admit_time'].dt.year == 2021)]
print(len(dfmonth))

df2020 = df[(df['admit_time'].dt.year == 2020) & (df['admit_time'].dt.month > 3)]
print(len(df2020))

df2021 = df[(df['admit_time'].dt.year == 2021)]
print(len(df2021))

print(len(df2020) + len(df2021))

df2020['year'] = df2020['admit_time'].dt.year
df2020['month'] = df2020['admit_time'].dt.month

# df2020.year.unique()
df2020.month.unique()

16484
7465
9019
16484


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2020['year'] = df2020['admit_time'].dt.year
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2020['month'] = df2020['admit_time'].dt.month


array([ 9,  8,  6, 10, 12, 11,  4,  7,  5], dtype=int64)

In [13]:
train_labels = dft[dft['admit_time'].dt.year < 2019]
validation_labels = dft[dft['admit_time'].dt.year >= 2019]
print(len(train_labels)) # 1323438
print(len(validation_labels)) # 503481
len(train_labels) + len(validation_labels) # 1826919

1323438
503481


1826919

In [14]:
dftv = pd.read_csv(os.path.join(outdir, "6_10_coh4_binned_labs_vitals_test.csv"))
dftv["admit_time"] = pd.to_datetime(dftv["admit_time"])

print(len(dftv)) # 2534674
dftv.head(5)

2534674


Unnamed: 0,anon_id,pat_enc_csn_id_coded,admit_time,feature_type,features,values
0,JC1000116,131295313275,2020-09-29 22:45:00+00:00,labs_results_test,ALB_7,1
1,JC1000116,131295313275,2020-09-29 22:45:00+00:00,labs_results_test,ALK_1,1
2,JC1000116,131295313275,2020-09-29 22:45:00+00:00,labs_results_test,ALT_3,1
3,JC1000116,131295313275,2020-09-29 22:45:00+00:00,labs_results_test,AST_4,1
4,JC1000116,131295313275,2020-09-29 22:45:00+00:00,labs_results_test,AnionGap_4,1


In [18]:
trainval_labels = dft
test_labels = dftv[(dftv['admit_time'].dt.year > 2019)  & (dftv['admit_time'].dt.month > 3)]
print(len(trainval_labels))
print(len(test_labels))
len(trainval_labels) + len(test_labels)

1826919
594342


2421261

### Push 1_4_cohort test (2019 and 2020) with predicted results

In [7]:
df = pd.read_csv(os.path.join(modeldir4, "1_4_cohort_test_results.csv"))
print(len(df)) # 12418
df.head(5)

12418


Unnamed: 0,anon_id,pat_enc_csn_id_coded,inpatient_data_id_coded,admit_time,label_max24,label_24hr_recent,admit_label,has_admit_label,died_within_24hrs,death_24hr_max_label,death_24hr_recent_label,first_label,first_label_minutes_since_admit,acute_to_critical_label_recent,critical_to_acute_label_recent,acute_to_critical_label_max,critical_to_acute_label_max,adm_year,transfer,pred_death_24hr_max,pred_death_24hr_recent,pred_first,diff0_24,diff0_True,diff24_True,abs_diff0_24
0,JCe8840f,131264906504,34995073,2019-02-14 22:22:00,0,0,0.0,1,0,0,0,0,0,0,0,0,0,2019,0,0.281178,0.209197,0.235676,0.03,0.24,0.21,0.0
1,JCdb7bb2,131280664882,44427056,2019-11-22 08:10:00,1,1,,0,0,1,1,1,123,0,0,0,0,2019,0,0.952692,0.846169,0.946322,0.1,-0.05,-0.15,0.1
2,JCdf010a,131282861801,45762025,2020-03-04 03:38:00,1,1,0.0,1,0,1,1,0,0,1,0,1,0,2020,1,0.767333,0.714786,0.688232,-0.03,0.69,-0.29,0.0
3,JC2a0fefa,131264837675,34946537,2019-03-01 18:26:00,1,1,1.0,1,0,1,1,1,0,0,0,0,0,2019,0,0.822907,0.485052,0.654735,0.17,-0.35,-0.51,0.2
4,JCcbc03d,131279238299,43525642,2019-11-08 01:14:00,0,0,0.0,1,0,0,0,0,0,0,0,0,0,2019,0,0.359332,0.348007,0.306411,-0.04,0.31,0.35,0.0


In [8]:
DATASET_NAME = 'triageTD'
TABLE_NAME = '1_4_cohort_test_results'
df.to_gbq(destination_table='triageTD.%s' % TABLE_NAME,
          project_id='som-nero-phi-jonc101', 
          if_exists='replace')

1it [00:05,  5.91s/it]


In [4]:
df = pd.read_csv(os.path.join(modeldir4, "1_4_cohort_diff_full_features.csv"))
print(len(df)) # 71107 (modeldir4) vs 69334 modeldir4preadmit
df.tail(5)

71107


Unnamed: 0,anon_id,pat_enc_csn_id_coded,admit_time,first_label,death_24hr_recent_label,death_24hr_max_label,pred_first,pred_death_24hr_recent,abs_diff0_24,diff0_True,diff24_True,feature_type,features,values,time
71102,JCda7d53,131278811777,2019-12-06 03:25:00,1,1,1,0.26965,0.525167,0.3,-0.73,-0.47,labs,Eos,0.01,2019-12-06 00:48:00+00:00
71103,JCda7d53,131278811777,2019-12-06 03:25:00,1,1,1,0.26965,0.525167,0.3,-0.73,-0.47,labs,Lymp,1.23,2019-12-06 00:48:00+00:00
71104,JCda7d53,131278811777,2019-12-06 03:25:00,1,1,1,0.26965,0.525167,0.3,-0.73,-0.47,labs,Basos,0.03,2019-12-06 00:48:00+00:00
71105,JCda7d53,131278811777,2019-12-06 03:25:00,1,1,1,0.26965,0.525167,0.3,-0.73,-0.47,labs,Mono,0.7,2019-12-06 00:48:00+00:00
71106,JCda7d53,131278811777,2019-12-06 03:25:00,1,1,1,0.26965,0.525167,0.3,-0.73,-0.47,labs,Neut,19.62,2019-12-06 00:48:00+00:00


In [6]:
DATASET_NAME = 'triageTD'
TABLE_NAME = '1_4_cohort_diff_full_features'
# TABLE_NAME = '1_4_cohort_24hrpreadmit_diff_full_features'
df.to_gbq(destination_table='triageTD.%s' % TABLE_NAME,
          project_id='som-nero-phi-jonc101', 
          if_exists='replace')

1it [00:09,  9.40s/it]


### Push 1_4_cohort all with full features used for modeling

In [9]:
df = pd.read_csv(os.path.join(featuredir, "1_4_cohort_full_features.csv"), engine='python')
print(len(df)) # 9,171,908 modeldir4 vs modeldir4preadmit
df.head(5)

9171898


Unnamed: 0,anon_id,pat_enc_csn_id_coded,inpatient_data_id_coded,admit_time,label_max24,label_24hr_recent,admit_label,has_admit_label,died_within_24hrs,death_24hr_max_label,death_24hr_recent_label,first_label,first_label_minutes_since_admit,acute_to_critical_label_recent,critical_to_acute_label_recent,acute_to_critical_label_max,critical_to_acute_label_max,feature_type,features,values,time,hr_before_admit
0,JCd97296,131176042095,18290644,2016-02-06 22:31:00,0,0,,0,0,0,0,0,1325,0,0,0,0,Lab,CBC WITH DIFFERENTIAL,2.0,,
1,JCd97296,131176042095,18290644,2016-02-06 22:31:00,0,0,,0,0,0,0,0,1325,0,0,0,0,Lab,AB SCREEN (ASI),2.0,,
2,JCd97296,131176042095,18290644,2016-02-06 22:31:00,0,0,,0,0,0,0,0,1325,0,0,0,0,Lab,"DRUGS OF ABUSE SCREEN, URINE",2.0,,
3,JCd97296,131176042095,18290644,2016-02-06 22:31:00,0,0,,0,0,0,0,0,1325,0,0,0,0,Lab,VOLATILE SCREEN,2.0,,
4,JCd97296,131176042095,18290644,2016-02-06 22:31:00,0,0,,0,0,0,0,0,1325,0,0,0,0,Lab,ABO/RH (ARI),2.0,,


In [None]:
DATASET_NAME = 'triageTD'
TABLE_NAME = '1_4_cohort_full_features'
# TABLE_NAME = '1_4_cohort_24hrpreadmit_full_features'
df.to_gbq(destination_table='triageTD.%s' % TABLE_NAME,
          project_id='som-nero-phi-jonc101', 
          if_exists='replace')

1it [04:21, 261.39s/it]


### Check csn in clinical_meta_doc

In [4]:
q = """
select * 
from som-nero-phi-jonc101.shc_core_2021.clinical_doc_meta as notes
where notes.pat_enc_csn_id_coded in 
    (select pat_enc_csn_id_coded
     from som-nero-phi-jonc101.triageTD.6_7_cohort4
    )
"""
query_job = client.query(q)
doc = query_job.to_dataframe()
print(len(doc)) #

0


In [11]:
q = """
select *
from som-nero-phi-naras-ric.Jon_Chen_data_Oct_2021.shc_clinical_doc_meta as notes
where notes.pat_enc_csn_id_coded in 
    (select pat_enc_csn_id_coded
     from `som-nero-phi-jonc101.triageTD.6_7_cohort4_all`
    )
"""
query_job = client.query(q)
doc = query_job.to_dataframe()
print(len(doc)) #

Forbidden: 403 POST https://bigquery.googleapis.com/bigquery/v2/projects/som-nero-phi-naras-ric/jobs?prettyPrint=false: Access Denied: Project som-nero-phi-naras-ric: User does not have bigquery.jobs.create permission in project som-nero-phi-naras-ric.

(job ID: 47619d7b-a383-4af1-850b-aac88330d8ea)

                           -----Query Job SQL Follows-----                            

    |    .    |    .    |    .    |    .    |    .    |    .    |    .    |    .    |
   1:
   2:select *
   3:from som-nero-phi-naras-ric.Jon_Chen_data_Oct_2021.shc_clinical_doc_meta as notes
   4:where notes.pat_enc_csn_id_coded in 
   5:    (select pat_enc_csn_id_coded
   6:     from `som-nero-phi-jonc101.triageTD.6_7_cohort4_all`
   7:    )
    |    .    |    .    |    .    |    .    |    .    |    .    |    .    |    .    |

### Misc