## SQL & BQ
Collection of files queried from or pushed to BQ

In [22]:
# !pip install pandas-gbq

In [1]:
import pandas as pd
import matplotlib.mlab as mlab
import matplotlib.pyplot as plt
from datetime import datetime, timedelta

# %matplotlib inline
# %load_ext rpy2.ipython

In [2]:
import os 
from google.cloud import bigquery
from google.cloud.bigquery import dbapi

##Use correct path based on whether you are, Nero or local
# use Ctrl + Insert to copy and Shift + Insert to paste

# for Nero:
# os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = '/home/minh084/.config/gcloud/application_default_credentials.json' 
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = '/home/jupyter/.config/gcloud/application_default_credentials.json'

# for local computer:
# os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = r'C:\Users\User\AppData\Roaming\gcloud\application_default_credentials.json' 

##set correct Nero project
os.environ['GCLOUD_PROJECT'] = 'som-nero-phi-jonc101' 

##Setting up BQ API, m1:
client = bigquery.Client()

##using dbAPI connection, m2:
conn = dbapi.connect(client)



### ADT check service, class to get admitted ED patients

In [3]:
datadir = "../../DataTD"
cohortdir = "../../OutputTD/1_cohort"
featuredir = "../../OutputTD/2_features"

pd.set_option('display.max_columns', 50)
pd.set_option('display.max_rows', 50)

In [4]:
# check information on ADT to select patient class and service for querying original cohort
q = """
SELECT adt.pat_class_c, adt.pat_class, adt.base_pat_class, adt.pat_service, adt.pat_lv_of_care
FROM 
    `som-nero-phi-jonc101.shc_core.adt` as adt
WHERE adt.pat_class_c = "112"
or adt.pat_class = "Emergency Services"
or adt.pat_service LIKE  "%Emergency%"
"""
query_job = client.query(q)
df = query_job.to_dataframe()

In [5]:
df.to_csv(os.path.join(datadir,"adt_class_serv_loc.csv"), index=False)
df = pd.read_csv(os.path.join(datadir,"adt_class_serv_loc.csv"))
print(len(df)) #2711384

2711384


### Run 1.1_cohort_init notebook
Results in `1_1_cohort` as the init (original) cohort.

Label is the label for highest level of care within 24 hours since admission.

### Queries information for other inclusion/ex criteria
When querying, join with `1_1_cohort`
- Inpatient/hospital encounters only
- Full code only
- Age 18 and older only

In [4]:
q = open('../SQL/feature_values/m1_encounter.sql', 'r')
query_job =  client.query(q.read()).to_dataframe().to_csv(os.path.join(datadir, 'encounters.csv'), index=False)

In [8]:
q = open('../SQL/feature_values/m2_codestatus.sql', 'r')
query_job =  client.query(q.read()).to_dataframe().to_csv(os.path.join(datadir, 'code_status.csv'), index=False)

In [6]:
q = open('../SQL/feature_values/m3_demographic.sql', 'r')
query_job =  client.query(q.read()).to_dataframe().to_csv(os.path.join(datadir, 'demographics.csv'), index=False)

### Pushed the 1_2_cohort after running 1.2_criteria_R notebook
Use `1_1_cohort` to filter out patients with exclusion/inc criteria, resulting in updated `1_2_cohort`

In [11]:
df = pd.read_csv(os.path.join(cohortdir, "1_2_cohort.csv"))
print(len(df)) # 45796 vs 45794
print(list(df.columns))

45794
['anon_id', 'pat_enc_csn_id_coded', 'label', 'admit_time', 'inpatient_data_id_coded', 'ESI', 'hosp_admsn_time', 'ed_time_hr', 'gender', 'race', 'language', 'recent_height_cm', 'recent_weight_kg', 'insurance', 'recent_date', 'age']


In [12]:
df.head()

Unnamed: 0,anon_id,pat_enc_csn_id_coded,label,admit_time,inpatient_data_id_coded,ESI,hosp_admsn_time,ed_time_hr,gender,race,language,recent_height_cm,recent_weight_kg,insurance,recent_date,age
0,JC29f8ad2,131274729058,0,2019-08-31 12:52:00,40679773,3.0,2019-08-31 09:59:00,2.883333,Female,White,English,165.1,83.46,UNITED HEALTHCARE,2020-03-31,52
1,JC29f8ad3,131278291027,0,2019-10-05 23:48:00,42992239,3.0,2019-10-05 20:07:00,3.683333,Female,Asian,English,,,BLUE CROSS,2020-02-24,35
2,JC29f8b9c,131266787806,0,2019-05-05 01:07:00,36261582,2.0,2019-05-05 00:06:00,1.016667,Male,Unknown,,,,,2019-01-12,59
3,JC29f8beb,131264387263,0,2019-03-15 03:35:00,34626013,3.0,2019-03-15 00:16:00,3.316667,Female,White,English,154.9,65.4,CIGNA,2019-12-22,30
4,JC29f8beb,131279241689,0,2019-11-27 15:29:00,43527040,3.0,2019-11-27 13:32:00,1.95,Female,White,English,154.9,65.4,CIGNA,2019-12-22,30


In [13]:
table_schema = [{'name' : 'anon_id', 'type' : 'STRING'},
                {'name' : 'pat_enc_csn_id_coded', 'type' : 'INTEGER'},
                {'name' : 'inpatient_data_id_coded', 'type': 'INTEGER'},
                {'name' : 'admit_time', 'type' : 'TIMESTAMP'},
                {'name' : 'label', 'type' : 'INTEGER'},
                {'name' : 'hosp_admsn_time', 'type': 'TIMESTAMP'},
                {'name' : 'ed_time_hr', 'type': 'FLOAT'},
                {'name' : 'ESI', 'type': 'INTEGER'},
                {'name' : 'age', 'type' : 'INTEGER'},
                {'name' : 'gender', 'type' : 'STRING'},
                {'name' : 'race', 'type' : 'STRING'},
                {'name' : 'language', 'type' : 'STRING'},
                {'name' : 'insurance', 'type' : 'STRING'},
                {'name' : 'recent_height_cm', 'type' : 'FLOAT'},
                {'name' : 'recent_weight_kg', 'type' : 'FLOAT'}, 
                {'name' : 'recent_date', 'type' : 'DATE'}]
DATASET_NAME = 'triageTD'
TABLE_NAME = '1_2_cohort'
df.to_gbq(destination_table='triageTD.%s' % TABLE_NAME,
                 project_id='som-nero-phi-jonc101',
                 table_schema=table_schema,
                 if_exists='replace')

1it [00:46, 46.38s/it]


### Use updated 1_2_cohort with inpatient_id_coded to query flowsheet

In [18]:
q = open('../SQL/feature_values/m4_HWflowsheet.sql', 'r')
query_job =  client.query(q.read()).to_dataframe().to_csv(os.path.join(datadir, 'HWflowsheet.csv'), index=False)

In [20]:
q = open('../SQL/feature_values/m5_flowsheet.sql', 'r')
query_job =  client.query(q.read()).to_dataframe().to_csv(os.path.join(datadir, 'flowsheet.csv'), index=False)

In [16]:
q.close()

### Query and save files (demographics, code status, and admission) for joining later

### Push 1_3_cohort file from R notebook to Big Query
- From 1.3_cohort_complete1vitals.ipynb or part 2 of 2.3_vitalsigns_R.ipynb
- This is the updated cohort with at least a complete set of vital signs. First set in this file

In [23]:
df = pd.read_csv(os.path.join(cohortdir, "1_3_cohort.csv"))
print(len(df)) # 44258
df.head(5)

44258


Unnamed: 0,anon_id,pat_enc_csn_id_coded,inpatient_data_id_coded,admit_time,label,DBP,Pulse,RR,SBP,Temp
0,JC29f8ad2,131274729058,40679773,2019-08-31 12:52:00+00:00,0,74,95,20,118,36.95
1,JC29f8ad3,131278291027,42992239,2019-10-05 23:48:00+00:00,0,59,78,18,90,36.85
2,JC29f8b9c,131266787806,36261582,2019-05-05 01:07:00+00:00,0,66,108,26,165,39.2
3,JC29f8beb,131264387263,34626013,2019-03-15 03:35:00+00:00,0,82,93,16,127,37.05
4,JC29f8beb,131279241689,43527040,2019-11-27 15:29:00+00:00,0,64,118,18,116,37.05


In [24]:
# %load_ext google.cloud.bigquery
# cohort file
table_schema = [{'name' : 'jc_uid', 'type' : 'STRING'},
                {'name' : 'pat_enc_csn_id_coded', 'type' : 'INTEGER'},
                {'name' : 'inpatient_data_id_coded', 'type': 'INTEGER'},
                {'name' : 'admit_time', 'type' : 'TIMESTAMP'},
                {'name' : 'label', 'type' : 'INTEGER'},
                {'name' : 'DBP', 'type' : 'INTEGER'},
                {'name' : 'SBP', 'type' : 'INTEGER'},
                {'name' : 'Pulse', 'type' : 'INTEGER'},
                {'name' : 'RR', 'type' : 'INTEGER'},
                {'name' : 'Temp', 'type' : 'FLOAT'}]
                       
DATASET_NAME = 'triageTD'
TABLE_NAME = '1_3_cohort'
df.to_gbq(destination_table='triageTD.%s' % TABLE_NAME,
          project_id='som-nero-phi-jonc101', 
          table_schema=table_schema,
          if_exists='replace')

1it [00:03,  3.91s/it]


### Use updated 1_3_cohort with reduced observation to query labs
And push the cohort with labs to BQ

In [26]:
q = open('../SQL/feature_values/m6_labs.sql', 'r')
query_job =  client.query(q.read()).to_dataframe().to_csv(os.path.join(datadir, 'labs.csv'), index=False)

In [29]:
# after processing labs, retain cohort with labs:
df = pd.read_csv(os.path.join(cohortdir, "cohort3L_withlabs.csv"))
print(len(df)) # 44258
df.head(5)

41627


Unnamed: 0,anon_id,pat_enc_csn_id_coded,inpatient_data_id_coded,admit_time,label
0,JCe33305,131063880385,13777312,2015-01-04 08:11:00+00:00,0
1,JCcd4014,131178061474,18550319,2016-03-16 00:01:00+00:00,0
2,JCdbdb3f,131080211867,14791894,2015-03-09 01:31:00+00:00,0
3,JCeaa1bc,131064623255,13866526,2015-01-03 22:02:00+00:00,1
4,JCe46ec7,131082536687,14979032,2015-05-01 21:50:00+00:00,0


In [30]:
# %load_ext google.cloud.bigquery
# cohort file
table_schema = [{'name' : 'jc_uid', 'type' : 'STRING'},
                {'name' : 'pat_enc_csn_id_coded', 'type' : 'INTEGER'},
                {'name' : 'inpatient_data_id_coded', 'type': 'INTEGER'},
                {'name' : 'admit_time', 'type' : 'TIMESTAMP'},
                {'name' : 'label', 'type' : 'INTEGER'}]
                       
DATASET_NAME = 'triageTD'
TABLE_NAME = 'cohort3L_withlabs'
df.to_gbq(destination_table='triageTD.%s' % TABLE_NAME,
          project_id='som-nero-phi-jonc101', 
          table_schema=table_schema,
          if_exists='replace')

1it [00:04,  4.93s/it]


### Push some final datasets:
- 1_4_cohort
- 1_5_cohort_final
- simple data (need to rename race.White to race_White, etc.... to push to avoid having a . in colnames)
- full feature values dataset

In [5]:
# from Tiffany's:
df = pd.read_csv(os.path.join(cohortdir, "1_4_cohort.csv"))
print(df.shape) # 43980
df.head(5)

(43980, 17)


Unnamed: 0,anon_id,pat_enc_csn_id_coded,inpatient_data_id_coded,admit_time,label_max24,label_24hr_recent,admit_label,has_admit_label,died_within_24hrs,death_24hr_max_label,death_24hr_recent_label,first_label,first_label_minutes_since_admit,acute_to_critical_label_recent,critical_to_acute_label_recent,acute_to_critical_label_max,critical_to_acute_label_max
0,JCd97296,131176042095,18290644,2016-02-06 22:31:00+00:00,0,0,,0,0,0,0,0,1325.0,0,0,0,0
1,JCcdc7e1,131064611420,13865299,2015-01-15 21:16:00+00:00,1,1,1.0,1,0,1,1,1,0.0,0,0,0,0
2,JCe3e5f4,131072326078,14296997,2015-01-28 11:12:00+00:00,1,1,1.0,1,0,1,1,1,0.0,0,0,0,0
3,JCdcfce9,131178712824,18633398,2016-03-04 17:01:00+00:00,1,1,1.0,1,0,1,1,1,0.0,0,0,0,0
4,JCdaaaa6,131211945620,22773101,2016-12-07 22:17:00+00:00,0,0,0.0,1,0,0,0,0,0.0,0,0,0,0


In [6]:
df["admit_time"] = pd.to_datetime(df["admit_time"]) 

In [7]:
DATASET_NAME = 'triageTD'
TABLE_NAME = '1_4_cohort'
df.to_gbq(destination_table='triageTD.%s' % TABLE_NAME,
          project_id='som-nero-phi-jonc101', 
          if_exists='replace')

1it [00:04,  4.65s/it]


In [14]:
# after processing labs, retain cohort with labs:
df = pd.read_csv(os.path.join(cohortdir, "1_5_cohort_final.csv"))
print(df.shape) # 41366
df.head(5)

(41366, 17)


Unnamed: 0,anon_id,pat_enc_csn_id_coded,inpatient_data_id_coded,admit_time,label_max24,label_24hr_recent,admit_label,has_admit_label,died_within_24hrs,death_24hr_max_label,death_24hr_recent_label,first_label,first_label_minutes_since_admit,acute_to_critical_label_recent,critical_to_acute_label_recent,acute_to_critical_label_max,critical_to_acute_label_max
0,JCe33305,131063880385,13777312,2015-01-04 08:11:00+00:00,0,0,0.0,1,0,0,0,0,0,0,0,0,0
1,JCcd4014,131178061474,18550319,2016-03-16 00:01:00+00:00,0,0,,0,0,0,0,0,233,0,0,0,0
2,JCdbdb3f,131080211867,14791894,2015-03-09 01:31:00+00:00,0,0,0.0,1,0,0,0,0,0,0,0,0,0
3,JCeaa1bc,131064623255,13866526,2015-01-03 22:02:00+00:00,1,1,1.0,1,0,1,1,1,0,0,0,0,0
4,JCe46ec7,131082536687,14979032,2015-05-01 21:50:00+00:00,0,0,0.0,1,0,0,0,0,0,0,0,0,0


In [12]:
print(list(df.columns))

['anon_id', 'pat_enc_csn_id_coded', 'inpatient_data_id_coded', 'admit_time', 'label_max24', 'label_24hr_recent', 'admit_label', 'has_admit_label', 'died_within_24hrs', 'death_24hr_max_label', 'death_24hr_recent_label', 'first_label', 'first_label_minutes_since_admit', 'acute_to_critical_label_recent', 'critical_to_acute_label_recent', 'acute_to_critical_label_max', 'critical_to_acute_label_max']


In [15]:
# %load_ext google.cloud.bigquery
# cohort file
table_schema = [{'name' : 'jc_uid', 'type' : 'STRING'},
                {'name' : 'pat_enc_csn_id_coded', 'type' : 'INTEGER'},
                {'name' : 'inpatient_data_id_coded', 'type': 'INTEGER'},
                {'name' : 'admit_time', 'type' : 'TIMESTAMP'},
                {'name' : 'label_max24', 'type' : 'INTEGER'},
                {'name' : 'label_24hr_recent', 'type' : 'INTEGER'},
                {'name' : 'admit_label', 'type' : 'INTEGER'},
                {'name' : 'has_admit_label', 'type' : 'INTEGER'},
                {'name' : 'died_within_24hrs', 'type' : 'INTEGER'},
                {'name' : 'death_24hr_max_label', 'type' : 'INTEGER'},
                {'name' : 'death_24hr_recent_label', 'type' : 'INTEGER'},
                {'name' : 'first_label', 'type' : 'INTEGER'}, 
                {'name' : 'first_label_minutes_since_admit', 'type' : 'INTEGER'},
                {'name' : 'acute_to_critical_label_recent', 'type' : 'INTEGER'},
                {'name' : 'critical_to_acute_label_recent', 'type' : 'INTEGER'},
                {'name' : 'acute_to_critical_label_max', 'type' : 'INTEGER'},
                {'name' : 'critical_to_acute_label_max', 'type' : 'INTEGER'}]
                       
DATASET_NAME = 'triageTD'
TABLE_NAME = '1_5_cohort_final'
df.to_gbq(destination_table='triageTD.%s' % TABLE_NAME,
          project_id='som-nero-phi-jonc101', 
          table_schema=table_schema,
          if_exists='replace')

1it [00:06,  6.80s/it]


In [33]:
# simple data with vitals sum stats
df = pd.read_csv(os.path.join(featuredir, "2_6_simpledata.csv"))
print(df.shape) # 41366
df.iloc[0:4, 0:23]

(41366, 81)


Unnamed: 0,anon_id,pat_enc_csn_id_coded,first_label,death_24hr_recent_label,ESI_i,delta_ESI,gender,age,insurance,English,Height_i,delta_H,Weight_i,delta_W,race.Asian,race.Black,race.Native.American,race.Other,race.Pacific.Islander,race.Unknown,race.White,DBP_count,DBP_first_val
0,JCe33305,131063880385,0,0,3,0,1,56,1,1,163.0,0,60.0,0,0,1,0,0,0,0,0,13,68
1,JCcd4014,131178061474,0,0,3,0,1,81,1,1,163.0,0,97.0,0,0,1,0,0,0,0,0,13,109
2,JCdbdb3f,131080211867,0,0,3,0,1,79,0,1,155.0,0,56.0,0,1,0,0,0,0,0,0,6,60
3,JCeaa1bc,131064623255,1,1,2,0,0,51,1,1,190.0,0,79.0,0,0,0,0,0,0,0,1,28,62


In [34]:
df.columns = df.columns.str.replace(".", "_")
df.iloc[0:4, 0:23]

Unnamed: 0,anon_id,pat_enc_csn_id_coded,first_label,death_24hr_recent_label,ESI_i,delta_ESI,gender,age,insurance,English,Height_i,delta_H,Weight_i,delta_W,race_Asian,race_Black,race_Native_American,race_Other,race_Pacific_Islander,race_Unknown,race_White,DBP_count,DBP_first_val
0,JCe33305,131063880385,0,0,3,0,1,56,1,1,163.0,0,60.0,0,0,1,0,0,0,0,0,13,68
1,JCcd4014,131178061474,0,0,3,0,1,81,1,1,163.0,0,97.0,0,0,1,0,0,0,0,0,13,109
2,JCdbdb3f,131080211867,0,0,3,0,1,79,0,1,155.0,0,56.0,0,1,0,0,0,0,0,0,6,60
3,JCeaa1bc,131064623255,1,1,2,0,0,51,1,1,190.0,0,79.0,0,0,0,0,0,0,0,1,28,62


In [35]:
# can't push because of colnames have a dot (from one-hot coding) --> need to rename to push             
DATASET_NAME = 'triageTD'
TABLE_NAME = '2_6_simpledata'
df.to_gbq(destination_table='triageTD.%s' % TABLE_NAME,
          project_id='som-nero-phi-jonc101', 
          if_exists='replace')

1it [00:22, 22.16s/it]


In [18]:
# final cohort data with all feature values, warning due to time NA for demos
# will be use with feature counts
df = pd.read_csv(os.path.join(featuredir, "2_7_feature_values.csv"))
print(len(df)) # 3012942
df.head(5)

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


3012942


Unnamed: 0,anon_id,pat_enc_csn_id_coded,admit_time,first_label,death_24hr_recent_label,feature_type,features,values,time
0,JC29f8ad2,131274729058,2019-08-31 12:52:00+00:00,0,0,demo,ESI_i,3.0,
1,JC29f8ad3,131278291027,2019-10-05 23:48:00+00:00,0,0,demo,ESI_i,3.0,
2,JC29f8b9c,131266787806,2019-05-05 01:07:00+00:00,0,0,demo,ESI_i,2.0,
3,JC29f8beb,131264387263,2019-03-15 03:35:00+00:00,0,0,demo,ESI_i,3.0,
4,JC29f8beb,131279241689,2019-11-27 15:29:00+00:00,0,0,demo,ESI_i,3.0,


In [20]:
table_schema = [{'name' : 'anon_id', 'type' : 'STRING'},
                {'name' : 'pat_enc_csn_id_coded', 'type' : 'INTEGER'},
                {'name' : 'admit_time', 'type' : 'TIMESTAMP'},
                {'name' : 'first_label', 'type' : 'INTEGER'},
                {'name' : 'death_24hr_recent_label', 'type' : 'INTEGER'},
                {'name' : 'feature_type', 'type' : 'STRING'},
                {'name' : 'features', 'type' : 'STRING'},
                {'name' : 'values', 'type' : 'FLOAT'},
                {'name' : 'recorded_time', 'type' : 'TIMESTAMP'}]

                       
DATASET_NAME = 'triageTD'
TABLE_NAME = '2_7_feature_values'
df.to_gbq(destination_table='triageTD.%s' % TABLE_NAME,
          project_id='som-nero-phi-jonc101',
          table_schema=table_schema,
          if_exists='replace')

1it [02:55, 175.03s/it]


### Feature order counts 

In [8]:
q = open('../SQL/feature_counts/Code_Counts.sql', 'r')
query_job =  client.query(q.read()).to_dataframe().to_csv(os.path.join(datadir, 'coh4_order_code_counts.csv'), index=False)

In [8]:
df = pd.read_csv(os.path.join(datadir, "order_code_counts.csv"))
print(len(df)) # 5730678
df.head(5)

5730678


Unnamed: 0,anon_id,pat_enc_csn_id_coded,admit_time,feature_type,features,values
0,JCe78a06,131062667066,2015-01-02 01:01:00+00:00,Diagnosis,I95.9,1
1,JCe78a06,131062667066,2015-01-02 01:01:00+00:00,Diagnosis,F41.1,1
2,JCe78a06,131062667066,2015-01-02 01:01:00+00:00,Diagnosis,I12.9,1
3,JCe78a06,131062667066,2015-01-02 01:01:00+00:00,Diagnosis,I99.8,1
4,JCe78a06,131062667066,2015-01-02 01:01:00+00:00,Lab,"MAGNESIUM, SERUM/PLASMA",1


### Push 2_9_features_all_long
This is the data set contains demos, vital and lab counts in bins, and order counts -- long format for modeling

In [17]:
# this one only has trainbin
df0 = pd.read_csv(os.path.join(featuredir, "2_9_features_all_long.csv"))
print(len(df0)) # 8258975
df0.head(5)

8258975


Unnamed: 0,anon_id,pat_enc_csn_id_coded,admit_time,feature_type,features,values
0,JC29f8ad2,131274729058,2019-08-31 12:52:00+00:00,demo,ESI_i,3.0
1,JC29f8ad3,131278291027,2019-10-05 23:48:00+00:00,demo,ESI_i,3.0
2,JC29f8b9c,131266787806,2019-05-05 01:07:00+00:00,demo,ESI_i,2.0
3,JC29f8beb,131264387263,2019-03-15 03:35:00+00:00,demo,ESI_i,3.0
4,JC29f8beb,131279241689,2019-11-27 15:29:00+00:00,demo,ESI_i,3.0


In [20]:
df0["admit_time"] = pd.to_datetime(df0["admit_time"])
df0['year'] = df0['admit_time'].dt.year

In [28]:
table_schema = [{'name' : 'anon_id', 'type' : 'STRING'},
                {'name' : 'pat_enc_csn_id_coded', 'type' : 'INTEGER'},
                {'name' : 'admit_time', 'type' : 'TIMESTAMP'},
                {'name' : 'feature_type', 'type' : 'STRING'},
                {'name' : 'features', 'type' : 'STRING'},
                {'name' : 'values', 'type' : 'FLOAT'},
                {'name' : 'year', 'type': 'INTEGER'}]

                       
DATASET_NAME = 'triageTD'
TABLE_NAME = '2_9_features_all_long'
df0.to_gbq(destination_table='triageTD.%s' % TABLE_NAME,
          project_id='som-nero-phi-jonc101',
          table_schema=table_schema,
          if_exists='replace')

1it [04:00, 240.71s/it]


In [11]:
# this one only has trainbin and testbin for vitals and labs
df = pd.read_csv(os.path.join(featuredir, "2_9_coh4_features_all_long_year.csv"))
print(len(df)) # 10529994
df.head(5)

10529994


Unnamed: 0,anon_id,pat_enc_csn_id_coded,admit_time,feature_type,features,values,year
0,JC29f8ad2,131274729058,2019-08-31 12:52:00+00:00,demo,ESI_i,3.0,2019
1,JC29f8ad3,131278291027,2019-10-05 23:48:00+00:00,demo,ESI_i,3.0,2019
2,JC29f8b9c,131266787806,2019-05-05 01:07:00+00:00,demo,ESI_i,2.0,2019
3,JC29f8beb,131264387263,2019-03-15 03:35:00+00:00,demo,ESI_i,3.0,2019
4,JC29f8beb,131279241689,2019-11-27 15:29:00+00:00,demo,ESI_i,3.0,2019


In [12]:
table_schema = [{'name' : 'anon_id', 'type' : 'STRING'},
                {'name' : 'pat_enc_csn_id_coded', 'type' : 'INTEGER'},
                {'name' : 'admit_time', 'type' : 'TIMESTAMP'},
                {'name' : 'feature_type', 'type' : 'STRING'},
                {'name' : 'features', 'type' : 'STRING'},
                {'name' : 'values', 'type' : 'FLOAT'},
                {'name' : 'year', 'type': 'INTEGER'}]

                       
DATASET_NAME = 'triageTD'
TABLE_NAME = '2_9_coh4_features_all_long_year'
df.to_gbq(destination_table='triageTD.%s' % TABLE_NAME,
          project_id='som-nero-phi-jonc101',
          table_schema=table_schema,
          if_exists='replace')

1it [02:50, 170.02s/it]


### CHECK ADT

In [10]:
# this one for Tif's label notebook, with ADT information for label processing
q = """
SELECT adt.anon_id, adt.pat_enc_csn_id_coded, adt.effective_time_jittered_utc, adt.seq_num_in_enc,
        adt.pat_class, adt.base_pat_class_c, adt.pat_lvl_of_care_c, adt.pat_lv_of_care, 
        adt.event_type, adt.pat_service
    
FROM shc_core.adt adt
RIGHT JOIN triageTD.1_2_cohort c 
ON adt.anon_id = c.anon_id and adt.pat_enc_csn_id_coded = c.pat_enc_csn_id_coded
"""
query_job = client.query(q)
adt = query_job.to_dataframe()
print(len(adt)) # 886988

adt.to_csv(os.path.join(datadir, 'cohort_adt.csv'), index=False)

886988


### Check Admission under order_proc datatable

In [7]:
q = """
SELECT o.order_type, o.display_name, o.description, o.order_class, 
       o.order_status, o.ordering_mode, o.order_time_jittered_utc, c.*
FROM shc_core.order_proc o
RIGHT JOIN triageTD.1_4_cohort c 
ON o.anon_id = c.anon_id and o.pat_enc_csn_id_coded = c.pat_enc_csn_id_coded
WHERE o.order_type = "Admission"
"""
query_job = client.query(q)
ordproc = query_job.to_dataframe()
print(len(ordproc)) #  all 17646270 takes a long time, only admission 112576

112576


In [8]:
ordproc.to_csv(os.path.join(datadir, '1_4_cohort_orderproc_adm.csv'), index=False)

### Push to BQ other files

In [17]:
df = pd.read_csv(os.path.join(featuredir, "2_4_coh3_imputedHWESI.csv"))
print(len(df)) # 45796
print(list(df.columns))
df.head(5)

44258
['anon_id', 'pat_enc_csn_id_coded', 'inpatient_data_id_coded', 'admit_time', 'label', 'ESI_i', 'delta_ESI', 'gender', 'age', 'insurance', 'English', 'Height_i', 'delta_H', 'Weight_i', 'delta_W', 'race.Asian', 'race.Black', 'race.Native.American', 'race.Other', 'race.Pacific.Islander', 'race.Unknown', 'race.White']


Unnamed: 0,anon_id,pat_enc_csn_id_coded,inpatient_data_id_coded,admit_time,label,ESI_i,delta_ESI,gender,age,insurance,...,delta_H,Weight_i,delta_W,race.Asian,race.Black,race.Native.American,race.Other,race.Pacific.Islander,race.Unknown,race.White
0,JC29f8ad2,131274729058,40679773,2019-08-31 12:52:00,0,3,0,1,52,1,...,0,81.0,0,0,0,0,0,0,0,1
1,JC29f8ad3,131278291027,42992239,2019-10-05 23:48:00,0,3,0,1,35,1,...,1,71.0,1,1,0,0,0,0,0,0
2,JC29f8b9c,131266787806,36261582,2019-05-05 01:07:00,0,2,0,0,59,0,...,1,58.0,1,0,0,0,0,0,1,0
3,JC29f8beb,131264387263,34626013,2019-03-15 03:35:00,0,3,0,1,30,1,...,0,65.4,0,0,0,0,0,0,0,1
4,JC29f8beb,131279241689,43527040,2019-11-27 15:29:00,0,3,0,1,30,1,...,0,63.0,0,0,0,0,0,0,0,1


In [12]:
# demographics file
table_schema = [{'name' : 'jc_uid', 'type' : 'STRING'},
                {'name' : 'pat_enc_csn_id_coded', 'type' : 'INTEGER'},
                {'name' : 'inpatient_data_id_coded', 'type': 'INTEGER'},
                {'name' : 'admit_time', 'type' : 'TIMESTAMP'},
                {'name' : 'label', 'type' : 'INTEGER'},
                {'name' : 'gender', 'type' : 'STRING'},
                {'name' : 'race', 'type' : 'STRING'},
                {'name' : 'language', 'type' : 'STRING'},
                {'name' : 'insurance', 'type' : 'STRING'},
                {'name' : 'recent_height', 'type' : 'INTEGER'},
                {'name' : 'recent_weight', 'type' : 'INTEGER'}, 
                {'name' : 'age', 'type' : 'INTEGER'},
                {'name' : 'recent_date', 'type' : 'DATE'}]
                       
DATASET_NAME = 'triageTD'
TABLE_NAME = '2_4_coh3_imputedHWESI'
cohort_demo.to_gbq(destination_table='triageTD.%s' % TABLE_NAME,
                 project_id='som-nero-phi-jonc101',
                 table_schema=table_schema,
                 if_exists='replace')

1it [00:12, 12.14s/it]


In [15]:
# push flowsheet up BQ:
# clean demographics file, with new variables medis and English
table_schema = [{'name' : 'jc_uid', 'type' : 'STRING'},
                {'name' : 'pat_enc_csn_id_coded', 'type' : 'INTEGER'},
                {'name' : 'inpatient_data_id_coded', 'type': 'INTEGER'},
                {'name' : 'admit_time', 'type' : 'TIMESTAMP'},
                {'name' : 'label', 'type' : 'INTEGER'},
                {'name' : 'gender', 'type' : 'STRING'},
                {'name' : 'race', 'type' : 'STRING'},
                {'name' : 'age', 'type' : 'INTEGER'},
                {'name' : 'medis', 'type' : 'INTEGER'},
                {'name' : 'Engl', 'type' : 'INTEGER'},
                {'name' : 'Height', 'type' : 'INTEGER'},
                {'name' : 'Weight', 'type' : 'INTEGER'}]
                       
DATASET_NAME = 'triage'
TABLE_NAME = 'triage_cohort_adjusted_demographics_clean'
cohort_demo_clean.to_gbq(destination_table='triage.%s' % TABLE_NAME,
                 project_id='som-nero-phi-jonc101',
                 table_schema=table_schema,
                 if_exists='replace')

1it [00:05,  5.82s/it]


In [17]:
# push flowsheet up BQ:
# imputed_demographics file
table_schema = [{'name' : 'jc_uid', 'type' : 'STRING'},
                {'name' : 'pat_enc_csn_id_coded', 'type' : 'INTEGER'},
                {'name' : 'inpatient_data_id_coded', 'type': 'INTEGER'},
                {'name' : 'admit_time', 'type' : 'TIMESTAMP'},
                {'name' : 'label', 'type' : 'INTEGER'},
                {'name' : 'Engl', 'type' : 'INTEGER'},
                {'name' : 'gender1', 'type' : 'STRING'},
                {'name' : 'race1', 'type' : 'STRING'},
                {'name' : 'age1', 'type' : 'INTEGER'},
                {'name' : 'medis1', 'type' : 'INTEGER'},
                {'name' : 'Height1', 'type' : 'INTEGER'},
                {'name' : 'Weight1', 'type' : 'INTEGER'},
                {'name' : 'delta_H', 'type' : 'INTEGER'},
                {'name' : 'delta_W', 'type' : 'INTEGER'},]

                       
DATASET_NAME = 'triage'
TABLE_NAME = 'triage_cohort_adjusted_demographics_imputed'
cohort_demo_imputed.to_gbq(destination_table='triage.%s' % TABLE_NAME,
                 project_id='som-nero-phi-jonc101',
                 table_schema=table_schema,
                 if_exists='replace')

1it [00:11, 11.92s/it]


In [28]:
# cleaned vitals
table_schema = [{'name' : 'jc_uid', 'type' : 'STRING'},
                {'name' : 'pat_enc_csn_id_coded', 'type' : 'INTEGER'},
                {'name' : 'admit_time', 'type' : 'TIMESTAMP'},
                {'name' : 'features', 'type' : 'STRING'},
                {'name' : 'recorded_time', 'type' : 'TIMESTAMP'},
                {'name' : 'values', 'type' : 'FLOAT'},
                {'name' : 'feature_type', 'type' : 'STRING'}]

                       
DATASET_NAME = 'triage'
TABLE_NAME = 'triage_cohort_adjusted_vitals_clean'
cohort_vitals_clean.to_gbq(destination_table='triage.%s' % TABLE_NAME,
                 project_id='som-nero-phi-jonc101',
                 table_schema=table_schema,
                 if_exists='replace')

1it [00:26, 26.69s/it]


In [33]:
cohort_short_labs_clean_4binning = pd.read_csv('./Data/cohort_short_labs_clean_4binning.csv')
print(len(cohort_short_labs_clean_4binning))
cohort_short_labs_clean_4binning.head(5)

313230


Unnamed: 0,jc_uid,pat_enc_csn_id_coded,admit_time,features,values,result_time,feature_type
0,JCcc15c6,131254215457,2018-07-09 03:24:00+00:00,"Magnesium, Ser/Plas",2.3,2018-07-08 23:47:00+00:00,labs
1,JCcc67c4,131170882153,2016-01-15 00:38:00+00:00,"Magnesium, Ser/Plas",1.9,2016-01-14 23:24:00+00:00,labs
2,JCccde6e,131259373038,2018-10-10 01:17:00+00:00,Glucose by Meter,155.0,2018-10-10 00:09:00+00:00,labs
3,JCccf3ed,131254647220,2018-07-05 19:23:00+00:00,"Magnesium, Ser/Plas",1.7,2018-07-05 16:19:00+00:00,labs
4,JCccf3ed,131254647220,2018-07-05 19:23:00+00:00,Glucose by Meter,187.0,2018-07-05 16:33:00+00:00,labs


In [18]:
# cleaned labs
table_schema = [{'name' : 'jc_uid', 'type' : 'STRING'},
                {'name' : 'pat_enc_csn_id_coded', 'type' : 'INTEGER'},
                {'name' : 'admit_time', 'type' : 'TIMESTAMP'},
                {'name' : 'features', 'type' : 'STRING'},
                {'name' : 'result_time', 'type' : 'TIMESTAMP'},
                {'name' : 'values', 'type' : 'FLOAT'},
                {'name' : 'feature_type', 'type' : 'STRING'}]

                       
DATASET_NAME = 'triage'
TABLE_NAME = 'triage_cohort_adjusted_short_labs_clean_4binning'
cohort_short_labs_clean_4binning.to_gbq(destination_table='triage.%s' % TABLE_NAME,
                 project_id='som-nero-phi-jonc101',
                 table_schema=table_schema,
                 if_exists='replace')

1it [00:27, 27.51s/it]


In [63]:
cohort_vitals_labs_long = pd.read_csv('./Data/cohort_vitals_labs_long.csv')
cohort_vitals_labs_long.tail(5)
print(len(cohort_vitals_labs_long))

844805


In [64]:
cohort_vitals_labs_long.loc[cohort_vitals_labs_long['features'] == "DBP"].describe()

Unnamed: 0,pat_enc_csn_id_coded,values
count,92744.0,92744.0
mean,131185700000.0,76.684885
std,63706800.0,17.482878
min,131037300000.0,15.0
25%,131124900000.0,65.0
50%,131201800000.0,76.0
75%,131242200000.0,87.0
max,131257800000.0,225.0


In [58]:
# cohort_vitals_labs_long.loc[cohort_vitals_labs_long['features'] == "DBP"].head()
cohort_vitals_labs_long = cohort_vitals_labs_long.drop('time', axis=1)
cohort_vitals_labs_long.loc[cohort_vitals_labs_long['features'] == "DBP"].head()

Unnamed: 0,jc_uid,pat_enc_csn_id_coded,admit_time,features,values,feature_type
438831,JCcb91cd,131226312076,2017-04-08 03:14:00+00:00,DBP,73.0,vitals
438832,JCcbb675,131239642003,2017-09-15 02:31:00+00:00,DBP,72.0,vitals
438833,JCcbb675,131239642003,2017-09-15 02:31:00+00:00,DBP,76.0,vitals
438834,JCcbba6b,131181787574,2016-04-06 04:30:00+00:00,DBP,77.0,vitals
438835,JCcbbdc1,131082563304,2015-03-19 02:58:00+00:00,DBP,63.0,vitals
