### Query files and push files to Big Query
* (other names: BMI212_demo_code_admit, cohort_adjusted)
* Use Cohort's original cohort to join with demographics, order_proc for code status and admission orders
* After processing and joining the these files in R (Cohort_queries notebook), push to Big Query:
including the final adjusted cohort (processed in R, saved, called from here, pushed to BQ)

* Generate: admit_code and admit_inpatient from SQL. 
* Process and save as: ordertypes and admit_inpatient.

Should have done the timediff and explore the distributions in R, but already did them here...

**UPDATE**: changed *traige_TE* to *triage* under *som-nero-phi-jonc101*

In [22]:
# !pip install pandas-gbq

In [1]:
import pandas as pd
import matplotlib.mlab as mlab
import matplotlib.pyplot as plt
from datetime import datetime, timedelta

%matplotlib inline
%load_ext rpy2.ipython

In [2]:
%%R
library(data.table)
library(tidyverse)
library(lubridate)
library(Matrix)
# library(slam)
library(bit64)
# library(mtools) for one hot coder, not available on Nero or use caret or tidyr
options(repr.matrix.max.rows=200, repr.matrix.max.cols=30)

R[write to console]: data.table 1.14.0 using 4 threads (see ?getDTthreads).  Latest news: r-datatable.com

R[write to console]: ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──

R[write to console]: ✔ ggplot2 3.3.3     ✔ purrr   0.3.4
✔ tibble  3.1.2     ✔ dplyr   1.0.6
✔ tidyr   1.1.3     ✔ stringr 1.4.0
✔ readr   1.4.0     ✔ forcats 0.5.1

R[write to console]: ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::between()   masks data.table::between()
✖ dplyr::filter()    masks stats::filter()
✖ dplyr::first()     masks data.table::first()
✖ dplyr::lag()       masks stats::lag()
✖ dplyr::last()      masks data.table::last()
✖ purrr::transpose() masks data.table::transpose()

R[write to console]: 
Attaching package: ‘lubridate’


R[write to console]: The following objects are masked from ‘package:data.table’:

    hour, isoweek, mday, minute, month, quarter, second, wday, week,
    yday, year


R[write to console]: T

In [3]:
import os 
from google.cloud import bigquery
from google.cloud.bigquery import dbapi

##Use correct path based on whether you are, Nero or local
# use Ctrl + Insert to copy and Shift + Insert to paste

# for Nero:
# os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = '/home/minh084/.config/gcloud/application_default_credentials.json' 
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = '/home/jupyter/.config/gcloud/application_default_credentials.json'

# for local computer:
# os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = r'C:\Users\User\AppData\Roaming\gcloud\application_default_credentials.json' 

##set correct Nero project
os.environ['GCLOUD_PROJECT'] = 'som-nero-phi-jonc101' 

##Setting up BQ API, m1:
client = bigquery.Client()

##using dbAPI connection, m2:
conn = dbapi.connect(client)



### ADT check service, class to get admitted ED patients

In [5]:
q = """
SELECT adt.pat_class_c, adt.pat_class, adt.base_pat_class, adt.pat_service, adt.pat_lv_of_care
FROM 
    `som-nero-phi-jonc101.shc_core.adt` as adt
WHERE adt.pat_class_c = "112"
or adt.pat_class = "Emergency Services"
or adt.pat_service LIKE  "%Emergency%"
"""
query_job = client.query(q)
df = query_job.to_dataframe()

In [7]:
df.to_csv('../Data/adt_class_serv_loc.csv', index=False)
df = pd.read_csv('../Data/adt_class_serv_loc.csv')
print(len(df))

2711384


### Process 01_cohort_init to grab init (original) cohort

### Queries for other in/ex criteria

In [24]:
q = open('../SQL/feature_values/m1_encounter.sql', 'r')
query_job =  client.query(q.read()).to_dataframe().to_csv('../Data/encounters.csv', index=False)

In [25]:
q = open('../SQL/feature_values/m2_codestatus.sql', 'r')
query_job =  client.query(q.read()).to_dataframe().to_csv('../Data/code_status.csv', index=False)

In [27]:
q = open('../SQL/feature_values/m3_demographic.sql', 'r')
query_job =  client.query(q.read()).to_dataframe().to_csv('../Data/demographics.csv', index=False)

### Pushed the updated cohort after meeting encounter/code status/age criteria

In [14]:
datadir = datadir = "../../DataTD"
# os.path.join(datadir, filename1 + "." + filename_suffix)
file1 = "cohort1_criteria.csv"
os.path.join(datadir, file1)

'../../DataTD/cohort1_criteria.csv'

In [13]:
df = pd.read_csv(os.path.join(datadir,"cohort1_criteria.csv"))
print(len(df)) # 45986

45986


In [42]:
df.head()

Unnamed: 0,anon_id,pat_enc_csn_id_coded,label,admit_time,inpatient_data_id_coded,ESI,hosp_admsn_time,ed_time_hr,gender,race,language,recent_height_cm,recent_weight_kg,insurance,recent_date,age
0,JCe78a06,131062667066,0,2015-01-02 01:01:00,13616753,,2015-01-01 17:10:00,7.85,Female,White,English,152.4,55.6,HEALTHNET SENIOR ADVANTAGE,2020-01-10,83
1,JCd1c19e,131062745090,0,2015-01-03 05:53:00,13628503,3.0,2015-01-03 01:56:00,3.95,Female,White,English,154.5,49.2,BLUE SHIELD,2020-03-31,53
2,JCd91eb2,131062747648,0,2015-01-01 08:24:00,13628701,2.0,2015-01-01 01:10:00,7.233333,Female,White,English,170.18,92.7,MEDICARE,2019-01-01,64
3,JCe7cb4d,131062788358,0,2015-01-01 23:39:00,13631833,3.0,2015-01-01 17:07:00,6.533333,Male,White,English,185.42,101.9,AETNA,2019-01-01,25
4,JCe293de,131063044001,0,2015-01-05 02:23:00,13663181,3.0,2015-01-04 18:13:00,8.166667,Female,Other,Spanish,152.4,40.37,COMMERCIAL OTHER,2019-01-01,34


In [43]:
table_schema = [{'name' : 'anon_id', 'type' : 'STRING'},
                {'name' : 'pat_enc_csn_id_coded', 'type' : 'INTEGER'},
                {'name' : 'inpatient_data_id_coded', 'type': 'INTEGER'},
                {'name' : 'admit_time', 'type' : 'TIMESTAMP'},
                {'name' : 'label', 'type' : 'INTEGER'},
                {'name' : 'hosp_admsn_time', 'type': 'TIMESTAMP'},
                {'name' : 'ed_time_hr', 'type': 'FLOAT'},
                {'name' : 'ESI', 'type': 'INTEGER'},
                {'name' : 'age', 'type' : 'INTEGER'},
                {'name' : 'gender', 'type' : 'STRING'},
                {'name' : 'race', 'type' : 'STRING'},
                {'name' : 'language', 'type' : 'STRING'},
                {'name' : 'insurance', 'type' : 'STRING'},
                {'name' : 'recent_height_cm', 'type' : 'FLOAT'},
                {'name' : 'recent_weight_kg', 'type' : 'FLOAT'}, 
                {'name' : 'recent_date', 'type' : 'DATE'}]
DATASET_NAME = 'triageTD'
TABLE_NAME = 'cohort_enc_code_age'
df.to_gbq(destination_table='triageTD.%s' % TABLE_NAME,
                 project_id='som-nero-phi-jonc101',
                 table_schema=table_schema,
                 if_exists='replace')

1it [00:05,  5.56s/it]


### Use updated cohort with inpatient_id_coded to query labs and flowsheet

In [37]:
q = open('../SQL/feature_values/m4_HWflowsheet.sql', 'r')
query_job =  client.query(q.read()).to_dataframe().to_csv('../Data/HWflowsheet.csv', index=False)

In [44]:
q = open('../SQL/feature_values/m5_flowsheet.sql', 'r')
query_job =  client.query(q.read()).to_dataframe().to_csv('../Data/flowsheet.csv', index=False)

In [45]:
q = open('../SQL/feature_values/m6_labs.sql', 'r')
query_job =  client.query(q.read()).to_dataframe().to_csv('../Data/labs.csv', index=False)

In [46]:
q.close()

### Query and save files (demographics, code status, and admission) for joining later

In [19]:
# adapted Tiffany's function to save files
def query_data(sqlfile, savefile):
    # open and read the sql file
    query = open(sqlfile, 'r')
    resultsDF = pd.read_sql_query(query.read(), conn);
    
    # save the results to a file, close query and return df
    resultsDF.to_csv(savefile, index=False)
    query.close()
    return resultsDF

In [23]:
# demograhic table, join with cohort
sqlfile = 'SQL/m4_demographic.sql'
outfile = './Data/demographic.csv'
df = query_data(sqlfile, outfile)

### Save adjusted cohort (done in R) to Big Query 
Check R notebook: 

Processing the cohort to get the adjusted admit time, excluded non-fullcodes for adult patients >= 18y.o

In [60]:
cohort_adj = pd.read_csv('./Data/cohort_adjusted.csv')
print(len(cohort_adj))
cohort_adj.head(5)

33314


Unnamed: 0,jc_uid,pat_enc_csn_id_coded,inpatient_data_id_coded,admit_time,label
0,JCcb9495,131066982419,14157061.0,2015-02-13 07:09:00,0
1,JCcb9599,131235926251,25238287.0,2017-08-03 03:50:00,0
2,JCcba5ad,131177568449,18459677.0,2016-02-20 19:27:00,0
3,JCcbfbf7,131193455501,20798263.0,2016-08-21 02:08:00,0
4,JCcc15c6,131254215457,31126955.0,2018-07-09 03:24:00,0


In [61]:
# %load_ext google.cloud.bigquery
# cohort file
table_schema = [{'name' : 'jc_uid', 'type' : 'STRING'},
                {'name' : 'pat_enc_csn_id_coded', 'type' : 'INTEGER'},
                {'name' : 'inpatient_data_id_coded', 'type': 'INTEGER'},
                {'name' : 'admit_time', 'type' : 'TIMESTAMP'},
                {'name' : 'label', 'type' : 'INTEGER'}]
                       
DATASET_NAME = 'triage'
TABLE_NAME = 'triage_cohort_adjusted'
cohort_adj.to_gbq(destination_table='triage.%s' % TABLE_NAME,
                 project_id='som-nero-phi-jonc101',
                 table_schema=table_schema,
                 if_exists='replace')

1it [00:10, 10.49s/it]


In [62]:
cohort_demo = pd.read_csv('./Data/cohort_demo.csv')
print(len(cohort_demo))
cohort_demo.head(5)

33315


Unnamed: 0,jc_uid,pat_enc_csn_id_coded,inpatient_data_id_coded,label,admit_time,gender,race,language,recent_height,recent_weight_kg,insurance,recent_date,age
0,JCcb9495,131066982419,14157061.0,0,2015-02-13 07:09:00,Male,White,English,185,121,,2018-12-31,27
1,JCcb9599,131235926251,25238287.0,0,2017-08-03 03:50:00,Female,Black,English,152,49,MEDICARE,2018-12-31,69
2,JCcba5ad,131177568449,18459677.0,0,2016-02-20 19:27:00,Female,Asian,Mandarin,149,47,MEDICARE,2016-08-15,87
3,JCcbfbf7,131193455501,20798263.0,0,2016-08-21 02:08:00,Female,Native American,English,162,81,MEDICARE,2018-12-31,66
4,JCcc15c6,131254215457,31126955.0,0,2018-07-09 03:24:00,Male,Asian,English,167,58,UNITED HEALTHCARE,2018-12-07,66


In [12]:
# demographics file
table_schema = [{'name' : 'jc_uid', 'type' : 'STRING'},
                {'name' : 'pat_enc_csn_id_coded', 'type' : 'INTEGER'},
                {'name' : 'inpatient_data_id_coded', 'type': 'INTEGER'},
                {'name' : 'admit_time', 'type' : 'TIMESTAMP'},
                {'name' : 'label', 'type' : 'INTEGER'},
                {'name' : 'gender', 'type' : 'STRING'},
                {'name' : 'race', 'type' : 'STRING'},
                {'name' : 'language', 'type' : 'STRING'},
                {'name' : 'insurance', 'type' : 'STRING'},
                {'name' : 'recent_height', 'type' : 'INTEGER'},
                {'name' : 'recent_weight', 'type' : 'INTEGER'}, 
                {'name' : 'age', 'type' : 'INTEGER'},
                {'name' : 'recent_date', 'type' : 'DATE'}]
                       
DATASET_NAME = 'triage'
TABLE_NAME = 'triage_cohort_adjusted_demographics'
cohort_demo.to_gbq(destination_table='triage.%s' % TABLE_NAME,
                 project_id='som-nero-phi-jonc101',
                 table_schema=table_schema,
                 if_exists='replace')

1it [00:12, 12.14s/it]


### Get tables from BigQuery

In [6]:
q = "SELECT * FROM triage.triage_cohort_adjusted_flowsheet"
query_job = client.query(q).to_dataframe().to_csv('./Data/flowsheet.csv', index=False)

In [7]:
q = "SELECT * FROM triage.triage_cohort_adjusted_labs"
query_job = client.query(q).to_dataframe().to_csv('./Data/labs.csv', index=False)

### PUSH tables to BigQuery

In [17]:
# Update cohort
cohort = pd.read_csv('../../../Data/cohort.csv')
print(len(cohort))
print(list(cohort.columns))
cohort.head(5)

43493
['anon_id', 'pat_enc_csn_id_coded', 'inpatient_data_id_coded', 'label_max24', 'admit_time']


Unnamed: 0,anon_id,pat_enc_csn_id_coded,inpatient_data_id_coded,label_max24,admit_time
0,JCcb658e,131231466934,24822070,0,2017-06-24 12:56:00
1,JCcb65bd,131072545278,14327919,0,2015-02-18 20:54:00
2,JCcb65e1,131239638605,26535361,0,2017-11-08 09:51:00
3,JCcb65e1,131265028886,35078537,0,2019-04-17 16:25:00
4,JCcb6601,131125490360,16936826,0,2015-09-15 04:10:00


In [23]:
table_schema = [{'name' : 'anon_id', 'type' : 'STRING'},
                {'name' : 'pat_enc_csn_id_coded', 'type' : 'INTEGER'},
                {'name' : 'inpatient_data_id_coded', 'type': 'INTEGER'},
                {'name' : 'admit_time', 'type' : 'TIMESTAMP'},
                {'name' : 'label_max24', 'type' : 'INTEGER'}]
                       
DATASET_NAME = 'triage'
TABLE_NAME = 'cohort'
cohort.to_gbq(destination_table='triage.%s' % TABLE_NAME,
              project_id='som-nero-phi-jonc101',
              table_schema=table_schema,
              if_exists='replace')

1it [00:03,  3.75s/it]


In [16]:
cohort_demo_clean = pd.read_csv('./Data/cohort_demo_clean.csv')
print(len(cohort_demo_clean))
cohort_demo_clean.head(5)

43493
['anon_id', 'pat_enc_csn_id_coded', 'inpatient_data_id_coded', 'label_max24', 'admit_time']


Unnamed: 0,anon_id,pat_enc_csn_id_coded,inpatient_data_id_coded,label_max24,admit_time
0,JCcb658e,131231466934,24822070,0,2017-06-24 12:56:00
1,JCcb65bd,131072545278,14327919,0,2015-02-18 20:54:00
2,JCcb65e1,131239638605,26535361,0,2017-11-08 09:51:00
3,JCcb65e1,131265028886,35078537,0,2019-04-17 16:25:00
4,JCcb6601,131125490360,16936826,0,2015-09-15 04:10:00


In [15]:
# push flowsheet up BQ:
# clean demographics file, with new variables medis and English
table_schema = [{'name' : 'jc_uid', 'type' : 'STRING'},
                {'name' : 'pat_enc_csn_id_coded', 'type' : 'INTEGER'},
                {'name' : 'inpatient_data_id_coded', 'type': 'INTEGER'},
                {'name' : 'admit_time', 'type' : 'TIMESTAMP'},
                {'name' : 'label', 'type' : 'INTEGER'},
                {'name' : 'gender', 'type' : 'STRING'},
                {'name' : 'race', 'type' : 'STRING'},
                {'name' : 'age', 'type' : 'INTEGER'},
                {'name' : 'medis', 'type' : 'INTEGER'},
                {'name' : 'Engl', 'type' : 'INTEGER'},
                {'name' : 'Height', 'type' : 'INTEGER'},
                {'name' : 'Weight', 'type' : 'INTEGER'}]
                       
DATASET_NAME = 'triage'
TABLE_NAME = 'triage_cohort_adjusted_demographics_clean'
cohort_demo_clean.to_gbq(destination_table='triage.%s' % TABLE_NAME,
                 project_id='som-nero-phi-jonc101',
                 table_schema=table_schema,
                 if_exists='replace')

1it [00:05,  5.82s/it]


In [39]:
cohort_demo_imputed = pd.read_csv('./Data/cohort_demo_imputed.csv')
print(len(cohort_demo_imputed))
cohort_demo_imputed.head(5)

33314


Unnamed: 0,jc_uid,pat_enc_csn_id_coded,inpatient_data_id_coded,label,admit_time,Engl,gender1,race1,age1,medis1,Height1,Weight1,delta_H,delta_W
0,JCcb9495,131066982419,14157061.0,0,2015-02-13 07:09:00,1,Male,White,27,1,185,121,0,0
1,JCcb9599,131235926251,25238287.0,0,2017-08-03 03:50:00,1,Female,Black,69,1,152,72,0,0
2,JCcba5ad,131177568449,18459677.0,0,2016-02-20 19:27:00,0,Female,Asian,87,1,149,47,0,0
3,JCcbfbf7,131193455501,20798263.0,0,2016-08-21 02:08:00,1,Female,Native American,66,1,162,81,0,0
4,JCcc15c6,131254215457,31126955.0,0,2018-07-09 03:24:00,1,Male,Asian,66,0,167,54,0,0


In [17]:
# push flowsheet up BQ:
# imputed_demographics file
table_schema = [{'name' : 'jc_uid', 'type' : 'STRING'},
                {'name' : 'pat_enc_csn_id_coded', 'type' : 'INTEGER'},
                {'name' : 'inpatient_data_id_coded', 'type': 'INTEGER'},
                {'name' : 'admit_time', 'type' : 'TIMESTAMP'},
                {'name' : 'label', 'type' : 'INTEGER'},
                {'name' : 'Engl', 'type' : 'INTEGER'},
                {'name' : 'gender1', 'type' : 'STRING'},
                {'name' : 'race1', 'type' : 'STRING'},
                {'name' : 'age1', 'type' : 'INTEGER'},
                {'name' : 'medis1', 'type' : 'INTEGER'},
                {'name' : 'Height1', 'type' : 'INTEGER'},
                {'name' : 'Weight1', 'type' : 'INTEGER'},
                {'name' : 'delta_H', 'type' : 'INTEGER'},
                {'name' : 'delta_W', 'type' : 'INTEGER'},]

                       
DATASET_NAME = 'triage'
TABLE_NAME = 'triage_cohort_adjusted_demographics_imputed'
cohort_demo_imputed.to_gbq(destination_table='triage.%s' % TABLE_NAME,
                 project_id='som-nero-phi-jonc101',
                 table_schema=table_schema,
                 if_exists='replace')

1it [00:11, 11.92s/it]


In [31]:
cohort_vitals_clean = pd.read_csv('./Data/cohort_vitals_clean.csv')
print(len(cohort_vitals_clean))
cohort_vitals_clean.tail(5)

531575


Unnamed: 0,jc_uid,pat_enc_csn_id_coded,admit_time,features,recorded_time,values,feature_type
531570,JCea71c5,131191900862,2016-08-01 00:16:00+00:00,DBP,2016-07-31 20:59:00+00:00,93.0,vitals
531571,JCea71c5,131191900862,2016-08-01 00:16:00+00:00,DBP,2016-07-31 20:49:00+00:00,99.0,vitals
531572,JCea71c5,131191900862,2016-08-01 00:16:00+00:00,DBP,2016-07-31 21:19:00+00:00,89.0,vitals
531573,JCebc514,131207442721,2016-11-01 01:17:00+00:00,DBP,2016-10-31 22:41:00+00:00,76.0,vitals
531574,JCebc514,131207442721,2016-11-01 01:17:00+00:00,DBP,2016-10-31 22:42:00+00:00,75.0,vitals


In [28]:
# cleaned vitals
table_schema = [{'name' : 'jc_uid', 'type' : 'STRING'},
                {'name' : 'pat_enc_csn_id_coded', 'type' : 'INTEGER'},
                {'name' : 'admit_time', 'type' : 'TIMESTAMP'},
                {'name' : 'features', 'type' : 'STRING'},
                {'name' : 'recorded_time', 'type' : 'TIMESTAMP'},
                {'name' : 'values', 'type' : 'FLOAT'},
                {'name' : 'feature_type', 'type' : 'STRING'}]

                       
DATASET_NAME = 'triage'
TABLE_NAME = 'triage_cohort_adjusted_vitals_clean'
cohort_vitals_clean.to_gbq(destination_table='triage.%s' % TABLE_NAME,
                 project_id='som-nero-phi-jonc101',
                 table_schema=table_schema,
                 if_exists='replace')

1it [00:26, 26.69s/it]


In [33]:
cohort_short_labs_clean_4binning = pd.read_csv('./Data/cohort_short_labs_clean_4binning.csv')
print(len(cohort_short_labs_clean_4binning))
cohort_short_labs_clean_4binning.head(5)

313230


Unnamed: 0,jc_uid,pat_enc_csn_id_coded,admit_time,features,values,result_time,feature_type
0,JCcc15c6,131254215457,2018-07-09 03:24:00+00:00,"Magnesium, Ser/Plas",2.3,2018-07-08 23:47:00+00:00,labs
1,JCcc67c4,131170882153,2016-01-15 00:38:00+00:00,"Magnesium, Ser/Plas",1.9,2016-01-14 23:24:00+00:00,labs
2,JCccde6e,131259373038,2018-10-10 01:17:00+00:00,Glucose by Meter,155.0,2018-10-10 00:09:00+00:00,labs
3,JCccf3ed,131254647220,2018-07-05 19:23:00+00:00,"Magnesium, Ser/Plas",1.7,2018-07-05 16:19:00+00:00,labs
4,JCccf3ed,131254647220,2018-07-05 19:23:00+00:00,Glucose by Meter,187.0,2018-07-05 16:33:00+00:00,labs


In [18]:
# cleaned labs
table_schema = [{'name' : 'jc_uid', 'type' : 'STRING'},
                {'name' : 'pat_enc_csn_id_coded', 'type' : 'INTEGER'},
                {'name' : 'admit_time', 'type' : 'TIMESTAMP'},
                {'name' : 'features', 'type' : 'STRING'},
                {'name' : 'result_time', 'type' : 'TIMESTAMP'},
                {'name' : 'values', 'type' : 'FLOAT'},
                {'name' : 'feature_type', 'type' : 'STRING'}]

                       
DATASET_NAME = 'triage'
TABLE_NAME = 'triage_cohort_adjusted_short_labs_clean_4binning'
cohort_short_labs_clean_4binning.to_gbq(destination_table='triage.%s' % TABLE_NAME,
                 project_id='som-nero-phi-jonc101',
                 table_schema=table_schema,
                 if_exists='replace')

1it [00:27, 27.51s/it]


In [63]:
cohort_vitals_labs_long = pd.read_csv('./Data/cohort_vitals_labs_long.csv')
cohort_vitals_labs_long.tail(5)
print(len(cohort_vitals_labs_long))

844805


In [64]:
cohort_vitals_labs_long.loc[cohort_vitals_labs_long['features'] == "DBP"].describe()

Unnamed: 0,pat_enc_csn_id_coded,values
count,92744.0,92744.0
mean,131185700000.0,76.684885
std,63706800.0,17.482878
min,131037300000.0,15.0
25%,131124900000.0,65.0
50%,131201800000.0,76.0
75%,131242200000.0,87.0
max,131257800000.0,225.0


In [58]:
# cohort_vitals_labs_long.loc[cohort_vitals_labs_long['features'] == "DBP"].head()
cohort_vitals_labs_long = cohort_vitals_labs_long.drop('time', axis=1)
cohort_vitals_labs_long.loc[cohort_vitals_labs_long['features'] == "DBP"].head()

Unnamed: 0,jc_uid,pat_enc_csn_id_coded,admit_time,features,values,feature_type
438831,JCcb91cd,131226312076,2017-04-08 03:14:00+00:00,DBP,73.0,vitals
438832,JCcbb675,131239642003,2017-09-15 02:31:00+00:00,DBP,72.0,vitals
438833,JCcbb675,131239642003,2017-09-15 02:31:00+00:00,DBP,76.0,vitals
438834,JCcbba6b,131181787574,2016-04-06 04:30:00+00:00,DBP,77.0,vitals
438835,JCcbbdc1,131082563304,2015-03-19 02:58:00+00:00,DBP,63.0,vitals


In [59]:
# vitals and labs combined in a specific format
table_schema = [{'name' : 'jc_uid', 'type' : 'STRING'},
                {'name' : 'pat_enc_csn_id_coded', 'type' : 'INTEGER'},
                {'name' : 'admit_time', 'type' : 'TIMESTAMP'},
                {'name' : 'features', 'type' : 'STRING'},
#                 {'name' : 'time', 'type' : 'TIMESTAMP'},
                {'name' : 'values', 'type' : 'FLOAT'},
                {'name' : 'feature_type', 'type' : 'STRING'}]

                       
DATASET_NAME = 'triage'
TABLE_NAME = 'triage_cohort_adjusted_vitals_labs_4binning'
cohort_vitals_labs_long.to_gbq(destination_table='triage.%s' % TABLE_NAME,
                 project_id='som-nero-phi-jonc101',
                 table_schema=table_schema,
                 if_exists='replace')

1it [01:09, 69.18s/it]


In [3]:
# no time col
cohort_demo_vitals_labs_long = pd.read_csv('./Data/cohort_demo1hot_vitals_labs_long.csv') 
print(len(cohort_demo_vitals_labs_long))
cohort_demo_vitals_labs_long.head(5)

1377829


Unnamed: 0,jc_uid,pat_enc_csn_id_coded,admit_time,features,values,feature_type
0,JCcb9495,131066982419,2015-02-13 07:09:00,Engl,1.0,demo
1,JCcb9599,131235926251,2017-08-03 03:50:00,Engl,1.0,demo
2,JCcba5ad,131177568449,2016-02-20 19:27:00,Engl,0.0,demo
3,JCcbfbf7,131193455501,2016-08-21 02:08:00,Engl,1.0,demo
4,JCcc15c6,131254215457,2018-07-09 03:24:00,Engl,1.0,demo


In [30]:
# demographics, vitals, labs in the specific format
table_schema = [{'name' : 'jc_uid', 'type' : 'STRING'},
                {'name' : 'pat_enc_csn_id_coded', 'type' : 'INTEGER'},
                {'name' : 'admit_time', 'type' : 'TIMESTAMP'},
                {'name' : 'features', 'type' : 'STRING'},
                {'name' : 'time', 'type' : 'TIMESTAMP'},
                {'name' : 'values', 'type' : 'FLOAT'},
                {'name' : 'feature_type', 'type' : 'STRING'}]

                       
DATASET_NAME = 'triage'
TABLE_NAME = 'triage_cohort_adjusted_demo1hot_vitals_labs_long'
cohort_demo_vitals_labs_long.to_gbq(destination_table='triage.%s' % TABLE_NAME,
                 project_id='som-nero-phi-jonc101',
                 table_schema=table_schema,
                 if_exists='replace')

1it [01:00, 60.91s/it]
