In [1]:
import pandas as pd
import numpy as np
import os 

### THIS IS MEANT TO RUN ON NERO - NEEDS TO BE CHANGED IF YOU RUN LOCALLY
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = '/home/ccorbin/.config/gcloud/application_default_credentials.json' 
os.environ['GCLOUD_PROJECT'] = 'mining-clinical-decisions' 
%load_ext google.cloud.bigquery

from google.cloud import bigquery
client=bigquery.Client()



## Create Count Based Representations of the following
* Dx Codes (making sure to only use codes from prior admissions)
* Lab/ Microbiology Lab orders (up until index time)
* Procedures (up until index time)
* Imaging orders (up until index time)
* Medication Orders (up until index time)

In [2]:
def featurize_code_counts(query):
    """ 
    Turns a sql query into long form dataframe of of counts
    supported queries for ref: dx_codes.sql, proc_orders.sql, meds.sql

    Output: Long form dataframe with cols:
        jc_uid : pat_id
        pat_enc_csn_id_coded : csn (our unit of analysis)
        admit_time : our index time + 1 hour
        feature_type : dx, meds, labs etc
        features : name of feature
        values : value of feature (which is a count)
        
    Notes: 
        * diagnosis from current encounter not included - but we look back through entire patient timeline
        * meds, labs, imaging, procs we only look back a year. 
    """

    query_job = client.query(query)
    df = query_job.result().to_dataframe()

    df = df[['jc_uid',
             'pat_enc_csn_id_coded',
             'admit_time',
             'code',
             'order_id',
             'feature_type']].groupby(['pat_enc_csn_id_coded', 'admit_time', 'code']).agg({
             'jc_uid' : 'first',
             'order_id' : 'count',
             'feature_type' : 'first'}).reset_index().rename(columns={
                               'order_id' : 'values',
                               'code' : 'features'})[['jc_uid', 'pat_enc_csn_id_coded', 'admit_time', 'feature_type', 'features', 'values']]
    return df

In [12]:
# Get counts for each of the kinds of orders we care about
feature_types = ['proc_orders', 'meds']
long_df = pd.DataFrame()

for f in feature_types:
    sql_file = ''.join(['../SQL/', f, '.sql'])
    with open(sql_file, 'r') as fr:
        query = fr.read()
    df = featurize_code_counts(query)
    
    long_df = pd.concat([long_df, df])
    print("processed: ", f)

processed:  proc_orders
processed:  meds


In [13]:
# Save to csv
long_df['feature_type'] = long_df['feature_type'].transform(lambda x: '_'.join([x, 'current_csn']))
long_df['features'] = long_df['features'].transform(lambda x: '_'.join([x, 'current_csn']))
long_df.to_csv('count_features_long_one_month.csv', index=None)

In [10]:
long_df[long_df['feature_type'] == 'Lab'].head(20)

Unnamed: 0,jc_uid,pat_enc_csn_id_coded,admit_time,feature_type,features,values


In [14]:
long_df.head()

Unnamed: 0,jc_uid,pat_enc_csn_id_coded,admit_time,feature_type,features,values
0,JCda8a27,131037296250,2014-07-19 02:54:00+00:00,Imaging_current_csn,IMGCTAP_current_csn,2
1,JCda8a27,131037296250,2014-07-19 02:54:00+00:00,Imaging_current_csn,IMGCTAPW_current_csn,2
2,JCda8a27,131037296250,2014-07-19 02:54:00+00:00,Imaging_current_csn,IMGDXCH1_current_csn,2
3,JCda8a27,131037296250,2014-07-19 02:54:00+00:00,Lab_current_csn,LABBLC_current_csn,2
4,JCda8a27,131037296250,2014-07-19 02:54:00+00:00,Lab_current_csn,LABBLC2_current_csn,2


In [6]:
long_df[['pat_enc_csn_id_coded', 'features', 'values']].pivot(index='pat_enc_csn_id_coded', columns='features', values='values').shape

(30625, 5800)