In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import os 

### THIS IS MEANT TO RUN ON NERO - NEEDS TO BE CHANGED IF YOU RUN LOCALLY
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = '/home/ccorbin/.config/gcloud/application_default_credentials.json' 
os.environ['GCLOUD_PROJECT'] = 'mining-clinical-decisions' 
%load_ext google.cloud.bigquery

from google.cloud import bigquery
client=bigquery.Client()



In [2]:
query = """select * from traige_TE.triage_features_demos_vitals_labs"""
query_job = client.query(query)
df = query_job.result().to_dataframe()

In [3]:
len(df)

3308906

In [5]:
df = df[df['feature_type'].isin(['labs', 'vitals'])]

### Process Continuous Features
Create a function that "trains" binning featurizer (computes distribution of values) based on subset of the data.  This is important because we only want to build the distribution with our training set and apply the bin featurizer
to the test set (prevents leakage). 

Then create a function that "applies" the trained featurizer on a set of data. 


In [6]:

def convert_to_dict(look_up_table):
    """Converts df look up table to dictionary for faster look up later"""
    bin_val_dict = {}
    for feature in look_up_table['features'].unique():
        feature_bin_vals = look_up_table[look_up_table['features'] == feature]
        for _bin in feature_bin_vals['bins'].unique():
            if feature not in bin_val_dict:
                bin_val_dict[feature] = {}
                bin_val_dict[feature]['min'] = []
                bin_val_dict[feature]['max'] = []

            min_val_for_bin = feature_bin_vals[feature_bin_vals['bins'] == _bin]['values']['min'].values[0]
            max_val_for_bin = feature_bin_vals[feature_bin_vals['bins'] == _bin]['values']['max'].values[0]

            bin_val_dict[feature]['min'].append(min_val_for_bin)
            bin_val_dict[feature]['max'].append(max_val_for_bin)
    return bin_val_dict

    
def train_featurizer(df_train):
    """
    Compute percent_ranks and generates a look up table of min and max bin values
    Input : long form dataframe with features and values where values are the continuous values of labs / vitals
    Output: look up table - dict of dict of lists (key1 = feature_name, key2 = max or min, values = lists of values)
    """
    # Compute percentiles and bins
    df_train['percentiles'] = df_train.groupby('features')['values'].transform(lambda x: x.rank(pct=True))
    df_train['bins'] = df_train['percentiles'].apply(lambda x: int(x * 10))
    
    # Generate look up table and conver to dictionary stucture
    look_up_table_df = df_train.groupby(['features', 'bins']).agg({'values' : ['min', 'max']}).reset_index()
    look_up_table = convert_to_dict(look_up_table_df)
    
    ### Sanity Check. Ensure that min vector for each feature is strictly increasing (no ties!)
    # Should be the case because ties are given same percentile rank in default pandas rank function
    for feature in look_up_table:
        mins = look_up_table[feature]['min']
        for i in range(len(mins)-1):
            assert mins[i] < mins[i+1]
    
    return look_up_table


def apply_featurizer(df, look_up_table):
    
    def get_appropriate_bin(feature, value, look_up_table):
        """Takes in feature, value and look up table and returns appropriate bin

        Quick Note: For some features, we do not have 10 bins.  This happens when we have many many ties in the 
        percent rank - and the percent rank alg returns ties as the average rank within that tie. So for instance
        we're trying to break each feature up into deciles where each bin covers range of 10% of the examples. But if more
        than 10% of the examples take on 1 value - then bins can be skipped. This shouldn't really be a problem
        for downstream tasks - just something to be aware of. This also means 'bins' and 'bins_applied' won't have
        perfect overlap in features that end up having less than 10 bins

        """
        mins = look_up_table[feature]['min']
        for i in range(len(mins) - 1):
            # If value is smaller than min value of smallest bin (in test time) - then return 0 (smallest bin)
            if i == 0 and value < mins[i]:
                return i

            if value >= mins[i] and value < mins[i+1] :
                return i

        # Then in last bin
        return len(mins)-1
    
    df['bins_applied'] = df[['features', 'values']].apply(
        lambda x: get_appropriate_bin(x['features'], x['values'], look_up_table), axis=1)
    
    return df
    
    



### Train And Apply Featurizer

In [15]:
df_train = df[df['admit_time'] < '2018-01-01']

look_up_table = train_featurizer(df_train)
df_featurized = apply_featurizer(df, look_up_table)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


### Quick Sanity Check
For features that have 10 bins from 0 to 9 - `bins` should be same as `bins_applied`

In [16]:
df_train = apply_featurizer(df_train, look_up_table)
look_up_table_df = df_train.groupby(['features', 'bins']).agg({'values' : ['min', 'max']}).reset_index()

features_with_0_9_bins = []
for feature in look_up_table_df:
    num_bins = len(look_up_table_df[look_up_table_df['features'] == feature]['bins'].values)
    ten_in_bins = 10 in look_up_table_df[look_up_table_df['features'] == feature]['bins'].values
    if num_bins == 10 and not ten_in_bins:
        features_with_0_9_bins.append(feature)

for feature in features_with_0_9_bins:
    df_test = df_train[df_train['features'] == 'feature']
    for b_real, b_computed in zip(df_test['bins'].values, df_test['bins_applied'].values):
        assert(b_real == b_computed)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


### Little bit of house cleaning
Create new feature names that reflect which bin the value belongs in

In [17]:
columns = ['anon_id', 'pat_enc_csn_id_coded', 'admit_time', 'feature_type', 'features', 'values', 'bins_applied']
df_new = df_featurized[columns]

In [18]:
df_new['features'] = ['_'.join([x, str(y)]) for x, y in zip(df_new['features'].values, df_new['bins_applied'].values)] 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


### Get Counts representation
Group by patient, cns, and feature name (with bin value appended to feature name) and make value the number of times
that particular feature appears for that csn id. 

In [19]:
df_final = df_new.groupby(['anon_id', 'pat_enc_csn_id_coded', 'features']).agg(
    {'admit_time' : 'first',
     'feature_type' : 'first',
     'values' : 'count'}).reset_index()

columns = ['anon_id', 'pat_enc_csn_id_coded', 'admit_time', 'feature_type', 'features', 'values']
df_final = df_final[columns] # reorder columns
 
# Rename feature_type to reflect training set used.  'vitals_test' means everything up to 2018 used. (train + dev)
# 'vitals_train' means everything up to July 2017 used. (train)
df_final['feature_type'] = [x + '_results' if x == 'labs' else x for x in df_final['feature_type'].values]
df_final['feature_type'] = [x + '_train' for x in df_final['feature_type'].values]


In [20]:
# Sanity check - sum of the counts should be length of the orginal dataframe
assert df_final['values'].sum() == len(df)

In [21]:
df_final.head(20)

Unnamed: 0,anon_id,pat_enc_csn_id_coded,admit_time,feature_type,features,values
0,JC29f8ad2,131274729058,2019-08-31 12:52:00+00:00,labs_results_train,"ALT (SGPT), Ser/Plas_0",1
1,JC29f8ad2,131274729058,2019-08-31 12:52:00+00:00,labs_results_train,"AST (SGOT), Ser/Plas_1",1
2,JC29f8ad2,131274729058,2019-08-31 12:52:00+00:00,labs_results_train,"Albumin, Ser/Plas_3",1
3,JC29f8ad2,131274729058,2019-08-31 12:52:00+00:00,labs_results_train,"Alk P'TASE, Total, Ser/Plas_7",1
4,JC29f8ad2,131274729058,2019-08-31 12:52:00+00:00,labs_results_train,AnionGap_9,1
5,JC29f8ad2,131274729058,2019-08-31 12:52:00+00:00,labs_results_train,BUN_8,1
6,JC29f8ad2,131274729058,2019-08-31 12:52:00+00:00,labs_results_train,Base_0,2
7,JC29f8ad2,131274729058,2019-08-31 12:52:00+00:00,labs_results_train,Basos_3,1
8,JC29f8ad2,131274729058,2019-08-31 12:52:00+00:00,labs_results_train,"CO2, Ser/Plas_0",1
9,JC29f8ad2,131274729058,2019-08-31 12:52:00+00:00,labs_results_train,"Calcium, Ser/Plas_2",1


In [22]:
# Save to CSV until I can get my own custom env on nero to upload to big query from here
df_final.to_csv('bins_labs_vitals_train.csv', index=False)

In [23]:
for feature in look_up_table_df['features'].unique():
    print(look_up_table_df[look_up_table_df['features'] == feature])

               features bins values        
                                min     max
0  ALT (SGPT), Ser/Plas    0   10.0    15.0
1  ALT (SGPT), Ser/Plas    1   16.0    19.0
2  ALT (SGPT), Ser/Plas    2   20.0    22.0
3  ALT (SGPT), Ser/Plas    3   23.0    25.0
4  ALT (SGPT), Ser/Plas    4   26.0    29.0
5  ALT (SGPT), Ser/Plas    5   30.0    34.0
6  ALT (SGPT), Ser/Plas    6   35.0    41.0
7  ALT (SGPT), Ser/Plas    7   42.0    54.0
8  ALT (SGPT), Ser/Plas    8   55.0    85.0
9  ALT (SGPT), Ser/Plas    9   86.0  3500.0
                features bins  values        
                                  min     max
10  AST (SGOT), Ser/Plas    0    10.0    15.0
11  AST (SGOT), Ser/Plas    1    16.0    18.0
12  AST (SGOT), Ser/Plas    2    19.0    21.0
13  AST (SGOT), Ser/Plas    3    22.0    23.0
14  AST (SGOT), Ser/Plas    4    24.0    27.0
15  AST (SGOT), Ser/Plas    5    28.0    32.0
16  AST (SGOT), Ser/Plas    6    33.0    39.0
17  AST (SGOT), Ser/Plas    7    40.0    52.0
18  AST (SGO