This notebook was developed by Gautam Machiraju and Conor Corbin, modified by Minh Nguyen

### Description:
- Use 2015 - 2017 as the training data to check for value distributions with 10 quantiles
- Use these distributions to assign new validation and test values to quantile bins

In [1]:
import os 
import numpy as np
import pandas as pd
from datetime import datetime, timedelta

In [2]:
import os 
from google.cloud import bigquery
from google.cloud.bigquery import dbapi

##Use correct path based on whether you are, Nero or local
# use Ctrl + Insert to copy and Shift + Insert to paste

# for Nero:
# os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = '/home/minh084/.config/gcloud/application_default_credentials.json' 
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = '/home/jupyter/.config/gcloud/application_default_credentials.json'

# for local computer:
# os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = r'C:\Users\User\AppData\Roaming\gcloud\application_default_credentials.json' 

##set correct Nero project
os.environ['GCLOUD_PROJECT'] = 'som-nero-phi-jonc101' 

In [3]:
featuredir = "../../OutputTD/2_features"

pd.set_option('display.max_columns', 50)
pd.set_option('display.max_rows', 50)

In [4]:
# or read from local folder
df = pd.read_csv(os.path.join(featuredir, "2_7_feature_values.csv"))
print(len(df)) # 3012942

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


3012942


In [6]:
df.head(5)

Unnamed: 0,anon_id,pat_enc_csn_id_coded,admit_time,first_label,death_24hr_recent_label,feature_type,features,values,time
0,JC29f8ad2,131274729058,2019-08-31 12:52:00+00:00,0,0,demo,ESI_i,3.0,
1,JC29f8ad3,131278291027,2019-10-05 23:48:00+00:00,0,0,demo,ESI_i,3.0,
2,JC29f8b9c,131266787806,2019-05-05 01:07:00+00:00,0,0,demo,ESI_i,2.0,
3,JC29f8beb,131264387263,2019-03-15 03:35:00+00:00,0,0,demo,ESI_i,3.0,
4,JC29f8beb,131279241689,2019-11-27 15:29:00+00:00,0,0,demo,ESI_i,3.0,


In [7]:
df = df[df['feature_type'].isin(['labs', 'vitals'])]
print(len(df)) # 

In [8]:
df.head(5)

Unnamed: 0,anon_id,pat_enc_csn_id_coded,admit_time,first_label,death_24hr_recent_label,feature_type,features,values,time
703222,JC29f8ad2,131274729058,2019-08-31 12:52:00+00:00,0,0,vitals,Temp,36.9,2019-08-31 10:14:00+00:00
703223,JC29f8ad2,131274729058,2019-08-31 12:52:00+00:00,0,0,vitals,DBP,62.0,2019-08-31 12:00:00+00:00
703224,JC29f8ad2,131274729058,2019-08-31 12:52:00+00:00,0,0,vitals,SBP,124.0,2019-08-31 12:00:00+00:00
703225,JC29f8ad2,131274729058,2019-08-31 12:52:00+00:00,0,0,vitals,Pulse,102.0,2019-08-31 12:00:00+00:00
703226,JC29f8ad2,131274729058,2019-08-31 12:52:00+00:00,0,0,vitals,RR,11.0,2019-08-31 12:00:00+00:00


### Process Continuous Features
Create a function that "trains" binning featurizer (computes distribution of values) based on subset of the data.  This is important because we only want to build the distribution with our training set and apply the bin featurizer
to the test set (prevents leakage). 

Then create a function that "applies" the trained featurizer on a set of data. 


In [9]:

def convert_to_dict(look_up_table):
    """Converts df look up table to dictionary for faster look up later"""
    bin_val_dict = {}
    for feature in look_up_table['features'].unique():
        feature_bin_vals = look_up_table[look_up_table['features'] == feature]
        for _bin in feature_bin_vals['bins'].unique():
            if feature not in bin_val_dict:
                bin_val_dict[feature] = {}
                bin_val_dict[feature]['min'] = []
                bin_val_dict[feature]['max'] = []

            min_val_for_bin = feature_bin_vals[feature_bin_vals['bins'] == _bin]['values']['min'].values[0]
            max_val_for_bin = feature_bin_vals[feature_bin_vals['bins'] == _bin]['values']['max'].values[0]

            bin_val_dict[feature]['min'].append(min_val_for_bin)
            bin_val_dict[feature]['max'].append(max_val_for_bin)
    return bin_val_dict

    
def train_featurizer(df_train):
    """
    Compute percent_ranks and generates a look up table of min and max bin values
    Input : long form dataframe with features and values where values are the continuous values of labs / vitals
    Output: look up table - dict of dict of lists (key1 = feature_name, key2 = max or min, values = lists of values)
    """
    # Compute percentiles and bins
    df_train['percentiles'] = df_train.groupby('features')['values'].transform(lambda x: x.rank(pct=True))
    df_train['bins'] = df_train['percentiles'].apply(lambda x: int(x * 10))
    
    # Generate look up table and conver to dictionary stucture
    look_up_table_df = df_train.groupby(['features', 'bins']).agg({'values' : ['min', 'max']}).reset_index()
    look_up_table = convert_to_dict(look_up_table_df)
    
    ### Sanity Check. Ensure that min vector for each feature is strictly increasing (no ties!)
    # Should be the case because ties are given same percentile rank in default pandas rank function
    for feature in look_up_table:
        mins = look_up_table[feature]['min']
        for i in range(len(mins)-1):
            assert mins[i] < mins[i+1]
    
    return look_up_table


def apply_featurizer(df, look_up_table):
    
    def get_appropriate_bin(feature, value, look_up_table):
        """Takes in feature, value and look up table and returns appropriate bin

        Quick Note: For some features, we do not have 10 bins.  This happens when we have many many ties in the 
        percent rank - and the percent rank alg returns ties as the average rank within that tie. So for instance
        we're trying to break each feature up into deciles where each bin covers range of 10% of the examples. But if more
        than 10% of the examples take on 1 value - then bins can be skipped. This shouldn't really be a problem
        for downstream tasks - just something to be aware of. This also means 'bins' and 'bins_applied' won't have
        perfect overlap in features that end up having less than 10 bins

        """
        mins = look_up_table[feature]['min']
        for i in range(len(mins) - 1):
            # If value is smaller than min value of smallest bin (in test time) - then return 0 (smallest bin)
            if i == 0 and value < mins[i]:
                return i

            if value >= mins[i] and value < mins[i+1] :
                return i

        # Then in last bin
        return len(mins)-1
    
    df['bins_applied'] = df[['features', 'values']].apply(
        lambda x: get_appropriate_bin(x['features'], x['values'], look_up_table), axis=1)
    
    return df

### Train And Apply Featurizer

In [10]:
df_train = df[df['admit_time'] < '2018-01-01']

look_up_table = train_featurizer(df_train)
df_featurized = apply_featurizer(df, look_up_table)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train['percentiles'] = df_train.groupby('features')['values'].transform(lambda x: x.rank(pct=True))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train['bins'] = df_train['percentiles'].apply(lambda x: int(x * 10))


### Quick Sanity Check
For features that have 10 bins from 0 to 9 - `bins` should be same as `bins_applied`

In [11]:
df_train = apply_featurizer(df_train, look_up_table)
look_up_table_df = df_train.groupby(['features', 'bins']).agg({'values' : ['min', 'max']}).reset_index()

features_with_0_9_bins = []
for feature in look_up_table_df:
    num_bins = len(look_up_table_df[look_up_table_df['features'] == feature]['bins'].values)
    ten_in_bins = 10 in look_up_table_df[look_up_table_df['features'] == feature]['bins'].values
    if num_bins == 10 and not ten_in_bins:
        features_with_0_9_bins.append(feature)

for feature in features_with_0_9_bins:
    df_test = df_train[df_train['features'] == 'feature']
    for b_real, b_computed in zip(df_test['bins'].values, df_test['bins_applied'].values):
        assert(b_real == b_computed)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['bins_applied'] = df[['features', 'values']].apply(


### Little bit of house cleaning
Create new feature names that reflect which bin the value belongs in

In [12]:
columns = ['anon_id', 'pat_enc_csn_id_coded', 'admit_time', 'feature_type', 'features', 'values', 'bins_applied']
df_new = df_featurized[columns]

In [13]:
df_new['features'] = ['_'.join([x, str(y)]) for x, y in zip(df_new['features'].values, df_new['bins_applied'].values)] 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_new['features'] = ['_'.join([x, str(y)]) for x, y in zip(df_new['features'].values, df_new['bins_applied'].values)]


### Get Counts representation
Group by patient, cns, and feature name (with bin value appended to feature name) and make value the number of times
that particular feature appears for that csn id. 

In [14]:
df_final = df_new.groupby(['anon_id', 'pat_enc_csn_id_coded', 'features']).agg(
    {'admit_time' : 'first',
     'feature_type' : 'first',
     'values' : 'count'}).reset_index()

columns = ['anon_id', 'pat_enc_csn_id_coded', 'admit_time', 'feature_type', 'features', 'values']
df_final = df_final[columns] # reorder columns
 
# Rename feature_type to reflect training set used.  'vitals_test' means everything up to 2018 used. (train + dev)
# 'vitals_train' means everything up to July 2017 used. (train)
df_final['feature_type'] = [x + '_results' if x == 'labs' else x for x in df_final['feature_type'].values]
df_final['feature_type'] = [x + '_train' for x in df_final['feature_type'].values]

In [16]:
# Sanity check - sum of the counts should be length of the orginal dataframe
assert df_final['values'].sum() == len(df)

In [17]:
df_final.head(20)

Unnamed: 0,anon_id,pat_enc_csn_id_coded,admit_time,feature_type,features,values
0,JC29f8ad2,131274729058,2019-08-31 12:52:00+00:00,labs_results_train,ALB_3,1
1,JC29f8ad2,131274729058,2019-08-31 12:52:00+00:00,labs_results_train,ALK_7,1
2,JC29f8ad2,131274729058,2019-08-31 12:52:00+00:00,labs_results_train,ALT_0,1
3,JC29f8ad2,131274729058,2019-08-31 12:52:00+00:00,labs_results_train,AST_1,1
4,JC29f8ad2,131274729058,2019-08-31 12:52:00+00:00,labs_results_train,AnionGap_9,1
5,JC29f8ad2,131274729058,2019-08-31 12:52:00+00:00,labs_results_train,BUN_8,1
6,JC29f8ad2,131274729058,2019-08-31 12:52:00+00:00,labs_results_train,Base_0,2
7,JC29f8ad2,131274729058,2019-08-31 12:52:00+00:00,labs_results_train,Basos_3,1
8,JC29f8ad2,131274729058,2019-08-31 12:52:00+00:00,labs_results_train,CO2_0,1
9,JC29f8ad2,131274729058,2019-08-31 12:52:00+00:00,labs_results_train,Ca_2,1


In [19]:
# Save to CSV until I can get my own custom env on nero to upload to big query from here
df_final.to_csv(os.path.join(featuredir, '2_8_binned_labs_vitals.csv'), index=False)

In [20]:
for feature in look_up_table_df['features'].unique():
    print(look_up_table_df[look_up_table_df['features'] == feature])

   features bins values     
                    min  max
0       ALB    0    1.0  2.3
1       ALB    1    2.4  2.7
2       ALB    2    2.8  2.9
3       ALB    3    3.0  3.2
4       ALB    4    3.3  3.3
5       ALB    5    3.4  3.5
6       ALB    6    3.6  3.7
7       ALB    7    3.8  3.8
8       ALB    8    3.9  4.1
9       ALB    9    4.2  5.8
10      ALB   10    6.4  6.4
   features bins values        
                    min     max
11      ALK    0   14.0    57.0
12      ALK    1   58.0    66.0
13      ALK    2   67.0    75.0
14      ALK    3   76.0    83.0
15      ALK    4   84.0    93.0
16      ALK    5   94.0   104.0
17      ALK    6  105.0   120.0
18      ALK    7  121.0   147.0
19      ALK    8  148.0   216.0
20      ALK    9  217.0  3500.0
   features bins values        
                    min     max
21      ALT    0   10.0    15.0
22      ALT    1   16.0    19.0
23      ALT    2   20.0    22.0
24      ALT    3   23.0    25.0
25      ALT    4   26.0    29.0
26      ALT    