In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import os 

### THIS IS MEANT TO RUN ON NERO - NEEDS TO BE CHANGED IF YOU RUN LOCALLY
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = '/home/ccorbin/.config/gcloud/application_default_credentials.json' 
os.environ['GCLOUD_PROJECT'] = 'mining-clinical-decisions' 
%reload_ext google.cloud.bigquery

from google.cloud import bigquery
client=bigquery.Client()



### Load in Long Form Feature Matrix and Cohort

In [2]:
q_cohort = """select * from traige_TE.triage_cohort_final_with_labels_complete1vs"""
query_job = client.query(q_cohort)
df_cohort = query_job.result().to_dataframe()

In [4]:
df_cohort = df_cohort.sort_values('pat_enc_csn_id_coded')

In [5]:
### Save labels to file
train_labels = df_cohort[df_cohort['admit_time'].dt.year < 2018]
validation_labels = df_cohort[df_cohort['admit_time'].dt.year == 2018]

train_and_val_labels = df_cohort[df_cohort['admit_time'].dt.year < 2019]
test_labels = df_cohort[df_cohort['admit_time'].dt.year == 2019]


path = '/home/ccorbin/BMI212/data/'
train_labels.to_csv(os.path.join(path, 'training_labels.csv'), index=None)
validation_labels.to_csv(os.path.join(path, 'validation_labels.csv'), index=None)
train_and_val_labels.to_csv(os.path.join(path, 'train_and_val_labels.csv'), index=None)
test_labels.to_csv(os.path.join(path, 'test_labels.csv'), index=None)

In [6]:
q_features = """
SELECT f.*, EXTRACT(YEAR from f.admit_time) year
FROM traige_TE.triage_features_counts_long f
RIGHT JOIN traige_TE.triage_cohort_final_with_labels_complete1vs l
USING (pat_enc_csn_id_coded)
"""
query_job = client.query(q_features)
df_features = query_job.result().to_dataframe()

In [8]:
df_features = df_features.sort_values('pat_enc_csn_id_coded')
df_features_val = df_features[~df_features['feature_type'].isin(['labs_results_test', 'vitals_test'])]
df_features_test = df_features[~df_features['feature_type'].isin(['labs_results_train', 'vitals_train'])]

In [9]:
training_examples = df_features_val[df_features_val['year'] < 2018]
validation_examples = df_features_val[df_features_val['year'] == 2018]
training_and_val_examples = df_features_test[df_features_test['year'] < 2019]
test_examples = df_features_test[df_features_test['year'] == 2019]

In [10]:
training_and_val_examples['features'].nunique()

37761

In [None]:
"""WITH ed_admit_discharge_time AS (
SELECT anon_id, pat_enc_csn_id_coded, index_time, max(effective_time_jittered_utc) discharge_time
FROM
  (SELECT c.anon_id, c.pat_enc_csn_id_coded, c.index_time, adt.effective_time_jittered_utc
  FROM `mining-clinical-decisions.abx.interm_cohort_with_no_inf_rules` c
  INNER JOIN `shc_core.adt` adt
  USING (pat_enc_csn_id_coded)) t
GROUP BY anon_id, pat_enc_csn_id_coded, index_time 
)

SELECT ed.*, om.med_description, om.order_start_time_utc
FROM ed_admit_discharge_time ed
INNER JOIN `shc_core.order_med` om
USING (anon_id)
INNER JOIN `mining-clinical-decisions.abx.abx_types` abx_types 
USING (med_description)
WHERE om.order_start_time_utc BETWEEN index_time AND TIMESTAMP_ADD(index_time, INTERVAL 14*24 HOUR) 
AND abx_types.is_include_abx = 0 AND abx_types.affects_not_infected_label = 1
ORDER BY anon_id, pat_enc_csn_id_coded, om.order_start_time_utc


"""

In [27]:
from scipy.sparse import csr_matrix, save_npz
import pdb

def build_vocab(data):
    """Builds vocabulary for of terms from the data. Assigns each unique term to a monotonically increasing integer."""
    vocabulary = {}
    for i, d in enumerate(data):
        for j, term in enumerate(d):
            vocabulary.setdefault(term, len(vocabulary))
    return vocabulary

def create_sparse_feature_matrix(train_data, apply_data):
    """Creates sparse matrix efficiently from long form dataframe.  We build a vocabulary
       from the training set, then apply vocab to the apply_set
       
       Parameters
       ----------
       train_data : long form pandas DataFrame
           Data to use to build vocabulary
       apply_data : long form pandas DataFrame
           Data to transform to sparse matrix for input to ML models
    
       Returns
       -------
       csr_data : scipy csr_matrix
           Sparse matrix version of apply_data to feed into ML models. 
    """
    
    train_features = train_data.groupby('pat_enc_csn_id_coded').agg({
        'features' : lambda x: list(x),
        'values' : lambda x: list(x)}).reset_index()
    train_feature_names = [doc for doc in train_features.features.values]
    train_feature_values = [doc for doc in train_features['values'].values]
    train_csns = [csn for csn in train_features.pat_enc_csn_id_coded.values]
    
    apply_features = apply_data.groupby('pat_enc_csn_id_coded').agg({
        'features' : lambda x: list(x),
        'values' : lambda x: list(x)}).reset_index()
    apply_features_names = [doc for doc in apply_features.features.values]
    apply_features_values = [doc for doc in apply_features['values'].values]
    apply_csns = [csn for csn in apply_features.pat_enc_csn_id_coded.values]

    
    vocabulary = build_vocab(train_feature_names)
    indptr = [0]
    indices = []
    data = []
    for i, d in enumerate(apply_features_names):
        for j, term in enumerate(d):
            if term not in vocabulary:
                continue
            else:
                indices.append(vocabulary[term])
                data.append(apply_features_values[i][j])
            if j == 0:
                # Add zero to data and max index in vocabulary to indices in case max feature indice isn't in apply features.
                indices.append(len(vocabulary)-1)
                data.append(0)
        indptr.append(len(indices))
    
    csr_data = csr_matrix((data, indices, indptr), dtype=float)
    
    return csr_data, apply_csns, vocabulary


In [28]:
# train_csr, train_csns = create_sparse_feature_matrix(training_examples, training_examples)
# validation_csr, val_csns = create_sparse_feature_matrix(training_examples, validation_examples)
train_and_val_csr, train_and_val_csns, train_and_val_vocab = create_sparse_feature_matrix(training_and_val_examples, training_and_val_examples)
test_csr, test_csns, test_and_val_vocab = create_sparse_feature_matrix(training_and_val_examples, test_examples)

In [30]:
test_csr.toarray().shape

(10096, 37761)

In [20]:
len(set(training_and_val_examples['features'].values).difference(set(test_examples['features'].values)))

13500

In [22]:
len(set(test_examples['features'].values))

28472

In [13]:
len(train_csns)

22037

In [14]:
len(train_labels['pat_enc_csn_id_coded'].values)

22037

In [31]:
for a, b in zip(train_labels['pat_enc_csn_id_coded'].values, train_csns):
    assert a == b
for a, b in zip(validation_labels['pat_enc_csn_id_coded'].values, val_csns):
    assert a == b
for a, b in zip(train_and_val_labels['pat_enc_csn_id_coded'].values, train_and_val_csns):
    assert a == b
for a, b in zip(test_labels['pat_enc_csn_id_coded'].values, test_csns):
    assert a == b

In [32]:
from scipy.sparse import csr_matrix, save_npz

import os
path = '/home/ccorbin/BMI212/data/'
save_npz(os.path.join(path, 'training_examples.npz'), train_csr)
save_npz(os.path.join(path, 'validation_examples.npz'), validation_csr)
save_npz(os.path.join(path, 'training_and_val_examples.npz'), train_and_val_csr)
save_npz(os.path.join(path, 'test_examples.npz'), test_csr)

In [16]:
# ### Create dictionary of all unique features and the set of csns that have the feature
# feature_dict = {}
# for feature, csn in zip(df_features.feature_name.values, df_features.pat_enc_csn_id_coded.values):
#     if feature in feature_dict:
#         feature_dict[feature].add(csn)
#     else:
#         feature_dict[feature] = set()
#         feature_dict[feature].add(csn)
# ### Remove features where less than 50 patients have that features
# feature_dict = {key : value for key, value in feature_dict.items() if len(value) >= 50}

# # Get df into document format where we have a list of csn's that each are a list of features. 
# # index - index of each feature
# # indices - list of indexes in order you loop through csn's and features
# # indptr - num of indices in a document
# # data - the value for each index

# ### Save features matrix in sparse matrix
from scipy.sparse import csr_matrix

docs = feature_names
indptr = [0]
indices = []
data = []
vocabulary = {}
for i, d in enumerate(docs):
    for j, term in enumerate(d):
        index = vocabulary.setdefault(term, len(vocabulary))
        indices.append(index)
        data.append(feature_values[i][j])
    indptr.append(len(indices))

test_csr = csr_matrix((data, indices, indptr), dtype=int)


In [18]:
test_csr.toarray().shape

(41654, 41974)

In [7]:
df_features.to_csv('features_long.csv', index=None)
df_cohort.to_csv('cohort.csv', index=None)

In [4]:
feature_types = set(['demo', 'Lab', 'Meds', 'Imaging', 'Procedures', 'Diagnosis', 'vitals_train', 'labs_results_train'])

### Randomly Sample 5000 examples from our 30k - actually just run full thing

In [5]:
df_features_full = df_features[df_features['feature_type'].isin(feature_types)]
df_cohort_small = df_cohort.sample(n=5000)
df_cohort_small = df_cohort 
csns = set(df_cohort_small['pat_enc_csn_id_coded'].values)
len(csns)

23626

In [11]:
df_features_full.groupby('feature_type').count()

Unnamed: 0_level_0,jc_uid,pat_enc_csn_id_coded,admit_time,features,values
feature_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Diagnosis,2998309,2998309,2998309,2998309,2998309
Imaging,220761,220761,220761,220761,220761
Lab,787576,787576,787576,787576,787576
Meds,1434271,1434271,1434271,1434271,1434271
Procedures,32401,32401,32401,32401,32401
demo,533024,533024,533024,533024,533024
labs_results_train,308754,308754,308754,308754,308754
vitals_train,333645,333645,333645,333645,333645


In [12]:
df_features_small = df_features_full[df_features_full['pat_enc_csn_id_coded'].isin(csns)]

In [13]:
len(df_features_small)

4736369

### Pivot Features (will be very memory intensive)

In [16]:
features = df_features_small.pivot(index='pat_enc_csn_id_coded',
                          columns='features',
                          values='values').fillna(0.0)

In [17]:
features.shape

(23626, 84631)

### Merge features and cohort labels


In [18]:
data = pd.merge(df_cohort_small,
                features,
                on='pat_enc_csn_id_coded',
                how='left').fillna(0.0) # if no features there means none of these things ordered and count should be zero

In [None]:
data[data['admit_time'] >= '2017-06-30'].shape

### Split into train and test based on time where 2017 is test set

In [19]:
data_train = data[data['admit_time'] < '2017-06-30']
data_test = data[data['admit_time'] >= '2017-07-01']

### Create X_train, X_test, Y_train Y_test

In [20]:
X_train = data_train[features.columns].values
X_test = data_test[features.columns].values

Y_train = data_train['label'].values
Y_test = data_test['label'].values

### Just use skearns Random Forest For Now (Need my own env to install LGBM - getting help from SRCC)

In [21]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score

# Using default params except num_trees=1000
rf = RandomForestClassifier(n_estimators=1000, random_state=42)
rf.fit(X_train, Y_train)


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=None,
            oob_score=False, random_state=42, verbose=0, warm_start=False)

In [23]:
predictions = rf.predict_proba(X_test)
auroc = roc_auc_score(Y_test, predictions[:, 1])
print(auroc)

0.8181121918378957


In [24]:
len(predictions)

3656

In [27]:
out_dict = {}
out_dict['yhat'] = predictions[:, 1]
out_dict['label'] = Y_test
df_out = pd.DataFrame(out_dict)

In [29]:
df_out.to_csv('rf_yhats.csv', index=False)

In [30]:
np.sum(Y_test)

487

In [31]:
487/3656

0.13320568927789933

In [36]:
rf.feature_importances_.shape

(84631,)

In [38]:
pd.DataFrame({'features' : features.columns, 'imps' : rf.feature_importances_}).sort_values('imps', ascending=False).head(30)

Unnamed: 0,features,imps
83959,Weight1,0.011707
84616,age1,0.011659
34256,Height1,0.010714
68215,RR_7,0.006511
38000,IMGDXCH1,0.005963
37906,IMGCTHSC,0.00509
70646,SBP_0,0.004514
68211,RR_3,0.004277
44237,LABMETC,0.004242
43562,LABCBCD,0.004228
