In [1]:
import pandas as pd
import numpy as np
#import matplotlib
#matplotlib.use('Agg')
#import matplotlib.pyplot as plt
#%matplotlib inline

import umap

#import seaborn as sns

#from sklearn.decomposition import PCA
#from sklearn.mixture import GaussianMixture
from sklearn.cluster import KMeans
from sklearn.metrics.cluster import adjusted_rand_score
from sklearn.model_selection import train_test_split, KFold

#This needs to be initialized for UMAP repeatability
np.random.seed(15)
from edviz_help import pre_proc, compare_clusters, describe_cluster, pre_proc_no_norm

In [2]:
import warnings
warnings.filterwarnings('ignore')
#Ignores warning from UMAP Spectral embedding issue- see https://github.com/lmcinnes/umap/issues/90

In [3]:
import datetime

In [4]:
from sklearn.preprocessing import MinMaxScaler

In [5]:
def paired_pre_proc(df_train, df_test):
    df_cat_train = df_train.select_dtypes(exclude=['int64', 'float64'])
    df_dog_train = df_train.select_dtypes(include=['int64', 'float64'])
    
    if df_cat_train.shape[1]:
        df_cat_train = pd.get_dummies(df_cat_train, drop_first=False)
        
    df_cat_test = df_test.select_dtypes(exclude=['int64', 'float64'])
    df_dog_test = df_test.select_dtypes(include=['int64', 'float64'])
    
    if df_cat_test.shape[1]:
        df_cat_test = pd.get_dummies(df_cat_test, drop_first=False)
        
    df_dog_test = (df_dog_test - df_dog_train.min())/(df_dog_train.max()-df_dog_train.min())
    df_dog_test =  df_dog_test - df_dog_train.mean()
    
    df_dog_train = (df_dog_train - df_dog_train.min())/(df_dog_train.max()-df_dog_train.min())
    df_dog_train =  df_dog_train - df_dog_train.mean()
    
    df_train = pd.concat([df_cat_train, df_dog_train], axis=1)
    df_test = pd.concat([df_cat_test, df_dog_test], axis=1)
    
    df_train = df_train.fillna(df_train.mean())
    df_test = df_test.fillna(df_train.mean())
    
    df_train = df_train.dropna(axis=1, how='all')
    df_test = df_test.dropna(axis=1, how='all')
    
    columns = [col for col in df_train.columns if col in df_test.columns]
    
    df_train = df_train[columns]
    df_test = df_test[columns]
    
    
    return df_train, df_test

## Real Data
Now we'll treat our real dataset like we treated the synthetic data.

In [6]:
#Renamed ed_data => X for memory reasons overnight
ed_data = pd.read_csv('../../edviz_raw.csv')

In [7]:
demographics = [
    'ethnicity'
    ,'race'
    ,'lang'
    ,'religion'
    ,'maritalstatus'
    ,'employstatus'
    ,'insurance_status'
    ,'dep_name'
    ,'arrivalmode'
    ,'gender'
    ,'previousdispo'
]
outcomes = ['disposition', 'esi']
other_cat = ['arrivalmonth', 'arrivalday', 'arrivalhour_bin']

In [8]:
cc_cols = [foo for foo in ed_data.columns if 'cc_' in foo]

In [9]:
foo = ed_data[cc_cols].describe().T.sort_values(by='mean', ascending=False)

In [10]:
foo.head(15)

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
cc_abdominalpain,557901.0,0.09741,0.296606,0.0,0.0,0.0,0.0,2.0
cc_other,557901.0,0.091459,0.293108,0.0,0.0,0.0,0.0,3.0
cc_chestpain,557901.0,0.064166,0.245121,0.0,0.0,0.0,0.0,2.0
cc_shortnessofbreath,557901.0,0.044212,0.205627,0.0,0.0,0.0,0.0,2.0
cc_backpain,557901.0,0.037001,0.188812,0.0,0.0,0.0,0.0,2.0
cc_fall,557901.0,0.034103,0.181563,0.0,0.0,0.0,0.0,2.0
cc_alcoholintoxication,557901.0,0.02857,0.166637,0.0,0.0,0.0,0.0,2.0
cc_motorvehiclecrash,557901.0,0.026691,0.161235,0.0,0.0,0.0,0.0,2.0
cc_dizziness,557901.0,0.022923,0.149707,0.0,0.0,0.0,0.0,2.0
cc_cough,557901.0,0.022506,0.148346,0.0,0.0,0.0,0.0,2.0


In [11]:
foo['mean'].sum()

1.1383023152853282

In [12]:
print(f'Starting at {datetime.datetime.now()}')
recent = datetime.datetime.now()
cc_list = ['cc_abdominalpain']
neighbor_list = [15, 150] #Default is 15
min_dist_list = [0, 0.1, .25] #Default is 0.1
#metric_list = ['euclidean', 'mahalanobis'] #Default is 'euclidean'
metric_list = ['euclidean']
skip_first = True
for target_cc in cc_list:
    cc = target_cc[3:]
    
    X = ed_data[ed_data[target_cc] == 1]
    #X = ed_data[ed_data['cc_backpain'] == 1].drop(columns=cc_cols)
    y = pd.get_dummies(X['disposition']).drop(columns='Discharge')
    X = X.drop(columns=outcomes)
    
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size = 0.2, random_state = 31415
    )
    
    X_train = X_train.copy()
    
    X_test = X_test.copy()
    
    kf = KFold(n_splits=5, random_state=1, shuffle=True)
    
    for n_neighbors in neighbor_list:
        for min_dist in min_dist_list:
            for metric in metric_list:
                if skip_first:
                    skip_first = False
                    continue
                #train_emb_list = []
                #test_emb_list = []
                i = 0
                print(f"cc = '{cc}', n_neighbors = {n_neighbors}, min_dist = {min_dist}, metric = '{metric}'")
                recent = datetime.datetime.now()
                for train_index, _ in kf.split(X_train):
                    X_train_fold = pd.DataFrame(X_train.iloc[train_index])
                    X_train_fold, X_test_fold = paired_pre_proc(X_train_fold, X_test)
                    
                    ebd = umap.UMAP(
                        n_neighbors=n_neighbors,
                        min_dist=min_dist,
                        metric=metric,
                        random_state=i
                    )
                    X_train_fold = ebd.fit_transform(X_train_fold)
                    
                    print(f'{datetime.datetime.now()}--- Fold {i} train embedding complete. '
                          f'({(datetime.datetime.now() - recent).seconds/60:.2f} '
                          f'minutes since last train completion).')     
                    recent = datetime.datetime.now()
                    np.savetxt(f'grid_search_data/{cc}_{n_neighbors}_{min_dist}_{metric}_train_embedding_fold_{i}.csv', X_train_fold, delimiter=',')
                    
                    X_test_fold  = ebd.transform(X_test_fold)
                    
                    print(f'{datetime.datetime.now()}--- Fold {i} test embedding complete.')
                    np.savetxt(f'grid_search_data/{cc}_{n_neighbors}_{min_dist}_{metric}_test_embedding_fold_{i}.csv', X_test_fold, delimiter=',')
                    
                    #train_emb_list.append(X_train_fold)
                    #test_emb_list.append(X_test_fold)
                    i = i+1

Starting at 2019-04-03 06:55:50.206611
cc = 'abdominalpain', n_neighbors = 15, min_dist = 0.1, metric = 'euclidean'
2019-04-03 07:03:38.830090--- Fold 0 train embedding complete. (7.78 minutes since last train completion).
2019-04-03 07:04:33.672329--- Fold 0 test embedding complete.
2019-04-03 07:13:11.948359--- Fold 1 train embedding complete. (9.55 minutes since last train completion).
2019-04-03 07:14:15.278998--- Fold 1 test embedding complete.
2019-04-03 07:21:52.650838--- Fold 2 train embedding complete. (8.67 minutes since last train completion).
2019-04-03 07:22:40.931832--- Fold 2 test embedding complete.
2019-04-03 07:30:46.121685--- Fold 3 train embedding complete. (8.88 minutes since last train completion).
2019-04-03 07:31:34.781317--- Fold 3 test embedding complete.
2019-04-03 07:40:29.165720--- Fold 4 train embedding complete. (9.72 minutes since last train completion).
2019-04-03 07:41:18.822669--- Fold 4 test embedding complete.
cc = 'abdominalpain', n_neighbors = 15,

In [14]:
print(f'Starting at {datetime.datetime.now()}')
recent = datetime.datetime.now()
cc_list = ['cc_abdominalpain']
neighbor_list = [2] #Default is 15
min_dist_list = [0, 0.1, .25] #Default is 0.1
#metric_list = ['euclidean', 'mahalanobis'] #Default is 'euclidean'
metric_list = ['euclidean']
skip_first = True
for target_cc in cc_list:
    cc = target_cc[3:]
    
    X = ed_data[ed_data[target_cc] == 1]
    #X = ed_data[ed_data['cc_backpain'] == 1].drop(columns=cc_cols)
    y = pd.get_dummies(X['disposition']).drop(columns='Discharge')
    X = X.drop(columns=outcomes)
    
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size = 0.2, random_state = 31415
    )
    
    X_train = X_train.copy()
    
    X_test = X_test.copy()
    
    kf = KFold(n_splits=5, random_state=1, shuffle=True)
    
    for n_neighbors in neighbor_list:
        for min_dist in min_dist_list:
            for metric in metric_list:
                if skip_first:
                    skip_first = False
                    continue
                #train_emb_list = []
                #test_emb_list = []
                i = 0
                print(f"cc = '{cc}', n_neighbors = {n_neighbors}, min_dist = {min_dist}, metric = '{metric}'")
                recent = datetime.datetime.now()
                for train_index, _ in kf.split(X_train):
                    X_train_fold = pd.DataFrame(X_train.iloc[train_index])
                    X_train_fold, X_test_fold = paired_pre_proc(X_train_fold, X_test)
                    
                    ebd = umap.UMAP(
                        n_neighbors=n_neighbors,
                        min_dist=min_dist,
                        metric=metric,
                        random_state=i
                    )
                    X_train_fold = ebd.fit_transform(X_train_fold)
                    
                    print(f'{datetime.datetime.now()}--- Fold {i} train embedding complete. '
                          f'({(datetime.datetime.now() - recent).seconds/60:.2f} '
                          f'minutes since last train completion).')     
                    recent = datetime.datetime.now()
                    np.savetxt(f'grid_search_data/{cc}_{n_neighbors}_{min_dist}_{metric}_train_embedding_fold_{i}.csv', X_train_fold, delimiter=',')
                    
                    X_test_fold  = ebd.transform(X_test_fold)
                    
                    print(f'{datetime.datetime.now()}--- Fold {i} test embedding complete.')
                    np.savetxt(f'grid_search_data/{cc}_{n_neighbors}_{min_dist}_{metric}_test_embedding_fold_{i}.csv', X_test_fold, delimiter=',')
                    
                    #train_emb_list.append(X_train_fold)
                    #test_emb_list.append(X_test_fold)
                    i = i+1

Starting at 2019-04-03 18:09:39.717543
cc = 'abdominalpain', n_neighbors = 2, min_dist = 0.1, metric = 'euclidean'
2019-04-03 18:14:07.562993--- Fold 0 train embedding complete. (4.42 minutes since last train completion).
2019-04-03 18:14:26.977546--- Fold 0 test embedding complete.
2019-04-03 18:20:05.670979--- Fold 1 train embedding complete. (5.97 minutes since last train completion).
2019-04-03 18:20:23.347956--- Fold 1 test embedding complete.
2019-04-03 18:24:38.479546--- Fold 2 train embedding complete. (4.53 minutes since last train completion).
2019-04-03 18:24:56.277771--- Fold 2 test embedding complete.
2019-04-03 18:29:23.115266--- Fold 3 train embedding complete. (4.73 minutes since last train completion).
2019-04-03 18:29:39.474087--- Fold 3 test embedding complete.
2019-04-03 18:33:53.633571--- Fold 4 train embedding complete. (4.50 minutes since last train completion).
2019-04-03 18:34:11.406270--- Fold 4 test embedding complete.
cc = 'abdominalpain', n_neighbors = 2, m

In [15]:
print(f'Starting at {datetime.datetime.now()}')
recent = datetime.datetime.now()
cc_list = ['cc_abdominalpain']
neighbor_list = [2, 15] #Default is 15
min_dist_list = [0] #Default is 0.1
#metric_list = ['euclidean', 'mahalanobis'] #Default is 'euclidean'
metric_list = ['euclidean']
for target_cc in cc_list:
    cc = target_cc[3:]
    
    X = ed_data[ed_data[target_cc] == 1]
    #X = ed_data[ed_data['cc_backpain'] == 1].drop(columns=cc_cols)
    y = pd.get_dummies(X['disposition']).drop(columns='Discharge')
    X = X.drop(columns=outcomes)
    
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size = 0.2, random_state = 31415
    )
    
    X_train = X_train.copy()
    
    X_test = X_test.copy()
    
    kf = KFold(n_splits=5, random_state=1, shuffle=True)
    
    for n_neighbors in neighbor_list:
        for min_dist in min_dist_list:
            for metric in metric_list:
                #train_emb_list = []
                #test_emb_list = []
                i = 0
                print(f"cc = '{cc}', n_neighbors = {n_neighbors}, min_dist = {min_dist}, metric = '{metric}'")
                recent = datetime.datetime.now()
                for train_index, _ in kf.split(X_train):
                    X_train_fold = pd.DataFrame(X_train.iloc[train_index])
                    X_train_fold, X_test_fold = paired_pre_proc(X_train_fold, X_test)
                    
                    ebd = umap.UMAP(
                        n_neighbors=n_neighbors,
                        min_dist=min_dist,
                        metric=metric,
                        random_state=i
                    )
                    X_train_fold = ebd.fit_transform(X_train_fold)
                    
                    print(f'{datetime.datetime.now()}--- Fold {i} train embedding complete. '
                          f'({(datetime.datetime.now() - recent).seconds/60:.2f} '
                          f'minutes since last train completion).')     
                    recent = datetime.datetime.now()
                    np.savetxt(f'grid_search_data/{cc}_{n_neighbors}_{min_dist}_{metric}_train_embedding_fold_{i}.csv', X_train_fold, delimiter=',')
                    
                    X_test_fold  = ebd.transform(X_test_fold)
                    
                    print(f'{datetime.datetime.now()}--- Fold {i} test embedding complete.')
                    np.savetxt(f'grid_search_data/{cc}_{n_neighbors}_{min_dist}_{metric}_test_embedding_fold_{i}.csv', X_test_fold, delimiter=',')
                    
                    #train_emb_list.append(X_train_fold)
                    #test_emb_list.append(X_test_fold)
                    i = i+1

Starting at 2019-04-03 19:01:02.472626
cc = 'abdominalpain', n_neighbors = 2, min_dist = 0, metric = 'euclidean'
2019-04-03 19:05:28.980155--- Fold 0 train embedding complete. (4.40 minutes since last train completion).
2019-04-03 19:05:47.274929--- Fold 0 test embedding complete.
2019-04-03 19:10:24.620312--- Fold 1 train embedding complete. (4.92 minutes since last train completion).
2019-04-03 19:10:41.987721--- Fold 1 test embedding complete.
2019-04-03 19:14:40.478213--- Fold 2 train embedding complete. (4.25 minutes since last train completion).
2019-04-03 19:14:58.061147--- Fold 2 test embedding complete.
2019-04-03 19:19:25.964475--- Fold 3 train embedding complete. (4.75 minutes since last train completion).
2019-04-03 19:19:42.596682--- Fold 3 test embedding complete.
2019-04-03 19:24:08.091639--- Fold 4 train embedding complete. (4.70 minutes since last train completion).
2019-04-03 19:24:25.129362--- Fold 4 test embedding complete.
cc = 'abdominalpain', n_neighbors = 15, mi