In [0]:
import numpy as np
import pandas as pd

In [0]:
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


# Data Description

In [0]:
# Shared drive read_csv
wids = pd.read_csv('/content/drive/Shared drives/DataMining_FinalProject/Data/training_v2.csv')
print("This data has {} rows and {} columns.".format(wids.shape[0], wids.shape[1]))

This data has 91713 rows and 186 columns.


In [0]:
wids[['encounter_id', 'patient_id', 'hospital_id']].nunique()
print("Patient_id and encounter_id are unique.")

Patient_id and encounter_id are unique.


In [0]:
wids['hospital_death'].value_counts(normalize=True)
print("We have an imbalanced dataset. Only around 8% of our patients have hospital_death =1.")

We have an imbalanced dataset. Only around 8% of our patients have hospital_death =1.


# Feature Engineering
We are going to create features relevant to a patient's possibility of death. The following are the functions for feature engineering. 

In [0]:
# Viviana
def gcs_finalscore(df):
    """
    Parameters
    ----------
    df : dataframe
        full dataframe.

    Returns
    -------
    df['gcs_score'] : series
        Gasglow Coma Scale Score.
    """
    df['gcs_score'] =  df['gcs_eyes_apache'] + df['gcs_verbal_apache'] + df['gcs_motor_apache']
    return df['gcs_score']

def creatine_risks(df):
    """

    Parameters
    ----------
    df : dataframe
        full dataframe.

    Returns
    -------
    df['creatine_risks'] : series
        Series defines risk from creatine (0) if they are below the risk threshold and (1) if they are at risk .

    """
    
    df['creatine_atrisk'] = pd.cut(pt_df['creatinine_apache'],bins = [0,1.1,np.inf], labels = [0,1])
    
    return df['creatine_atrisk']
  

def bilirubin_risk (df):
    """ Column that tells us if the individual is at risk liver problems. High bilirubin is associated with a higher risk.

    Parameters
    ----------
    df : Dataframe
        original dataframe.

    Returns
    -------
    df['bilirubin_risk']: Series
        a colum of 0s and 1s, 1 if you are above .6, 0 otherwise.

    """
    df['bilirubin_risk'] = pd.cut(df['bilirubin_apache'], bins = [0,.6,np.inf], labels = [0,1])

    return df['bilirubin_risk']

def glucose_risk (df):
    """ Column that tells us if the individual is at risk. High glucose above 240 is associated with a higher risk of ICU death. 

    Parameters
    ----------
    df : Dataframe
        original dataframe.

    Returns
    -------
    df['glucose_risk']: Series
        a colum of 0s and 1s, 1 if you are above 240, 0 otherwise.

    """
    df['glucose_risk'] = pd.cut(df['glucose_apache'], bins = [0,240,np.inf], labels = [0,1])


    return df['glucose_risk']




In [0]:
# Jessica

#number of test done in an hour
def h1_test(df):
    column = df.columns
    h1_list = []
    for i in column:
        if (i[:2]=='h1'):
            h1_list.append(i)
    return df[h1_list].notnull().sum(axis=1)/2


#number of test done in a day
def d1_test(df):
    column = df.columns
    d1_list = []
    for i in column:
        if (i[:2]=='d1'):
            d1_list.append(i)
    return df[d1_list].notnull().sum(axis=1)/2



#number of test are missing
def missing_function(df):
    df_sub = df.loc[:,'d1_diasbp_invasive_max':'h1_pao2fio2ratio_min']
    return df_sub.isnull().sum(axis=1)/2


#number of chronic conditions
def chronic_function(df):
    df_sub = df.loc[:, 'aids':'solid_tumor_with_metastasis'] 
    return df_sub.sum(axis=1)

In [0]:
# Vamika

def avg_h1_cols(df):
    '''
    Creates column with average h1 data using col_max and col_min variables

    Parameters:
    -----------------------------
    df                pd.DataFrame

    Return:
    -----------------------------
    avg_h1_data         pd.DataFrame

    '''
    avg_h1_list = ['h1_diasbp', 'h1_heartrate', 'h1_mbp', 'h1_resprate', 'h1_spo2', 'h1_sysbp',
                        'h1_temp', 'h1_albumin', 'h1_bilirubin', 'h1_bun', 'h1_calcium', 'h1_creatinine',
                        'h1_glucose', 'h1_hco3', 'h1_hemaglobin', 'h1_hematocrit', 'h1_inr', 'h1_lactate', 
                        'h1_platelets', 'h1_potassium', 'h1_sodium', 'h1_wbc', 'h1_arterial_pco2', 
                        'h1_arterial_ph', 'h1_arterial_po2', 'h1_pao2fio2ratio']
    avg_h1_data = pd.DataFrame()
    for col in avg_h1_list:
        avg_h1_data[col + '_avg'] = df[[col + '_min', col + '_max']].mean(axis=1)
    return(avg_h1_data) 

    

def diff_h1d1_cols(df):
    '''
    Creates column with difference in h1 and d1 data using col_max and col_min variables

    Parameters:
    -----------------------------
    df                pd.DataFrame

    Return:
    -----------------------------
    diff_h1d1_data         pd.DataFrame

    '''
    diff_h1d1_list = ['h1_diasbp', 'h1_heartrate', 'h1_mbp', 'h1_resprate', 'h1_spo2', 'h1_sysbp',
                        'h1_temp', 'h1_albumin', 'h1_bilirubin', 'h1_bun', 'h1_calcium', 'h1_creatinine',
                        'h1_glucose', 'h1_hco3', 'h1_hemaglobin', 'h1_hematocrit', 'h1_inr', 'h1_lactate', 
                        'h1_platelets', 'h1_potassium', 'h1_sodium', 'h1_wbc', 'h1_arterial_pco2', 
                        'h1_arterial_ph', 'h1_arterial_po2', 'h1_pao2fio2ratio', 'd1_diasbp','d1_heartrate',
                        'd1_mbp','d1_resprate','d1_spo2', 'd1_sysbp', 'd1_temp', 'd1_albumin',
                        'd1_bilirubin', 'd1_bun', 'd1_calcium', 'd1_creatinine', 'd1_glucose', 'd1_hco3',
                        'd1_hemaglobin', 'd1_hematocrit', 'd1_inr', 'd1_lactate', 'd1_platelets', 'd1_potassium',
                        'd1_sodium', 'd1_wbc', 'd1_arterial_pco2', 'd1_arterial_ph', 'd1_arterial_po2', 'd1_pao2fio2ratio']
    diff_h1d1_data = pd.DataFrame()
    for col in diff_h1d1_list:
        diff_h1d1_data[col + '_diff'] = df[col + '_max'] - df[col + '_min']
    return(diff_h1d1_data)

    
def med_diff_cols(df):
    '''
    Creates column with difference in median(h1) and median(d1) data using col_max and col_min variables

    Parameters:
    -----------------------------
    df                pd.DataFrame

    Return:
    -----------------------------
    med_diff_data         pd.DataFrame

    '''

    h1d1_list = ['diasbp', 'heartrate', 'mbp', 'resprate', 'spo2', 'sysbp', 'temp', 'albumin', 'bilirubin', 'bun',
                 'calcium', 'creatinine', 'glucose', 'hco3', 'hemaglobin', 'hematocrit', 'inr', 'lactate', 'platelets',
                 'potassium', 'sodium', 'wbc', 'arterial_pco2', 'arterial_ph', 'arterial_po2', 'pao2fio2ratio']
    med_diff_data = pd.DataFrame()
    for col in h1d1_list:
        med_diff_data[col + 'med_diff'] = ((df[['d1_' + col + '_min', 'd1_' + col + '_max']].median(axis=1)) - df[['h1_' + col + '_min', 'h1_' + col + '_max']].median(axis=1))
    return(med_diff_data)

def h1_ppv(df):
    '''
    Creates column with the h1 blood pressure ratio (sys / dias)

    Parameters:
    -----------------------------
    df                pd.DataFrame

    Return:
    -----------------------------
    h1_ppv         pd.DataFrame

    '''
    h1_ppv = pd.DataFrame()
    h1_ppv['h1_ppv'] = (df[['h1_sysbp_max', 'h1_sysbp_min']].median(axis=1)) / (df[['h1_diasbp_max', 'h1_diasbp_min']].median(axis=1))
    return(h1_ppv)


def d1_ppv(df):
    '''
    Creates column with the d1 blood pressure ratio (sys / dias)

    Parameters:
    -----------------------------
    df                pd.DataFrame

    Return:
    -----------------------------
    d1_ppv         pd.DataFrame

    '''
    d1_ppv = pd.DataFrame()
    d1_ppv['d1_ppv'] = (df[['d1_sysbp_max', 'd1_sysbp_min']].median(axis=1)) / (df[['d1_diasbp_max', 'd1_diasbp_min']].median(axis=1))
    return(d1_ppv)

In [0]:
diff_h1d1_cols(wids)

Unnamed: 0,h1_diasbp_diff,h1_heartrate_diff,h1_mbp_diff,h1_resprate_diff,h1_spo2_diff,h1_sysbp_diff,h1_temp_diff,h1_albumin_diff,h1_bilirubin_diff,h1_bun_diff,h1_calcium_diff,h1_creatinine_diff,h1_glucose_diff,h1_hco3_diff,h1_hemaglobin_diff,h1_hematocrit_diff,h1_inr_diff,h1_lactate_diff,h1_platelets_diff,h1_potassium_diff,h1_sodium_diff,h1_wbc_diff,h1_arterial_pco2_diff,h1_arterial_ph_diff,h1_arterial_po2_diff,h1_pao2fio2ratio_diff,d1_diasbp_diff,d1_heartrate_diff,d1_mbp_diff,d1_resprate_diff,d1_spo2_diff,d1_sysbp_diff,d1_temp_diff,d1_albumin_diff,d1_bilirubin_diff,d1_bun_diff,d1_calcium_diff,d1_creatinine_diff,d1_glucose_diff,d1_hco3_diff,d1_hemaglobin_diff,d1_hematocrit_diff,d1_inr_diff,d1_lactate_diff,d1_platelets_diff,d1_potassium_diff,d1_sodium_diff,d1_wbc_diff,d1_arterial_pco2_diff,d1_arterial_ph_diff,d1_arterial_po2_diff,d1_pao2fio2ratio_diff
0,5.0,11.0,1.0,8.0,26.0,16.0,2.0,,,,,,,,,,,,,,,,,,,,31.0,47.0,43.0,24.0,26.0,58.0,2.7,0.0,0.0,1.0,1.1,0.28,59.0,4.0,0.0,0.0,,0.3,0.0,0.6,2.0,0.0,,,,
1,13.0,14.0,28.0,3.0,25.0,24.0,0.0,,,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000,0.00,0.0,0.0,64.0,46.0,82.0,20.0,30.0,92.0,1.2,0.0,0.0,2.0,0.6,0.15,17.0,1.0,0.2,0.8,0.0,0.0,70.0,0.4,0.0,10.6,0.0,0.00,0.0,3.800000
2,30.0,18.0,8.0,4.0,7.0,24.0,0.0,,,,,,,,,,,,,,,,,,,,40.0,28.0,34.0,13.0,7.0,43.0,0.3,,,,,,,,,,,,,,,,,,,
3,18.0,4.0,21.0,1.0,1.0,30.0,0.8,,,,,,,,0.0,0.0,0.5,,0.0,,,0.0,3.000,0.03,72.0,0.0,6.0,24.0,0.0,16.0,5.0,74.0,3.2,,,,,,97.0,,2.7,8.1,0.5,,155.0,1.5,,1.0,10.0,0.10,235.0,105.833333
4,31.0,13.0,12.0,,0.0,10.0,,,,,,,,,,,,,,,,,,,,,42.0,29.0,14.0,2.0,4.0,27.0,0.5,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
91708,11.0,15.0,9.0,8.0,1.0,7.0,0.0,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.000,0.00,0.0,,60.0,45.0,61.0,18.0,15.0,61.0,1.6,,,0.0,0.0,0.00,213.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.00,0.0,
91709,8.0,6.0,10.0,6.0,3.0,20.0,0.0,,,,,,,,,,,,,,,,,,,,33.0,24.0,35.0,21.0,55.0,62.0,1.3,,,0.0,0.0,0.00,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,27.0,0.11,124.0,124.000000
91710,,,,,,,0.0,,,,,,,,,,,,,,,,0.003,0.00,0.0,,28.0,79.0,25.0,34.0,18.0,35.0,1.1,0.3,,15.0,0.9,1.30,452.0,13.0,0.0,0.0,,,0.0,1.5,4.0,0.0,5.1,0.43,76.0,
91711,,,,,,,,,,,,,,,,,,,,,,,,,,,67.0,22.0,54.0,9.0,16.0,42.0,0.2,,,,,,,,,,,,,,,,0.0,0.00,0.0,0.000000


In [0]:
# Meghna

# Flag for Urine output level
def urine_flag(df):
    '''
    Creates 1 for patients with less than 800 mL of urine output in the first 24 hours
    0 Otherwise

    Parameters:
    -----------------------------
    df                pd.DataFrame

    Return:
    -----------------------------
    low_urine         pd.Series

    '''
    bins = [0, 800, np.inf]
    names = [1, 0]
    
    low_urine = pd.cut(df['urineoutput_apache'], bins, labels=names, include_lowest = True)
    return pd.Series(low_urine)

# Flag for white blood cells count
def wbc_level(df):
    '''
    Creates three levels for different levels of White Blood Cells
    Level = 0 --> wbc <10
    Level = 1 --> 10 <= wbc <s10
    Level = 2 --> 30 <= wbc

    Parameters:
    -----------------------------
    df                pd.DataFrame

    Return:
    -----------------------------
    wbc_level         pd.Series

    '''
    bins = [0, 10, 30, np.inf]
    names = [0, 1, 2]
    
    wbc_level = pd.cut(df['wbc_apache'], bins, labels=names, include_lowest = True)
    return pd.Series(wbc_level)

# Flag for lactate level
def lactate_level(df):
    '''
    Creates 1 for patients with lactate greater than 2.2 in the first 24 hours
    0 Otherwise
    
    Used d1_lactate_max as a more conservative measure

    Parameters:
    -----------------------------
    df                pd.DataFrame

    Return:
    -----------------------------
    high_lactate       pd.Series

    '''
    bins = [0, 2.2, np.inf]
    names = [1, 0]
    
    high_lactate = pd.cut(df['d1_lactate_max'], bins, labels=names, include_lowest = True)
    return pd.Series(high_lactate)

# Flag for platelets level
def platelets_level(df):
    '''
    Creates 1 for patients with platelets less than 100 in the first 24 hours
    0 Otherwise
    
    Used d1_plateslets_min as a more conservative measure

    Parameters:
    -----------------------------
    df                pd.DataFrame

    Return:
    -----------------------------
    low_platelets    pd.Series

    '''
    bins = [0, 100, np.inf]
    names = [1, 0]
    
    low_platelets = pd.cut(df['d1_platelets_min'], bins, labels=names, include_lowest = True)
    return pd.Series(low_platelets)



In [0]:
## Jenny
# Diff between first_hour and first_day max/min variables
def diff_h1_d1(df):
    """
    Diff between first_hour and first_day max/min variables. 
    It shows the drop/increase of the measures (max/min).
    
    Parameters:
    -----------------------
    df:                pd.DataFrame
    
    
    Return:
    -----------------------
    output:            pd.DataFrame or pd.Series
                       one or multiple columns that went through the feature engineering. 
    """
    
    
    hd = df[['d1_diasbp_invasive_max', 'd1_diasbp_invasive_min',
       'd1_diasbp_max', 'd1_diasbp_min', 'd1_diasbp_noninvasive_max',
       'd1_diasbp_noninvasive_min', 'd1_heartrate_max',
       'd1_heartrate_min', 'd1_mbp_invasive_max', 'd1_mbp_invasive_min',
       'd1_mbp_max', 'd1_mbp_min', 'd1_mbp_noninvasive_max',
       'd1_mbp_noninvasive_min', 'd1_resprate_max', 'd1_resprate_min',
       'd1_spo2_max', 'd1_spo2_min', 'd1_sysbp_invasive_max',
       'd1_sysbp_invasive_min', 'd1_sysbp_max', 'd1_sysbp_min',
       'd1_sysbp_noninvasive_max', 'd1_sysbp_noninvasive_min',
       'd1_temp_max', 'd1_temp_min', 'h1_diasbp_invasive_max',
       'h1_diasbp_invasive_min', 'h1_diasbp_max', 'h1_diasbp_min',
       'h1_diasbp_noninvasive_max', 'h1_diasbp_noninvasive_min',
       'h1_heartrate_max', 'h1_heartrate_min', 'h1_mbp_invasive_max',
       'h1_mbp_invasive_min', 'h1_mbp_max', 'h1_mbp_min',
       'h1_mbp_noninvasive_max', 'h1_mbp_noninvasive_min',
       'h1_resprate_max', 'h1_resprate_min', 'h1_spo2_max', 'h1_spo2_min',
       'h1_sysbp_invasive_max', 'h1_sysbp_invasive_min', 'h1_sysbp_max',
       'h1_sysbp_min', 'h1_sysbp_noninvasive_max',
       'h1_sysbp_noninvasive_min', 'h1_temp_max', 'h1_temp_min',
       'd1_albumin_max', 'd1_albumin_min', 'd1_bilirubin_max',
       'd1_bilirubin_min', 'd1_bun_max', 'd1_bun_min', 'd1_calcium_max',
       'd1_calcium_min', 'd1_creatinine_max', 'd1_creatinine_min',
       'd1_glucose_max', 'd1_glucose_min', 'd1_hco3_max', 'd1_hco3_min',
       'd1_hemaglobin_max', 'd1_hemaglobin_min', 'd1_hematocrit_max',
       'd1_hematocrit_min', 'd1_inr_max', 'd1_inr_min', 'd1_lactate_max',
       'd1_lactate_min', 'd1_platelets_max', 'd1_platelets_min',
       'd1_potassium_max', 'd1_potassium_min', 'd1_sodium_max',
       'd1_sodium_min', 'd1_wbc_max', 'd1_wbc_min', 'h1_albumin_max',
       'h1_albumin_min', 'h1_bilirubin_max', 'h1_bilirubin_min',
       'h1_bun_max', 'h1_bun_min', 'h1_calcium_max', 'h1_calcium_min',
       'h1_creatinine_max', 'h1_creatinine_min', 'h1_glucose_max',
       'h1_glucose_min', 'h1_hco3_max', 'h1_hco3_min',
       'h1_hemaglobin_max', 'h1_hemaglobin_min', 'h1_hematocrit_max',
       'h1_hematocrit_min', 'h1_inr_max', 'h1_inr_min', 'h1_lactate_max',
       'h1_lactate_min', 'h1_platelets_max', 'h1_platelets_min',
       'h1_potassium_max', 'h1_potassium_min', 'h1_sodium_max',
       'h1_sodium_min', 'h1_wbc_max', 'h1_wbc_min',
       'd1_arterial_pco2_max', 'd1_arterial_pco2_min',
       'd1_arterial_ph_max', 'd1_arterial_ph_min', 'd1_arterial_po2_max',
       'd1_arterial_po2_min', 'd1_pao2fio2ratio_max',
       'd1_pao2fio2ratio_min', 'h1_arterial_pco2_max',
       'h1_arterial_pco2_min', 'h1_arterial_ph_max', 'h1_arterial_ph_min',
       'h1_arterial_po2_max', 'h1_arterial_po2_min',
       'h1_pao2fio2ratio_max', 'h1_pao2fio2ratio_min']]
    
    # Keep the synthesized ones. 
    hd = hd[hd.columns.drop(hd.filter(regex = 'invasive'))]

    hd = hd.reindex(sorted(hd.columns), axis=1)
    
    # h1-d1
    cols = hd.iloc[:, 52:].columns
    cols = [i.strip('h1_') + '_change' for i in cols] 

    output = pd.DataFrame(hd.iloc[:, 52:].values - hd.iloc[:, :52].values, columns=cols)
    
    return output



# Apache diagnosis - The part before decimal
def grab_main_cat(col):
    """
    Return the main category for apache diagnosis. (apache_3j_diagnosis and apache_2_diagnosis)

    Parameters
    -------------------
    col                 pd.Series
                        The variables we want to extract the main category.

    Return
    -------------------
    output              pd.Series
                        Main categories of diagnosis. 

    """
    output = col.astype('str').str.split('.', n=1, expand=True)[0]
    output = output.astype(str)
    
    return output
