In [188]:
import time, matplotlib, sklearn

# visualizatoin 
import matplotlib.pyplot as plt  # for plotting general data
import seaborn as sns  # for statistical data visualization

# data wrangling
import pandas as pd
import numpy as np

import os

# the data intro has been tested with the following versions
print("pandas        Tested version: 2.0.3   Your version: %s" % pd.__version__)
print("numpy         Tested version: 1.21.5  Your version: %s" % np.__version__)
print("matplotlib    Tested version: 3.5.3   Your version: %s" % matplotlib.__version__)
print("scikit-learn  Tested version: 1.2.2   Your version: %s" % sklearn.__version__)
print("seaborn                               Your version: %s" % sns.__version__)

pandas        Tested version: 2.0.3   Your version: 2.0.3
numpy         Tested version: 1.21.5  Your version: 1.26.4
matplotlib    Tested version: 3.5.3   Your version: 3.5.3
scikit-learn  Tested version: 1.2.2   Your version: 1.2.2
seaborn                               Your version: 0.13.2


# Loading data


In [180]:
# description
description = pd.read_csv('./data/WiDS_Datathon_2020_Dictionary.csv')
description_dict = description.set_index('Variable Name').to_dict(orient='index')
# data
df = pd.read_csv('./data/training_v2.csv')

df.head()

Unnamed: 0,encounter_id,patient_id,hospital_id,hospital_death,age,bmi,elective_surgery,ethnicity,gender,height,...,aids,cirrhosis,diabetes_mellitus,hepatic_failure,immunosuppression,leukemia,lymphoma,solid_tumor_with_metastasis,apache_3j_bodysystem,apache_2_bodysystem
0,66154,25312,118,0,68.0,22.73,0,Caucasian,M,180.3,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,Sepsis,Cardiovascular
1,114252,59342,81,0,77.0,27.42,0,Caucasian,F,160.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,Respiratory,Respiratory
2,119783,50777,118,0,25.0,31.95,0,Caucasian,F,172.7,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Metabolic,Metabolic
3,79267,46918,118,0,81.0,22.64,1,Caucasian,F,165.1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Cardiovascular,Cardiovascular
4,92056,34377,33,0,19.0,,0,Caucasian,M,188.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Trauma,Trauma


In [181]:
description

Unnamed: 0,Category,Variable Name,Unit of Measure,Data Type,Description,Example
0,identifier,encounter_id,,integer,Unique identifier associated with a patient un...,
1,identifier,hospital_id,,integer,Unique identifier associated with a hospital,
2,identifier,patient_id,,integer,Unique identifier associated with a patient,
3,demographic,hospital_death,,binary,Whether the patient died during this hospitali...,0
4,demographic,age,Years,numeric,The age of the patient on unit admission,
...,...,...,...,...,...,...
183,APACHE comorbidity,lymphoma,,binary,Whether the patient has been diagnosed with no...,1
184,APACHE comorbidity,solid_tumor_with_metastasis,,binary,Whether the patient has been diagnosed with an...,1
185,APACHE grouping,apache_3j_bodysystem,,string,Admission diagnosis group for APACHE III,Cardiovascular
186,APACHE grouping,apache_2_bodysystem,,string,Admission diagnosis group for APACHE II,Respiratory


# Data preprocessing

We aim to perform:
- Missing value handling by k-nearest neighbours
- Categorical data conversion to dummies

Later we will explorer:
- Class imbalance handling techniques:
    - relabeling
    - over/undersampling
    - reweighing
    

In [182]:
# data preprocessing
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_selection import VarianceThreshold
from sklearn.impute import KNNImputer

# -- preprocessing parameters -- #
data_split = {
    'train' :    .7,    # 70% of the data for training
    'test' :     .1,    # 10% of the data for testing (evaluation after training, chapter 4!)
    'validate' : .2     # 20% of the data for validation (evaluation during training / model selection, chapter 3!)
}
seed = 42
max_missing = .75 # maximum missing values in a column

# -- features that shouldn't be used in the model, as they're either identifiers or other predictor models -- #
target_feature = 'hospital_death'
identification_features = ['encounter_id', 'patient_id', 'hospital_id', 'icu_id']
baseline_features = ['apache_4a_hospital_death_prob', 'apache_4a_icu_death_prob', 'apache_2_bodysystem']

In [183]:
# -- split the data into train, validation, and test datasets -- #

# copy allowed data into the dataset X and target variable y
X = df.copy().drop(columns=[target_feature] + identification_features + baseline_features)
y = df[target_feature].copy() # contains actual y values
y_apache = df['apache_4a_hospital_death_prob'].copy() # contains prediction of y values from the apache model, for comparison

# split the data into train, validation, and test datasets
X_train, X_temp, y_train, y_temp, y_apache_train, y_apache_temp = train_test_split(
    X, y, y_apache, test_size=(data_split['test'] + data_split['validate']), random_state=seed
)
X_val, X_test, y_val, y_test, y_apache_val, y_apache_test = train_test_split(
    X_temp, y_temp, y_apache_temp, test_size=(data_split['test'] / (data_split['test'] + data_split['validate'])), random_state=seed
)
del X_temp, y_temp, y_apache_temp # remove temporary datasets (shouldn't be used anymore)

## Available Variables

### Model Datasets:

- `X_train`: Training dataset features. Use this dataset to train your model.
- `y_train`: Training dataset target variable. Use this dataset to train your model.
- `X_val`: Validation dataset features. Use this dataset to validate your model during training.
- `y_val`: Validation dataset target variable. Use this dataset to validate your model during training.
- `X_test`: Test dataset features. Use this dataset to evaluate your model after training.
- `y_test`: Test dataset target variable. Use this dataset to evaluate your model after training.

### Apache Datasets:
- `y_apache_train`: Series containing the APACHE model predictions for the training dataset.
- `y_apache_val`: Series containing the APACHE model predictions for the validation dataset.
- `y_apache_test`: Series containing the APACHE model predictions for the test dataset.

### Variables:

- `target_feature`: The target variable for prediction, in this case, 'hospital_death'.
- `identification_features`: List of features used for identification purposes, not for model training.
- `baseline_features`: List of baseline features that should not be used in the model.
- `data_split`: Dictionary containing the proportions for splitting the data into training, validation, and test sets.
- `seed`: Random seed for reproducibility.
- `max_missing`: Maximum allowed proportion of missing values in a column.
- `description`: DataFrame containing the description of each variable in the dataset.
- `description_dict`: Dictionary containing the description of each variable in the dataset.

> note: the unsplit datasets are still available, but there is no excuse for using them! PREVENT DATA SPILLING!

The preproccessing is done in a pipeline that allows for easily repeatable steps.

In [184]:
def dropMissingValues(dataset: pd.DataFrame, threshhold: float) -> pd.DataFrame:
    """
    Drops columns from the dataset that have more missing values than the given threshold.
    """

    missingvalues = dataset.isna().sum()            # count missing values in each column
    max_allowed_missing = threshhold * len(dataset) # calculate the maximum allowed missing values
    feature_drop_list = missingvalues[missingvalues  > max_allowed_missing].index # get the columns with too many missing values

    print(f"there are {len(feature_drop_list)} columns in the dataset that will be dropped:")
    for i,col in enumerate(feature_drop_list):
        print(f"\t{i+1} {col} ({100*missingvalues[col]/len(dataset):.2f}% missing)")
    
    return dataset.drop(feature_drop_list, axis=1)

In [185]:
# Missing value handling:
# - drop columns with more than max_missing missing values
# - impute missing values in the remaining columns

X_train = dropMissingValues(X_train, max_missing)
X_val = dropMissingValues(X_val, max_missing)
X_test = dropMissingValues(X_test, max_missing)

# impute missing values in the remaining columns, using the k-NN algorithm
imputer = KNNImputer(n_neighbors=5)
numerical_transformer = Pipeline(steps=[('imputer', imputer)])

there are 45 columns in the dataset that will be dropped:
	1 fio2_apache (77.23% missing)
	2 paco2_apache (77.23% missing)
	3 paco2_for_ph_apache (77.23% missing)
	4 pao2_apache (77.23% missing)
	5 ph_apache (77.23% missing)
	6 h1_diasbp_invasive_max (81.68% missing)
	7 h1_diasbp_invasive_min (81.68% missing)
	8 h1_mbp_invasive_max (81.58% missing)
	9 h1_mbp_invasive_min (81.58% missing)
	10 h1_sysbp_invasive_max (81.67% missing)
	11 h1_sysbp_invasive_min (81.67% missing)
	12 h1_albumin_max (91.37% missing)
	13 h1_albumin_min (91.37% missing)
	14 h1_bilirubin_max (92.23% missing)
	15 h1_bilirubin_min (92.23% missing)
	16 h1_bun_max (81.92% missing)
	17 h1_bun_min (81.92% missing)
	18 h1_calcium_max (82.75% missing)
	19 h1_calcium_min (82.75% missing)
	20 h1_creatinine_max (81.78% missing)
	21 h1_creatinine_min (81.78% missing)
	22 h1_hco3_max (83.02% missing)
	23 h1_hco3_min (83.02% missing)
	24 h1_hemaglobin_max (79.85% missing)
	25 h1_hemaglobin_min (79.85% missing)
	26 h1_hematocrit

In [186]:
# -- Categorical re-encoding -- #
# - one-hot encoding for categorical variables

categorical_transformer = Pipeline(steps=[
    ('encode_to_dummies', OneHotEncoder(drop='first', sparse_output=False, handle_unknown="ignore"))
])

In [None]:
# -- Pileline -- #

# - define the column transformer
basic_preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, make_column_selector(dtype_exclude=object)), # apply numerical processing to numbers
        ('cat', categorical_transformer, make_column_selector(dtype_include=object)) # ... and categorical to objects
    ]
)

# compute pipeline metadata on training set (the same will be used for validation and test set)
basic_preprocessor.fit(X_train) # no supervised information is used here, so we can use the training data only

# apply the pipeline to the datasets
X_train = basic_preprocessor.transform(X_train)
X_val = basic_preprocessor.transform(X_val)
X_test = basic_preprocessor.transform(X_test)

print(f"X_train shape: {X_train.shape}")

X_train shape: (64199, 174)


In [196]:
display(basic_preprocessor)

In [197]:
#convert output to pandas dataframe

X_train = pd.DataFrame(X_train, columns=basic_preprocessor.get_feature_names_out())
X_val = pd.DataFrame(X_val, columns=basic_preprocessor.get_feature_names_out())
X_test = pd.DataFrame(X_test, columns=basic_preprocessor.get_feature_names_out())

print(f"columns: {basic_preprocessor.get_feature_names_out()}")

columns: ['num__age' 'num__bmi' 'num__elective_surgery' 'num__height'
 'num__pre_icu_los_days' 'num__readmission_status' 'num__weight'
 'num__albumin_apache' 'num__apache_2_diagnosis'
 'num__apache_3j_diagnosis' 'num__apache_post_operative' 'num__arf_apache'
 'num__bilirubin_apache' 'num__bun_apache' 'num__creatinine_apache'
 'num__gcs_eyes_apache' 'num__gcs_motor_apache' 'num__gcs_unable_apache'
 'num__gcs_verbal_apache' 'num__glucose_apache' 'num__heart_rate_apache'
 'num__hematocrit_apache' 'num__intubated_apache' 'num__map_apache'
 'num__resprate_apache' 'num__sodium_apache' 'num__temp_apache'
 'num__urineoutput_apache' 'num__ventilated_apache' 'num__wbc_apache'
 'num__d1_diasbp_invasive_max' 'num__d1_diasbp_invasive_min'
 'num__d1_diasbp_max' 'num__d1_diasbp_min'
 'num__d1_diasbp_noninvasive_max' 'num__d1_diasbp_noninvasive_min'
 'num__d1_heartrate_max' 'num__d1_heartrate_min'
 'num__d1_mbp_invasive_max' 'num__d1_mbp_invasive_min' 'num__d1_mbp_max'
 'num__d1_mbp_min' 'num__d1_mbp_

In [194]:
import os

# Create a new directory
output_dir = 'split data'
os.makedirs(output_dir, exist_ok=True)


In [195]:
# save the data
X_train.to_csv(os.path.join(output_dir, 'X_train.csv'), index=False)
X_val.to_csv(os.path.join(output_dir, 'X_val.csv'), index=False)
X_test.to_csv(os.path.join(output_dir, 'X_test.csv'), index=False)

y_train.to_csv(os.path.join(output_dir, 'y_train.csv'), index=False)
y_val.to_csv(os.path.join(output_dir, 'y_val.csv'), index=False)
y_test.to_csv(os.path.join(output_dir, 'y_test.csv'), index=False)

y_apache_train.to_csv(os.path.join(output_dir, 'y_apache_train.csv'), index=False)
y_apache_val.to_csv(os.path.join(output_dir, 'y_apache_val.csv'), index=False)
y_apache_test.to_csv(os.path.join(output_dir, 'y_apache_test.csv'), index=False)