In [20]:
import time, matplotlib, sklearn

# visualizatoin 
import matplotlib.pyplot as plt  # for plotting general data
import seaborn as sns  # for statistical data visualization

# data wrangling
import pandas as pd
import numpy as np

# the data intro has been tested with the following versions
print("pandas        Tested version: 2.0.3   Your version: %s" % pd.__version__)
print("numpy         Tested version: 1.21.5  Your version: %s" % np.__version__)
print("matplotlib    Tested version: 3.5.3   Your version: %s" % matplotlib.__version__)
print("scikit-learn  Tested version: 1.2.2   Your version: %s" % sklearn.__version__)
print("seaborn                               Your version: %s" % sns.__version__)

pandas        Tested version: 2.0.3   Your version: 2.0.3
numpy         Tested version: 1.21.5  Your version: 1.26.4
matplotlib    Tested version: 3.5.3   Your version: 3.5.3
scikit-learn  Tested version: 1.2.2   Your version: 1.2.2
seaborn                               Your version: 0.13.2


# Loading data


In [2]:
# description
description = pd.read_csv('./data/WiDS_Datathon_2020_Dictionary.csv')
description_dict = description.set_index('Variable Name').to_dict(orient='index')
# data
df = pd.read_csv('./data/training_v2.csv')

df.head()

Unnamed: 0,encounter_id,patient_id,hospital_id,hospital_death,age,bmi,elective_surgery,ethnicity,gender,height,...,aids,cirrhosis,diabetes_mellitus,hepatic_failure,immunosuppression,leukemia,lymphoma,solid_tumor_with_metastasis,apache_3j_bodysystem,apache_2_bodysystem
0,66154,25312,118,0,68.0,22.73,0,Caucasian,M,180.3,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,Sepsis,Cardiovascular
1,114252,59342,81,0,77.0,27.42,0,Caucasian,F,160.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,Respiratory,Respiratory
2,119783,50777,118,0,25.0,31.95,0,Caucasian,F,172.7,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Metabolic,Metabolic
3,79267,46918,118,0,81.0,22.64,1,Caucasian,F,165.1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Cardiovascular,Cardiovascular
4,92056,34377,33,0,19.0,,0,Caucasian,M,188.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Trauma,Trauma


In [12]:
description

Unnamed: 0,Category,Variable Name,Unit of Measure,Data Type,Description,Example
0,identifier,encounter_id,,integer,Unique identifier associated with a patient un...,
1,identifier,hospital_id,,integer,Unique identifier associated with a hospital,
2,identifier,patient_id,,integer,Unique identifier associated with a patient,
3,demographic,hospital_death,,binary,Whether the patient died during this hospitali...,0
4,demographic,age,Years,numeric,The age of the patient on unit admission,
...,...,...,...,...,...,...
183,APACHE comorbidity,lymphoma,,binary,Whether the patient has been diagnosed with no...,1
184,APACHE comorbidity,solid_tumor_with_metastasis,,binary,Whether the patient has been diagnosed with an...,1
185,APACHE grouping,apache_3j_bodysystem,,string,Admission diagnosis group for APACHE III,Cardiovascular
186,APACHE grouping,apache_2_bodysystem,,string,Admission diagnosis group for APACHE II,Respiratory


# Data preprocessing

We aim to perform:
- Missing value handling by k-nearest neighbours
- Categorical data conversion to dummies

Later we will explorer:
- Class imbalance handling techniques:
    - relabeling
    - over/undersampling
    - reweighing
    

In [22]:
# data preprocessing
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_selection import VarianceThreshold
from sklearn.impute import KNNImputer

# -- preprocessing parameters -- #
data_split = {
    'train' :    .7,    # 70% of the data for training
    'test' :     .1,    # 10% of the data for testing (evaluation after training, chapter 4!)
    'validate' : .2     # 20% of the data for validation (evaluation during training / model selection, chapter 3!)
}
seed = 42
max_missing = .25 # maximum missing values in a column

# -- features that shouldn't be used in the model, as they're either identifiers or other predictor models -- #
target_feature = 'hospital_death'
identification_features = ['encounter_id', 'patient_id', 'hospital_id', 'icu_id']
baseline_features = ['apache_4a_hospital_death_prob', 'apache_4a_icu_death_prob', 'apache_2_bodysystem']

In [11]:
# -- split the data into train, validation, and test datasets -- #

# copy allowed data into the dataset X and target variable y
X = df.copy().drop(columns=[target_feature] + identification_features + baseline_features)
y = df[target_feature].copy() # contains actual y values
y_apache = df['apache_4a_hospital_death_prob'].copy() # contains prediction of y values from the apache model, for comparison

# split the data into train, validation, and test datasets
X_train, X_temp, y_train, y_temp, y_apache_train, y_apache_temp = train_test_split(
    X, y, y_apache, test_size=(data_split['test'] + data_split['validate']), random_state=seed
)
X_val, X_test, y_val, y_test, y_apache_val, y_apache_test = train_test_split(
    X_temp, y_temp, y_apache_temp, test_size=(data_split['test'] / (data_split['test'] + data_split['validate'])), random_state=seed
)
del X_temp, y_temp, y_apache_temp # remove temporary datasets (shouldn't be used anymore)

## Available Variables

### Model Datasets:

- `X_train`: Training dataset features. Use this dataset to train your model.
- `y_train`: Training dataset target variable. Use this dataset to train your model.
- `X_val`: Validation dataset features. Use this dataset to validate your model during training.
- `y_val`: Validation dataset target variable. Use this dataset to validate your model during training.
- `X_test`: Test dataset features. Use this dataset to evaluate your model after training.
- `y_test`: Test dataset target variable. Use this dataset to evaluate your model after training.

### Apache Datasets:
- `y_apache_train`: Series containing the APACHE model predictions for the training dataset.
- `y_apache_val`: Series containing the APACHE model predictions for the validation dataset.
- `y_apache_test`: Series containing the APACHE model predictions for the test dataset.

### Variables:

- `target_feature`: The target variable for prediction, in this case, 'hospital_death'.
- `identification_features`: List of features used for identification purposes, not for model training.
- `baseline_features`: List of baseline features that should not be used in the model.
- `data_split`: Dictionary containing the proportions for splitting the data into training, validation, and test sets.
- `seed`: Random seed for reproducibility.
- `max_missing`: Maximum allowed proportion of missing values in a column.
- `description`: DataFrame containing the description of each variable in the dataset.
- `description_dict`: Dictionary containing the description of each variable in the dataset.

> note: the unsplit datasets are still available, but there is no excuse for using them! PREVENT DATA SPILLING!

The preproccessing is done in a pipeline that allows for easily repeatable steps.

In [None]:
# Missing value handling:
# - drop columns with more than max_missing missing values
# - impute missing values in the remaining columns

size_train, size_val, size_test = len(X_train), len(X_val), len(X_test)

# drop columns with more than max_missing missing values
print(f'Dropping columns with more than {max_missing:.0%} missing values.')
X_train = X_train.drop(columns=X_train.isnull().mean()[X_train.isnull().mean() > max_missing].index)
X_val = X_val.drop(columns=X_val.isnull().mean()[X_val.isnull().mean() > max_missing].index)
X_test = X_test.drop(columns=X_test.isnull().mean()[X_test.isnull().mean() > max_missing].index)

dropped_train = size_train - len(X_train)
dropped_val = size_val - X_val.shape[0]
dropped_test = size_test - X_test.shape[0]

dropped_percentage_train = dropped_train / size_train
dropped_percentage_val = dropped_val / size_val
dropped_percentage_test = dropped_test / size_test

print(f'Dropped {dropped_train} columns ({dropped_percentage_train:.2%}) in the training data.')
print(f'Dropped {dropped_val} columns ({dropped_percentage_val:.2%}) in the validation data.')
print(f'Dropped {dropped_test} columns ({dropped_percentage_test:.2%}) in the test data.')

# impute missing values in the remaining columns, using the k-NN algorithm
imputer = KNNImputer(n_neighbors=5)
numerical_transformer = Pipeline(steps=[('imputer', imputer)])

Dropping columns with more than 25% missing values.
Dropped 0 columns (0.00%) in the training data.
Dropped 0 columns (0.00%) in the validation data.
Dropped 0 columns (0.00%) in the test data.
