In [1]:
import pandas as pd

# Get imputed data
mimic_complete = pd.read_csv("./impute_mimic.csv")
mimic_complete['mort_28'].replace([False, True],[0, 1], inplace=True)

In [2]:
# Create variables to store outcome Y, treatment T, and features X
y = "mort_28"
T = "peep_regime"
X = ["age", "sex", "weight", "height", "pf_ratio", "po2", "pco2", "ph", "driving_pressure", "lung_compliance", "map", "bilirubin", "creatinine", "platelets", "urea", "fio2", "hco3", "heart_rate", "minute_volume", "peep", "plateau_pressure", "respiratory_rate", "syst_blood_pressure", "diastolic_blood_pressure"]

In [3]:
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, RandomizedSearchCV


def hyperparameter_tuning_perform(mimic_complete, X, T):
    # Train and test set
    train, test = train_test_split(mimic_complete, test_size=0.3, random_state=None)
    scaler = StandardScaler()
    train[X] = scaler.fit_transform(train[X])
    test[X] = scaler.transform(test[X])

    # Define hyperparameter grids
    logistic_param_grid = {
        'C': [0.01, 0.1, 1, 10, 100],
        'solver': ['lbfgs', 'saga'],
        'max_iter': [100, 200, 300, 400, 500]}
    rf_param_grid = {
        'n_estimators': [50, 100, 200, 300],
        'max_depth': [None, 10, 20, 30, 40, 50],
        'min_samples_leaf': [1, 2, 4, 6, 8]
    }
    dt_param_grid = {
        'max_depth': [None, 10, 20, 30, 40, 50],
        'min_samples_leaf': [1, 2, 4, 6, 8]
    }

    # Hyperparameter tuning logistic regression
    logistic_search = RandomizedSearchCV(LogisticRegression(random_state=123),
                                         logistic_param_grid,
                                         n_iter=25,
                                         random_state=123,
                                         n_jobs=-1,
                                         cv=3,
                                         scoring='accuracy'
                                         )
    logistic_search.fit(train[X], train[T])
    propensity_logistic = logistic_search.best_estimator_
    best_logistic_params = logistic_search.best_params_
    best_logistic_accuracy = logistic_search.best_score_

    # Hyperparameter tuning random forest
    rf_search = RandomizedSearchCV(RandomForestClassifier(random_state=123),
                                   rf_param_grid,
                                   n_iter=25,
                                   random_state=123,
                                   n_jobs=-1,
                                   cv=3,
                                   scoring='accuracy')
    rf_search.fit(train[X], train[T])
    propensity_rf = rf_search.best_estimator_
    best_rf_params = rf_search.best_params_
    best_rf_accuracy = rf_search.best_score_

    # Hyperparameter tuning decision tree
    dt_search = RandomizedSearchCV(DecisionTreeClassifier(random_state=123),
                                   dt_param_grid,
                                   n_iter=25,
                                   random_state=123,
                                   n_jobs=-1,
                                   cv=3,
                                   scoring='accuracy')
    dt_search.fit(train[X], train[T])
    propensity_decision_tree = dt_search.best_estimator_
    best_dt_params = dt_search.best_params_
    best_dt_accuracy = dt_search.best_score_

    return best_logistic_params, best_rf_params, best_dt_params, best_logistic_accuracy, best_rf_accuracy, best_dt_accuracy

In [4]:
def hyperparameter_tuning_iterate(mimic_complete, X, T, num_experiments):
    logistic_params_list = []
    rf_params_list = []
    dt_params_list = []
    logistic_accuracy_list = []
    rf_accuracy_list = []
    dt_accuracy_list = []

    for i in range(num_experiments):
        logistic_params, rf_params, dt_params, logistic_accuracy, rf_accuracy, dt_accuracy = hyperparameter_tuning_perform(mimic_complete, X, T)
        
        logistic_params_list.append(logistic_params)
        rf_params_list.append(rf_params)
        dt_params_list.append(dt_params)
        
        logistic_accuracy_list.append(logistic_accuracy)
        rf_accuracy_list.append(rf_accuracy)
        dt_accuracy_list.append(dt_accuracy)

    return logistic_params_list, rf_params_list, dt_params_list, logistic_accuracy_list, rf_accuracy_list, dt_accuracy_list

# Hyperparameter tuning
logistic_params_list, rf_params_list, dt_params_list, logistic_accuracy_list, rf_accuracy_list, dt_accuracy_list = hyperparameter_tuning_iterate(mimic_complete, X, T, 10)

In [5]:
print('Logistic regression:')
for i in range(len(logistic_params_list)):
    print(logistic_params_list[i], logistic_accuracy_list[i])

In [6]:
print('Random forest:')
for i in range(len(rf_params_list)):
    print(rf_params_list[i], rf_accuracy_list[i])

In [7]:
print('Decision tree:')
for i in range(len(dt_params_list)):
    print(dt_params_list[i], dt_accuracy_list[i])

In [8]:
# Train and test
train, test = train_test_split(mimic_complete, test_size=0.3, random_state=None)
scaler = StandardScaler()
train[X] = scaler.fit_transform(train[X])
test[X] = scaler.transform(test[X])

### Evaluate on test set

In [27]:
# Cross-validation accuracy test set
propensity_logistic = LogisticRegression(solver="lbfgs", max_iter=100, C=0.1, random_state=123)
propensity_logistic.fit(train[X], train[T])
logistic_accuracy = propensity_logistic.score(test[X], test[T])
print('Accuracy logistic model: ', logistic_accuracy)

In [28]:
from matplotlib import pyplot as plt
test['propensity_score'] = propensity_logistic.predict_proba(test[X])[:, 1]
test['predicted_group'] = (test['propensity_score'] >= 0.5).astype(int)
group_distribution = test['predicted_group'].value_counts()
print(group_distribution)
print('Percentage treated: ', 67/(1116+67) * 100)

In [29]:
 # Cross-validation accuracy test set
propensity_rf = RandomForestClassifier(n_estimators=300, max_depth=20, min_samples_leaf=4, random_state=123)
propensity_rf.fit(train[X], train[T])
rf_accuracy = propensity_rf.score(test[X], test[T])
print('Accuracy logistic model: ', rf_accuracy)

In [30]:
from matplotlib import pyplot as plt
test['propensity_score'] = propensity_rf.predict_proba(test[X])[:, 1]
test['predicted_group'] = (test['propensity_score'] >= 0.5).astype(int)
group_distribution = test['predicted_group'].value_counts()
print(group_distribution)
print('Percentage treated: ', 59/(1124+59) * 100)

In [31]:
# Cross-validation accuracy test set
propensity_dt = DecisionTreeClassifier(max_depth=10, min_samples_leaf=1, random_state=123)
propensity_dt.fit(train[X], train[T])
dt_accuracy = propensity_dt.score(test[X], test[T])
print('Accuracy logistic model: ', dt_accuracy) 

In [32]:
from matplotlib import pyplot as plt
test['propensity_score'] = propensity_dt.predict_proba(test[X])[:, 1]
test['predicted_group'] = (test['propensity_score'] >= 0.5).astype(int)
group_distribution = test['predicted_group'].value_counts()
print(group_distribution)
print('Percentage treated: ', 102/(1081+102) * 100)