In [1]:
import random
import numpy as np
import pandas as pd
from joblib import dump, load
np.random.seed(42)
random.seed(42)

In [2]:
with np.load('output/Xy.npz') as f:
    X = f['X']
    y = f['y']

In [3]:
# Perform temporal split of data into train/test sets
pop = pd.read_csv('data/population.csv').set_index('BMT_ID')

split_idx = -85
assert pop[:split_idx].index.str.startswith('train_').all()
assert pop[split_idx:].index.str.startswith('test_').all()

In [4]:
from sklearn import preprocessing, model_selection, metrics, utils
from sklearn.linear_model import LogisticRegression
from tqdm import tqdm
from joblib import Parallel, delayed
from sklearn.base import clone

In [5]:
# Specify hyperparameters and cv parameters
base_estimator = LogisticRegression(penalty='l2', class_weight='balanced', solver='liblinear')
param_grid = {
    'C': [10. ** n for n in range(-6, 7)],
    'penalty': ['l2'],
}

## Train model with baseline+vitals

In [6]:
Xtr, Xte = X[:split_idx], X[split_idx:]
ytr, yte = y[:split_idx], y[split_idx:]

cv_splits, cv_repeat = 5, 20
cv = model_selection.RepeatedStratifiedKFold(cv_splits, cv_repeat, random_state=0)
clf = model_selection.GridSearchCV(
    clone(base_estimator), param_grid, 
    cv=cv, scoring='roc_auc', n_jobs=5,
)
clf.fit(Xtr, ytr)
test_score = metrics.roc_auc_score(yte, clf.decision_function(Xte))

In [7]:
y_true = yte
y_score = clf.decision_function(Xte)

def boostrap_func(i, y_true, y_score):
    yte_true_b, yte_pred_b = utils.resample(y_true, y_score, replace=True, random_state=i)
    return metrics.roc_curve(yte_true_b, yte_pred_b), metrics.roc_auc_score(yte_true_b, yte_pred_b)

roc_curves, auc_scores = zip(*Parallel(n_jobs=4)(delayed(boostrap_func)(i, y_true, y_score) for i in range(1000)))
print('Test AUC: {:.3f} ({:.3f}, {:.3f})'.format(np.median(auc_scores), np.percentile(auc_scores, 2.5), np.percentile(auc_scores, 97.5)))

Test AUC: 0.658 (0.536, 0.784)


In [8]:
dump(clf, 'output/model_combined.joblib')

['output/model_combined.joblib']

## Train model with baseline features only

In [9]:
Xtr, Xte = X[:split_idx, :52], X[split_idx:, :52]
ytr, yte = y[:split_idx], y[split_idx:]

cv_splits, cv_repeat = 5, 20
cv = model_selection.RepeatedStratifiedKFold(cv_splits, cv_repeat, random_state=0)
clf = model_selection.GridSearchCV(
    clone(base_estimator), param_grid, 
    cv=cv, scoring='roc_auc', n_jobs=5,
)
clf.fit(Xtr, ytr)
test_score = metrics.roc_auc_score(yte, clf.decision_function(Xte))



In [10]:
y_true = yte
y_score = clf.decision_function(Xte)

def boostrap_func(i, y_true, y_score):
    yte_true_b, yte_pred_b = utils.resample(y_true, y_score, replace=True, random_state=i)
    return metrics.roc_curve(yte_true_b, yte_pred_b), metrics.roc_auc_score(yte_true_b, yte_pred_b)

roc_curves, auc_scores = zip(*Parallel(n_jobs=4)(delayed(boostrap_func)(i, y_true, y_score) for i in range(1000)))
print('Test AUC: {:.3f} ({:.3f}, {:.3f})'.format(np.median(auc_scores), np.percentile(auc_scores, 2.5), np.percentile(auc_scores, 97.5)))

Test AUC: 0.512 (0.364, 0.643)


In [11]:
dump(clf, 'output/model_baseline.joblib')

['output/model_baseline.joblib']

## Train model with vitals features only

In [12]:
Xtr, Xte = X[:split_idx, 52:], X[split_idx:, 52:]
ytr, yte = y[:split_idx], y[split_idx:]

cv_splits, cv_repeat = 5, 20
cv = model_selection.RepeatedStratifiedKFold(cv_splits, cv_repeat, random_state=0)
clf = model_selection.GridSearchCV(
    clone(base_estimator), param_grid, 
    cv=cv, scoring='roc_auc', n_jobs=5,
)
clf.fit(Xtr, ytr)
test_score = metrics.roc_auc_score(yte, clf.decision_function(Xte))

In [13]:
y_true = yte
y_score = clf.decision_function(Xte)

def boostrap_func(i, y_true, y_score):
    yte_true_b, yte_pred_b = utils.resample(y_true, y_score, replace=True, random_state=i)
    return metrics.roc_curve(yte_true_b, yte_pred_b), metrics.roc_auc_score(yte_true_b, yte_pred_b)

roc_curves, auc_scores = zip(*Parallel(n_jobs=4)(delayed(boostrap_func)(i, y_true, y_score) for i in range(1000)))
print('Test AUC: {:.3f} ({:.3f}, {:.3f})'.format(np.median(auc_scores), np.percentile(auc_scores, 2.5), np.percentile(auc_scores, 97.5)))

Test AUC: 0.633 (0.507, 0.757)


In [14]:
dump(clf, 'output/model_vitals.joblib')

['output/model_vitals.joblib']