# Build LR and RF models

In [1]:
import numpy as np
import pandas as pd
from collections import Counter

from sklearn.model_selection import train_test_split
from sklearn.model_selection import ShuffleSplit

In [2]:
df = pd.read_csv('input/label.csv').rename(columns={'HADM_ID': 'ID'})

In [3]:
df

Unnamed: 0,ID,mortality_day,label,partition
0,100001,0,0,train
1,100003,250,0,test
2,100009,0,0,train
3,100010,0,0,test
4,100011,0,0,train
...,...,...,...,...
19718,199972,920,0,train
19719,199979,0,0,val
19720,199984,0,0,val
19721,199992,0,0,train


In [4]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedShuffleSplit, StratifiedKFold, GridSearchCV, RandomizedSearchCV
from sklearn import metrics, feature_selection, utils
import scipy.stats
from joblib import Parallel, delayed
from tqdm import tqdm#_notebook as tqdm
import random

In [12]:
def train_model(Xtr, ytr, Xte, yte, model_name, exp_name):
    np.random.seed(0)
    random.seed(0)

    clf = helper(model_name)

    clf.fit(Xtr, ytr)
    print('best_params_', clf.best_params_)
    print('best_score_ ', clf.best_score_)
    try:
        np.savetxt(
            'output/{}.{},coef.txt'.format(exp_name, model_name), 
            clf.best_estimator_.coef_,
            delimiter=',',
        )
    except:
        print('Coefficients not saved')
        pass

    ###### 
    # Eval
    # Bootstrapped 95% Confidence Interval
    try:
        yte_pred = clf.predict_proba(Xte)[:,1]
    except AttributeError:
        print('Cannot produce probabilistic estimates')
        raise

    def func(i):
        yte_true_b, yte_pred_b = utils.resample(yte, yte_pred, replace=True, random_state=i)
        return metrics.roc_auc_score(yte_true_b, yte_pred_b)

    test_scores = Parallel(n_jobs=2)(delayed(func)(i) for i in tqdm(range(1000), leave=False))
    print('Test AUC: {:.3f} ({:.3f}, {:.3f})'.format(np.median(test_scores), np.percentile(test_scores, 2.5), np.percentile(test_scores, 97.5)))

    save_test_predictions(yte, yte_pred, model_name, 'output/'+exp_name+'.')

In [6]:
n_jobs=12
search_budget=50

def helper(model_type):
    if model_type == 'LR':
        clf = RandomizedSearchCV(
            LogisticRegression(solver='lbfgs'), 
            {'C': scipy.stats.reciprocal(1e-5, 1e5)},
            n_iter=search_budget,
            cv=StratifiedKFold(5),
            scoring='roc_auc',
            n_jobs=n_jobs, verbose=2,
        )
    elif model_type == 'RF':
        clf = RandomizedSearchCV(
            RandomForestClassifier(), 
            {
                "criterion": ["gini", "entropy"],
                "max_depth": [4, 8, 16, 32, None],
                "max_features": scipy.stats.randint(1, 100),
                "min_samples_split": scipy.stats.randint(2, 11),
                "min_samples_leaf": scipy.stats.randint(1, 11),
                "n_estimators": scipy.stats.randint(50,500),
                "bootstrap": [True],
            },
            n_iter=search_budget,
            cv=StratifiedKFold(5),
            scoring='roc_auc',
            n_jobs=n_jobs, verbose=2,
        )
    else:
        assert False
    
    return clf

def save_test_predictions(y_true, y_score, model_name, save_dir):
#     import pathlib
#     pathlib.Path(save_dir).mkdir(parents=True, exist_ok=True)
    
    fname = save_dir + '{}.test.npz'.format(model_name)
    np.savez(
        open(fname, 'wb'),
        y_score = y_score,
        y_true  = y_true,
    )
    print('Test predictions saved to', fname)

### ICD [0]

In [7]:
import sparse
X = sparse.load_npz('output.icd[0]/s.npz').todense()

In [8]:
X.shape

(19723, 17)

In [9]:
Xtr = X[df.partition=="train"]
ytr = df[df.partition=="train"]['label']
Xte = X[df.partition=="test"]
yte = df[df.partition=="test"]['label']
print(Xtr.shape, ytr.shape, Xte.shape, yte.shape)

(13865, 17) (13865,) (2928, 17) (2928,)


In [10]:
train_model(Xtr, ytr, Xte, yte, 'LR', 'ICD[0]')

Fitting 5 folds for each of 50 candidates, totalling 250 fits


[Parallel(n_jobs=12)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  17 tasks      | elapsed:    1.2s
[Parallel(n_jobs=12)]: Done 192 tasks      | elapsed:    2.7s
[Parallel(n_jobs=12)]: Done 227 out of 250 | elapsed:    2.9s remaining:    0.3s
[Parallel(n_jobs=12)]: Done 250 out of 250 | elapsed:    3.0s finished
  0%|          | 0/1000 [00:00<?, ?it/s]

best_params_ {'C': 0.17240426024865194}
best_score_  0.7386671618536903


                                                  

Test AUC: 0.725 (0.694, 0.753)
Test predictions saved to output/ICD[0].LR.test.npz


In [13]:
train_model(Xtr, ytr, Xte, yte, 'RF', 'ICD[0]')

Fitting 5 folds for each of 50 candidates, totalling 250 fits


[Parallel(n_jobs=12)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  17 tasks      | elapsed:    1.2s
[Parallel(n_jobs=12)]: Done 138 tasks      | elapsed:    5.3s
[Parallel(n_jobs=12)]: Done 250 out of 250 | elapsed:    9.3s finished
  0%|          | 0/1000 [00:00<?, ?it/s]

best_params_ {'bootstrap': True, 'criterion': 'entropy', 'max_depth': 8, 'max_features': 2, 'min_samples_leaf': 7, 'min_samples_split': 10, 'n_estimators': 431}
best_score_  0.7386321364300639
Coefficients not saved


                                                  

Test AUC: 0.733 (0.703, 0.763)
Test predictions saved to output/ICD[0].RF.test.npz


### ICD [0,1]

In [14]:
import sparse
X = sparse.load_npz('output.icd[0,1]/s.npz').todense()

In [15]:
X.shape

(19723, 112)

In [16]:
Xtr = X[df.partition=="train"]
ytr = df[df.partition=="train"]['label']
Xte = X[df.partition=="test"]
yte = df[df.partition=="test"]['label']
print(Xtr.shape, ytr.shape, Xte.shape, yte.shape)

(13865, 112) (13865,) (2928, 112) (2928,)


In [17]:
train_model(Xtr, ytr, Xte, yte, 'LR', 'ICD[0,1]')

Fitting 5 folds for each of 50 candidates, totalling 250 fits


[Parallel(n_jobs=12)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  17 tasks      | elapsed:    1.8s
[Parallel(n_jobs=12)]: Done 138 tasks      | elapsed:    5.9s
[Parallel(n_jobs=12)]: Done 250 out of 250 | elapsed:    9.1s finished
  0%|          | 0/1000 [00:00<?, ?it/s]

best_params_ {'C': 0.14015848675188294}
best_score_  0.7950248801285071


                                                  

Test AUC: 0.777 (0.748, 0.804)
Test predictions saved to output/ICD[0,1].LR.test.npz


In [18]:
train_model(Xtr, ytr, Xte, yte, 'RF', 'ICD[0,1]')

Fitting 5 folds for each of 50 candidates, totalling 250 fits


[Parallel(n_jobs=12)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  17 tasks      | elapsed:    8.9s
[Parallel(n_jobs=12)]: Done 138 tasks      | elapsed:  1.4min
[Parallel(n_jobs=12)]: Done 250 out of 250 | elapsed:  2.8min finished


best_params_ {'bootstrap': True, 'criterion': 'gini', 'max_depth': None, 'max_features': 4, 'min_samples_leaf': 5, 'min_samples_split': 6, 'n_estimators': 384}
best_score_  0.7911249803736851
Coefficients not saved


                                                  

Test AUC: 0.778 (0.751, 0.806)
Test predictions saved to output/ICD[0,1].RF.test.npz


### ICD [0,1,2]

In [19]:
import sparse
X = sparse.load_npz('output.icd[0,1,2]/s.npz').todense()

In [20]:
X.shape

(19723, 314)

In [21]:
Xtr = X[df.partition=="train"]
ytr = df[df.partition=="train"]['label']
Xte = X[df.partition=="test"]
yte = df[df.partition=="test"]['label']
print(Xtr.shape, ytr.shape, Xte.shape, yte.shape)

(13865, 314) (13865,) (2928, 314) (2928,)


In [22]:
train_model(Xtr, ytr, Xte, yte, 'LR', 'ICD[0,1,2]')

Fitting 5 folds for each of 50 candidates, totalling 250 fits


[Parallel(n_jobs=12)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  17 tasks      | elapsed:    3.5s
[Parallel(n_jobs=12)]: Done 138 tasks      | elapsed:   15.9s
[Parallel(n_jobs=12)]: Done 250 out of 250 | elapsed:   25.0s finished
  0%|          | 0/1000 [00:00<?, ?it/s]

best_params_ {'C': 0.039362167558821506}
best_score_  0.8176147686510381


                                                  

Test AUC: 0.795 (0.767, 0.820)
Test predictions saved to output/ICD[0,1,2].LR.test.npz


In [23]:
train_model(Xtr, ytr, Xte, yte, 'RF', 'ICD[0,1,2]')

Fitting 5 folds for each of 50 candidates, totalling 250 fits


[Parallel(n_jobs=12)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  17 tasks      | elapsed:    8.9s
[Parallel(n_jobs=12)]: Done 138 tasks      | elapsed:  2.7min
[Parallel(n_jobs=12)]: Done 250 out of 250 | elapsed:  8.4min finished


best_params_ {'bootstrap': True, 'criterion': 'entropy', 'max_depth': None, 'max_features': 59, 'min_samples_leaf': 8, 'min_samples_split': 7, 'n_estimators': 391}
best_score_  0.8153834076113868
Coefficients not saved


                                                  

Test AUC: 0.799 (0.771, 0.825)
Test predictions saved to output/ICD[0,1,2].RF.test.npz
