# Setup, matplotlib inline, automatically reload libraries on every evaluation

In [1]:
import pandas as pd
import numpy as np
import scipy as sp
pd.options.display.max_rows = 400
pd.options.display.max_columns = 400
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [3]:
from creon.loadcreon import LoadCreon, load_search, save_search
from creon.creonsklearn.creonmetrics import pu_scorer, prior_squared_error_scorer_015, brier_score_labeled_loss_scorer, \
    f1_assumed_scorer, f1_labeled_scorer, report_metrics, f1_assumed_beta10_scorer
from creon.semisuperhelper import SemiSupervisedHelper
from creon.creonsklearn.pnuwrapper import PNUWrapper
from creon.creonsklearn.jeffsearchcv import JeffRandomSearchCV, extract_score_grid
from creon.creonsklearn.nestedcross import NestedCV
from creon.creonsklearn.frankenscorer import FrankenScorer, extract_scores_from_nested
from creon.creonsklearn.repeatedsampling import RepeatedRandomSubSampler

In [3]:
path = "C:\Data\\010317\membership14_final_0103.txt"
lc = LoadCreon(path)

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.base import clone

In [5]:
X_train, X_test, y_train, y_test = train_test_split(lc.X, lc.y, test_size=0.2, random_state=771, stratify=lc.y)

# FIRST REPEATED SUB SAMPLER

## Set up randomized search parameters

In [6]:
rf_param_search = {'base_estimator__base_estimator__bootstrap': [True, False],
 'base_estimator__base_estimator__class_weight': [None,'balanced','balanced_subsample'],
 'base_estimator__base_estimator__criterion': ['gini','entropy'],
 'base_estimator__base_estimator__max_depth': [None] + list(range(2,100)),
 'base_estimator__base_estimator__max_features': ['sqrt','log2',None] + list(range(5,100)),
 'base_estimator__base_estimator__min_samples_leaf': [1,2,3,4,5,6,7,8,9,10,15,20,25,30,35,40,45,50,75,100],
 'base_estimator__base_estimator__min_samples_split':[2,0.005,0.01,0.015,0.02,0.025,0.03,0.035,0.04,
                                                        0.045,0.05,0.07,0.09,0.1,0.12,0.15,0.17,0.2,0.25],
 'base_estimator__base_estimator__n_estimators': sp.stats.randint(low=10, high=200),
 'base_estimator__sample_imbalance': sp.stats.uniform(loc=0.1, scale=0.9)
                  }

## Set up PNU Wrapper with Random Forest, then JeffSearchCV, then NestedCV

In [7]:
rf = RandomForestClassifier(n_jobs=-1)
rep = RepeatedRandomSubSampler(base_estimator=rf, verbose=1)
pnu = PNUWrapper(base_estimator=rep, num_unlabeled=1.0)

In [8]:
jeffsearch = JeffRandomSearchCV(pnu, rf_param_search, n_iter=10, scoring=FrankenScorer(decision_score='assumed_f1beta10'),
                                n_jobs=-1, cv=3, verbose=1, pre_dispatch=8)

#### 3x3 (x20) nested cross validation

In [9]:
nested_cross = NestedCV(jeffsearch, scoring=FrankenScorer(decision_score='assumed_f1beta10'), cv=3, random_state=None)

## Score the nested cross - 180 models!

In [10]:
scores = nested_cross.score(X_train.values, y=y_train.values, verbose=100, pre_dispatch=8)

[CV]  ................................................................
Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed: 133.1min finished


generating 167 samples of indices to use to train multiple estimators,               sized 2517 elements with last being 1972 elements


[Parallel(n_jobs=1)]: Done 167 out of 167 | elapsed:  2.2min finished
[Parallel(n_jobs=1)]: Done 167 out of 167 | elapsed:  2.5min finished
[Parallel(n_jobs=1)]: Done 167 out of 167 | elapsed:  2.5min finished
[Parallel(n_jobs=1)]: Done 167 out of 167 | elapsed:  5.3min finished
[Parallel(n_jobs=1)]: Done 167 out of 167 | elapsed:  5.4min finished


[CV]  , score=0.716148567429 score_data={'SCORE': 0.71614856742898003, 'confusion_matrix_un': array([[107408,   9555],
       [    91,    466]]), 'labeled_avg_prec': 0.91851869489460025, 'labeled_prec': 0.89272030651340994, 'assumed_brier': 0.097044506923199325, 'labeled_roc_auc': 0.82109016556951919, 'labeled_f1': 0.86376274328081559, 'labeled_brier_neg': 0.20234212951346711, 'labeled_recall': 0.83662477558348292, 'labeled_acc': 0.82603550295857986, 'confusion_matrix_lab': array([[232,  56],
       [ 91, 466]]), 'labeled_brier': 0.13315288933266117, 'labeled_brier_pos': 0.097378201411526338, 'pr_one_unlabeled': 0.081414184701092782, 'assumed_brier_neg': 0.097042917806726622, 'pu_score': 8.2084690247396175, 'assumed_f1beta10': 0.71614856742898003, 'assumed_f1': 0.088107392701834}, total=140.4min
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed: 151.1min remaining:    0.0s
[CV]  ................................................................
Fitting 3 folds for each of 10 candidates

[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed: 117.1min finished


generating 49 samples of indices to use to train multiple estimators,               sized 5895 elements with last being 5501 elements


[Parallel(n_jobs=1)]: Done  49 out of  49 | elapsed:   19.6s finished
[Parallel(n_jobs=1)]: Done  49 out of  49 | elapsed:   22.6s finished
[Parallel(n_jobs=1)]: Done  49 out of  49 | elapsed:   22.7s finished
[Parallel(n_jobs=1)]: Done  49 out of  49 | elapsed:   43.9s finished
[Parallel(n_jobs=1)]: Done  49 out of  49 | elapsed:   43.9s finished


[CV]  , score=0.702073170732 score_data={'SCORE': 0.70207317073170739, 'confusion_matrix_un': array([[107518,   9444],
       [   101,    456]]), 'labeled_avg_prec': 0.91188829693420581, 'labeled_prec': 0.88543689320388352, 'assumed_brier': 0.074279356550173842, 'labeled_roc_auc': 0.80654827066352219, 'labeled_f1': 0.85074626865671643, 'labeled_brier_neg': 0.17894291146431782, 'labeled_recall': 0.81867145421903054, 'labeled_acc': 0.81042654028436023, 'confusion_matrix_lab': array([[228,  59],
       [101, 456]]), 'labeled_brier': 0.1489406592760002, 'labeled_brier_pos': 0.13348168911792629, 'pr_one_unlabeled': 0.080437111634883221, 'assumed_brier_neg': 0.073997421398242114, 'pu_score': 7.9559526116705666, 'assumed_f1beta10': 0.70207317073170739, 'assumed_f1': 0.087214306206368941}, total=118.3min
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed: 270.9min remaining:    0.0s
[CV]  ................................................................
Fitting 3 folds for each of 10 candidate

[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed: 206.6min finished


generating 193 samples of indices to use to train multiple estimators,               sized 2330 elements with last being 1567 elements


[Parallel(n_jobs=1)]: Done 193 out of 193 | elapsed:  2.7min finished
[Parallel(n_jobs=1)]: Done 193 out of 193 | elapsed:  4.1min finished
[Parallel(n_jobs=1)]: Done 193 out of 193 | elapsed:  4.1min finished
[Parallel(n_jobs=1)]: Done 193 out of 193 | elapsed:  8.7min finished
[Parallel(n_jobs=1)]: Done 193 out of 193 | elapsed:  8.7min finished


[CV]  , score=0.701636277201 score_data={'SCORE': 0.70163627720100008, 'confusion_matrix_un': array([[107805,   9156],
       [   103,    453]]), 'labeled_avg_prec': 0.92976890349198771, 'labeled_prec': 0.92260692464358451, 'assumed_brier': 0.095216665542289872, 'labeled_roc_auc': 0.84117201012708998, 'labeled_f1': 0.86532951289398297, 'labeled_brier_neg': 0.16836826011406414, 'labeled_recall': 0.81474820143884896, 'labeled_acc': 0.83274021352313166, 'confusion_matrix_lab': array([[249,  38],
       [103, 453]]), 'labeled_brier': 0.12867981456520436, 'labeled_brier_pos': 0.10819315292397637, 'pr_one_unlabeled': 0.078149373467953448, 'assumed_brier_neg': 0.095154978937488116, 'pu_score': 8.1183790279020513, 'assumed_f1beta10': 0.70163627720100008, 'assumed_f1': 0.089129365469749131}, total=217.5min
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed: 506.0min remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed: 506.0min finished


## The scores of the 3 folds of the outer loop

In [11]:
scores

(0.71614856742898003, 0.70207317073170739, 0.70163627720100008)

In [12]:
save_search(nested_cross, './res/nested_cross_repreated_rf_small_20170130.pkl')

In [21]:
extract_scores_from_nested(nested_cross.test_score_datas_).mean().sort_index()

assumed_brier                                                      0.0888468
assumed_brier_neg                                                  0.0887318
assumed_f1                                                         0.0881504
assumed_f1beta10                                                    0.706619
confusion_matrix_lab       [[236.333333333, 51.0], [98.3333333333, 458.33...
confusion_matrix_un        [[107577.0, 9385.0], [98.3333333333, 458.33333...
fn_confusion_matrix_lab                                              98.3333
fn_confusion_matrix_un                                               98.3333
fp_confusion_matrix_lab                                                   51
fp_confusion_matrix_un                                                  9385
labeled_acc                                                         0.823067
labeled_avg_prec                                                    0.920059
labeled_brier                                                       0.136924

# Let's dive in and see the parameters for one of the best models

In [14]:
[est.best_params_ for est in nested_cross.estimators_]

[{'base_estimator__base_estimator__bootstrap': False,
  'base_estimator__base_estimator__class_weight': 'balanced_subsample',
  'base_estimator__base_estimator__criterion': 'gini',
  'base_estimator__base_estimator__max_depth': 58,
  'base_estimator__base_estimator__max_features': 77,
  'base_estimator__base_estimator__min_samples_leaf': 1,
  'base_estimator__base_estimator__min_samples_split': 0.1,
  'base_estimator__base_estimator__n_estimators': 115,
  'base_estimator__sample_imbalance': 0.79242354330034359},
 {'base_estimator__base_estimator__bootstrap': True,
  'base_estimator__base_estimator__class_weight': 'balanced',
  'base_estimator__base_estimator__criterion': 'gini',
  'base_estimator__base_estimator__max_depth': 80,
  'base_estimator__base_estimator__max_features': 40,
  'base_estimator__base_estimator__min_samples_leaf': 1,
  'base_estimator__base_estimator__min_samples_split': 0.015,
  'base_estimator__base_estimator__n_estimators': 37,
  'base_estimator__sample_imbalanc

## Let's see what feature importance looks like for this specific estimator

In [26]:
fi = np.array([sub_est.feature_importances_ for sub_est in nested_cross.estimators_[0].best_estimator_.base_estimator.estimators_])

In [27]:
fi.shape

(167, 288)

In [49]:
pd.DataFrame(np.mean(fi, axis=0), index=X_test.columns).sort_values(by=0,ascending=False)

Unnamed: 0,0
DIAG_FLAG4_Sum,0.4200402
DIAG_FLAG5_Sum,0.2363329
ndc_cat58_Sum,0.0607444
CPT_FLAG9_Sum,0.04394944
DIAG_FLAG75_Sum,0.03085083
ndc_cat87_Sum,0.02513798
ndc_cat54_Sum,0.01244384
age,0.01097083
DIAG_FLAG6_Sum,0.01028217
ndc_cat85_Sum,0.006728196
