# Setup, matplotlib inline, automatically reload libraries on every evaluation

In [1]:
import pandas as pd
import numpy as np
import scipy as sp
pd.options.display.max_rows = 400
pd.options.display.max_columns = 400
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [2]:
from loadcreon import LoadCreon
from creonmetrics import pu_scorer, prior_squared_error_scorer_015, brier_score_labeled_loss_scorer, \
    f1_assumed_scorer, f1_labeled_scorer, report_metrics, f1_assumed_beta10_scorer
from semisuperhelper import SemiSupervisedHelper
from pnuwrapper import PNUWrapper
from jeffsearchcv import JeffRandomSearchCV
from nestedcross import NestedCV
from frankenscorer import FrankenScorer, extract_scores_from_nested, extract_score_grid
from searchrf import save_search, load_search
from repeatedsampling import RepeatedRandomSubSampler

In [3]:
path = "C:\Data\\010317\membership14_final_0103.txt"
lc = LoadCreon(path)

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.base import clone

In [5]:
X_train, X_test, y_train, y_test = train_test_split(lc.X, lc.y, test_size=0.2, random_state=771, stratify=lc.y)

# FIRST REPEATED SUB SAMPLER

## Set up PNU Wrapper with Random Forest, then JeffSearchCV, then NestedCV

In [6]:
rf = RandomForestClassifier()
et = ExtraTreesClassifier()
rep = RepeatedRandomSubSampler(base_estimator=rf, verbose=1)
pnu = PNUWrapper(base_estimator=rep, num_unlabeled=1.0)

## Set up randomized search parameters

In [7]:
rf_param_search = {'base_estimator__base_estimator':[rf, et],
 'base_estimator__base_estimator__bootstrap': [True, False],
 'base_estimator__base_estimator__class_weight': [None,'balanced','balanced_subsample'],
 'base_estimator__base_estimator__criterion': ['gini','entropy'],
 'base_estimator__base_estimator__max_depth': [None] + list(range(2,100)),
 'base_estimator__base_estimator__max_features': ['sqrt','log2',None] + list(range(5,100)),
 'base_estimator__base_estimator__min_samples_leaf': [1,2,3,4,5,6,7,8,9,10,15,20,25,30,35,40,45,50,75,100],
 'base_estimator__base_estimator__min_samples_split':[2,0.005,0.01,0.015,0.02,0.025,0.03,0.035,0.04,
                                                        0.045,0.05,0.07,0.09,0.1,0.12,0.15,0.17,0.2,0.25],
 'base_estimator__base_estimator__n_estimators': sp.stats.randint(low=10, high=300),
 'base_estimator__sample_imbalance': sp.stats.uniform(loc=0.1, scale=0.9),
 'pu_learning': [True, False]
                  }

### notice random_state is set in jeffsearch, this is so that the same random parameters are searched for each outer fold, sort of like grid search

In [8]:
jeffsearch = JeffRandomSearchCV(pnu, rf_param_search, n_iter=60, scoring=FrankenScorer(decision_score='assumed_f1beta10'),
                                n_jobs=-1, cv=3, verbose=1, pre_dispatch=8, random_state=77)

#### 3x3 (x60) nested cross validation

In [9]:
nested_cross = NestedCV(jeffsearch, scoring=FrankenScorer(decision_score='assumed_f1beta10'), cv=3, random_state=None)

## Score the nested cross

In [10]:
scores = nested_cross.score(X_train.values, y=y_train.values, verbose=100, pre_dispatch=8)

[CV]  ................................................................
Fitting 3 folds for each of 60 candidates, totalling 180 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed: 112.0min
[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed: 489.4min finished


generating 137 samples of indices to use to train multiple estimators,               sized 2830 elements with last being 1524 elements


[Parallel(n_jobs=1)]: Done 137 out of 137 | elapsed:  1.4min finished
[Parallel(n_jobs=1)]: Done 137 out of 137 | elapsed:  7.5min finished
[Parallel(n_jobs=1)]: Done 137 out of 137 | elapsed:  7.5min finished
[Parallel(n_jobs=1)]: Done 137 out of 137 | elapsed: 15.6min finished
[Parallel(n_jobs=1)]: Done 137 out of 137 | elapsed: 15.6min finished


[CV]  , score=0.702562296381 score_data={'labeled_acc': 0.82011834319526622, 'assumed_f1beta10': 0.70256229638108647, 'assumed_brier_neg': 0.095473052356552665, 'labeled_brier_pos': 0.16992182157136196, 'assumed_brier': 0.095825911141888376, 'labeled_roc_auc': 0.80570204967085568, 'pu_score': 6.8401854874045922, 'pr_one_unlabeled': 0.10198414398971502, 'confusion_matrix_un': array([[104995,  11968],
       [    83,    474]]), 'labeled_f1': 0.86181818181818171, 'labeled_brier': 0.17982954499036324, 'confusion_matrix_lab': array([[219,  69],
       [ 83, 474]]), 'assumed_f1': 0.072928686822063243, 'labeled_prec': 0.8729281767955801, 'labeled_brier_neg': 0.19899135729725112, 'SCORE': 0.70256229638108647, 'labeled_recall': 0.85098743267504484, 'labeled_avg_prec': 0.9110702307708155}, total=505.9min
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed: 537.2min remaining:    0.0s
[CV]  ................................................................
Fitting 3 folds for each of 60 candidates,

[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed: 117.1min
[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed: 529.8min finished


generating 137 samples of indices to use to train multiple estimators,               sized 2830 elements with last being 1525 elements


[Parallel(n_jobs=1)]: Done 137 out of 137 | elapsed:  4.2min finished
[Parallel(n_jobs=1)]: Done 137 out of 137 | elapsed:  7.4min finished
[Parallel(n_jobs=1)]: Done 137 out of 137 | elapsed:  7.4min finished
[Parallel(n_jobs=1)]: Done 137 out of 137 | elapsed: 15.3min finished
[Parallel(n_jobs=1)]: Done 137 out of 137 | elapsed: 15.4min finished


[CV]  , score=0.70122764265 score_data={'labeled_acc': 0.81872037914691942, 'assumed_f1beta10': 0.70122764265023685, 'assumed_brier_neg': 0.074222194234285888, 'labeled_brier_pos': 0.14428425002048584, 'assumed_brier': 0.074554264495885439, 'labeled_roc_auc': 0.82127687524631066, 'pu_score': 8.141926515967004, 'pr_one_unlabeled': 0.077523034068995078, 'confusion_matrix_un': array([[107868,   9094],
       [   104,    453]]), 'labeled_f1': 0.85552407932011321, 'labeled_brier': 0.14973095356007535, 'confusion_matrix_lab': array([[238,  49],
       [104, 453]]), 'assumed_f1': 0.089667458432304017, 'labeled_prec': 0.90239043824701193, 'labeled_brier_neg': 0.16030173360032404, 'SCORE': 0.70122764265023685, 'labeled_recall': 0.81328545780969475, 'labeled_avg_prec': 0.91944932243593624}, total=548.9min
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed: 1116.8min remaining:    0.0s
[CV]  ................................................................
Fitting 3 folds for each of 60 candidate

[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed: 118.9min
[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed: 564.8min finished


generating 137 samples of indices to use to train multiple estimators,               sized 2833 elements with last being 1255 elements


[Parallel(n_jobs=1)]: Done 137 out of 137 | elapsed:  4.2min finished
[Parallel(n_jobs=1)]: Done 137 out of 137 | elapsed:  7.4min finished
[Parallel(n_jobs=1)]: Done 137 out of 137 | elapsed:  7.5min finished
[Parallel(n_jobs=1)]: Done 137 out of 137 | elapsed: 15.4min finished
[Parallel(n_jobs=1)]: Done 137 out of 137 | elapsed: 15.4min finished


[CV]  , score=0.707446235318 score_data={'labeled_acc': 0.83867141162514824, 'assumed_f1beta10': 0.70744623531766204, 'assumed_brier_neg': 0.073830485701749118, 'labeled_brier_pos': 0.13907766980805419, 'assumed_brier': 0.074139185161087792, 'labeled_roc_auc': 0.8481970521144061, 'pu_score': 8.4090020631052518, 'pr_one_unlabeled': 0.076015221900337687, 'confusion_matrix_un': array([[108057,   8904],
       [   101,    455]]), 'labeled_f1': 0.86998087954110903, 'labeled_brier': 0.1389149549523232, 'confusion_matrix_lab': array([[252,  35],
       [101, 455]]), 'assumed_f1': 0.091780131114473015, 'labeled_prec': 0.9285714285714286, 'labeled_brier_neg': 0.13859973035376419, 'SCORE': 0.70744623531766204, 'labeled_recall': 0.81834532374100721, 'labeled_avg_prec': 0.93336347698658562}, total=584.0min
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed: 1731.7min remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed: 1731.7min finished


## The scores of the 3 folds of the outer loop

In [14]:
scores

(0.70256229638108647, 0.70122764265023685, 0.70744623531766204)

In [12]:
save_search(nested_cross, './res/nested_cross_repreated_rf_large_20170131.pkl')

In [15]:
extract_scores_from_nested(nested_cross.test_score_datas_).mean().sort_index()

assumed_brier                                                      0.0815065
assumed_brier_neg                                                  0.0811752
assumed_f1                                                         0.0847921
assumed_f1beta10                                                    0.703745
confusion_matrix_lab          [[236.333333333, 51.0], [96.0, 460.666666667]]
confusion_matrix_un        [[106973.333333, 9988.66666667], [96.0, 460.66...
fn_confusion_matrix_lab                                                   96
fn_confusion_matrix_un                                                    96
fp_confusion_matrix_lab                                                   51
fp_confusion_matrix_un                                               9988.67
labeled_acc                                                         0.825837
labeled_avg_prec                                                    0.921294
labeled_brier                                                       0.156158

In [56]:
t = extract_score_grid(nested_cross.estimators_[0])
t['optimize_score'] = t.mean_pu_score_test + (t.mean_assumed_f1beta10_test * 100)
t[['mean_pu_score_test','mean_assumed_f1beta10_test','optimize_score','mean_pr_one_unlabeled_test']].iloc[34]

mean_pu_score_test            11.311659
mean_assumed_f1beta10_test     0.689577
optimize_score                80.269381
mean_pr_one_unlabeled_test     0.046951
Name: 34, dtype: float64

# Let's dive in and see the parameters for one of the best models

In [43]:
[est.cv_results_['params'][34] for est in nested_cross.estimators_]

[{'base_estimator__base_estimator': RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
              max_depth=64, max_features='auto', max_leaf_nodes=None,
              min_impurity_split=1e-07, min_samples_leaf=1,
              min_samples_split=2, min_weight_fraction_leaf=0.0,
              n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
              verbose=0, warm_start=False),
  'base_estimator__base_estimator__bootstrap': False,
  'base_estimator__base_estimator__class_weight': None,
  'base_estimator__base_estimator__criterion': 'gini',
  'base_estimator__base_estimator__max_depth': 64,
  'base_estimator__base_estimator__max_features': 87,
  'base_estimator__base_estimator__min_samples_leaf': 8,
  'base_estimator__base_estimator__min_samples_split': 0.01,
  'base_estimator__base_estimator__n_estimators': 79,
  'base_estimator__sample_imbalance': 0.44063408204723742,
  'pu_learning': True},
 {'base_estimator__base_estimator': RandomFores

## Let's see what feature importance looks like for this specific estimator

In [47]:
feature_importances = np.array([est.best_estimator_.feature_importances_ for est in nested_cross.estimators_])

In [20]:
feature_importance = np.mean(feature_importances, axis=0)

In [59]:
importance = pd.DataFrame(feature_importance, index=X_test.columns, columns=['Importance']).sort_values(by='Importance', ascending=False)
importance

Unnamed: 0,Importance
DIAG_FLAG4_Sum,0.3355714
DIAG_FLAG5_Sum,0.2041382
ndc_cat58_Sum,0.06525617
CPT_FLAG9_Sum,0.04299861
ndc_cat87_Sum,0.03694698
DIAG_FLAG75_Sum,0.03289905
ndc_cat54_Sum,0.01936213
CPT_FLAG43_Sum,0.0136835
DIAG_FLAG42_Sum,0.01265005
age,0.01173387
