# SETUP

In [1]:
import pandas as pd
import numpy as np
import scipy as sp
pd.options.display.max_rows = 400
pd.options.display.max_columns = 400
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [2]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MaxAbsScaler
from sklearn.linear_model import LogisticRegression

In [7]:
from creon.loadcreon import LoadCreon, load_search, save_search
from creon.creonsklearn.pnuwrapper import PNUWrapper
from creon.creonsklearn.nestedcross import NestedCV
from creon.creonsklearn.jeffsearchcv import JeffRandomSearchCV, extract_score_grid
from creon.creonsklearn.frankenscorer import FrankenScorer, extract_scores_from_nested

In [11]:
path = "C:\Data\\010317\membership14_final_0103.txt"
lc = LoadCreon(path)

## Let's stratify 80/20 train test holdout for future use

In [12]:
from sklearn.model_selection import train_test_split

In [13]:
X_train, X_test, y_train, y_test = train_test_split(lc.X, lc.y, test_size=0.2, random_state=771, stratify=lc.y)

## LASSO pipeline and model on labeled data (PN)

In [17]:
estimators = [('scaler', MaxAbsScaler()),
              ('clf',PNUWrapper(base_estimator=LogisticRegression(penalty='l1', C=2, random_state=732)))]
pipe = Pipeline(estimators)

In [18]:
pipe.fit(X_train.values, y_train.values)

Pipeline(steps=[('scaler', MaxAbsScaler(copy=True)), ('clf', PNUWrapper(base_estimator=LogisticRegression(C=2, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=732, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
      num_unlabeled=0.0, pu_learning=False, random_state=None,
      threshold_set_pct=None))])

#### Scores of LASSO PN Baseline model

In [19]:
FrankenScorer()(pipe, X_test.values, y_test.values)

({'SCORE': 0.89976689976689972,
  'assumed_brier': 0.60031267154696111,
  'assumed_brier_neg': 0.60285852450719479,
  'assumed_f1': 0.0094622917866817001,
  'assumed_f1beta10': 0.3170392537956721,
  'confusion_matrix_lab': array([[161,  54],
         [ 32, 386]]),
  'confusion_matrix_un': array([[ 6939, 80783],
         [   32,   386]]),
  'labeled_acc': 0.86413902053712477,
  'labeled_avg_prec': 0.92563531297005985,
  'labeled_brier': 0.1040448755848036,
  'labeled_brier_neg': 0.17793964147052785,
  'labeled_brier_pos': 0.066036802222529167,
  'labeled_f1': 0.89976689976689972,
  'labeled_prec': 0.87727272727272732,
  'labeled_recall': 0.92344497607655507,
  'labeled_roc_auc': 0.83614109268944037,
  'pr_one_unlabeled': 0.92254333938999167,
  'pu_mix_assumed_f1beta10': 32.629912394133711,
  'pu_score': 0.92598701456650112},
 0.89976689976689972)

#### Coefficients of features for LASSO PN Baseline model

In [20]:
imp = pd.DataFrame(pipe.named_steps['clf'].base_estimator.coef_, columns=X_test.columns).T.sort_values(by=0, ascending=False)
imp[imp[0] != 0.0]

Unnamed: 0,0
DIAG_FLAG4_Sum,31.442457
ndc_cat54_Sum,7.552092
DIAG_FLAG69_Sum,5.673218
DIAG_FLAG5_Sum,5.406727
REVCODE_FLAG10_Sum,4.901706
REVCODE_FLAG19_Sum,4.670333
ndc_cat61_Sum,3.886519
DIAG_FLAG31_Sum,3.86993
DIAG_FLAG71_Sum,3.583528
ndc_cat58_Sum,3.45397


# Do 3-Fold Cross Validation of LASSO - PN

In [22]:
nested_lasso = NestedCV(pipe, scoring=FrankenScorer('assumed_f1beta10'), cv=3, random_state=None, use_same_random_state=False)
nested_lasso.score(X_train.values, y_train.values, n_jobs=-1, verbose=1)

[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:   20.1s finished


WARN: NestedCV.best_params_ set to None
WARN: NestedCV.best_idxs_ set to None


(0.32460658700734701, 0.32100177981184846, 0.31133339831375795)

In [28]:
extract_scores_from_nested(nested_lasso.test_score_datas_).mean().sort_index()

assumed_brier                                                       0.593557
assumed_brier_neg                                                   0.596055
assumed_f1                                                        0.00951951
assumed_f1beta10                                                    0.318981
confusion_matrix_lab       [[193.333333333, 94.0], [39.3333333333, 517.33...
confusion_matrix_un        [[9331.0, 107631.0], [39.3333333333, 517.33333...
fn_confusion_matrix_lab                                              39.3333
fn_confusion_matrix_un                                               39.3333
fp_confusion_matrix_lab                                                   94
fp_confusion_matrix_un                                                107631
labeled_acc                                                         0.842003
labeled_avg_prec                                                    0.911138
labeled_brier                                                       0.116227

# 3-Fold Nested Cross random search for pipline with PNU wrapper which will _undersample_ the unlabeled data by a massive amount

In [40]:
param_search = {'clf__base_estimator__C': sp.stats.expon(scale=4), 
               'clf__num_unlabeled': sp.stats.randint(low=1000, high=10000),
               'clf__base_estimator__class_weight': [None, 'balanced'],
               'clf__base_estimator__penalty': ['l1','l2'],
               'clf__pu_learning': [True, False]}

In [41]:
jeffsearch = JeffRandomSearchCV(pipe, param_search, n_iter=20, 
                            scoring=FrankenScorer('assumed_f1beta10'), n_jobs=-1, cv=5, verbose=1, random_state=42)

In [42]:
nested_cross = NestedCV(jeffsearch, scoring=FrankenScorer('assumed_f1beta10'), cv=3, random_state=731)

In [43]:
nested_cross.score(X_train.values, y_train.values, n_jobs=-1, verbose=1)

[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed: 10.3min finished


(0.55586130985140336, 0.54713945910272099, 0.53723404255319152)

In [47]:
extract_scores_from_nested(nested_cross.test_score_datas_).mean().sort_index()

assumed_brier                                                       0.129891
assumed_brier_neg                                                   0.129681
assumed_f1                                                         0.0438627
assumed_f1beta10                                                    0.546745
confusion_matrix_lab       [[169.666666667, 117.666666667], [160.33333333...
confusion_matrix_un        [[99822.3333333, 17139.6666667], [160.33333333...
fn_confusion_matrix_lab                                              160.333
fn_confusion_matrix_un                                               160.333
fp_confusion_matrix_lab                                              117.667
fp_confusion_matrix_un                                               17139.7
labeled_acc                                                         0.670631
labeled_avg_prec                                                    0.838328
labeled_brier                                                       0.212076

In [48]:
extract_scores_from_nested(nested_cross.test_score_datas_).std().sort_index()

assumed_brier                 0.003854
assumed_brier_neg             0.003918
assumed_f1                    0.001308
assumed_f1beta10              0.009320
fn_confusion_matrix_lab      12.220202
fn_confusion_matrix_un       12.220202
fp_confusion_matrix_lab      39.954140
fp_confusion_matrix_un     1086.978074
labeled_acc                   0.040728
labeled_avg_prec              0.027969
labeled_brier                 0.026333
labeled_brier_neg             0.083501
labeled_brier_pos             0.009855
labeled_f1                    0.022572
labeled_prec                  0.059693
labeled_recall                0.022181
labeled_roc_auc               0.063812
pr_one_unlabeled              0.009194
pu_mix_assumed_f1beta10       0.949519
pu_score                      0.048419
tn_confusion_matrix_lab      39.803685
tn_confusion_matrix_un     1086.061846
tp_confusion_matrix_lab      12.662280
tp_confusion_matrix_un       12.662280
dtype: float64

In [46]:
[est.best_params_ for est in nested_cross.estimators_]

[{'clf__base_estimator__C': 3.9037130019710689,
  'clf__base_estimator__class_weight': 'balanced',
  'clf__base_estimator__penalty': 'l1',
  'clf__num_unlabeled': 2610,
  'clf__pu_learning': True},
 {'clf__base_estimator__C': 1.485616929764805,
  'clf__base_estimator__class_weight': 'balanced',
  'clf__base_estimator__penalty': 'l1',
  'clf__num_unlabeled': 9623,
  'clf__pu_learning': True},
 {'clf__base_estimator__C': 2.1671955308749342,
  'clf__base_estimator__class_weight': 'balanced',
  'clf__base_estimator__penalty': 'l1',
  'clf__num_unlabeled': 8386,
  'clf__pu_learning': False}]