In [1]:
import pandas as pd
import numpy as np
import scipy as sp
pd.options.display.max_rows = 400
pd.options.display.max_columns = 400
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [2]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MaxAbsScaler
from sklearn.linear_model import LogisticRegression

In [1]:
from creon.loadcreon import LoadCreon
from creon.creonsklearn.pnuwrapper import PNUWrapper
from creon.creonsklearn.nestedcross import NestedCV
from creon.creonsklearn.jeffsearchcv import JeffRandomSearchCV, extract_score_grid
from creon.creonsklearn.frankenscorer import FrankenScorer, extract_scores_from_nested

ImportError: cannot import name 'extract_score_grid'

In [4]:
path = "C:\Data\\010317\membership14_final_0103.txt"
lc = LoadCreon(path)

## Let's stratify 80/20 train test holdout for future use

In [5]:
from sklearn.model_selection import train_test_split

In [7]:
X_train, X_test, y_train, y_test = train_test_split(lc.X, lc.y, test_size=0.2, random_state=771, stratify=lc.y)

## LASSO pipeline and model on labeled data (PN)

In [8]:
estimators = [('scaler', MaxAbsScaler()),
              ('clf',PNUWrapper(base_estimator=LogisticRegression(penalty='l1', C=2)))]
pipe = Pipeline(estimators)

In [92]:
pipe.fit(X_train.values, y_train.values)

Pipeline(steps=[('scaler', MaxAbsScaler(copy=True)), ('clf', PNUWrapper(base_estimator=LogisticRegression(C=2, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
      num_unlabeled=0.0, pu_learning=False, random_state=None,
      threshold_set_pct=None))])

In [93]:
FrankenScorer()(pipe, X_test.values, y_test.values)

({'SCORE': 0.88551401869158874,
  'assumed_brier': 0.59597031989103544,
  'assumed_brier_neg': 0.59845958947742284,
  'assumed_f1': 0.0093217733505503286,
  'assumed_f1beta10': 0.31197991800940528,
  'confusion_matrix_lab': array([[156,  59],
         [ 39, 379]]),
  'confusion_matrix_un': array([[ 7204, 80518],
         [   39,   379]]),
  'labeled_acc': 0.84518167456556081,
  'labeled_avg_prec': 0.91680337132692624,
  'labeled_brier': 0.112377322679029,
  'labeled_brier_neg': 0.18782771255097075,
  'labeled_brier_pos': 0.07356910779274313,
  'labeled_f1': 0.88551401869158874,
  'labeled_prec': 0.86529680365296802,
  'labeled_recall': 0.90669856459330145,
  'labeled_roc_auc': 0.81613997997106935,
  'pr_one_unlabeled': 0.91945787194167328,
  'pu_score': 0.89570806802864955},
 0.88551401869158874)

In [94]:
imp = pd.DataFrame(pipe.named_steps['clf'].base_estimator.coef_, columns=X_test.columns).T.sort_values(by=0, ascending=False)
imp[imp[0] != 0.0]

Unnamed: 0,0
DIAG_FLAG4_Sum,31.614798
DIAG_FLAG2_Sum,8.195174
DIAG_FLAG71_Sum,5.882953
ndc_cat54_Sum,5.25443
DIAG_FLAG69_Sum,4.939295
ndc_cat61_Sum,4.02993
DIAG_FLAG31_Sum,3.957505
ndc_cat59_Sum,3.950633
ndc_cat87_Sum,3.608643
ndc_cat58_Sum,3.571072


# Let's do nested cross 3x validation on the 80%, although really only 3-fold

In [10]:
nested_lasso = NestedCV(pipe, scoring=FrankenScorer('assumed_f1beta10'), cv=3, random_state=None, use_same_random_state=False)
nested_lasso.score(X_train.values, y_train.values, n_jobs=-1, verbose=1)

[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:   17.2s finished


(0.32460658700734701, 0.32100760974192261, 0.31133719159305517)

In [13]:
extract_scores_from_nested(nested_lasso.test_score_datas_).mean().sort_index()

assumed_brier                                                       0.593542
assumed_brier_neg                                                   0.596041
assumed_f1                                                        0.00951965
assumed_f1beta10                                                    0.318984
confusion_matrix_lab       [[193.333333333, 94.0], [39.3333333333, 517.33...
confusion_matrix_un        [[9332.66666667, 107629.333333], [39.333333333...
fn_confusion_matrix_lab                                              39.3333
fn_confusion_matrix_un                                               39.3333
fp_confusion_matrix_lab                                                   94
fp_confusion_matrix_un                                                107629
labeled_acc                                                         0.842003
labeled_avg_prec                                                    0.911138
labeled_brier                                                       0.116226

## Nested Cross grid search for pipline with PNU wrapper

In [11]:
param_search = {'clf__base_estimator__C': sp.stats.expon(scale=4), 
               'clf__num_unlabeled': sp.stats.randint(low=1000, high=10000),
               'clf__base_estimator__class_weight': [None, 'balanced'],
               'clf__base_estimator__penalty': ['l1','l2'],
               'clf__pu_learning': [True, False]}

In [13]:
jeffsearch = JeffRandomSearchCV(pipe, param_search, n_iter=20, 
                            scoring=FrankenScorer('assumed_f1beta10'), n_jobs=-1, cv=5, verbose=100)

In [14]:
nested_cross = NestedCV(jeffsearch, scoring=FrankenScorer('assumed_f1beta10'), cv=5)

In [15]:
nested_cross.score(X_train.values, y_train.values, n_jobs=-1, verbose=100)

Pickling array (shape=(624,), dtype=uint32).
Memmaping (shape=(352556, 288), dtype=int64) to new file C:\Users\JEFFRE~1.GOM\AppData\Local\Temp\joblib_memmaping_pool_4404_191771704\4404-199370344-d47a562ce4fdf84a63c4c4a1b92e0b10.pkl
Memmaping (shape=(352556,), dtype=int64) to new file C:\Users\JEFFRE~1.GOM\AppData\Local\Temp\joblib_memmaping_pool_4404_191771704\4404-199370344-37e9d47f2d553d0c8c1a28b850ef16f7.pkl
Memmaping (shape=(282044,), dtype=int32) to new file C:\Users\JEFFRE~1.GOM\AppData\Local\Temp\joblib_memmaping_pool_4404_191771704\4404-199370344-c9c654a2f7eb2665bd3c539dafc8097f.pkl
Pickling array (shape=(70512,), dtype=int32).
Pickling array (shape=(624,), dtype=uint32).
Memmaping (shape=(352556, 288), dtype=int64) to old file C:\Users\JEFFRE~1.GOM\AppData\Local\Temp\joblib_memmaping_pool_4404_191771704\4404-199370344-d47a562ce4fdf84a63c4c4a1b92e0b10.pkl
Memmaping (shape=(352556,), dtype=int64) to old file C:\Users\JEFFRE~1.GOM\AppData\Local\Temp\joblib_memmaping_pool_4404_191

(0.55428519162169576,
 0.57867621577913819,
 0.53450171821305836,
 0.55285273008085756,
 0.54992157741429537)

In [16]:
extract_scores_from_nested(nested_cross.test_score_datas_).mean()

labeled_brier                                         0.299873
assumed_brier                                         0.129452
tn_confusion_matrix_un                                 59952.8
labeled_brier_pos                                     0.170663
labeled_roc_auc                                       0.557804
assumed_f1beta10                                      0.554047
labeled_f1                                            0.718223
labeled_prec                                          0.736439
tp_confusion_matrix_lab                                  240.6
fp_confusion_matrix_lab                                  104.4
fp_confusion_matrix_un                                 10224.4
labeled_avg_prec                                      0.820621
fn_confusion_matrix_un                                    93.4
labeled_recall                                        0.720359
pu_score                                               3.50494
confusion_matrix_lab            [[68.0, 104.4], [93.4, 

In [18]:
[est.best_params_ for est in nested_cross.estimators_]

[{'clf__base_estimator__C': 13.376507992043008,
  'clf__base_estimator__class_weight': 'balanced',
  'clf__base_estimator__penalty': 'l1',
  'clf__num_unlabeled': 4362,
  'clf__pu_learning': True},
 {'clf__base_estimator__C': 9.2583371272535455,
  'clf__base_estimator__class_weight': 'balanced',
  'clf__base_estimator__penalty': 'l1',
  'clf__num_unlabeled': 5620,
  'clf__pu_learning': True},
 {'clf__base_estimator__C': 5.409346407713377,
  'clf__base_estimator__class_weight': 'balanced',
  'clf__base_estimator__penalty': 'l1',
  'clf__num_unlabeled': 3618,
  'clf__pu_learning': True},
 {'clf__base_estimator__C': 4.8954480062572152,
  'clf__base_estimator__class_weight': 'balanced',
  'clf__base_estimator__penalty': 'l1',
  'clf__num_unlabeled': 7026,
  'clf__pu_learning': False},
 {'clf__base_estimator__C': 1.3738660955586304,
  'clf__base_estimator__class_weight': 'balanced',
  'clf__base_estimator__penalty': 'l1',
  'clf__num_unlabeled': 7233,
  'clf__pu_learning': False}]