In [1]:
import pandas as pd
import numpy as np
import scipy as sp
pd.options.display.max_rows = 400
pd.options.display.max_columns = 400
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [2]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MaxAbsScaler
from sklearn.svm import SVC

In [3]:
from creon.loadcreon import LoadCreon
from creon.creonsklearn.pnuwrapper import PNUWrapper
from creon.creonsklearn.nestedcross import NestedCV
from creon.creonsklearn.jeffsearchcv import JeffRandomSearchCV, extract_score_grid
from creon.creonsklearn.frankenscorer import FrankenScorer, extract_scores_from_nested
from creon.creonsklearn.blagging import BlaggingClassifier

In [4]:
path = "C:\Data\\010317\membership14_final_0103.txt"
lc = LoadCreon(path)

## Let's stratify 80/20 train test holdout for future use

In [5]:
from sklearn.model_selection import train_test_split

In [6]:
X_train, X_test, y_train, y_test = train_test_split(lc.X, lc.y, test_size=0.2, random_state=771, stratify=lc.y)

## SVC pipeline and model on labeled data (PN)

In [7]:
estimators = [('scaler', MaxAbsScaler()),
              ('clf',PNUWrapper(base_estimator=SVC(C=1.0, kernel='linear', probability=True, class_weight='balanced')))]
pipe = Pipeline(estimators)

In [8]:
pipe.fit(X_train.values, y_train.values)

Pipeline(steps=[('scaler', MaxAbsScaler(copy=True)), ('clf', PNUWrapper(base_estimator=SVC(C=1.0, cache_size=200, class_weight='balanced', coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=True, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
      num_unlabeled=0.0, pu_learning=False, random_state=None,
      threshold_set_pct=None))])

In [9]:
FrankenScorer()(pipe, X_test.values, y_test.values)

({'SCORE': 0.9903381642512078,
  'assumed_brier': 0.98987546182088737,
  'assumed_brier_neg': 0.99451180725273935,
  'assumed_f1': 0.009284840798949227,
  'assumed_f1beta10': 0.31928017394254343,
  'confusion_matrix_lab': array([[215,   0],
         [  8, 410]]),
  'confusion_matrix_un': array([[  234, 87488],
         [    8,   410]]),
  'labeled_acc': 0.9873617693522907,
  'labeled_avg_prec': 0.99674973733342409,
  'labeled_brier': 0.012335177411497278,
  'labeled_brier_neg': 0.0034870615501299967,
  'labeled_brier_pos': 0.016886241789951739,
  'labeled_f1': 0.9903381642512078,
  'labeled_prec': 1.0,
  'labeled_recall': 0.98086124401913877,
  'labeled_roc_auc': 0.99043062200956933,
  'pr_one_unlabeled': 0.99978287451289616,
  'pu_score': 0.96473759438047069},
 0.9903381642512078)

## Nested Cross grid search for pipline with PNU wrapper

In [8]:
param_search = {'clf__base_estimator__C': sp.stats.expon(scale=4), 
               'clf__num_unlabeled': sp.stats.randint(low=1000, high=8000),
               'clf__base_estimator__class_weight': [None, 'balanced'],
               'clf__base_estimator__kernel': ['linear','rbf'],
               'clf__pu_learning': [True, False],
               'clf__base_estimator__gamma': ['auto',0.1,0.5,1.0,1.5,2.0,2.5,3.0,5.0,9.0]}

In [9]:
jeffsearch = JeffRandomSearchCV(pipe, param_search, n_iter=20, 
                            scoring=FrankenScorer('assumed_f1beta10'), n_jobs=-1, cv=3, verbose=100)

In [10]:
nested_cross = NestedCV(jeffsearch, scoring=FrankenScorer('assumed_f1beta10'), cv=3)

In [11]:
nested_cross.score(X_train.values, y_train.values, n_jobs=1, verbose=100)

[CV]  ................................................................
Fitting 3 folds for each of 20 candidates, totalling 60 fits
Memmaping (shape=(235036, 288), dtype=int64) to new file C:\Users\JEFFRE~1.GOM\AppData\Local\Temp\joblib_memmaping_pool_4176_189483440\4176-197259728-3854caa48e64fd7067f49caccec3d748.pkl
Memmaping (shape=(235036,), dtype=int64) to new file C:\Users\JEFFRE~1.GOM\AppData\Local\Temp\joblib_memmaping_pool_4176_189483440\4176-197259728-1921f06a521ddc04a073cc069edefb84.pkl
Pickling array (shape=(156690,), dtype=int32).
Pickling array (shape=(78346,), dtype=int32).
Memmaping (shape=(235036, 288), dtype=int64) to old file C:\Users\JEFFRE~1.GOM\AppData\Local\Temp\joblib_memmaping_pool_4176_189483440\4176-197259728-3854caa48e64fd7067f49caccec3d748.pkl
Memmaping (shape=(235036,), dtype=int64) to old file C:\Users\JEFFRE~1.GOM\AppData\Local\Temp\joblib_memmaping_pool_4176_189483440\4176-197259728-1921f06a521ddc04a073cc069edefb84.pkl
Pickling array (shape=(156691,), dt

(0.52071160482285495, 0.52528292466437376, 0.5122838198099664)

In [26]:
extract_scores_from_nested(nested_cross.test_score_datas_).mean().sort_index()

assumed_brier                                                      0.0525847
assumed_brier_neg                                                  0.0511207
assumed_f1                                                         0.0426132
assumed_f1beta10                                                    0.519426
confusion_matrix_lab        [[12.3333333333, 275.0], [183.0, 373.666666667]]
confusion_matrix_un        [[100340.333333, 16621.6666667], [183.0, 373.6...
fn_confusion_matrix_lab                                                  183
fn_confusion_matrix_un                                                   183
fp_confusion_matrix_lab                                                  275
fp_confusion_matrix_un                                               16621.7
labeled_acc                                                         0.457358
labeled_avg_prec                                                    0.732197
labeled_brier                                                       0.398751

In [13]:
[est.best_params_ for est in nested_cross.estimators_]

[{'clf__base_estimator__C': 5.5924501604319081,
  'clf__base_estimator__class_weight': 'balanced',
  'clf__base_estimator__gamma': 0.1,
  'clf__base_estimator__kernel': 'rbf',
  'clf__num_unlabeled': 6159,
  'clf__pu_learning': True},
 {'clf__base_estimator__C': 18.46571573629733,
  'clf__base_estimator__class_weight': 'balanced',
  'clf__base_estimator__gamma': 'auto',
  'clf__base_estimator__kernel': 'linear',
  'clf__num_unlabeled': 3410,
  'clf__pu_learning': True},
 {'clf__base_estimator__C': 7.3080519477862032,
  'clf__base_estimator__class_weight': 'balanced',
  'clf__base_estimator__gamma': 1.5,
  'clf__base_estimator__kernel': 'linear',
  'clf__num_unlabeled': 2662,
  'clf__pu_learning': True}]