In [1]:
import pandas as pd
import numpy as np
import scipy as sp
pd.options.display.max_rows = 400
pd.options.display.max_columns = 400
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [2]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MaxAbsScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

In [1]:
from creon.loadcreon import LoadCreon, load_search, save_search
from creon.creonsklearn.pnuwrapper import PNUWrapper
from creon.creonsklearn.nestedcross import NestedCV
from creon.creonsklearn.jeffsearchcv import JeffRandomSearchCV, extract_score_grid
from creon.creonsklearn.frankenscorer import FrankenScorer, extract_scores_from_nested
from creon.creonsklearn.blagging import BlaggingClassifier

In [4]:
path = "C:\Data\\010317\membership14_final_0103.txt"
lc = LoadCreon(path)

## Let's stratify 80/20 train test holdout for future use

In [5]:
from sklearn.model_selection import train_test_split

In [6]:
X_train, X_test, y_train, y_test = train_test_split(lc.X, lc.y, test_size=0.2, random_state=771, stratify=lc.y)

## Blagging with Logistic Regressor from good grid search

In [7]:
estimators = [('scaler', MaxAbsScaler()),
              ('clf',PNUWrapper(base_estimator=BlaggingClassifier(LogisticRegression(penalty='l1', class_weight='balanced', C=9.0),
                                                   n_estimators=100, max_samples=10000, sample_imbalance=0.237,
                                                   bootstrap=True, n_jobs=-1, verbose=100),
                 num_unlabeled=1.0, random_state=None))]
pipe = Pipeline(estimators)

In [8]:
pipe.fit(X_train.values, y_train.values)

Pickling array (shape=(100,), dtype=int32).
Pickling array (shape=(2,), dtype=int64).
Memmaping (shape=(352556, 288), dtype=float64) to new file C:\Users\JEFFRE~1.GOM\AppData\Local\Temp\joblib_memmaping_pool_3704_189479456\3704-197206088-ba6e3f67a1fe87184b38113f415f36e9.pkl
Memmaping (shape=(352556,), dtype=int64) to new file C:\Users\JEFFRE~1.GOM\AppData\Local\Temp\joblib_memmaping_pool_3704_189479456\3704-197206088-9404432e1fced3ef7c8ae52f549dbd24.pkl
Pickling array (shape=(25,), dtype=int32).
Pickling array (shape=(100,), dtype=int32).
Pickling array (shape=(2,), dtype=int64).
Memmaping (shape=(352556, 288), dtype=float64) to old file C:\Users\JEFFRE~1.GOM\AppData\Local\Temp\joblib_memmaping_pool_3704_189479456\3704-197206088-ba6e3f67a1fe87184b38113f415f36e9.pkl
Memmaping (shape=(352556,), dtype=int64) to old file C:\Users\JEFFRE~1.GOM\AppData\Local\Temp\joblib_memmaping_pool_3704_189479456\3704-197206088-9404432e1fced3ef7c8ae52f549dbd24.pkl
Pickling array (shape=(25,), dtype=int32)

Pipeline(steps=[('scaler', MaxAbsScaler(copy=True)), ('clf', PNUWrapper(base_estimator=BlaggingClassifier(base_estimator=LogisticRegression(C=9.0, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l1', random_...lse),
      num_unlabeled=1.0, pu_learning=False, random_state=None,
      threshold_set_pct=None))])

In [9]:
FrankenScorer()(pipe, X_test.values, y_test.values)

Pickling array (shape=(1,), dtype=int32).
Pickling array (shape=(1, 288), dtype=float64).
Pickling array (shape=(1,), dtype=float64).
Pickling array (shape=(2,), dtype=int64).
Pickling array (shape=(1,), dtype=int32).
Pickling array (shape=(1, 288), dtype=float64).
Pickling array (shape=(1,), dtype=float64).
Pickling array (shape=(2,), dtype=int64).
Pickling array (shape=(1,), dtype=int32).
Pickling array (shape=(1, 288), dtype=float64).
Pickling array (shape=(1,), dtype=float64).
Pickling array (shape=(2,), dtype=int64).
Pickling array (shape=(1,), dtype=int32).
Pickling array (shape=(1, 288), dtype=float64).
Pickling array (shape=(1,), dtype=float64).
Pickling array (shape=(2,), dtype=int64).
Pickling array (shape=(1,), dtype=int32).
Pickling array (shape=(1, 288), dtype=float64).
Pickling array (shape=(1,), dtype=float64).
Pickling array (shape=(2,), dtype=int64).
Pickling array (shape=(1,), dtype=int32).
Pickling array (shape=(1, 288), dtype=float64).
Pickling array (shape=(1,), dt

({'SCORE': 0.78669275929549909,
  'assumed_brier': 0.55346401816285584,
  'assumed_brier_neg': 0.55596226600824927,
  'assumed_f1': 0.011889797548098963,
  'assumed_f1beta10': 0.37248516095887274,
  'confusion_matrix_lab': array([[ 13, 202],
         [ 16, 402]]),
  'confusion_matrix_un': array([[20921, 66801],
         [   16,   402]]),
  'labeled_acc': 0.65560821484992104,
  'labeled_avg_prec': 0.82628093162049043,
  'labeled_brier': 0.31115868109582762,
  'labeled_brier_neg': 0.8593803862101721,
  'labeled_brier_pos': 0.029178617460459088,
  'labeled_f1': 0.78669275929549909,
  'labeled_prec': 0.66556291390728473,
  'labeled_recall': 0.96172248803827753,
  'labeled_roc_auc': 0.51109380215867362,
  'pr_one_unlabeled': 0.76107054292799436,
  'pu_score': 1.2130645967000115},
 0.78669275929549909)

# The BELOW sections don't work yet, just copied

## Nested Cross grid search for pipline with PNU wrapper

In [11]:
param_search = {'clf__base_estimator__C': sp.stats.expon(scale=4), 
               'clf__num_unlabeled': sp.stats.randint(low=1000, high=10000),
               'clf__base_estimator__class_weight': [None, 'balanced'],
               'clf__base_estimator__penalty': ['l1','l2'],
               'clf__pu_learning': [True, False]}

In [13]:
jeffsearch = JeffRandomSearchCV(pipe, param_search, n_iter=20, 
                            scoring=FrankenScorer('assumed_f1beta10'), n_jobs=-1, cv=5, verbose=100)

In [14]:
nested_cross = NestedCV(jeffsearch, scoring=FrankenScorer('assumed_f1beta10'), cv=5)

In [15]:
nested_cross.score(X_train.values, y_train.values, n_jobs=-1, verbose=100)

Pickling array (shape=(624,), dtype=uint32).
Memmaping (shape=(352556, 288), dtype=int64) to new file C:\Users\JEFFRE~1.GOM\AppData\Local\Temp\joblib_memmaping_pool_4404_191771704\4404-199370344-d47a562ce4fdf84a63c4c4a1b92e0b10.pkl
Memmaping (shape=(352556,), dtype=int64) to new file C:\Users\JEFFRE~1.GOM\AppData\Local\Temp\joblib_memmaping_pool_4404_191771704\4404-199370344-37e9d47f2d553d0c8c1a28b850ef16f7.pkl
Memmaping (shape=(282044,), dtype=int32) to new file C:\Users\JEFFRE~1.GOM\AppData\Local\Temp\joblib_memmaping_pool_4404_191771704\4404-199370344-c9c654a2f7eb2665bd3c539dafc8097f.pkl
Pickling array (shape=(70512,), dtype=int32).
Pickling array (shape=(624,), dtype=uint32).
Memmaping (shape=(352556, 288), dtype=int64) to old file C:\Users\JEFFRE~1.GOM\AppData\Local\Temp\joblib_memmaping_pool_4404_191771704\4404-199370344-d47a562ce4fdf84a63c4c4a1b92e0b10.pkl
Memmaping (shape=(352556,), dtype=int64) to old file C:\Users\JEFFRE~1.GOM\AppData\Local\Temp\joblib_memmaping_pool_4404_191

(0.55428519162169576,
 0.57867621577913819,
 0.53450171821305836,
 0.55285273008085756,
 0.54992157741429537)

In [16]:
extract_scores_from_nested(nested_cross.test_score_datas_).mean()

labeled_brier                                         0.299873
assumed_brier                                         0.129452
tn_confusion_matrix_un                                 59952.8
labeled_brier_pos                                     0.170663
labeled_roc_auc                                       0.557804
assumed_f1beta10                                      0.554047
labeled_f1                                            0.718223
labeled_prec                                          0.736439
tp_confusion_matrix_lab                                  240.6
fp_confusion_matrix_lab                                  104.4
fp_confusion_matrix_un                                 10224.4
labeled_avg_prec                                      0.820621
fn_confusion_matrix_un                                    93.4
labeled_recall                                        0.720359
pu_score                                               3.50494
confusion_matrix_lab            [[68.0, 104.4], [93.4, 

In [18]:
[est.best_params_ for est in nested_cross.estimators_]

[{'clf__base_estimator__C': 13.376507992043008,
  'clf__base_estimator__class_weight': 'balanced',
  'clf__base_estimator__penalty': 'l1',
  'clf__num_unlabeled': 4362,
  'clf__pu_learning': True},
 {'clf__base_estimator__C': 9.2583371272535455,
  'clf__base_estimator__class_weight': 'balanced',
  'clf__base_estimator__penalty': 'l1',
  'clf__num_unlabeled': 5620,
  'clf__pu_learning': True},
 {'clf__base_estimator__C': 5.409346407713377,
  'clf__base_estimator__class_weight': 'balanced',
  'clf__base_estimator__penalty': 'l1',
  'clf__num_unlabeled': 3618,
  'clf__pu_learning': True},
 {'clf__base_estimator__C': 4.8954480062572152,
  'clf__base_estimator__class_weight': 'balanced',
  'clf__base_estimator__penalty': 'l1',
  'clf__num_unlabeled': 7026,
  'clf__pu_learning': False},
 {'clf__base_estimator__C': 1.3738660955586304,
  'clf__base_estimator__class_weight': 'balanced',
  'clf__base_estimator__penalty': 'l1',
  'clf__num_unlabeled': 7233,
  'clf__pu_learning': False}]