```
Copyright 2021 IBM Corporation

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

     http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
```

# Boosting Machine on Credit Card Fraud Dataset

## Background 

This is a classification problem to distinguish between a signal process which produces supersymmetric particles and a background process which does not.

## Source

Daniel Whiteson daniel '@' uci.edu, Assistant Professor, Physics & Astronomy, Univ. of California Irvine.

In this example, we download the dataset from the [UCI Machine Learning Repository](https://archive.ics.uci.edu/ml/datasets.php).

## Goal
The goal of this notebook is to illustrate how Snap ML can accelerate training of a logistic regression model on this dataset.

## Code

In [1]:
cd ../../

/Users/tpa/Code/snapml-examples/examples


In [2]:
CACHE_DIR='cache-dir'

In [3]:
import numpy as np
import time
from datasets import CreditCardFraud
from xgboost import XGBClassifier
from snapml import BoostingMachineClassifier as SnapBoostingMachineClassifier
from sklearn.metrics import log_loss

In [4]:
dataset = CreditCardFraud(cache_dir=CACHE_DIR)
X_train, X_test, y_train, y_test = dataset.get_train_test_split()

Reading binary CreditCardFraud dataset (cache) from disk.


In [5]:
print("Number of examples: %d" % (X_train.shape[0]))
print("Number of features: %d" % (X_train.shape[1]))
print("Number of classes:  %d" % (len(np.unique(y_train))))

Number of examples: 213605
Number of features: 28
Number of classes:  2


In [6]:
from sklearn.model_selection import train_test_split, PredefinedSplit
train_ind, val_ind = train_test_split(range(0, X_train.shape[0]), test_size=0.3, shuffle=True, random_state=42)
tmp = np.zeros(shape=(X_train.shape[0],))
for i in train_ind:
    tmp[i] = -1
splitter = PredefinedSplit(tmp)

In [7]:
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingRandomSearchCV
from sklearn.utils import parallel_backend
import pandas as pd

In [8]:
from sklearn.metrics import make_scorer
from sklearn.utils.class_weight import compute_class_weight, compute_sample_weight

class_weights = {
    0: y_train.shape[0]/2.0/np.sum(y_train == 0),
    1: y_train.shape[0]/2.0/np.sum(y_train == 1)
}
print(class_weights)

w_train = compute_sample_weight(class_weights, y_train)

{0: 0.5008652385150725, 1: 289.43766937669375}


In [9]:
from sklearn.metrics import make_scorer

def weighted_neg_log_loss(y, p):
    w = compute_sample_weight(class_weights, y)
    return -log_loss(y, p.astype(np.float64), sample_weight=w)

scorer = make_scorer(weighted_neg_log_loss, greater_is_better=True, needs_proba=True)

In [10]:
sh_params = {
    'n_candidates': 256,
    'min_resources': 16,
    'max_resources': 1024,
    'factor': 4,
    'scoring': scorer,
    'random_state': 42,
    'verbose': 40,
    'n_jobs': 4,
    'cv': splitter,
    'return_train_score': False,
}

In [11]:
clf = XGBClassifier(random_state=42, 
                    n_jobs=1,
                    tree_method='hist',
                    use_label_encoder=False,
                    eval_metric='logloss',
                    max_bin=256)

xgb_distributions = {
    "max_depth": range(1, 20),
    "learning_rate": 10 ** np.linspace(-2.5, -1),
    "colsample_bytree": np.linspace(0.5, 1.0),
    "subsample": np.linspace(0.5, 1.0),
    "reg_lambda": 10 ** np.linspace(-2, 2)
}

search = HalvingRandomSearchCV(clf, xgb_distributions, resource='n_estimators', **sh_params)
                        
t0 = time.time()
with parallel_backend("loky"): 
    search.fit(X_train, y_train.astype(np.int32), sample_weight=w_train)
t_fit_xgb  = time.time()-t0


print(search.best_params_)

score_xgb = weighted_neg_log_loss(y_test, search.predict_proba(X_test)[:,1])

print(t_fit_xgb, search.best_score_, score_xgb)

n_iterations: 4
n_required_iterations: 5
n_possible_iterations: 4
min_resources_: 16
max_resources_: 1024
aggressive_elimination: False
factor: 4
----------
iter: 0
n_candidates: 256
n_resources: 16
Fitting 1 folds for each of 256 candidates, totalling 256 fits
----------
iter: 1
n_candidates: 64
n_resources: 64
Fitting 1 folds for each of 64 candidates, totalling 64 fits
----------
iter: 2
n_candidates: 16
n_resources: 256
Fitting 1 folds for each of 16 candidates, totalling 16 fits
----------
iter: 3
n_candidates: 4
n_resources: 1024
Fitting 1 folds for each of 4 candidates, totalling 4 fits
{'subsample': 0.6326530612244898, 'reg_lambda': 56.89866029018293, 'max_depth': 1, 'learning_rate': 0.09319395762340775, 'colsample_bytree': 0.7142857142857143, 'n_estimators': 1024}
274.5722529888153 -0.4541671767181547 -0.26632982394058535


In [12]:
clf = SnapBoostingMachineClassifier(random_state=42, n_jobs=1, base_score=None)

snap_distributions = {
    "max_depth": range(1, 20),
    "tree_select_probability": np.linspace(0.9, 1.0),
    "learning_rate": 10 ** np.linspace(-2.5, -1),
    "colsample_bytree": np.linspace(0.5, 1.0),
    "subsample": np.linspace(0.5, 1.0),
    "lambda_l2": 10 ** np.linspace(-2, 2),
    "regularizer": 10 ** np.linspace(-6, 3),
    "fit_intercept": [False, True],
    "gamma": 10 ** np.linspace(-3, 3),
    "n_components": range(1, 100)   
}

search = HalvingRandomSearchCV(clf, snap_distributions, resource='num_round', **sh_params)
                             
t0 = time.time()
with parallel_backend("loky"): 
    search.fit(X_train, y_train, sample_weight=w_train)
t_fit_snapml = time.time()-t0

print(search.best_params_)

score_snapml = weighted_neg_log_loss(y_test, search.predict_proba(X_test)[:,1])

print(t_fit_snapml, search.best_score_, score_snapml)

n_iterations: 4
n_required_iterations: 5
n_possible_iterations: 4
min_resources_: 16
max_resources_: 1024
aggressive_elimination: False
factor: 4
----------
iter: 0
n_candidates: 256
n_resources: 16
Fitting 1 folds for each of 256 candidates, totalling 256 fits




----------
iter: 1
n_candidates: 64
n_resources: 64
Fitting 1 folds for each of 64 candidates, totalling 64 fits
----------
iter: 2
n_candidates: 16
n_resources: 256
Fitting 1 folds for each of 16 candidates, totalling 16 fits
----------
iter: 3
n_candidates: 4
n_resources: 1024
Fitting 1 folds for each of 4 candidates, totalling 4 fits
{'tree_select_probability': 0.9346938775510204, 'subsample': 0.846938775510204, 'regularizer': 6.250551925273976, 'n_components': 58, 'max_depth': 1, 'learning_rate': 0.08094001216083124, 'lambda_l2': 56.89866029018293, 'gamma': 568.9866029018293, 'fit_intercept': True, 'colsample_bytree': 0.5408163265306123, 'num_round': 1024}
275.0033118724823 -0.45589279056285664 -0.25351774172662755


In [17]:
speed_up = t_fit_xgb/t_fit_snapml
score_diff = (score_snapml-score_xgb)/np.abs(score_xgb)
print("Speed-up:                %.1f x" % (speed_up))
print("Relative diff. in score: %.4f" % (score_diff))

Speed-up:                1.0 x
Relative diff. in score: 0.0481
