In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import sklearn.metrics as metrics
from sklearn.preprocessing import StandardScaler
from scipy.stats import uniform, randint
from sklearn.model_selection import cross_val_score, GridSearchCV, KFold, RandomizedSearchCV, train_test_split
import xgboost as xgb


sns.set(style="whitegrid")

In [2]:
df_train = pd.read_csv("../../data/extended_train.csv", sep="|")

In [3]:
def score_function(y_test, y_pred):
  accuracy = metrics.accuracy_score(y_test, y_pred)
  confusion_matrix = metrics.confusion_matrix(y_test, y_pred)
  f2_score = (0 if all(y_pred == 0) else metrics.fbeta_score(y_test, y_pred, beta=2) )
  dmc_score =  np.sum(confusion_matrix *np.array([[0, -25],[ -5, 5]]))
  
  return accuracy, f2_score, dmc_score, confusion_matrix

In [4]:
df_cpy = df_train.copy()
scaler = StandardScaler()
df_cpy[['trustLevel','totalScanTimeInSeconds', 'grandTotal', 'lineItemVoids', 'scansWithoutRegistration', 'quantityModifications', 'scannedLineItemsPerSecond', 'valuePerSecond', 'lineItemVoidsPerPosition', 'totalScannedLineItems']] = scaler.fit_transform(df_cpy[['trustLevel','totalScanTimeInSeconds', 'grandTotal', 'lineItemVoids', 'scansWithoutRegistration', 'quantityModifications', 'scannedLineItemsPerSecond', 'valuePerSecond', 'lineItemVoidsPerPosition', 'totalScannedLineItems']])
y = df_cpy.fraud
X = df_cpy.drop(['fraud'], axis=1)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


In [5]:
X.head()

Unnamed: 0,trustLevel,totalScanTimeInSeconds,grandTotal,lineItemVoids,scansWithoutRegistration,quantityModifications,scannedLineItemsPerSecond,valuePerSecond,lineItemVoidsPerPosition,totalScannedLineItems
0,0.935189,0.229898,0.132567,0.443466,-1.562415,0.280068,-0.109983,-0.12067,-0.379855,1.564236
1,-0.235121,-1.554996,-0.812391,-0.136202,-0.925241,0.870031,0.256761,0.041543,-0.292611,-0.158893
2,-0.235121,1.10159,0.390409,-0.71587,1.623453,1.459994,-0.178001,-0.129443,-0.387851,-0.273769
3,1.520344,1.620455,1.43249,0.7333,-0.288068,0.870031,-0.150646,-0.120957,-0.353867,1.564236
4,0.935189,-0.947453,1.059898,-0.71587,0.667693,-0.309895,0.016712,-0.009777,-0.478031,1.334485


In [6]:
y.head()

0    0
1    0
2    0
3    0
4    0
Name: fraud, dtype: int64

In [7]:
xgb_model2 = xgb.XGBClassifier(nthread=8, objective="binary:logistic", random_state=42,eval_metric="auc")
params = {
    "eta": uniform(0.1, 0.7),
    "gamma": uniform(0, 1),
    "learning_rate": uniform(0.03, 0.3),  # default 0.1
    "max_depth": randint(2, 8),  # default 3
    "n_estimators": randint(100, 400),  # default 100
    "subsample": uniform(0.6, 0.4),
    "lambda" : uniform(1e-3, 1),
"colsample_bytree" : uniform(0.1,0.9)
}

In [8]:
def display_scores(scores):
    print("Scores: {0}\nMean: {1:.3f}\nStd: {2:.3f}".format(scores, np.mean(scores), np.std(scores)))



def report_best_scores(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
                  results['mean_test_score'][candidate],
                  results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")


In [9]:
ftwo_scorer = metrics.make_scorer(metrics.fbeta_score, beta=3)
search = RandomizedSearchCV(xgb_model2, scoring=ftwo_scorer, param_distributions=params, random_state=42, n_iter=200,
                            cv=3, verbose=1, n_jobs=12, return_train_score=True)
search.fit(X, y)

report_best_scores(search.cv_results_, 3)

Fitting 3 folds for each of 200 candidates, totalling 600 fits


[Parallel(n_jobs=12)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:  1.3min
[Parallel(n_jobs=12)]: Done 176 tasks      | elapsed:  7.0min
[Parallel(n_jobs=12)]: Done 426 tasks      | elapsed: 16.3min


Model with rank: 1
Mean validation score: 0.871 (std: 0.064)
Parameters: {'learning_rate': 0.18176660366533978, 'subsample': 0.8904382344249603, 'gamma': 0.1533514031160802, 'max_depth': 2, 'eta': 0.7630983377282706, 'colsample_bytree': 0.5158052232797194, 'lambda': 0.5872298320167972, 'n_estimators': 210}

Model with rank: 1
Mean validation score: 0.871 (std: 0.064)
Parameters: {'learning_rate': 0.10446793282714556, 'subsample': 0.7675701798018192, 'gamma': 0.03284667024343835, 'max_depth': 2, 'eta': 0.510947756895597, 'colsample_bytree': 0.9266092524453711, 'lambda': 0.9137494628791288, 'n_estimators': 279}

Model with rank: 3
Mean validation score: 0.870 (std: 0.042)
Parameters: {'learning_rate': 0.17543351014510383, 'subsample': 0.8966747776914662, 'gamma': 0.4710654526938144, 'max_depth': 2, 'eta': 0.7897571185692143, 'colsample_bytree': 0.6091989579699044, 'lambda': 0.18310713882158325, 'n_estimators': 113}



[Parallel(n_jobs=12)]: Done 600 out of 600 | elapsed: 22.7min finished


In [10]:
ftwo_scorer = metrics.make_scorer(metrics.fbeta_score, beta=2)
search = RandomizedSearchCV(xgb_model2, scoring=ftwo_scorer, param_distributions=params, random_state=42, n_iter=200,
                            cv=3, verbose=1, n_jobs=12, return_train_score=True)
search.fit(X, y)

report_best_scores(search.cv_results_, 3)


Fitting 3 folds for each of 200 candidates, totalling 600 fits


[Parallel(n_jobs=12)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:  1.2min
[Parallel(n_jobs=12)]: Done 176 tasks      | elapsed:  7.0min
[Parallel(n_jobs=12)]: Done 426 tasks      | elapsed: 16.3min


Model with rank: 1
Mean validation score: 0.877 (std: 0.054)
Parameters: {'learning_rate': 0.18176660366533978, 'subsample': 0.8904382344249603, 'gamma': 0.1533514031160802, 'max_depth': 2, 'eta': 0.7630983377282706, 'colsample_bytree': 0.5158052232797194, 'lambda': 0.5872298320167972, 'n_estimators': 210}

Model with rank: 1
Mean validation score: 0.877 (std: 0.054)
Parameters: {'learning_rate': 0.10446793282714556, 'subsample': 0.7675701798018192, 'gamma': 0.03284667024343835, 'max_depth': 2, 'eta': 0.510947756895597, 'colsample_bytree': 0.9266092524453711, 'lambda': 0.9137494628791288, 'n_estimators': 279}

Model with rank: 3
Mean validation score: 0.876 (std: 0.034)
Parameters: {'learning_rate': 0.17543351014510383, 'subsample': 0.8966747776914662, 'gamma': 0.4710654526938144, 'max_depth': 2, 'eta': 0.7897571185692143, 'colsample_bytree': 0.6091989579699044, 'lambda': 0.18310713882158325, 'n_estimators': 113}



[Parallel(n_jobs=12)]: Done 600 out of 600 | elapsed: 22.8min finished
