In [2]:
import os
# ggf muss Pfad angepasst werden
os.chdir("{}/..".format(os.getcwd()))
os.getcwd()


'/home/lukas/Projects/dmc2019'

In [3]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import model_selection, linear_model, metrics
from scipy.stats import uniform, randint
import xgboost as xgb
from sklearn.model_selection import cross_val_score, GridSearchCV, KFold, RandomizedSearchCV, train_test_split, cross_validate
#from scripts.utils import own_scorer, calc_scores

## Data

In [4]:
df_train = pd.read_csv("data/extended_train.csv", sep="|")
sum_frauds, sum_non_frauds  = len(df_train[df_train.fraud == 1]), len(df_train[df_train.fraud == 0])
df_y = df_train.fraud
df_X = df_train.drop(['fraud'], axis=1)

train_x, val_x, train_y, val_y = train_test_split(df_X, df_y, test_size=0.2)

print("Shape of train data: ", train_x.shape, "Shape of val data: ", val_x.shape)
df_train.head()

Shape of train data:  (1503, 10) Shape of val data:  (376, 10)


Unnamed: 0,trustLevel,totalScanTimeInSeconds,grandTotal,lineItemVoids,scansWithoutRegistration,quantityModifications,scannedLineItemsPerSecond,valuePerSecond,lineItemVoidsPerPosition,fraud,totalScannedLineItems
0,5,1054,54.7,7,0,3,0.027514,0.051898,0.241379,0,29.0
1,3,108,27.36,5,2,4,0.12963,0.253333,0.357143,0,14.0
2,3,1516,62.16,3,10,5,0.008575,0.041003,0.230769,0,13.0
3,6,1791,92.31,8,4,4,0.016192,0.051541,0.275862,0,29.0
4,5,430,81.53,3,7,2,0.062791,0.189605,0.111111,0,27.0


## Data with new Features

In [71]:
df = df_train
df['totalScannedLineItems'] = df['scannedLineItemsPerSecond'] * df['totalScanTimeInSeconds']
df['avgTimePerScan'] = 1/ df['scannedLineItemsPerSecond']
df['avgValuePerScan'] = df['avgTimePerScan'] * df['valuePerSecond']
df['withoutRegisPerPosition'] = df['scansWithoutRegistration'] / df['totalScannedLineItems'] #equivalent to lineItemVoidsPerPosition?
df['quantiModPerPosition'] = df['quantityModifications'] / df['totalScannedLineItems']
df['lineItemVoidsPerTotal'] = df['lineItemVoids'] / df['grandTotal']
df['withoutRegisPerTotal'] = df['scansWithoutRegistration'] / df['grandTotal']
df['quantiModPerTotal'] = df['quantityModifications'] / df['grandTotal']
df['lineItemVoidsPerTime'] = df['lineItemVoids'] / df['totalScanTimeInSeconds']
df['withoutRegisPerTime'] = df['scansWithoutRegistration'] / df['totalScanTimeInSeconds']
df['quantiModPerTime'] = df['quantityModifications'] / df['totalScanTimeInSeconds']
df['valuePerScannedLineItem'] = df['valuePerSecond'] / df['scannedLineItemsPerSecond']
df1_y = df.fraud
df1_X = df.drop(['fraud'], axis=1)

In [49]:
print("Frauds: ", sum_frauds, "Non Frauds: ", sum_non_frauds)

Frauds:  104 Non Frauds:  1775


### Scoring Methods

In [6]:
scoring = {'AUC': 'roc_auc', 'FBeta': metrics.make_scorer(metrics.fbeta_score, beta=0.5172)}

In [19]:
params = {
    "max_depth": randint(2, 6),  # default 3
    "n_estimators": randint(300, 450),  # default 100
    "eta" :  uniform(0.1, 0.7),
    "gamma" : uniform(0,2),
    "min_child_weight" : uniform(0,50),
    "max_delta_step" : uniform(0,10), #Set it to value of 1-10 might help control the update.
    "lambda" : uniform(0.1,2),
    "scale_pos_weight" : [1, sum_non_frauds/sum_frauds],
    "max_bin" : randint(200, 300)
}
default_xgb = xgb.XGBClassifier(booster="gbtree",tree_method='gpu_hist', disable_default_eval_metric=1,objective='binary:logistic',eval_metric='aucpr', n_jobs=6, verbosity=2)
search = RandomizedSearchCV(default_xgb, scoring=scoring, param_distributions=params, random_state=42, n_iter=10,
                            cv=3, verbose=1, n_jobs=-1, return_train_score=True,refit='FBeta')
search.fit(df_X, df_y)
results = search.cv_results_

Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:    4.8s finished


In [20]:
def own_scorer_normalized(estimator, X_val, ground_truth):
    prediction = estimator.predict(X_val)
    confusion_matrix = metrics.confusion_matrix(ground_truth, prediction)
    dmc_score = np.sum(confusion_matrix * np.array([[0, -25], [-5, 5]]))
    return dmc_score/len(ground_truth)

# DMC Score for usage as scorer
def own_scorer(estimator, X_val, ground_truth):
    prediction = estimator.predict(X_val)
    confusion_matrix = metrics.confusion_matrix(ground_truth, prediction)
    dmc_score = np.sum(confusion_matrix * np.array([[0, -25], [-5, 5]]))
    return dmc_score

scorings = {"DMC" : own_scorer, "DMC_Norm" : own_scorer_normalized}
xgbo = search.best_estimator_
res = cross_validate(xgbo, df_X, df_y, scoring=scorings, cv=5, n_jobs=-1)

In [21]:
res



{'fit_time': array([0.97972083, 0.999892  , 1.00132322, 0.98844695, 0.96589613]),
 'score_time': array([0.00831389, 0.00643206, 0.00542617, 0.00634503, 0.00897479]),
 'test_DMC': array([-70, -30,  30,  20,  55]),
 'test_DMC_Norm': array([-0.18617021, -0.07978723,  0.07978723,  0.05319149,  0.14666667]),
 'train_DMC': array([190, 140, 190, 240, 320]),
 'train_DMC_Norm': array([0.12641384, 0.09314704, 0.12641384, 0.15968064, 0.21276596])}

In [22]:
results

{'mean_fit_time': array([1.65201481, 1.62268567, 1.76170095, 2.62661489, 1.40069795,
        1.81090871, 2.17810178, 1.37819846, 1.32529147, 1.06996719]),
 'mean_score_time': array([0.01336797, 0.0132997 , 0.01412503, 0.01442782, 0.01387755,
        0.01476526, 0.01299334, 0.01146142, 0.00770036, 0.00590452]),
 'mean_test_AUC': array([0.99654747, 0.9956154 , 0.96381256, 0.99829959, 0.99535658,
        0.99697113, 0.99839667, 0.99594602, 0.9968931 , 0.99779805]),
 'mean_test_FBeta': array([0.85629233, 0.71191035, 0.75662265, 0.84614022, 0.70670032,
        0.87588566, 0.851698  , 0.72341712, 0.75995774, 0.84383849]),
 'mean_train_AUC': array([0.99965954, 0.99831877, 0.9767591 , 1.        , 0.99772014,
        0.99988578, 1.        , 0.99880411, 0.99964534, 1.        ]),
 'mean_train_FBeta': array([0.97123925, 0.74744778, 0.79580985, 0.94313573, 0.71981586,
        0.98335283, 0.92742036, 0.74302401, 0.78754933, 0.99254765]),
 'param_eta': masked_array(data=[0.3621780831931537, 0.7063233