In [None]:
import os
# ggf muss Pfad angepasst werden
os.chdir("{}/..".format(os.getcwd()))
os.getcwd()


In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import model_selection, linear_model, metrics
from scipy.stats import uniform, randint
import xgboost as xgb
from sklearn.model_selection import cross_val_score, GridSearchCV, KFold, RandomizedSearchCV, train_test_split, cross_validate
#from scripts.utils import own_scorer, calc_scores

## Data

In [None]:
df_train = pd.read_csv("data/extended_train.csv", sep="|")
sum_frauds, sum_non_frauds  = len(df_train[df_train.fraud == 1]), len(df_train[df_train.fraud == 0])
df_y = df_train.fraud
df_X = df_train.drop(['fraud'], axis=1)

train_x, val_x, train_y, val_y = train_test_split(df_X, df_y, test_size=0.2)

print("Shape of train data: ", train_x.shape, "Shape of val data: ", val_x.shape)
df_train.head()

## Data with new Features

In [None]:
df = df_train
df['totalScannedLineItems'] = df['scannedLineItemsPerSecond'] * df['totalScanTimeInSeconds']
df['avgTimePerScan'] = 1/ df['scannedLineItemsPerSecond']
df['avgValuePerScan'] = df['avgTimePerScan'] * df['valuePerSecond']
df['withoutRegisPerPosition'] = df['scansWithoutRegistration'] / df['totalScannedLineItems'] #equivalent to lineItemVoidsPerPosition?
df['quantiModPerPosition'] = df['quantityModifications'] / df['totalScannedLineItems']
df['lineItemVoidsPerTotal'] = df['lineItemVoids'] / df['grandTotal']
df['withoutRegisPerTotal'] = df['scansWithoutRegistration'] / df['grandTotal']
df['quantiModPerTotal'] = df['quantityModifications'] / df['grandTotal']
df['lineItemVoidsPerTime'] = df['lineItemVoids'] / df['totalScanTimeInSeconds']
df['withoutRegisPerTime'] = df['scansWithoutRegistration'] / df['totalScanTimeInSeconds']
df['quantiModPerTime'] = df['quantityModifications'] / df['totalScanTimeInSeconds']
df['valuePerScannedLineItem'] = df['valuePerSecond'] / df['scannedLineItemsPerSecond']
df1_y = df.fraud
df1_X = df.drop(['fraud'], axis=1)

In [None]:
print("Frauds: ", sum_frauds, "Non Frauds: ", sum_non_frauds)

### Scoring Methods

In [None]:
scoring = {'AUC': 'roc_auc', 'FBeta': metrics.make_scorer(metrics.fbeta_score, beta=0.5172)}

In [None]:
params = {
    "max_depth": randint(2, 6),  # default 3
    "n_estimators": randint(300, 450),  # default 100
    "eta" :  uniform(0.1, 0.7),
    "gamma" : uniform(0,2),
    "min_child_weight" : uniform(0,50),
    "max_delta_step" : uniform(0,10), #Set it to value of 1-10 might help control the update.
    "lambda" : uniform(0.1,2),
    "scale_pos_weight" : [1, sum_non_frauds/sum_frauds],
    "max_bin" : randint(200, 300)
}
default_xgb = xgb.XGBClassifier(booster="gbtree",tree_method='gpu_hist', disable_default_eval_metric=1,objective='binary:logistic',eval_metric='aucpr', n_jobs=-1, verbosity=2)
search = RandomizedSearchCV(default_xgb, scoring=scoring, param_distributions=params, random_state=42, n_iter=10000,
                            cv=3, verbose=1, n_jobs=-1, return_train_score=True,refit='FBeta')
search.fit(df_X, df_y)
results = search.cv_results_

In [None]:
def own_scorer_normalized(estimator, X_val, ground_truth):
    prediction = estimator.predict(X_val)
    confusion_matrix = metrics.confusion_matrix(ground_truth, prediction)
    dmc_score = np.sum(confusion_matrix * np.array([[0, -25], [-5, 5]]))
    return dmc_score/len(ground_truth)

# DMC Score for usage as scorer
def own_scorer(estimator, X_val, ground_truth):
    prediction = estimator.predict(X_val)
    confusion_matrix = metrics.confusion_matrix(ground_truth, prediction)
    dmc_score = np.sum(confusion_matrix * np.array([[0, -25], [-5, 5]]))
    return dmc_score

scorings = {"DMC" : own_scorer, "DMC_Norm" : own_scorer_normalized}
xgbo = search.best_estimator_
res = cross_validate(xgbo, df_X, df_y, scoring=scorings, cv=5, n_jobs=-1)

In [None]:
res

In [None]:
results

In [None]:
search.best_estimator_