In [5]:
import os
# ggf muss Pfad angepasst werden
os.chdir("{}/..".format(os.getcwd()))
os.getcwd()


'/home/lukas/Projects/dmc2019'

In [6]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import model_selection, linear_model, metrics
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler, QuantileTransformer
from scipy.stats import uniform, randint
from sklearn.svm import SVC
import xgboost as xgb
from sklearn.model_selection import cross_val_score, GridSearchCV, KFold, RandomizedSearchCV, train_test_split, cross_validate
#from scripts.utils import own_scorer, calc_scores

In [7]:
# Calculate multiple scores
def calc_scores(y_test, y_pred):
    accuracy = metrics.accuracy_score(y_test, y_pred)
    confusion_matrix = metrics.confusion_matrix(y_test, y_pred)
    f2_score = (0 if all(y_pred == 0) else metrics.fbeta_score(y_test, y_pred, beta=2))
    dmc_score = np.sum(confusion_matrix * np.array([[0, -25], [-5, 5]]))

    return accuracy, f2_score, dmc_score/len(y_test), confusion_matrix

# Normalized DMC Score for usage as scorer
def own_scorer_normalized(estimator, X_val, ground_truth):
    prediction = estimator.predict(X_val)
    confusion_matrix = metrics.confusion_matrix(ground_truth, prediction)
    dmc_score = np.sum(confusion_matrix * np.array([[0, -25], [-5, 5]]))
    return dmc_score/len(ground_truth)

# DMC Score for usage as scorer
def own_scorer(estimator, X_val, ground_truth):
    prediction = estimator.predict(X_val)
    confusion_matrix = metrics.confusion_matrix(ground_truth, prediction)
    dmc_score = np.sum(confusion_matrix * np.array([[0, -25], [-5, 5]]))
    return dmc_score
# F2 Score for usage as scorer
def own_f2_score(estimator, X_val, ground_truth):
    prediction = estimator.predict(X_val)
    return 0 if all(prediction == 0) else metrics.fbeta_score(ground_truth, prediction, beta=2)


def report_best_scores(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
                results['mean_test_score'][candidate],
                results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")


## Loading train data and add feature

In [8]:
df_train = pd.read_csv("data/extended_train.csv", sep="|")
df_y = df_train.fraud
df_X = df_train.drop(['fraud'], axis=1)

train_x, val_x, train_y, val_y = train_test_split(df_X, df_y, test_size=0.2)

print("Shape of train data: ", train_x.shape, "Shape of val data: ", val_x.shape)

Shape of train data:  (1503, 10) Shape of val data:  (376, 10)


## Creating a quantile scaled dataset

In [9]:
df_quant_scaled_y = df_train.fraud
df_wo_frauds = df_train.drop(['fraud'], axis=1)
heads = list(df_wo_frauds.columns.values)

qt = QuantileTransformer(n_quantiles=10, random_state=0)
df_quant_scaled_x = pd.DataFrame(qt.fit_transform(df_wo_frauds), columns=heads)
#df_quant_scaled['fraud'] = df_s_y
train_qs_x, val_qs_x, train_qs_y, val_qs_y = train_test_split(df_quant_scaled_x, df_quant_scaled_y, test_size=0.2)

print("Shape of train data: ", train_qs_x.shape, "Shape of val data: ", val_qs_x.shape)
train_qs_x.head()

Shape of train data:  (1503, 10) Shape of val data:  (376, 10)


Unnamed: 0,trustLevel,totalScanTimeInSeconds,grandTotal,lineItemVoids,scansWithoutRegistration,quantityModifications,scannedLineItemsPerSecond,valuePerSecond,lineItemVoidsPerPosition,totalScannedLineItems
547,0.444444,0.243764,0.241806,0.388889,0.7777778,0.6111111,0.821827,0.482265,0.260101,0.777778
1521,1.0,0.120172,0.937788,0.777778,1e-07,0.2222222,0.889656,0.889459,0.489475,0.851852
75,1.0,0.158798,0.925902,0.388889,0.4444444,0.6111111,0.748899,0.889105,0.555556,0.333333
1602,0.444444,0.501301,0.027299,0.888889,0.1111111,1e-07,0.549707,0.027109,0.694026,0.555556
234,0.444444,0.094837,0.205048,0.111111,0.3333333,0.9999999,0.889331,0.779881,0.122261,0.555556


In [10]:
default_xgb = None

In [14]:
default_xgb = xgb.XGBClassifier(n_jobs=6, verbosity=2)
res_train1 = cross_validate(default_xgb, train_x,train_y, scoring=own_scorer, cv=5,n_jobs=-1)['test_score']
res_train2 = cross_validate(default_xgb, train_qs_x,train_qs_y, scoring=own_scorer, cv=5,n_jobs=-1)['test_score']
res_n_train1 = cross_validate(default_xgb, train_x,train_y, scoring=own_scorer_normalized, cv=5,n_jobs=-1)['test_score']
res_n_train2 = cross_validate(default_xgb, train_qs_x,train_qs_y, scoring=own_scorer_normalized, cv=5,n_jobs=-1)['test_score']
print("Original Data: ", res_train1,"Scaled Data: ", res_train2)
print("Original Data: ", res_n_train1,"Scaled Data: ", res_n_train2)

Original Data:  [45 55 20 20 25] Scaled Data:  [ 20 -10   5  65  75]
Original Data:  [0.14900662 0.18272425 0.06666667 0.06666667 0.08333333] Scaled Data:  [ 0.06644518 -0.03322259  0.0166113   0.21666667  0.25      ]


In [15]:
params = {
    "learning_rate": uniform(0.1, 0.7),
    "gamma": uniform(0, 1),
    "max_depth": randint(2, 6),  # default 3
    "n_estimators": randint(300, 450),  # default 100
    "colsample_bytree" : uniform(0.1,0.9)
    
}

In [16]:
default_xgb = xgb.XGBClassifier(booster="gblinear",objective='binary:logistic',n_jobs=6, verbosity=2)
ftwo_scorer = metrics.make_scorer(metrics.fbeta_score, beta=0.5172)
search = RandomizedSearchCV(xgb_model, scoring=ftwo_scorer, param_distributions=params, random_state=42, n_iter=1000,
                            cv=3, verbose=1, n_jobs=-1, return_train_score=True,refit)
search.fit(X_train, y_train)