In [12]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import sklearn.metrics as metrics
import matplotlib.pyplot as plt
import xgboost as xgb

# Load data

In [2]:
df_train = pd.read_csv("../data/train_new.csv", sep="|")
df_train_val = pd.read_csv("../data/val_new.csv", sep="|")
df_train_complete = pd.read_csv("../data/train.csv", sep="|")
df_test = pd.read_csv("../data/test.csv", sep="|")
df_train.head()

Unnamed: 0,trustLevel,totalScanTimeInSeconds,grandTotal,lineItemVoids,scansWithoutRegistration,quantityModifications,scannedLineItemsPerSecond,valuePerSecond,lineItemVoidsPerPosition,fraud
0,4,828,66.56,7,4,3,0.007246,0.080386,1.166667,0
1,1,1612,31.34,2,4,3,0.008685,0.019442,0.142857,0
2,3,848,52.37,2,4,0,0.022406,0.061757,0.105263,0
3,1,321,76.03,8,7,2,0.071651,0.236854,0.347826,0
4,1,660,6.06,3,7,1,0.027273,0.009182,0.166667,0


In [15]:
df_train_labels = df_train["fraud"]
df_train_values = df_train.drop(columns=["fraud"])
df_train_val_labels = df_train_val["fraud"]
df_train_val_values = df_train_val.drop(columns=["fraud"])

# Create scoring functions

In [4]:
# Normalized DMC Score for usage as scorer
def own_scorer_normalized(estimator, X_val, ground_truth):
    prediction = estimator.predict(X_val)
    confusion_matrix = metrics.confusion_matrix(ground_truth, prediction)
    dmc_score = np.sum(confusion_matrix * np.array([[0, -25], [-5, 5]]))
    return dmc_score/len(ground_truth)

# DMC Score for usage as scorer
def own_scorer(estimator, X_val, ground_truth):
    prediction = estimator.predict(X_val)
    confusion_matrix = metrics.confusion_matrix(ground_truth, prediction)
    dmc_score = np.sum(confusion_matrix * np.array([[0, -25], [-5, 5]]))
    return dmc_score

# Classification function

In [33]:
def classify(classifier, x_train, y_train, x_validation, y_validation):
    classifier.fit(x_train, y_train)
    #predictions = classifier.predict(x_validation)
    dmc_score = own_scorer(classifier, x_validation, y_validation)
    dmc_score_normalized = own_scorer_normalized(classifier, x_validation, y_validation)
    probabilities = classifier.predict_proba(x_validation)
    predictions = classifier.predict(x_validation)
    
    return dmc_score, dmc_score_normalized, probabilities, predictions
    

# Run all the things

## XGBoost classifier

In [54]:
classifier = xgb.XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, disable_default_eval_metric=1,
       eta=0.1769182150877735, eval_metric='aucpr',
       gamma=1.8285912697052542, #reg_lambda=0.4149772770711012,
       learning_rate=0.1, max_bin=254, max_delta_step=7.2556696256684035,
       max_depth=3, min_child_weight=1.0317712458399741, missing=None,
       n_estimators=445, n_jobs=-1, nthread=None,
       objective='binary:logistic', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=True,
       subsample=1, tree_method='hist', verbosity=2)

dmc_score, dmc_score_normalized, probabilities, predictions = classify(classifier, df_train_values, df_train_labels, df_train_val_values, df_train_val_labels)

print(len(probabilities[:,0]))
print(len(predictions))
print(len(df_train_val_labels))

df = pd.DataFrame({'probability_non_fraud':probabilities[:,0], 'probability_fraud':probabilities[:,1], 'predictions':predictions, 'labels':df_train_val_labels})
df.head(100)
df[df.predictions == 1].head(30)
#df.predictions.hist()

#print("DMC-Score: ", dmc_score)
#print("DMC-Score (normalized): ", dmc_score_normalized)
#probabilities * 100
#np.set_printoptions(precision=2, suppress=True)#, formatter={'float': '{: 0.3f}'.format})
#print(probabilities)

376
376
376


Unnamed: 0,probability_non_fraud,probability_fraud,predictions,labels
28,0.308322,0.691678,1,0
34,0.497428,0.502572,1,0
51,0.460006,0.539994,1,1
87,0.39204,0.60796,1,0
113,0.393745,0.606255,1,1
135,0.080839,0.919161,1,1
137,0.058851,0.941149,1,1
200,0.233792,0.766208,1,1
210,0.450932,0.549068,1,0
262,0.170585,0.829415,1,1


## 