In [3]:
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn import metrics
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from IPython.display import display
from sklearn.model_selection import train_test_split
from sklearn.neighbors import NearestNeighbors, KDTree
from scores import scores
import time
import warnings
warnings.filterwarnings("ignore")

### 4.) Import Data

In [4]:
%run ../notebooks/utils.ipynb

In [5]:
trainandknn_Xy_original_df = pd.read_csv("../data/train_new.csv", sep="|")
train_Xy_original_df, knn_Xy_original_df = train_test_split(trainandknn_Xy_original_df,train_size=0.75, random_state=42) # if FINAL_SUBMISSION else 0.8) #small
test_X_original_df  = pd.read_csv("../data/test.csv", sep="|").iloc[0:500] #TODO: For faster testing we use less data from the test set
test_final_X_df = pd.read_csv("../data/test.csv", sep="|")

train_Xy_wo_knn_df = pd.read_csv("../data/train_new.csv", sep="|")
#Only for test routines
val_Xy_original_df = pd.read_csv("../data/val_new.csv", sep="|")
train_complete_Xy_original_df = pd.read_csv("../data/train.csv", sep="|")
train_complete_Xy_original_df = train_complete_Xy_original_df[train_complete_Xy_original_df.trustLevel <=2]

In [6]:
#convention for variables names: datasetname_columntype_transformstatus_dataframeornot
train_y_original_df = train_Xy_original_df[["fraud"]].copy()
train_X_original_df = train_Xy_original_df.copy().drop("fraud", axis=1)

knn_y_original_df = knn_Xy_original_df[["fraud"]].copy()
knn_X_original_df = knn_Xy_original_df.copy().drop("fraud", axis=1)

# Only for test routie#nes
val_y_originial_df = val_Xy_original_df[["fraud"]].copy()
val_X_originial_df = val_Xy_original_df.copy().drop("fraud", axis=1)

train_y_wo_knn_df = train_Xy_wo_knn_df[["fraud"]].copy()
train_X_wo_knn_df = train_Xy_wo_knn_df.copy().drop("fraud", axis=1)

train_complete_y_originial_df = train_complete_Xy_original_df[["fraud"]].copy()
train_complete_X_originial_df = train_complete_Xy_original_df.copy().drop("fraud", axis=1)

In [7]:
#scaler = MinMaxScaler()
scaler = StandardScaler()
transformer = DataTransformer(scaler)

# Adding new Features to train and test set
train_X_unscaled_df = transformer.add_features(train_X_original_df)
test_X_unscaled_df = transformer.add_features(test_X_original_df)
knn_X_unscaled_df = transformer.add_features(knn_X_original_df)

test_X_lt = test_X_unscaled_df[test_X_unscaled_df.trustLevel <= 2]

val_X_unscaled_df = transformer.add_features(val_X_originial_df)
train_complete_X_unscaled_df = transformer.add_features(train_complete_X_originial_df) 

transformer.fit_scaler(transformer.add_features(train_complete_X_unscaled_df.append(test_X_lt, sort=False)))
train_X_scaled_df = transformer.apply_scaler(train_X_unscaled_df)
knn_X_scaled_df   = transformer.apply_scaler(knn_X_unscaled_df)


test_X_scaled_df  = transformer.apply_scaler(test_X_unscaled_df)
val_X_scaled_df = transformer.apply_scaler(val_X_unscaled_df)
train_complete_X_scaled_df = transformer.apply_scaler(train_complete_X_unscaled_df)

train_X_wo_knn_unscaled_df = transformer.add_features(train_X_wo_knn_df.copy())
train_X_wo_knn_scaled_df = transformer.apply_scaler(train_X_wo_knn_unscaled_df)

# labels
train_y_df = train_y_original_df.copy()
val_y_df = val_y_originial_df.copy()
knn_y_df = knn_y_original_df.copy()

train_complete_X_scaled_df = transformer.apply_scaler(transformer.add_features(train_complete_Xy_original_df.copy().drop(columns=['fraud'])))
train_complete_y_df = train_complete_Xy_original_df.copy().fraud

test_final_X_df = transformer.add_features(test_final_X_df)


display(train_complete_X_scaled_df.head(5))
display(train_complete_y_df.head(5))

Unnamed: 0,trustLevel,totalScanTimeInSeconds,grandTotal,lineItemVoids,scansWithoutRegistration,quantityModifications,scannedLineItemsPerSecond,valuePerSecond,lineItemVoidsPerPosition,totalScannedLineItems
5,-1.037475,-0.252351,-1.251994,1.582052,0.014423,-0.297142,-0.106,-0.197456,-0.232764,1.241436
7,0.963879,1.197361,-0.852969,-1.578303,0.949711,0.881638,-0.226264,-0.197049,-0.581851,-0.630084
9,0.963879,-0.336527,-0.230067,1.294747,-0.920865,0.881638,-0.090687,-0.148996,-0.276253,1.358406
15,-1.037475,-0.065291,-0.524139,-0.716388,-1.232628,1.471027,-0.224396,-0.1712,-0.169293,-1.097964
23,0.963879,-1.458885,-0.760964,-0.141778,0.326186,-0.297142,0.591229,0.019959,-0.409952,1.007496


5     1
7     0
9     0
15    0
23    0
Name: fraud, dtype: int64

## Scoring functions
Defining multiple scores which should be tracked in the HyperParamSearch Object

In [8]:
score = scores.Scores()

In [9]:
scoring = {'AUC': 'roc_auc', 'FBeta': metrics.make_scorer(metrics.fbeta_score, beta=0.5172), "Precision":'precision', "Recall": 'recall', "AP": score.average_precision, "DMC" : score.dmc_score}

## Defining the paramteres which should be tuned
To tune the hyperparameters, i looked into the [documentation here](https://xgboost.readthedocs.io/en/latest/parameter.html#parameters-for-tree-booster). `randint` can be used for integer values, for float values, use `uniform`. 

You can also use a Grid search on single parameters to get a feeling for a good interval. If you want to try only two possibilities, you can create a list like for the `scale_pos_weight` parameter.

**Note: For the classifiers which work without gpu support, you can probably set a parameter n_jobs=-1 to use all processors**

In [10]:
params = {
    "tol": uniform(1e-5, 1e-1),  # default 100
    "C" :  uniform(0.0, 80.0),
    "shrinking" : [True, False]
}

## Creating a classifier with some default values
Not all paramters of a classifier should be fine tuned. For SVM for example, the `kernel`-paramter should be set manually. In the case of xgboost, some things like the objective, the booster and the tree method should not be tuned. The choice of paramters depend on the specific classifier

In [11]:
default_svm = SVC(kernel="linear", probability=True, cache_size=8000,  verbose=0, random_state=42)
default_svm.get_params().keys()

dict_keys(['shrinking', 'kernel', 'cache_size', 'class_weight', 'probability', 'max_iter', 'decision_function_shape', 'degree', 'gamma', 'C', 'verbose', 'random_state', 'coef0', 'tol'])

In [12]:
train_complete_X_scaled_df.describe()

Unnamed: 0,trustLevel,totalScanTimeInSeconds,grandTotal,lineItemVoids,scansWithoutRegistration,quantityModifications,scannedLineItemsPerSecond,valuePerSecond,lineItemVoidsPerPosition,totalScannedLineItems
count,679.0,679.0,679.0,679.0,679.0,679.0,679.0,679.0,679.0,679.0
mean,-0.014692,0.012632,0.023559,0.014357,-0.039756,0.015347,-0.04597,-0.03174,0.020799,-0.003201
std,1.00117,1.000917,0.997986,0.984335,0.993521,0.992125,0.576729,0.834192,1.026366,1.017604
min,-1.037475,-1.688968,-1.627507,-1.578303,-1.544391,-1.475921,-0.252355,-0.213921,-0.581851,-1.682813
25%,-1.037475,-0.846265,-0.814804,-0.716388,-0.920865,-0.886532,-0.216657,-0.18515,-0.444332,-0.864024
50%,0.963879,-0.009173,-0.071274,-0.141778,0.014423,0.292248,-0.18043,-0.154021,-0.272433,-0.045234
75%,0.963879,0.875618,0.923733,1.007442,0.949711,0.881638,-0.107333,-0.084219,0.044919,0.890526
max,0.963879,1.73048,1.771535,1.582052,1.573237,1.471027,6.844282,19.320347,7.669302,1.709315


In [13]:
search = RandomizedSearchCV(default_svm, scoring=scoring, param_distributions=params, random_state=42, n_iter=10000,
                            cv=3, verbose=1, n_jobs=-1, return_train_score=True,refit='DMC')
search.fit(train_complete_X_scaled_df, train_complete_y_df)
results = search.cv_results_

Fitting 3 folds for each of 10000 candidates, totalling 30000 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    1.3s
[Parallel(n_jobs=-1)]: Done 289 tasks      | elapsed:    2.9s
[Parallel(n_jobs=-1)]: Done 789 tasks      | elapsed:    5.7s
[Parallel(n_jobs=-1)]: Done 1489 tasks      | elapsed:    9.6s
[Parallel(n_jobs=-1)]: Done 2389 tasks      | elapsed:   14.4s
[Parallel(n_jobs=-1)]: Done 3489 tasks      | elapsed:   20.6s
[Parallel(n_jobs=-1)]: Done 4789 tasks      | elapsed:   27.8s
[Parallel(n_jobs=-1)]: Done 6289 tasks      | elapsed:   36.3s
[Parallel(n_jobs=-1)]: Done 7989 tasks      | elapsed:   45.6s
[Parallel(n_jobs=-1)]: Done 9889 tasks      | elapsed:   55.9s
[Parallel(n_jobs=-1)]: Done 11989 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 14289 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 16789 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 19489 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 24763 tasks 

In [14]:
train_complete_Xy_scaled_df = train_complete_X_scaled_df.copy()
train_complete_Xy_scaled_df['fraud'] =train_complete_y_df.copy()

train_Xy_wo_knn_scaled_df = train_X_wo_knn_scaled_df.copy()
train_Xy_wo_knn_scaled_df['fraud'] = train_y_wo_knn_df

val_Xy_scaled = val_X_scaled_df.copy()
val_Xy_scaled['fraud'] = val_y_df.copy()

In [15]:
scorings = {'AUC': 'roc_auc', 'FBeta': metrics.make_scorer(metrics.fbeta_score, beta=0.5172), "Precision":'precision', "Recall": 'recall', "AP": score.average_precision, "DMC" : score.dmc_score}
xgbo = search.best_estimator_
result_dict = test_classification(xgbo,df_train=train_Xy_wo_knn_scaled_df, df_val=val_Xy_scaled)

Results Fix Split: 
DMC Score: 30  ---  Normalized DMC Score: 0.0797872340425532, 

Results Cross Validation: 
DMC Score: 61.0  ---  Normalized DMC Score: 0.16225531914893618 


In [16]:
dmc = np.mean(search.cv_results_['mean_test_DMC'])
ap = np.mean(search.cv_results_['mean_test_AP'])
precision = np.mean(search.cv_results_['mean_test_Precision'])
recall = np.mean(search.cv_results_['mean_test_Recall'])

In [17]:
print("DMC: {} --- Average Precision: {} --- Precision: {} --- Recall: {}".format(dmc, ap, precision, recall))

DMC: 73.35262444771723 --- Average Precision: 0.9734262972878711 --- Precision: 0.9110801968736305 --- Recall: 0.9454169871660002


In [18]:
search.best_estimator_

SVC(C=2.193624797812612, cache_size=8000, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='linear', max_iter=-1, probability=True, random_state=42,
  shrinking=False, tol=0.06885050184921368, verbose=0)

In [38]:
search.best_params_

{'C': 11.439334564226868, 'shrinking': False, 'tol': 0.08370638742373739}