In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import fbeta_score

In [2]:
df_train = pd.read_csv("../data/train_new.csv", sep="|")
df_val = pd.read_csv("../data/val_new.csv", sep="|")

In [24]:
def dmc_score_function(label, y_pred):
    loss = 0.0
    for index, item in enumerate(label):
        if label[index] == 0: 
            if y_pred[index] == 0:
                loss += 0.0
            else:
                loss -= 25.0
        else:
            if y_pred[index] == 0:
                loss -= 5.0
            else:
                loss += 5.0
    return loss

def calculate_metrics(x, y, model, tp="scikit"):
    scores = {}
    pred = None
    if tp is "scikit":
        scores['acc'] = model.score(x,y)
        pred = model.predict(x)
    elif tp is "keras":
        pred = model.predict_classes(x)
    scores['dmc'] = dmc_score_function(pred,y)
    scores['f2'] = fbeta_score(pred, y, 2, average='binary')
    return scores


In [5]:
df_wo_frauds_balanced = df_train[df_train.fraud!=1].sample(n=df_train[df_train.fraud==1].count()[0])
df_50_50 = df_wo_frauds_balanced.append(df_train[df_train.fraud==1],ignore_index=True).sample(frac=1)
df_50_50.count()
#Validation dataset
df_val_y = df_val.fraud
df_val_x = df_val.drop(['fraud'], axis=1)

#Train balanced
df_train_balanced_y = df_50_50.fraud
df_train_balanced_x = df_50_50.drop(['fraud'], axis=1)

#Train unbalanced
df_train_unbalanced_y = df_train.fraud
df_train_unbalanced_x = df_train.drop(['fraud'], axis=1)

## Logistic Regression 
`class sklearn.linear_model.LogisticRegression(penalty=’l2’, dual=False, tol=0.0001, C=1.0, fit_intercept=True, intercept_scaling=1, class_weight=None, random_state=None, solver=’warn’, max_iter=100, multi_class=’warn’, verbose=0, warm_start=False, n_jobs=None)`

In [6]:
res = dict()

log_reg_balanced = LogisticRegression().fit(df_train_balanced_x, df_train_balanced_y)
log_reg_unbalanced = LogisticRegression().fit(df_train_unbalanced_x, df_train_unbalanced_y)

res['logistic regression balanced'] = calculate_metrics(df_val_x, df_val_y, log_reg_balanced)
res['logistic regression unbalanced'] = calculate_metrics(df_val_x, df_val_y, log_reg_unbalanced)
print(res)

{'logistic regression unbalanced': {'dmc': -270.0, 'f2': 0.632183908045977, 'acc': 0.9547872340425532}, 'logistic regression balanced': {'dmc': -105.0, 'f2': 0.3951890034364261, 'acc': 0.8829787234042553}}




## Linear Discriminant Analysis
`class sklearn.discriminant_analysis.LinearDiscriminantAnalysis(solver=’svd’, shrinkage=None, priors=None, n_components=None, store_covariance=False, tol=0.0001)`

In [7]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
lda_balanced = LinearDiscriminantAnalysis().fit(df_train_balanced_x, df_train_balanced_y)
lda_unbalanced = LinearDiscriminantAnalysis().fit(df_train_unbalanced_x, df_train_unbalanced_y)
res['lda balanced'] = calculate_metrics(df_val_x, df_val_y, lda_balanced)
res['lda unbalanced'] = calculate_metrics(df_val_x, df_val_y, lda_unbalanced)

## KNN 
`class sklearn.neighbors.KNeighborsClassifier(n_neighbors=5, weights=’uniform’, algorithm=’auto’, leaf_size=30, p=2, metric=’minkowski’, metric_params=None, n_jobs=None, **kwargs)`

In [8]:
from sklearn.neighbors import KNeighborsClassifier
knn_balanced = KNeighborsClassifier().fit(df_train_balanced_x, df_train_balanced_y)
knn_unbalanced = KNeighborsClassifier().fit(df_train_unbalanced_x, df_train_unbalanced_y)
res['knn balanced'] = calculate_metrics(df_val_x, df_val_y, knn_balanced)
res['knn unbalanced'] = calculate_metrics(df_val_x, df_val_y, knn_unbalanced)

## Support Vector Classifier
`class sklearn.svm.SVC(C=1.0, kernel=’rbf’, degree=3, gamma=’auto_deprecated’, coef0=0.0, shrinking=True, probability=False, tol=0.001, cache_size=200, class_weight=None, verbose=False, max_iter=-1, decision_function_shape=’ovr’, random_state=None)`

In [9]:
from sklearn.svm import SVC
svc_balanced = SVC().fit(df_train_balanced_x, df_train_balanced_y)
svc_unbalanced = SVC().fit(df_train_unbalanced_x, df_train_unbalanced_y)
res['svc balanced'] = calculate_metrics(df_val_x, df_val_y, svc_balanced)
res['svc unbalanced'] = calculate_metrics(df_val_x, df_val_y, svc_unbalanced)


  'recall', 'true', average, warn_for)


In [10]:
from sklearn.ensemble import RandomForestClassifier
rfc_balanced = SVC().fit(df_train_balanced_x, df_train_balanced_y)
rfc_unbalanced = SVC().fit(df_train_unbalanced_x, df_train_unbalanced_y)
res['rfc balanced'] = calculate_metrics(df_val_x, df_val_y, rfc_balanced)
res['rfc unbalanced'] = calculate_metrics(df_val_x, df_val_y, rfc_unbalanced)

  'recall', 'true', average, warn_for)


## XGBoost 
[XGBoost Webseite](https://xgboost.readthedocs.io/en/latest/index.html)

In [11]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
#params = param = {'max_depth':3, 'eta':1, 'gamma': 1.0, 'min_child_weight' : 1, 'objective':'binary:logistic' }
xgb_balanced = XGBClassifier().fit(df_train_balanced_x,df_train_balanced_y)
xgb_unbalanced = XGBClassifier().fit(df_train_unbalanced_x,df_train_unbalanced_y)
#bst = xgb.train(param, xgb_train_x_balanced, 3)
res['xgb balanced'] = calculate_metrics(df_val_x, df_val_y, xgb_balanced)
res['xgb unbalanced'] = calculate_metrics(df_val_x, df_val_y, xgb_unbalanced)

In [15]:
import numpy as np
np.array(df_val_x).shape

(376, 9)

In [17]:
from xgboost import XGBClassifier
import pandas as pd
import numpy as np
from sklearn.metrics import fbeta_score
from keras.models import Sequential
from keras.layers import Dense, Activation

df_train = pd.read_csv("../data/train_new.csv", sep="|")
df_val = pd.read_csv("../data/val_new.csv", sep="|")

# Data Preparation
df_wo_frauds_balanced = df_train[df_train.fraud != 1].sample(n=df_train[df_train.fraud == 1].count()[0])
df_50_50 = df_wo_frauds_balanced.append(df_train[df_train.fraud == 1], ignore_index=True).sample(frac=1)
df_50_50.count()

# Validation dataset
df_val_y = df_val.fraud
df_val_x = df_val.drop(['fraud'], axis=1)

# Train balanced
df_train_balanced_y = df_50_50.fraud
df_train_balanced_x = df_50_50.drop(['fraud'], axis=1)

# Train unbalanced
df_train_unbalanced_y = np.array(df_train.fraud)
df_train_unbalanced_x = np.array(df_train.drop(['fraud'], axis=1))

model = Sequential()
model.add(Dense(32, activation='relu', input_dim=9))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='rmsprop',
              loss='binary_crossentropy',
              metrics=['accuracy'])

model.fit(df_train_unbalanced_x, df_train_unbalanced_y, epochs=15, batch_size=128)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<keras.callbacks.History at 0x7f6524728048>

Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 24/500
Epoch 25/500
Epoch 26/500
Epoch 27/500
Epoch 28/500
Epoch 29/500
Epoch 30/500
Epoch 31/500
Epoch 32/500
Epoch 33/500
Epoch 34/500
Epoch 35/500
Epoch 36/500
Epoch 37/500
Epoch 38/500
Epoch 39/500
Epoch 40/500
Epoch 41/500
Epoch 42/500
Epoch 43/500
Epoch 44/500
Epoch 45/500
Epoch 46/500
Epoch 47/500
Epoch 48/500
Epoch 49/500
Epoch 50/500
Epoch 51/500
Epoch 52/500
Epoch 53/500
Epoch 54/500
Epoch 55/500
Epoch 56/500
Epoch 57/500
Epoch 58/500
Epoch 59/500
Epoch 60/500
Epoch 61/500
Epoch 62/500
Epoch 63/500
Epoch 64/500
Epoch 65/500
Epoch 66/500
Epoch 67/500
Epoch 68/500
Epoch 69/500
Epoch 70/500
Epoch 71/500
Epoch 72/500
Epoch 73/500
Epoch 74/500
Epoch 75/500
Epoch 76/500
Epoch 77/500
Epoch 78

Epoch 167/500
Epoch 168/500
Epoch 169/500
Epoch 170/500
Epoch 171/500
Epoch 172/500
Epoch 173/500
Epoch 174/500
Epoch 175/500
Epoch 176/500
Epoch 177/500
Epoch 178/500
Epoch 179/500
Epoch 180/500
Epoch 181/500
Epoch 182/500
Epoch 183/500
Epoch 184/500
Epoch 185/500
Epoch 186/500
Epoch 187/500
Epoch 188/500
Epoch 189/500
Epoch 190/500
Epoch 191/500
Epoch 192/500
Epoch 193/500
Epoch 194/500
Epoch 195/500
Epoch 196/500
Epoch 197/500
Epoch 198/500
Epoch 199/500
Epoch 200/500
Epoch 201/500
Epoch 202/500
Epoch 203/500
Epoch 204/500
Epoch 205/500
Epoch 206/500
Epoch 207/500
Epoch 208/500
Epoch 209/500
Epoch 210/500
Epoch 211/500
Epoch 212/500
Epoch 213/500
Epoch 214/500
Epoch 215/500
Epoch 216/500
Epoch 217/500
Epoch 218/500
Epoch 219/500
Epoch 220/500
Epoch 221/500
Epoch 222/500
Epoch 223/500
Epoch 224/500
Epoch 225/500
Epoch 226/500
Epoch 227/500
Epoch 228/500
Epoch 229/500
Epoch 230/500
Epoch 231/500
Epoch 232/500
Epoch 233/500
Epoch 234/500
Epoch 235/500
Epoch 236/500
Epoch 237/500
Epoch 

Epoch 331/500
Epoch 332/500
Epoch 333/500
Epoch 334/500
Epoch 335/500
Epoch 336/500
Epoch 337/500
Epoch 338/500
Epoch 339/500
Epoch 340/500
Epoch 341/500
Epoch 342/500
Epoch 343/500
Epoch 344/500
Epoch 345/500
Epoch 346/500
Epoch 347/500
Epoch 348/500
Epoch 349/500
Epoch 350/500
Epoch 351/500
Epoch 352/500
Epoch 353/500
Epoch 354/500
Epoch 355/500
Epoch 356/500
Epoch 357/500
Epoch 358/500
Epoch 359/500
Epoch 360/500
Epoch 361/500
Epoch 362/500
Epoch 363/500
Epoch 364/500
Epoch 365/500
Epoch 366/500
Epoch 367/500
Epoch 368/500
Epoch 369/500
Epoch 370/500
Epoch 371/500
Epoch 372/500
Epoch 373/500
Epoch 374/500
Epoch 375/500
Epoch 376/500
Epoch 377/500
Epoch 378/500
Epoch 379/500
Epoch 380/500
Epoch 381/500
Epoch 382/500
Epoch 383/500
Epoch 384/500
Epoch 385/500
Epoch 386/500
Epoch 387/500
Epoch 388/500
Epoch 389/500
Epoch 390/500
Epoch 391/500
Epoch 392/500
Epoch 393/500
Epoch 394/500
Epoch 395/500
Epoch 396/500
Epoch 397/500
Epoch 398/500
Epoch 399/500
Epoch 400/500
Epoch 401/500
Epoch 

Epoch 495/500
Epoch 496/500
Epoch 497/500
Epoch 498/500
Epoch 499/500
Epoch 500/500


In [36]:
res

{'knn balanced': {'acc': 0.5345744680851063,
  'dmc': -1035.0,
  'f2': 0.08253094910591471},
 'knn unbalanced': {'acc': 0.9361702127659575, 'dmc': -580.0, 'f2': 0.0},
 'lda balanced': {'acc': 0.8138297872340425,
  'dmc': -235.0,
  'f2': 0.2911392405063291},
 'lda unbalanced': {'acc': 0.9414893617021277,
  'dmc': -545.0,
  'f2': 0.1851851851851852},
 'logistic regression balanced': {'acc': 0.8829787234042553,
  'dmc': -105.0,
  'f2': 0.3951890034364261},
 'logistic regression unbalanced': {'acc': 0.9547872340425532,
  'dmc': -270.0,
  'f2': 0.632183908045977},
 'mlp balanced': {'dmc': -320.0, 'f2': 0.257985257985258},
 'mlp unbalanced': {'dmc': -575.0, 'f2': 0.0},
 'rfc balanced': {'acc': 0.07446808510638298,
  'dmc': -1650.0,
  'f2': 0.0733822548365577},
 'rfc unbalanced': {'acc': 0.9388297872340425, 'dmc': -575.0, 'f2': 0.0},
 'svc balanced': {'acc': 0.07446808510638298,
  'dmc': -1650.0,
  'f2': 0.0733822548365577},
 'svc unbalanced': {'acc': 0.9388297872340425, 'dmc': -575.0, 'f2': 

In [31]:
mlp_unbalanced = model.fit(df_train_unbalanced_x,df_train_unbalanced_y)
res['mlp unbalanced'] = calculate_metrics(df_val_x, df_val_y, model, "keras")


Epoch 1/1


  'recall', 'true', average, warn_for)


In [32]:
res

{'knn balanced': {'acc': 0.5345744680851063,
  'dmc': -1035.0,
  'f2': 0.08253094910591471},
 'knn unbalanced': {'acc': 0.9361702127659575, 'dmc': -580.0, 'f2': 0.0},
 'lda balanced': {'acc': 0.8138297872340425,
  'dmc': -235.0,
  'f2': 0.2911392405063291},
 'lda unbalanced': {'acc': 0.9414893617021277,
  'dmc': -545.0,
  'f2': 0.1851851851851852},
 'logistic regression balanced': {'acc': 0.8829787234042553,
  'dmc': -105.0,
  'f2': 0.3951890034364261},
 'logistic regression unbalanced': {'acc': 0.9547872340425532,
  'dmc': -270.0,
  'f2': 0.632183908045977},
 'mlp balanced': {'dmc': -705.0, 'f2': 0.14685314685314685},
 'mlp unbalanced': {'dmc': -575.0, 'f2': 0.0},
 'rfc balanced': {'acc': 0.07446808510638298,
  'dmc': -1650.0,
  'f2': 0.0733822548365577},
 'rfc unbalanced': {'acc': 0.9388297872340425, 'dmc': -575.0, 'f2': 0.0},
 'svc balanced': {'acc': 0.07446808510638298,
  'dmc': -1650.0,
  'f2': 0.0733822548365577},
 'svc unbalanced': {'acc': 0.9388297872340425, 'dmc': -575.0, 'f2'