In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import fbeta_score

In [2]:
df_train = pd.read_csv("../data/train_new.csv", sep="|")
df_val = pd.read_csv("../data/val_new.csv", sep="|")

In [3]:
def dmc_score_function(label, y_pred):
    loss = 0.0
    for index, item in enumerate(label):
        if label[index] == 0:
            if y_pred[index] == 0:
                loss += 0.0
            else:
                loss -= 25.0
        else:
            if y_pred[index] == 0:
                loss -= 5.0
            else:
                loss += 5.0
    return loss


def calculate_metrics(x, y, model, tp="scikit"):
    scores = {}
    pred = None
    if tp is "scikit":
        scores['acc'] = model.score(x, y)
        pred = model.predict(x)
    elif tp is "keras":
        pred = model.predict_classes(x)
    scores['dmc'] = dmc_score_function(y, pred)
    scores['f2'] = fbeta_score(pred, y, 2, average='binary')
    return scores


In [4]:
df_train = pd.read_csv("../data/train_new.csv", sep="|")
df_val = pd.read_csv("../data/val_new.csv", sep="|")
df_train['totalScannedLineItems'] =  df_train['totalScanTimeInSeconds'] * df_train['scannedLineItemsPerSecond']
df_val['totalScannedLineItems'] =  df_val['totalScanTimeInSeconds'] * df_val['scannedLineItemsPerSecond']
df_train.head()

df_wo_frauds_balanced = df_train[df_train.fraud!=1].sample(n=df_train[df_train.fraud==1].count()[0])
df_50_50 = df_wo_frauds_balanced.append(df_train[df_train.fraud==1],ignore_index=True).sample(frac=1)
df_50_50.count()
#Validation dataset
df_val_y = df_val.fraud
df_val_x = df_val.drop(['fraud'], axis=1)

#Train balanced
df_train_balanced_y = df_50_50.fraud
df_train_balanced_x = df_50_50.drop(['fraud'], axis=1)

#Train unbalanced
df_train_unbalanced_y = df_train.fraud
df_train_unbalanced_x = df_train.drop(['fraud'], axis=1)

## Logistic Regression 
`class sklearn.linear_model.LogisticRegression(penalty=’l2’, dual=False, tol=0.0001, C=1.0, fit_intercept=True, intercept_scaling=1, class_weight=None, random_state=None, solver=’warn’, max_iter=100, multi_class=’warn’, verbose=0, warm_start=False, n_jobs=None)`

In [5]:
res = dict()

log_reg_balanced = LogisticRegression().fit(df_train_balanced_x, df_train_balanced_y)
log_reg_unbalanced = LogisticRegression().fit(df_train_unbalanced_x, df_train_unbalanced_y)

print("Balanced ",calculate_metrics(df_val_x, df_val_y, log_reg_balanced))
print("Unbalanced ",calculate_metrics(df_val_x, df_val_y, log_reg_unbalanced))

Balanced  {'f2': 0.4581673306772909, 'acc': 0.9095744680851063, 'dmc': -735.0}
Unbalanced  {'f2': 0.782608695652174, 'acc': 0.973404255319149, 'dmc': -60.0}




## Linear Discriminant Analysis
`class sklearn.discriminant_analysis.LinearDiscriminantAnalysis(solver=’svd’, shrinkage=None, priors=None, n_components=None, store_covariance=False, tol=0.0001)`

In [6]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
lda_balanced = LinearDiscriminantAnalysis().fit(df_train_balanced_x, df_train_balanced_y)
lda_unbalanced = LinearDiscriminantAnalysis().fit(df_train_unbalanced_x, df_train_unbalanced_y)
res['lda balanced'] = calculate_metrics(df_val_x, df_val_y, lda_balanced)
res['lda unbalanced'] = calculate_metrics(df_val_x, df_val_y, lda_unbalanced)

## KNN 
`class sklearn.neighbors.KNeighborsClassifier(n_neighbors=5, weights=’uniform’, algorithm=’auto’, leaf_size=30, p=2, metric=’minkowski’, metric_params=None, n_jobs=None, **kwargs)`

In [7]:
from sklearn.neighbors import KNeighborsClassifier
knn_balanced = KNeighborsClassifier().fit(df_train_balanced_x, df_train_balanced_y)
knn_unbalanced = KNeighborsClassifier().fit(df_train_unbalanced_x, df_train_unbalanced_y)
print("Balanced ",calculate_metrics(df_val_x, df_val_y, knn_balanced))
print("Unbalanced ",calculate_metrics(df_val_x, df_val_y, knn_unbalanced))

Balanced  {'f2': 0.08939974457215837, 'acc': 0.5079787234042553, 'dmc': -4375.0}
Unbalanced  {'f2': 0.0, 'acc': 0.9388297872340425, 'dmc': -115.0}


  'recall', 'true', average, warn_for)


## Support Vector Classifier
`class sklearn.svm.SVC(C=1.0, kernel=’rbf’, degree=3, gamma=’auto_deprecated’, coef0=0.0, shrinking=True, probability=False, tol=0.001, cache_size=200, class_weight=None, verbose=False, max_iter=-1, decision_function_shape=’ovr’, random_state=None)`

In [8]:
from sklearn.svm import SVC
svc_balanced = SVC().fit(df_train_balanced_x, df_train_balanced_y)
svc_unbalanced = SVC().fit(df_train_unbalanced_x, df_train_unbalanced_y)
print("Balanced ",calculate_metrics(df_val_x, df_val_y, svc_balanced))
print("Unbalanced ",calculate_metrics(df_val_x, df_val_y, svc_unbalanced))




Balanced  {'f2': 0.0, 'acc': 0.9281914893617021, 'dmc': -215.0}
Unbalanced  {'f2': 0.0, 'acc': 0.9388297872340425, 'dmc': -115.0}


  'recall', 'true', average, warn_for)


In [9]:
from sklearn.ensemble import RandomForestClassifier
rfc_balanced = SVC().fit(df_train_balanced_x, df_train_balanced_y)
rfc_unbalanced = SVC().fit(df_train_unbalanced_x, df_train_unbalanced_y)
print("Balanced ", calculate_metrics(df_val_x, df_val_y, rfc_balanced))
print("Balanced ",calculate_metrics(df_val_x, df_val_y, rfc_unbalanced))



Balanced  {'f2': 0.0, 'acc': 0.9281914893617021, 'dmc': -215.0}
Balanced  {'f2': 0.0, 'acc': 0.9388297872340425, 'dmc': -115.0}


  'recall', 'true', average, warn_for)


## XGBoost 
[XGBoost Webseite](https://xgboost.readthedocs.io/en/latest/index.html)

In [10]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
#params = param = {'max_depth':3, 'eta':1, 'gamma': 1.0, 'min_child_weight' : 1, 'objective':'binary:logistic' }
xgb_balanced = XGBClassifier().fit(df_train_balanced_x,df_train_balanced_y)
xgb_unbalanced = XGBClassifier().fit(df_train_unbalanced_x,df_train_unbalanced_y)
#bst = xgb.train(param, xgb_train_x_balanced, 3)
print("Balanced ",calculate_metrics(df_val_x, df_val_y, xgb_balanced))
print("Unalanced ",calculate_metrics(df_val_x, df_val_y, xgb_unbalanced))

Balanced  {'f2': 0.628415300546448, 'acc': 0.9547872340425532, 'dmc': -310.0}
Unalanced  {'f2': 0.8585858585858586, 'acc': 0.9787234042553191, 'dmc': 5.0}


In [11]:
import numpy as np
np.array(df_val_x).shape

(376, 10)

In [22]:
from xgboost import XGBClassifier
import pandas as pd
import numpy as np
from sklearn.metrics import fbeta_score
from keras.models import Sequential
from keras.layers import Dense, Activation

df_train = pd.read_csv("../data/train_new.csv", sep="|")
df_val = pd.read_csv("../data/val_new.csv", sep="|")

# Data Preparation
df_wo_frauds_balanced = df_train[df_train.fraud != 1].sample(n=df_train[df_train.fraud == 1].count()[0])
df_50_50 = df_wo_frauds_balanced.append(df_train[df_train.fraud == 1], ignore_index=True).sample(frac=1)
df_50_50.count()

# Validation dataset
df_val_y = df_val.fraud
df_val_x = df_val.drop(['fraud'], axis=1)

# Train balanced
df_train_balanced_y = df_50_50.fraud
df_train_balanced_x = df_50_50.drop(['fraud'], axis=1)

# Train unbalanced
df_train_unbalanced_y = np.array(df_train.fraud)
df_train_unbalanced_x = np.array(df_train.drop(['fraud'], axis=1))

model = Sequential()
model.add(Dense(5, activation='sigmoid', input_dim=9))
#model.add(Dense(500, activation='sigmoid'))

model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='rmsprop',
              loss='binary_crossentropy',
              metrics=['accuracy'])

model.fit(df_train_unbalanced_x, df_train_unbalanced_y, epochs=2, batch_size=128)
calculate_metrics(df_val_x, df_val_y, model, "keras")


Epoch 1/2
Epoch 2/2


{'dmc': -8635.0, 'f2': 0.07590759075907591}

In [13]:
res

{'lda balanced': {'acc': 0.851063829787234,
  'dmc': -1285.0,
  'f2': 0.3392330383480826},
 'lda unbalanced': {'acc': 0.9601063829787234,
  'dmc': -50.0,
  'f2': 0.7142857142857143}}

In [18]:
mlp_unbalanced = model.fit(df_train_unbalanced_x,df_train_unbalanced_y)
calculate_metrics(df_val_x, df_val_y, model, "keras")


Epoch 1/1


{'dmc': -115.0, 'f2': 0.0}

In [15]:
res

{'lda balanced': {'acc': 0.851063829787234,
  'dmc': -1285.0,
  'f2': 0.3392330383480826},
 'lda unbalanced': {'acc': 0.9601063829787234,
  'dmc': -50.0,
  'f2': 0.7142857142857143}}