In [1]:
import pandas as pd
import numpy as np

from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier, IsolationForest
from sklearn.metrics import f1_score, accuracy_score, matthews_corrcoef, precision_score, roc_auc_score, recall_score, average_precision_score
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier, LocalOutlierFactor
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, MinMaxScaler
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

In [2]:
transactions = pd.read_csv('banksim_dataset/bs140513_032310.csv')
transactions.replace("'",'', regex=True, inplace=True) 

In [3]:
y = transactions['fraud']
X = transactions.drop(columns=['fraud'])

X.head()

Unnamed: 0,step,customer,age,gender,zipcodeOri,merchant,zipMerchant,category,amount
0,0,'C1093826151','4','M','28007','M348934600','28007','es_transportation',4.55
1,0,'C352968107','2','M','28007','M348934600','28007','es_transportation',39.68
2,0,'C2054744914','4','F','28007','M1823072687','28007','es_transportation',26.89
3,0,'C1760612790','3','M','28007','M348934600','28007','es_transportation',17.25
4,0,'C757503768','5','M','28007','M348934600','28007','es_transportation',35.72


In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [5]:
transformer = ColumnTransformer([('One Hot Encoder', OneHotEncoder(drop='first'), ['category', 'gender']),
                                 ('Age Pipe', Pipeline([('ord', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)), ('scale', MinMaxScaler())]), ['age']),
                                 ('MinMaxScaler', MinMaxScaler(), ['amount', 'step']),
                                 ('drop', 'drop', ['zipcodeOri', 'zipMerchant', 'customer', 'merchant'])], remainder = 'passthrough')
transformer.fit(X_train)

In [6]:
def score_model(model, X_test, y_test, map = None):
    model_preds = model.predict(X_test)
    scores = {}
    if map:
        model_preds = [map[i] for i in model_preds]
    evaluation_funcs = {'accuracy':accuracy_score, 'precision':precision_score, 'recall':recall_score, 'f1':f1_score, 'MCC':matthews_corrcoef}
    for name, score in evaluation_funcs.items():
        scores[name] = score(y_pred = model_preds, y_true = y_test)
    scores['ROC-AUC'] = roc_auc_score(y_score = model_preds, y_true = y_test)
    scores['AUPR'] = average_precision_score(y_score = model_preds, y_true = y_test)
    return scores

In [7]:
decision_tree = Pipeline(steps=[('transformer', transformer), ('model', DecisionTreeClassifier(random_state=42))])
decision_tree.fit(X_train, y_train)
desc_tree_scores = score_model(decision_tree, X_test, y_test)
for name, score in desc_tree_scores.items():
    print(f'{name}: {score}')

accuracy: 0.9913090857723277
precision: 0.6225910064239829
recall: 0.6645714285714286
f1: 0.6428966279712548
MCC: 0.6388506653966829
ROC-AUC: 0.8298863023968837
AUPR: 0.41770477555463403


In [8]:
random_forest = Pipeline(steps=[('transformer', transformer), ('model', RandomForestClassifier(random_state=42))])
random_forest.fit(X_train, y_train)
random_forest_scores = score_model(random_forest, X_test, y_test)
for name, score in random_forest_scores.items():
    print(f'{name}: {score}')

accuracy: 0.9947464365233653
precision: 0.862378459237098
recall: 0.6588571428571428
f1: 0.747003563330094
MCC: 0.7512917590930227
ROC-AUC: 0.8288023419426922
AUPR: 0.5722000558522087


In [9]:
KNN = Pipeline(steps=[('transformer', transformer), ('model', KNeighborsClassifier())])
KNN.fit(X_train, y_train)
KNN_scores = score_model(KNN, X_test, y_test)
for name, score in KNN_scores.items():
    print(f'{name}: {score}')

accuracy: 0.9928024162355964
precision: 0.8393213572854291
recall: 0.4805714285714286
f1: 0.6111918604651163
MCC: 0.6320461710849047
ROC-AUC: 0.7397377634855701
AUPR: 0.4094684465440129


In [10]:
MLP = Pipeline(steps=[('transformer', transformer), ('model', MLPClassifier(hidden_layer_sizes=(15, 15, 15), random_state=42))])
MLP.fit(X_train, y_train)
MLP_scores = score_model(MLP, X_test, y_test)
for name, score in MLP_scores.items():
    print(f'{name}: {score}')

accuracy: 0.9949751447925145
precision: 0.9563239308462238
recall: 0.6005714285714285
f1: 0.7378027378027378
MCC: 0.7557678326203371
ROC-AUC: 0.8001223500720067
AUPR: 0.5790428022705177


In [11]:
SVC = Pipeline(steps=[('transformer', transformer), ('model', SVC(class_weight='balanced', random_state=42))])
SVC.fit(X_train, y_train)
SVC_scores = score_model(SVC, X_test, y_test)
for name, score in SVC_scores.items():
    print(f'{name}: {score}')

accuracy: 0.9153241267043811
precision: 0.12136668529904975
recall: 0.9925714285714285
f1: 0.21628688830780726
MCC: 0.33164564650216777
ROC-AUC: 0.9534876937154368
AUPR: 0.12055255148763741


In [12]:
ISO = Pipeline(steps=[('transformer', transformer), ('model', IsolationForest(contamination=sum(y_train)/len(y_train), random_state=42))])
ISO.fit(X_train, y_train)
ISO_scores = score_model(ISO, X_test, y_test, map={1:0, -1:1})
for name, score in ISO_scores.items():
    print(f'{name}: {score}')

accuracy: 0.983156308648536
precision: 0.29240088105726875
recall: 0.30342857142857144
f1: 0.297812675266405
MCC: 0.28934232118595815
ROC-AUC: 0.6473408895764879
AUPR: 0.09692264574404397


In [13]:

LOF = Pipeline(steps=[('transformer', transformer), ('model', LocalOutlierFactor(n_neighbors=10, novelty=True, contamination=sum(y_train)/len(y_train)))])
LOF.fit(X_train, y_train)
LOF_scores = score_model(LOF, X_test, y_test, map={1:0, -1:1})
for name, score in LOF_scores.items():
    print(f'{name}: {score}')

accuracy: 0.979053013231446
precision: 0.13214670981661272
recall: 0.14
f1: 0.13596004439511655
MCC: 0.1254204946321358
ROC-AUC: 0.5645238954196758
AUPR: 0.028624243641080342


In [14]:
paper_scores = {
    'Decision Tree': [99.150, 65.854, 76.416, 70.743, 70.120, 87.922, 68.231],
    'Random Forest': [98.463, 82.781, 71.897, 76.956, 76.779, 85.848, 70.232],
    'K-NN' : [98.789, 76.191, 64.263, 69.721, 70.988, 81.243, 68.007], 
    'MLP' : [98.789, 85.623, 54.722, 66.771, 65.215, 77.119, 71.435], 
    'SVM' : [98.192, 77.452, 51.123, 61.592, 62.822, 78.193, 74.657], 
    'LOF' : [98.189, 78.102, 81.495, 79.762, 66.318, 82.229, 78.228], 
    'IF' : [99.381, 81.162, 82.997, 82.069, 69.829, 80.071, 84.994]
}

paper_scores_df = pd.DataFrame(paper_scores).T
paper_scores_df.columns = ['accuracy', 'precision', 'recall', 'f1', 'MCC', 'ROC-AUC', 'AUPR']
paper_scores_df

Unnamed: 0,accuracy,precision,recall,f1,MCC,ROC-AUC,AUPR
Decision Tree,99.15,65.854,76.416,70.743,70.12,87.922,68.231
Random Forest,98.463,82.781,71.897,76.956,76.779,85.848,70.232
K-NN,98.789,76.191,64.263,69.721,70.988,81.243,68.007
MLP,98.789,85.623,54.722,66.771,65.215,77.119,71.435
SVM,98.192,77.452,51.123,61.592,62.822,78.193,74.657
LOF,98.189,78.102,81.495,79.762,66.318,82.229,78.228
IF,99.381,81.162,82.997,82.069,69.829,80.071,84.994


In [15]:
my_scores = {
    'Decision Tree': desc_tree_scores,
    'Random Forest': random_forest_scores,
    'K-NN': KNN_scores,
    'MLP': MLP_scores,
    'SVM': SVC_scores,
    'LOF': LOF_scores,
    'IF': ISO_scores
}

my_scores_df = pd.DataFrame(my_scores).T * 100
my_scores_df 

Unnamed: 0,accuracy,precision,recall,f1,MCC,ROC-AUC,AUPR
Decision Tree,99.130909,62.259101,66.457143,64.289663,63.885067,82.98863,41.770478
Random Forest,99.474644,86.237846,65.885714,74.700356,75.129176,82.880234,57.220006
K-NN,99.280242,83.932136,48.057143,61.119186,63.204617,73.973776,40.946845
MLP,99.497514,95.632393,60.057143,73.780274,75.576783,80.012235,57.90428
SVM,91.532413,12.136669,99.257143,21.628689,33.164565,95.348769,12.055255
LOF,97.905301,13.214671,14.0,13.596004,12.542049,56.45239,2.862424
IF,98.315631,29.240088,30.342857,29.781268,28.934232,64.734089,9.692265


In [16]:
score_diff_df = paper_scores_df - my_scores_df
score_diff_df.abs().mean(axis=1).sort_values()

Random Forest     4.337858
MLP               7.121152
Decision Tree     8.236430
K-NN             10.736116
SVM              38.498189
IF               41.351796
LOF              50.535737
dtype: float64