In [1]:
import pandas as pd

from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier, IsolationForest
from sklearn.metrics import f1_score, accuracy_score, matthews_corrcoef, precision_score, roc_auc_score, recall_score, average_precision_score
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier, LocalOutlierFactor
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, MinMaxScaler
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

In [2]:
transactions = pd.read_csv('banksim_dataset/bs140513_032310.csv')
network = pd.read_csv('banksim_dataset/bsNET140513_032310.csv')

In [3]:
y = transactions['fraud']
X = transactions.drop(columns=['fraud'])

X.head()

Unnamed: 0,step,customer,age,gender,zipcodeOri,merchant,zipMerchant,category,amount
0,0,'C1093826151','4','M','28007','M348934600','28007','es_transportation',4.55
1,0,'C352968107','2','M','28007','M348934600','28007','es_transportation',39.68
2,0,'C2054744914','4','F','28007','M1823072687','28007','es_transportation',26.89
3,0,'C1760612790','3','M','28007','M348934600','28007','es_transportation',17.25
4,0,'C757503768','5','M','28007','M348934600','28007','es_transportation',35.72


In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [5]:
transformer = ColumnTransformer([('One Hot Encoder', OneHotEncoder(drop='first'), ['category', 'gender']),
                                 ('Age Pipe', Pipeline([('ord', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)), ('scale', MinMaxScaler())]), ['age']),
                                 ('MinMaxScaler', MinMaxScaler(), ['amount']),
                                 ('drop', 'drop', ['zipcodeOri', 'zipMerchant', 'customer', 'merchant'])], remainder = 'passthrough')
transformer.fit(X_train)

In [6]:
def score_model(model, X_test, y_test, map = None):
    model_preds = model.predict(X_test)
    if map:
        model_preds = [map[i] for i in model_preds]
    evaluation_funcs = {'accuracy':accuracy_score, 'precision':precision_score, 'recall':recall_score, 'f1':f1_score, 'MCC':matthews_corrcoef}
    for name, score in evaluation_funcs.items():
        print(f'{name}: {score(y_pred = model_preds, y_true = y_test)}')
    print(f'ROC-AUC: {roc_auc_score(y_score = model_preds, y_true = y_test)}')
    print(f'AUPR: {average_precision_score(y_score = model_preds, y_true = y_test)}')

In [7]:
decision_tree = Pipeline(steps=[('transformer', transformer), ('model', DecisionTreeClassifier())])
decision_tree.fit(X_train, y_train)
score_model(decision_tree, X_test, y_test)

accuracy: 0.9914638001896934
precision: 0.6297895304910955
recall: 0.6668571428571428
f1: 0.6477935054121565
MCC: 0.6437462129947279
ROC-AUC: 0.831093824541
AUPR: 0.42390132104916356


In [8]:
random_forest = Pipeline(steps=[('transformer', transformer), ('model', RandomForestClassifier())])
random_forest.fit(X_train, y_train)
score_model(random_forest, X_test, y_test)

accuracy: 0.9947195296681712
precision: 0.8587360594795539
recall: 0.66
f1: 0.7463651050080776
MCC: 0.7503297416597305
ROC-AUC: 0.8293533499874074
AUPR: 0.5707681939666179


In [9]:
KNN = Pipeline(steps=[('transformer', transformer), ('model', KNeighborsClassifier())])
KNN.fit(X_train, y_train)
score_model(KNN, X_test, y_test)

In [None]:
MLP = Pipeline(steps=[('transformer', transformer), ('model', MLPClassifier(hidden_layer_sizes=(15, 15, 15)))])
MLP.fit(X_train, y_train)
score_model(MLP, X_test, y_test)

In [None]:
SVC = Pipeline(steps=[('transformer', transformer), ('model', SVC())])
SVC.fit(X_train, y_train)
score_model(SVC, X_test, y_test)

In [None]:
ISO = Pipeline(steps=[('transformer', transformer), ('model', IsolationForest())])
ISO.fit(X_train, y_train)
score_model(ISO, X_test, y_test, map={1:0, -1:1})

In [None]:
LOF = Pipeline(steps=[('transformer', transformer), ('model', LocalOutlierFactor())])
LOF.fit(X_train, y_train)
score_model(LOF, X_test, y_test, map={1:0, -1:1})