In [37]:
import pandas as pd
import numpy as np
import seaborn as sns

from matplotlib import pyplot as plt

from os import listdir
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier, IsolationForest
from sklearn.metrics import f1_score, accuracy_score, matthews_corrcoef, precision_score, roc_auc_score, recall_score, average_precision_score, make_scorer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier, LocalOutlierFactor
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, MinMaxScaler
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from tqdm import tqdm

In [38]:
def score_model(model, X_test, y_test, map = None):
    model_preds = model.predict(X_test)
    scores = {}
    if map:
        model_preds = [map[i] for i in model_preds]
    evaluation_funcs = {'accuracy':accuracy_score, 'precision':precision_score, 'recall':recall_score, 'f1':f1_score, 'MCC':matthews_corrcoef}
    for name, score in evaluation_funcs.items():
        scores[name] = score(y_pred = model_preds, y_true = y_test)
    scores['ROC-AUC'] = roc_auc_score(y_score = model_preds, y_true = y_test)
    scores['AUPR'] = average_precision_score(y_score = model_preds, y_true = y_test)
    return scores

In [39]:
transaction_df = pd.read_csv('original_data.csv')
#Split data to x, y, train, test
y = transaction_df['fraud']
X = transaction_df.drop(columns=['fraud'])
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [40]:
#Fit transformer to data
transformer = ColumnTransformer([('One Hot Encoder', OneHotEncoder(drop='first'), ['category', 'gender']),
                                 ('Age Pipe', Pipeline([('ord', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)), ('scale', MinMaxScaler())]), ['age']),
                                 ('MinMaxScaler', MinMaxScaler(), ['amount', 'step']),
                                 ('drop', 'drop', ['zipcodeOri', 'zipMerchant', 'customer', 'merchant'])], remainder = MinMaxScaler(), sparse_threshold=0)
transformer.fit(X_train)
X_train_transformed = transformer.transform(X_train)
X_test_transformed = transformer.transform(X_test)

In [41]:
#Decision Tree
decision_tree = DecisionTreeClassifier(random_state=42)
decision_tree.fit(X_train_transformed, y_train)
dec_tree_scores = score_model(decision_tree, X_test_transformed, y_test)
dec_tree_scores

{'accuracy': 0.9913090857723277,
 'precision': 0.6225910064239829,
 'recall': 0.6645714285714286,
 'f1': 0.6428966279712548,
 'MCC': 0.6388506653966829,
 'ROC-AUC': 0.8298863023968837,
 'AUPR': 0.41770477555463403}

In [42]:
# Random Forest
random_forest = RandomForestClassifier(random_state=42, n_jobs=-1)
random_forest.fit(X_train_transformed, y_train)
random_forest_scores = score_model(random_forest, X_test_transformed, y_test)
random_forest_scores

{'accuracy': 0.9947464365233653,
 'precision': 0.862378459237098,
 'recall': 0.6588571428571428,
 'f1': 0.747003563330094,
 'MCC': 0.7512917590930227,
 'ROC-AUC': 0.8288023419426922,
 'AUPR': 0.5722000558522087}

In [43]:
# KNN
KNN = KNeighborsClassifier(n_jobs=-1)
KNN.fit(X_train_transformed, y_train)
KNN_scores = score_model(KNN, X_test_transformed, y_test)
KNN_scores

{'accuracy': 0.9928024162355964,
 'precision': 0.8393213572854291,
 'recall': 0.4805714285714286,
 'f1': 0.6111918604651163,
 'MCC': 0.6320461710849047,
 'ROC-AUC': 0.7397377634855701,
 'AUPR': 0.4094684465440129}

In [44]:
#Multilayer perceptron
MLP = MLPClassifier(hidden_layer_sizes=(15, 15, 15), random_state=42)
MLP.fit(X_train_transformed, y_train)
MLP_scores = score_model(MLP, X_test_transformed, y_test)
MLP_scores

{'accuracy': 0.9949751447925145,
 'precision': 0.9563239308462238,
 'recall': 0.6005714285714285,
 'f1': 0.7378027378027378,
 'MCC': 0.7557678326203371,
 'ROC-AUC': 0.8001223500720067,
 'AUPR': 0.5790428022705177}

In [45]:
#Support Vector machine
SVM = SVC(class_weight='balanced', random_state=42)
SVM.fit(X_train_transformed, y_train)
SVM_scores = score_model(SVM, X_test_transformed, y_test)
SVM_scores

{'accuracy': 0.9153241267043811,
 'precision': 0.12136668529904975,
 'recall': 0.9925714285714285,
 'f1': 0.21628688830780726,
 'MCC': 0.33164564650216777,
 'ROC-AUC': 0.9534876937154368,
 'AUPR': 0.12055255148763741}

In [46]:
#Isolation forest
y_train_transformed = y_train.map({0:1, 1:-1})

ISO = IsolationForest(contamination=sum(y_train)/len(y_train), random_state=42, n_jobs=-1)
ISO.fit(X_train_transformed, y_train_transformed)

In [47]:
ISO_scores = score_model(ISO, X_test_transformed, y_test, map={1:0, -1:1})
ISO_scores

{'accuracy': 0.983156308648536,
 'precision': 0.29240088105726875,
 'recall': 0.30342857142857144,
 'f1': 0.297812675266405,
 'MCC': 0.28934232118595815,
 'ROC-AUC': 0.6473408895764879,
 'AUPR': 0.09692264574404397}

In [48]:
#Local Outlier Factor
LOF = LocalOutlierFactor(novelty=True, contamination=sum(y_train)/len(y_train), n_jobs=-1)
LOF.fit(X_train_transformed, y_train)
LOF_scores = score_model(LOF, X_test_transformed, y_test, map={1:0, -1:1})
LOF_scores

{'accuracy': 0.9784946959861699,
 'precision': 0.10356164383561643,
 'recall': 0.108,
 'f1': 0.10573426573426573,
 'MCC': 0.09487699071658544,
 'ROC-AUC': 0.5484320030494654,
 'AUPR': 0.021685057773717584}

In [49]:
scores = {
            'Decision Tree': dec_tree_scores,
            'Random Forest': random_forest_scores,
            'K-NN': KNN_scores,
            'MLP': MLP_scores,
            'SVM': SVM_scores,
            'LOF': LOF_scores,
            'IF': ISO_scores
            }
scores_df = pd.DataFrame(scores)

In [None]:
scores_df.T.to_csv('no_feature_scores.csv')