In [21]:
import pandas as pd
import numpy as np

from os import listdir
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier, IsolationForest
from sklearn.metrics import f1_score, accuracy_score, matthews_corrcoef, precision_score, roc_auc_score, recall_score, average_precision_score, make_scorer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier, LocalOutlierFactor
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, MinMaxScaler
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from tqdm import tqdm

In [10]:
def score_model(model, X_test, y_test, map = None):
    model_preds = model.predict(X_test)
    scores = {}
    if map:
        model_preds = [map[i] for i in model_preds]
    evaluation_funcs = {'accuracy':accuracy_score, 'precision':precision_score, 'recall':recall_score, 'f1':f1_score, 'MCC':matthews_corrcoef}
    for name, score in evaluation_funcs.items():
        scores[name] = score(y_pred = model_preds, y_true = y_test)
    scores['ROC-AUC'] = roc_auc_score(y_score = model_preds, y_true = y_test)
    scores['AUPR'] = average_precision_score(y_score = model_preds, y_true = y_test)
    return scores

In [11]:
transaction_df = pd.read_csv('original_data.csv')
#Split data to x, y, train, test
y = transaction_df['fraud']
X = transaction_df.drop(columns=['fraud'])
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [12]:
#Fit transformer to data
transformer = ColumnTransformer([('One Hot Encoder', OneHotEncoder(drop='first'), ['category', 'gender']),
                                 ('Age Pipe', Pipeline([('ord', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)), ('scale', MinMaxScaler())]), ['age']),
                                 ('MinMaxScaler', MinMaxScaler(), ['amount', 'step']),
                                 ('drop', 'drop', ['zipcodeOri', 'zipMerchant', 'customer', 'merchant'])], remainder = MinMaxScaler())
transformer.fit(X_train)


In [27]:
scorer_dict = {
    'accuracy':make_scorer(accuracy_score),
    'precision':make_scorer(precision_score),
    'recall':make_scorer(recall_score),
    'f1':make_scorer(f1_score),
    'mcc':make_scorer(matthews_corrcoef),
    'roc-auc':make_scorer(roc_auc_score),
    'aupr':make_scorer(average_precision_score)
}

In [13]:
#Decision Tree
decision_tree = Pipeline(steps=[('transformer', transformer), ('model', DecisionTreeClassifier(random_state=42))])
decision_tree.fit(X_train, y_train)
desc_tree_scores = score_model(decision_tree, X_test, y_test)
desc_tree_scores

{'accuracy': 0.9913090857723277,
 'precision': 0.6225910064239829,
 'recall': 0.6645714285714286,
 'f1': 0.6428966279712548,
 'MCC': 0.6388506653966829,
 'ROC-AUC': 0.8298863023968837,
 'AUPR': 0.41770477555463403}

In [28]:
#Decision Tree Grid Search
scoring_dict = {'accuracy':accuracy_score, 'precision':precision_score, 'recall':recall_score, 'f1':f1_score, 'MCC': matthews_corrcoef, 'ROC-AUC':roc_auc_score, 'AUPR':average_precision_score}
decision_tree = Pipeline(steps=[('transformer', transformer), 
                                ('model', GridSearchCV(
                                    DecisionTreeClassifier(random_state=42), 
                                    {'criterion': ['gini', 'entropy', 'log_loss']},
                                    scoring = scorer_dict,
                                    refit='accuracy'))])
decision_tree.fit(X_train, y_train)
desc_tree_scores = score_model(decision_tree.best_estimator_, X_test, y_test)

In [31]:
desc_tree_scores = score_model(decision_tree, X_test, y_test)
desc_tree_scores

{'accuracy': 0.9918875831589994,
 'precision': 0.6501103752759382,
 'recall': 0.6731428571428572,
 'f1': 0.6614261650758001,
 'MCC': 0.6574240576933236,
 'ROC-AUC': 0.8344136595820405,
 'AUPR': 0.4414648357642067}

In [None]:
# Random Forest
random_forest = Pipeline(steps=[('transformer', transformer), ('model', RandomForestClassifier(random_state=42))])
random_forest.fit(X_train, y_train)
random_forest_scores = score_model(random_forest, X_test, y_test)

In [None]:
# KNN
KNN = Pipeline(steps=[('transformer', transformer), ('model', KNeighborsClassifier())])
KNN.fit(X_train, y_train)
KNN_scores = score_model(KNN, X_test, y_test)

In [None]:
#Multilayer perceptron
MLP = Pipeline(steps=[('transformer', transformer), ('model', MLPClassifier(hidden_layer_sizes=(15, 15, 15), random_state=42))])
MLP.fit(X_train, y_train)
MLP_scores = score_model(MLP, X_test, y_test)

In [None]:
#Support Vector machine
SVM = Pipeline(steps=[('transformer', transformer), ('model', SVC(class_weight='balanced', random_state=42))])
SVM.fit(X_train, y_train)
SVM_scores = score_model(SVM, X_test, y_test)

In [None]:
#Isolation forest
ISO = Pipeline(steps=[('transformer', transformer), ('model', IsolationForest(contamination=sum(y_train)/len(y_train), random_state=42))])
ISO.fit(X_train, y_train)
ISO_scores = score_model(ISO, X_test, y_test, map={1:0, -1:1})

In [None]:
#Local Outlier Factor
LOF = Pipeline(steps=[('transformer', transformer), ('model', LocalOutlierFactor(n_neighbors=10, novelty=True, contamination=sum(y_train)/len(y_train)))])
LOF.fit(X_train, y_train)
LOF_scores = score_model(LOF, X_test, y_test, map={1:0, -1:1})

In [None]:
scores = {
            'Decision Tree': desc_tree_scores,
            'Random Forest': random_forest_scores,
            'K-NN': KNN_scores,
            'MLP': MLP_scores,
            'SVM': SVM_scores,
            'LOF': LOF_scores,
            'IF': ISO_scores
            }