In [51]:
import pandas as pd
import numpy as np
import seaborn as sns

from matplotlib import pyplot as plt

from os import listdir
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier, IsolationForest
from sklearn.metrics import f1_score, accuracy_score, matthews_corrcoef, precision_score, roc_auc_score, recall_score, average_precision_score, make_scorer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier, LocalOutlierFactor
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, MinMaxScaler
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from tqdm import tqdm

In [52]:
# Function used to score the models, returns a dictionary of the scores
def score_model(model, X_test, y_test, map = None):
    model_preds = model.predict(X_test)
    scores = {}
    if map:
        model_preds = [map[i] for i in model_preds]
    evaluation_funcs = {'accuracy':accuracy_score, 'precision':precision_score, 'recall':recall_score, 'f1':f1_score, 'MCC':matthews_corrcoef}
    for name, score in evaluation_funcs.items():
        scores[name] = score(y_pred = model_preds, y_true = y_test)
    scores['ROC-AUC'] = roc_auc_score(y_score = model_preds, y_true = y_test)
    scores['AUPR'] = average_precision_score(y_score = model_preds, y_true = y_test)
    return scores

In [53]:
transaction_df = pd.read_csv('original_data.csv')
transaction_df.replace("'",'', regex=True, inplace=True) 
#Drop unused columns
transaction_df.drop(columns = ['step', 'age', 'gender', 'zipcodeOri', 'zipMerchant'], inplace=True)
#Split data to train, test
train, test = train_test_split(transaction_df, random_state=42)

In [54]:
# Create merchant fraud rate, 1 - 5 value of fraud rates in merchants in train set
mechant_fraud_rate = pd.cut(train.groupby('merchant').mean('fraud')['fraud'], bins = 5, labels=range(5))
train['merchant fraud rate'] = train['merchant'].apply(lambda x: mechant_fraud_rate.get(x))
test['merchant fraud rate'] = test['merchant'].apply(lambda x: mechant_fraud_rate.get(x))

In [55]:
# Create previous fraud per customer. 1 if there was fraud in the test set, 0 if not
customer_previous_fraud = (train.groupby('customer').sum('fraud')['fraud'] > 0).astype(int)
train['previous fraud'] = train['customer'].apply(lambda x: customer_previous_fraud.get(x))
test['previous fraud'] = test['customer'].apply(lambda x: customer_previous_fraud.get(x))

In [56]:
# Create category fraud rate, 1 - 5 value of fraud rate in train set
category_fraud_rate = pd.cut(train.groupby('category').mean('fraud')['fraud'], bins = 5, labels = range(5))
train['category fraud rate'] = train['category'].apply(lambda x: category_fraud_rate.get(x))
test['category fraud rate'] = test['category'].apply(lambda x: category_fraud_rate.get(x))

In [57]:
train.drop(columns=['customer', 'merchant', 'category'], inplace=True)
test.drop(columns=['customer', 'merchant', 'category'], inplace=True)
train.head()

Unnamed: 0,amount,fraud,merchant fraud rate,previous fraud,category fraud rate
458422,40.54,0,0,0,0
102312,38.63,0,0,0,0
253447,61.62,0,0,1,0
585032,41.24,0,0,0,0
136214,4.74,0,0,0,0


In [58]:
# Make feature and target vectors
y_train = train['fraud']
X_train = train.drop(columns = ['fraud'])
y_test = test['fraud']
X_test = test.drop(columns = ['fraud'])

In [59]:
#Fit transformer to data
MMscaler = MinMaxScaler()
X_train_transformed = MMscaler.fit_transform(X_train)
X_test_transformed = MMscaler.transform(X_test)

In [60]:
#Decision Tree
decision_tree = DecisionTreeClassifier(random_state=42)
decision_tree.fit(X_train_transformed, y_train)
dec_tree_scores = score_model(decision_tree, X_test_transformed, y_test)
dec_tree_scores

{'accuracy': 0.9935894417500218,
 'precision': 0.7455329636475663,
 'recall': 0.6914285714285714,
 'f1': 0.7174621998221168,
 'MCC': 0.7147432892156179,
 'ROC-AUC': 0.8443086727921764,
 'AUPR': 0.5191152174589402}

In [61]:
# Random Forest
random_forest = RandomForestClassifier(random_state=42, n_jobs=-1)
random_forest.fit(X_train_transformed, y_train)
random_forest_scores = score_model(random_forest, X_test_transformed, y_test)
random_forest_scores

{'accuracy': 0.9936096218914174,
 'precision': 0.7442002442002442,
 'recall': 0.6965714285714286,
 'f1': 0.719598583234947,
 'MCC': 0.7167709718976021,
 'ROC-AUC': 0.8468596808368917,
 'AUPR': 0.521960512272778}

In [62]:
# KNN
KNN = KNeighborsClassifier(n_jobs=-1)
KNN.fit(X_train_transformed, y_train)
KNN_scores = score_model(KNN, X_test_transformed, y_test)
KNN_scores

{'accuracy': 0.9946522625301861,
 'precision': 0.8277282086479066,
 'recall': 0.6891428571428572,
 'f1': 0.7521047708138447,
 'MCC': 0.752640913994761,
 'ROC-AUC': 0.8437171698705824,
 'AUPR': 0.5740823149517458}

In [63]:
#Multilayer perceptron
MLP = MLPClassifier(hidden_layer_sizes=(15, 15, 15), random_state=42)
MLP.fit(X_train_transformed, y_train)
MLP_scores = score_model(MLP, X_test_transformed, y_test)
MLP_scores

{'accuracy': 0.9949616913649175,
 'precision': 0.802416918429003,
 'recall': 0.7588571428571429,
 'f1': 0.7800293685756241,
 'MCC': 0.7777928716069064,
 'ROC-AUC': 0.8783156527226883,
 'AUPR': 0.6117584833222367}

In [64]:
#Support Vector machine
SVM = SVC(random_state=42)
SVM.fit(X_train_transformed, y_train)
SVM_scores = score_model(SVM, X_test_transformed, y_test)
SVM_scores

{'accuracy': 0.9951567660650742,
 'precision': 0.8781204111600588,
 'recall': 0.6834285714285714,
 'f1': 0.7686375321336761,
 'MCC': 0.7723821554655222,
 'ROC-AUC': 0.8411493178085468,
 'AUPR': 0.6038591775857622}

In [65]:
#Isolation forest
y_train_transformed = y_train.map({0:1, 1:-1})

ISO = IsolationForest(contamination=sum(y_train)/len(y_train), random_state=42, n_jobs=-1)
ISO.fit(X_train_transformed, y_train_transformed)

In [66]:
ISO_scores = score_model(ISO, X_test_transformed, y_test, map={1:0, -1:1})
ISO_scores

{'accuracy': 0.9936768890294024,
 'precision': 0.7288135593220338,
 'recall': 0.7371428571428571,
 'f1': 0.7329545454545454,
 'MCC': 0.7297672799466504,
 'ROC-AUC': 0.8669377864343524,
 'AUPR': 0.5403339977904129}

In [67]:
#Local Outlier Factor
LOF = LocalOutlierFactor(novelty=True, contamination=sum(y_train)/len(y_train), n_jobs=-1)
LOF.fit(X_train_transformed, y_train_transformed)
LOF_scores = score_model(LOF, X_test_transformed, y_test, map={1:0, -1:1})
LOF_scores

{'accuracy': 0.9779027451719011,
 'precision': 0.0006506180871828237,
 'recall': 0.0005714285714285715,
 'f1': 0.0006084575600851841,
 'MCC': -0.010538893084049006,
 'ROC-AUC': 0.49505805944707054,
 'AUPR': 0.011765394215354621}

In [68]:
scores = {
            'Decision Tree': dec_tree_scores,
            'Random Forest': random_forest_scores,
            'K-NN': KNN_scores,
            'MLP': MLP_scores,
            'SVM': SVM_scores,
            'LOF': LOF_scores,
            'IF': ISO_scores
            }
scores_df = pd.DataFrame(scores)

In [70]:
(100 * scores_df).round(2).T.to_csv('engineered_features_binned.csv')