In [2]:
import pandas as pd
import numpy as np
import seaborn as sns

from matplotlib import pyplot as plt

from os import listdir
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier, IsolationForest
from sklearn.metrics import f1_score, accuracy_score, matthews_corrcoef, precision_score, roc_auc_score, recall_score, average_precision_score, make_scorer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier, LocalOutlierFactor
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, MinMaxScaler
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from tqdm import tqdm

In [3]:
def score_model(model, X_test, y_test, map = None):
    model_preds = model.predict(X_test)
    scores = {}
    if map:
        model_preds = [map[i] for i in model_preds]
    evaluation_funcs = {'accuracy':accuracy_score, 'precision':precision_score, 'recall':recall_score, 'f1':f1_score, 'MCC':matthews_corrcoef}
    for name, score in evaluation_funcs.items():
        scores[name] = score(y_pred = model_preds, y_true = y_test)
    scores['ROC-AUC'] = roc_auc_score(y_score = model_preds, y_true = y_test)
    scores['AUPR'] = average_precision_score(y_score = model_preds, y_true = y_test)
    return scores

In [4]:
transaction_df = pd.read_csv('original_data.csv')
#Split data to x, y, train, test
y = transaction_df['fraud']
X = transaction_df.drop(columns=['fraud'])
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [5]:
#Fit transformer to data
transformer = ColumnTransformer([('One Hot Encoder', OneHotEncoder(drop='first'), ['category', 'gender']),
                                 ('Age Pipe', Pipeline([('ord', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)), ('scale', MinMaxScaler())]), ['age']),
                                 ('MinMaxScaler', MinMaxScaler(), ['amount', 'step']),
                                 ('drop', 'drop', ['zipcodeOri', 'zipMerchant', 'customer', 'merchant'])], remainder = MinMaxScaler(), sparse_threshold=0)
transformer.fit(X_train)
X_train_transformed = transformer.transform(X_train)
X_test_transformed = transformer.transform(X_test)

In [6]:
roc_auc_scorer = make_scorer(roc_auc_score)

In [7]:
param_grid = {'criterion': ['gini', 'entropy']}

decision_tree_grid = GridSearchCV(DecisionTreeClassifier(random_state=42), 
                                    param_grid,
                                    scoring = roc_auc_scorer,
                                    verbose=1,
                                    n_jobs=-1)
decision_tree_grid.fit(X_train_transformed, y_train)
dec_tree_grid_scores = score_model(decision_tree_grid, X_test_transformed, y_test)
dec_tree_grid_scores

Fitting 5 folds for each of 2 candidates, totalling 10 fits


{'accuracy': 0.9913090857723277,
 'precision': 0.6225910064239829,
 'recall': 0.6645714285714286,
 'f1': 0.6428966279712548,
 'MCC': 0.6388506653966829,
 'ROC-AUC': 0.8298863023968837,
 'AUPR': 0.41770477555463403}

In [8]:
# Random Forest
param_grid = {'criterion': ['gini', 'entropy'],
              'class_weight':['balanced', 'balanced_subsample', None],}
random_forest_grid = GridSearchCV(RandomForestClassifier(random_state=42), 
                                    param_grid,
                                    scoring = roc_auc_scorer,
                                    verbose=1,
                                    n_jobs=-1)
random_forest_grid.fit(X_train_transformed, y_train)

Fitting 5 folds for each of 6 candidates, totalling 30 fits




KeyboardInterrupt: 

In [8]:
random_forest_scores_grid = score_model(random_forest_grid, X_test_transformed, y_test)
random_forest_scores_grid

{'accuracy': 0.9947464365233653,
 'precision': 0.862378459237098,
 'recall': 0.6588571428571428,
 'f1': 0.747003563330094,
 'MCC': 0.7512917590930227,
 'ROC-AUC': 0.8288023419426922,
 'AUPR': 0.5722000558522087}

In [9]:
# KNN
param_grid = {'n_neighbors':range(5, 15, 2),
                            'weights':['uniform', 'distance'],
                            'p':[1, 2]}
KNN_grid = GridSearchCV(KNeighborsClassifier(n_jobs = -1), 
                        param_grid,
                        scoring = roc_auc_scorer,
                        verbose=2,
                        n_jobs=-1)
KNN_grid.fit(X_train_transformed, y_train)                               

Fitting 5 folds for each of 20 candidates, totalling 100 fits
[CV] END ................n_neighbors=5, p=2, weights=uniform; total time= 3.5min
[CV] END ................n_neighbors=5, p=2, weights=uniform; total time= 3.7min
[CV] END ................n_neighbors=5, p=2, weights=uniform; total time= 3.4min
[CV] END ................n_neighbors=5, p=2, weights=uniform; total time= 3.6min
[CV] END ................n_neighbors=5, p=2, weights=uniform; total time= 3.7min
[CV] END ...............n_neighbors=5, p=2, weights=distance; total time= 3.5min
[CV] END ...............n_neighbors=5, p=2, weights=distance; total time= 3.4min
[CV] END ...............n_neighbors=5, p=2, weights=distance; total time= 3.5min
[CV] END ...............n_neighbors=5, p=2, weights=distance; total time= 3.6min
[CV] END ...............n_neighbors=5, p=2, weights=distance; total time= 3.4min
[CV] END ................n_neighbors=5, p=1, weights=uniform; total time=18.1min
[CV] END ...............n_neighbors=5, p=1, wei

In [10]:
KNN_grid_scores = score_model(KNN_grid, X_test_transformed, y_test)
KNN_grid_scores   

{'accuracy': 0.9922171921351263,
 'precision': 0.743631881676253,
 'recall': 0.5171428571428571,
 'f1': 0.6100438153016514,
 'MCC': 0.6164682229734424,
 'ROC-AUC': 0.757509561182329,
 'AUPR': 0.39024798911231584}

In [11]:
#Multilayer perceptron
param_grid = {'hidden_layer_sizes': [(10, 10, 10), (15, 15, 15), (20, 20, 20),],
            'activation': ['tanh', 'relu'],
            'solver':['sgd', 'adam'],
            'alpha': [.001, .01, .1],
            'learning_rate':['constant', 'adaptive']}
MLP_grid = GridSearchCV(MLPClassifier(random_state=42), 
                        param_grid,
                        scoring = roc_auc_scorer,
                        verbose=2,
                        n_jobs=-1)
MLP_grid.fit(X_train_transformed, y_train)

Fitting 5 folds for each of 72 candidates, totalling 360 fits
[CV] END activation=tanh, alpha=0.001, hidden_layer_sizes=(10, 10, 10), learning_rate=constant, solver=adam; total time=  40.3s
[CV] END activation=tanh, alpha=0.001, hidden_layer_sizes=(10, 10, 10), learning_rate=constant, solver=adam; total time=  40.7s
[CV] END activation=tanh, alpha=0.001, hidden_layer_sizes=(10, 10, 10), learning_rate=constant, solver=adam; total time=  40.8s
[CV] END activation=tanh, alpha=0.001, hidden_layer_sizes=(10, 10, 10), learning_rate=constant, solver=adam; total time=  40.9s
[CV] END activation=tanh, alpha=0.001, hidden_layer_sizes=(10, 10, 10), learning_rate=constant, solver=adam; total time=  42.3s
[CV] END activation=tanh, alpha=0.001, hidden_layer_sizes=(10, 10, 10), learning_rate=adaptive, solver=adam; total time=  39.9s
[CV] END activation=tanh, alpha=0.001, hidden_layer_sizes=(10, 10, 10), learning_rate=adaptive, solver=adam; total time=  41.9s
[CV] END activation=tanh, alpha=0.001, hid

In [12]:
MLP_grid_scores = score_model(MLP_grid, X_test_transformed, y_test)
MLP_grid_scores

{'accuracy': 0.9950222317891041,
 'precision': 0.9279661016949152,
 'recall': 0.6257142857142857,
 'f1': 0.7474402730375427,
 'MCC': 0.7598212674065462,
 'ROC-AUC': 0.8125678520620356,
 'AUPR': 0.5850476440271268}

In [19]:
param_grid = {'C': [2**i for i in range(-5, 15, 2)], 
              'gamma': [2**i for i in range(-15, 3, 2)],
              'class_weight':['balanced', {0:1, 1:1}]}
#Support Vector machine
SVM_grid = GridSearchCV(SVC(kernel='rbf', random_state=42),
                    param_grid,   
                    scoring = roc_auc_scorer,
                    verbose=2,
                    n_jobs=-1)
SVM_grid.fit(X_train_transformed, y_train)

Fitting 5 folds for each of 180 candidates, totalling 900 fits


[CV] END C=0.03125, class_weight=balanced, gamma=0.00048828125; total time=553.0min
[CV] END C=0.03125, class_weight=balanced, gamma=0.00048828125; total time=554.4min
[CV] END C=0.03125, class_weight=balanced, gamma=0.0001220703125; total time=640.4min
[CV] END C=0.03125, class_weight=balanced, gamma=3.0517578125e-05; total time=640.8min
[CV] END C=0.03125, class_weight=balanced, gamma=3.0517578125e-05; total time=643.2min
[CV] END C=0.03125, class_weight=balanced, gamma=3.0517578125e-05; total time=648.6min
[CV] END C=0.03125, class_weight=balanced, gamma=3.0517578125e-05; total time=652.2min
[CV] END C=0.03125, class_weight=balanced, gamma=0.0001220703125; total time=653.0min
[CV] END C=0.03125, class_weight=balanced, gamma=3.0517578125e-05; total time=656.1min
[CV] END C=0.03125, class_weight=balanced, gamma=0.0001220703125; total time=658.1min
[CV] END C=0.03125, class_weight=balanced, gamma=0.0001220703125; total time=664.3min
[CV] END C=0.03125, class_weight=balanced, gamma=0.00

In [None]:
SVM_grid_scores = score_model(SVM_grid, X_test_transformed, y_test)
SVM_grid_scores

{'accuracy': 0.93530246668595,
 'precision': 0.15167345493182222,
 'recall': 0.9788571428571429,
 'f1': 0.26264949402023924,
 'MCC': 0.3719260199574464,
 'ROC-AUC': 0.9568203936883068,
 'AUPR': 0.14871553315237993}

In [16]:
y_train_transformed = y_train.map({0:-1, 1:1})
y_test_transformed = y_test.map({0:-1, 1:1})

param_grid = {'n_estimators': range(100, 900, 100), 
              'max_samples': range(100, 500, 100), 
              'max_features': [.25, .5, .75, 1.0]}

ISO_grid = GridSearchCV(IsolationForest(random_state=42, n_jobs=-1),
                        param_grid,
                        scoring = roc_auc_scorer,
                        verbose=2,
                        n_jobs=-1)
ISO_grid.fit(X_train_transformed, y_train_transformed)

Fitting 5 folds for each of 128 candidates, totalling 640 fits


[CV] END max_features=0.25, max_samples=100, n_estimators=100; total time=  28.2s
[CV] END max_features=0.25, max_samples=100, n_estimators=100; total time=  28.2s
[CV] END max_features=0.25, max_samples=100, n_estimators=100; total time=  28.4s
[CV] END max_features=0.25, max_samples=100, n_estimators=100; total time=  28.4s
[CV] END max_features=0.25, max_samples=100, n_estimators=100; total time=  28.6s
[CV] END max_features=0.25, max_samples=100, n_estimators=200; total time=  52.4s
[CV] END max_features=0.25, max_samples=100, n_estimators=200; total time=  52.5s
[CV] END max_features=0.25, max_samples=100, n_estimators=200; total time=  52.5s
[CV] END max_features=0.25, max_samples=100, n_estimators=200; total time=  52.5s
[CV] END max_features=0.25, max_samples=100, n_estimators=200; total time=  52.7s
[CV] END max_features=0.25, max_samples=100, n_estimators=300; total time= 1.3min
[CV] END max_features=0.25, max_samples=100, n_estimators=300; total time= 1.4min
[CV] END max_fea



[CV] END max_features=0.5, max_samples=200, n_estimators=400; total time= 2.6min
[CV] END max_features=0.5, max_samples=200, n_estimators=400; total time= 2.6min
[CV] END max_features=0.5, max_samples=200, n_estimators=400; total time= 2.7min
[CV] END max_features=0.5, max_samples=200, n_estimators=500; total time= 3.2min
[CV] END max_features=0.5, max_samples=200, n_estimators=500; total time= 3.0min
[CV] END max_features=0.5, max_samples=200, n_estimators=500; total time= 3.0min
[CV] END max_features=0.5, max_samples=200, n_estimators=500; total time= 3.1min
[CV] END max_features=0.5, max_samples=200, n_estimators=500; total time= 3.2min
[CV] END max_features=0.5, max_samples=200, n_estimators=600; total time= 3.9min
[CV] END max_features=0.5, max_samples=200, n_estimators=600; total time= 3.6min
[CV] END max_features=0.5, max_samples=200, n_estimators=600; total time= 3.7min
[CV] END max_features=0.5, max_samples=200, n_estimators=600; total time= 3.7min
[CV] END max_features=0.5, m

In [17]:
ISO_grid_scores = score_model(ISO_grid, X_test_transformed, y_test, map={-1:1, 1:0})
ISO_grid_scores

{'accuracy': 0.8752060056100793,
 'precision': 0.08237224100218732,
 'recall': 0.9468571428571428,
 'f1': 0.15155949876520627,
 'MCC': 0.2589415920796484,
 'ROC-AUC': 0.9106048209946352,
 'AUPR': 0.07862032914933231}

In [None]:
# param_grid = {'n_neighbors':range(10, 100, 5),
#               'p':[1, 2],
#               'contamination':['auto', sum(y_train)/len(y_train)]}

# LOF_grid = GridSearchCV(LocalOutlierFactor(novelty=True, n_jobs=-1),
#                         param_grid,
#                         scoring = scorer_dict,
#                         refit='accuracy',
#                         verbose=2,
#                         n_jobs=-1)
# LOF_grid.fit(X_train_transformed, y_train)

In [None]:
# LOF_grid_scores = score_model(LOF_grid, X_test_transformed, y_test, map={1:0, -1:1})
# LOF_grid_scores

In [None]:
grid_scores = {
            'Decision Tree': dec_tree_grid_scores,
            'Random Forest': random_forest_scores_grid,
            'K-NN': KNN_grid_scores,
            'MLP': MLP_grid_scores,
            'SVM': SVM_grid_scores,
            # 'LOF': LOF_grid_scores,
            # 'IF': ISO_grid_scores
            }

In [None]:
grid_scores_df = pd.DataFrame(grid_scores)

In [1]:
grid_scores_df.to_csv('grid_search.csv')

NameError: name 'grid_scores_df' is not defined