In [2]:
import pickle
from pathlib import Path

import numpy as np
import pandas as pd

# samplers
from imblearn.over_sampling import ADASYN
from imblearn.over_sampling import SMOTE

# transformers
from sklearn.preprocessing import StandardScaler
from transformers import AmountCentsOnly
from transformers import Log1pAmount
from transformers import TimeToHour

# estimators
from sklearn.linear_model import LogisticRegression

# misc
from imblearn.pipeline import Pipeline

from sklearn.metrics import confusion_matrix
from sklearn.metrics import recall_score

from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split

from sklearn.neighbors import LocalOutlierFactor

In [3]:
data_dir = Path.cwd() / 'data'
pkl_dir = data_dir / 'pkl'

credit = pd.read_pickle(pkl_dir / 'credit_train.pkl')
X, y = credit.drop(columns=['Class']), credit['Class']


# remove outliers from non-fraud samples to see if it performs better
is_fraud = (y == 1)
fraud = credit[is_fraud]
nonfraud = credit[~is_fraud]

loc = LocalOutlierFactor(contamination='auto')
loc_pred = loc.fit_predict(nonfraud)
is_inlier = (loc_pred == 1)

credit_in = pd.concat([nonfraud[is_inlier], fraud])
X_in, y_in = credit_in.drop(columns=['Class']), credit_in['Class']

## Pipelines

In [4]:
def pipeline_name(pipeline, sep=' -> ', method='keys'):
    assert method in ['keys', 'values']
    steps = getattr(pipeline.named_steps, method)()
    if method == 'values':
        steps = (type(obj).__name__ for obj in steps)
    return sep.join(steps)

In [5]:
# transformers
hour = TimeToHour()
log1p = Log1pAmount()
cents = AmountCentsOnly()
scaler = StandardScaler()

transform_steps = [
    ('hour', hour),
    ('log1p', log1p),
    ('cents', cents),
    ('scaler', scaler),
]


# samplers
adasyn = ADASYN(random_state=0)
smote = SMOTE(random_state=0)


# classifier
logreg = LogisticRegression(solver='liblinear', random_state=0)

# pipelines
no_sampling_pipe = Pipeline(transform_steps + [('logreg', logreg)])
smote_pipe = Pipeline(transform_steps + [('smote', smote), ('logreg', logreg)])
adasyn_pipe = Pipeline(transform_steps + [('adasyn', adasyn), ('logreg', logreg)])

all_pipes = [no_sampling_pipe, smote_pipe, adasyn_pipe]

In [5]:
[pipeline_name(pipe) for pipe in all_pipes]

['hour -> log1p -> cents -> scaler -> logreg',
 'hour -> log1p -> cents -> scaler -> smote -> logreg',
 'hour -> log1p -> cents -> scaler -> adasyn -> logreg']

## Cross Validation

In [7]:
def pipeline_cv(X, y, pipelines):
    skf = StratifiedKFold(n_splits=5, random_state=0)
    for pipe in pipelines:
        steps = pipeline_name(pipe)
        scores = []
        print('=' * 100)
        print(steps, '\n')
        for train_idx, test_idx in skf.split(X, y):
            X_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
            X_test, y_test = X.iloc[test_idx], y.iloc[test_idx]
            y_pred = pipe.fit(X_train, y_train).predict(X_test)
            cm = confusion_matrix(y_test, y_pred)
            score = recall_score(y_test, y_pred)
            print(f"Recall: {score:.2%}")
            print(cm, '\n')
            scores.append(score)
        print(f"\nMean Recall: {np.mean(scores)}")

In [10]:
pipeline_cv(X, y, all_pipes)

hour -> log1p -> cents -> scaler -> logreg 

Recall: 55.70%
[[45484     7]
 [   35    44]] 

Recall: 60.26%
[[45484     7]
 [   31    47]] 

Recall: 66.67%
[[45481    10]
 [   26    52]] 

Recall: 57.69%
[[45486     5]
 [   33    45]] 

Recall: 56.41%
[[45485     5]
 [   34    44]] 


Mean Recall: 0.5934436871145732
hour -> log1p -> cents -> scaler -> smote -> logreg 

Recall: 87.34%
[[44416  1075]
 [   10    69]] 

Recall: 92.31%
[[44312  1179]
 [    6    72]] 

Recall: 89.74%
[[44289  1202]
 [    8    70]] 

Recall: 88.46%
[[44308  1183]
 [    9    69]] 

Recall: 93.59%
[[44272  1218]
 [    5    73]] 


Mean Recall: 0.9028886725089256
hour -> log1p -> cents -> scaler -> adasyn -> logreg 

Recall: 89.87%
[[41708  3783]
 [    8    71]] 

Recall: 93.59%
[[41167  4324]
 [    5    73]] 

Recall: 89.74%
[[41546  3945]
 [    8    70]] 

Recall: 92.31%
[[41141  4350]
 [    6    72]] 

Recall: 96.15%
[[41304  4186]
 [    3    75]] 


Mean Recall: 0.9233365790327817


In [8]:
pipeline_cv(X_in, y_in, all_pipes)

hour -> log1p -> cents -> scaler -> logreg 

Recall: 64.56%
[[44726     6]
 [   28    51]] 

Recall: 62.82%
[[44727     4]
 [   29    49]] 

Recall: 74.36%
[[44724     7]
 [   20    58]] 

Recall: 66.67%
[[44728     3]
 [   26    52]] 

Recall: 62.82%
[[44727     4]
 [   29    49]] 


Mean Recall: 0.6624472573839661
hour -> log1p -> cents -> scaler -> smote -> logreg 

Recall: 87.34%
[[43647  1085]
 [   10    69]] 

Recall: 92.31%
[[43572  1159]
 [    6    72]] 

Recall: 89.74%
[[43540  1191]
 [    8    70]] 

Recall: 88.46%
[[43550  1181]
 [    9    69]] 

Recall: 93.59%
[[43524  1207]
 [    5    73]] 


Mean Recall: 0.9028886725089256
hour -> log1p -> cents -> scaler -> adasyn -> logreg 

Recall: 89.87%
[[40957  3775]
 [    8    71]] 

Recall: 94.87%
[[40558  4173]
 [    4    74]] 

Recall: 84.62%
[[40856  3875]
 [   12    66]] 

Recall: 89.74%
[[40588  4143]
 [    8    70]] 

Recall: 94.87%
[[40669  4062]
 [    4    74]] 


Mean Recall: 0.9079519636481661


Interestingly, removing non-fraud outliers does not help the performance, so I will be using the entire test set for the final evaluation. 

While ADASYN performed ~2% better than SMOTE in terms of recall, my metric of choice, SMOTE had 4x fewer false positives. Thus depending on the cost of letting a true-fraudster be uncaught vs auditing people classified as fraudsters, we would choose one variant over the other.

## Grid Search

### SMOTE - Logistic Regression

In [39]:
smote_params = dict(
    smote__k_neighbors=range(4, 6),
    smote__random_state=[0],
    logreg__C=np.power(10.0, range(-2, 3)),
    logreg__penalty=['l1', 'l2'],
    logreg__solver=['liblinear'],
    logreg__random_state=[0],
)
smote_gridcv = GridSearchCV(smote_pipe, smote_params, scoring='recall', iid=False, cv=5, verbose=2, n_jobs=1)

In [None]:
smote_gridcv.fit(X, y)

In [51]:
smote_gridcv.best_score_

0.9054527750730281

In [44]:
smote_gridcv.best_params_

{'logreg__C': 0.01,
 'logreg__penalty': 'l1',
 'logreg__random_state': 0,
 'logreg__solver': 'liblinear',
 'smote__k_neighbors': 4,
 'smote__random_state': 0}

In [87]:
smote_best = smote_gridcv.best_estimator_

In [88]:
with open(pkl_dir / 'smote_best.pkl', 'wb') as fp:
    pickle.dump(smote_best, fp)

### ADASYN - Logistic Regression

In [47]:
adasyn_params = dict(
    adasyn__n_neighbors=range(4, 6),
    adasyn__random_state=[0],
    logreg__C=np.power(10.0, range(-2, 3)),
    logreg__penalty=['l1', 'l2'],
    logreg__solver=['liblinear'],
    logreg__random_state=[0],
)
adasyn_gridcv = GridSearchCV(adasyn_pipe, adasyn_params, scoring='recall', iid=False, cv=5, verbose=2, n_jobs=1)

In [None]:
adasyn_gridcv.fit(X, y)

In [52]:
adasyn_gridcv.best_score_

0.9233365790327817

In [49]:
adasyn_gridcv.best_params_

{'adasyn__n_neighbors': 5,
 'adasyn__random_state': 0,
 'logreg__C': 1.0,
 'logreg__penalty': 'l1',
 'logreg__random_state': 0,
 'logreg__solver': 'liblinear'}

In [85]:
adasyn_best = adasyn_gridcv.best_estimator_

In [86]:
with open(pkl_dir / 'adasyn_best.pkl', 'wb') as fp:
    pickle.dump(adasyn_best, fp)