In [44]:
from itertools import chain
from itertools import cycle
from functools import partial
from pathlib import Path

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

from imblearn.metrics import specificity_score
from imblearn.over_sampling import ADASYN, SMOTE
from imblearn.pipeline import Pipeline
from imblearn.under_sampling import NeighbourhoodCleaningRule

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import LocalOutlierFactor
from sklearn.pipeline import FeatureUnion
from sklearn.utils.multiclass import unique_labels

# from xgboost import XGBoostClassifier

from transformers import AmountCentsOnly, Log1pAmount, TimeToHour

In [38]:
def plot_confusion_matrix(y_true, y_pred, classes,
                          normalize=False,
                          title=None,
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if not title:
        if normalize:
            title = 'Normalized confusion matrix'
        else:
            title = 'Confusion matrix, without normalization'

    # Compute confusion matrix
    cm = confusion_matrix(y_true, y_pred)
    # Only use the labels that appear in the data
    classes = classes[unique_labels(y_true, y_pred)]
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(f'Accuracy: {accuracy_score(y_true, y_pred)}')
    print(f'Recall: {recall_score(y_true, y_pred)}')
    print(f'Precision: {precision_score(y_true, y_pred)}')
    print(f'Specificity: {specificity_score(y_true, y_pred)}')
    print(cm)

    fig, ax = plt.subplots()
    im = ax.imshow(cm, interpolation='nearest', cmap=cmap)
    ax.figure.colorbar(im, ax=ax)
    # We want to show all ticks...
    ax.set(xticks=np.arange(cm.shape[1]),
           yticks=np.arange(cm.shape[0]),
           # ... and label them with the respective list entries
           xticklabels=classes, yticklabels=classes,
           title=title,
           ylabel='True label',
           xlabel='Predicted label')

    # Rotate the tick labels and set their alignment.
    plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
             rotation_mode="anchor")

    # Loop over data dimensions and create text annotations.
    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            ax.text(j, i, format(cm[i, j], fmt),
                    ha="center", va="center",
                    color="white" if cm[i, j] > thresh else "black")
    fig.tight_layout()
    return ax

In [39]:
data_dir = Path.cwd() / 'data'
pkl_dir = data_dir / 'pkl'

credit = pd.read_pickle(pkl_dir / 'credit_train.pkl')

In [52]:
hour = TimeToHour()
log1p = Log1pAmount()
cents = AmountCentsOnly()

# samplers
smote = SMOTE(random_state=0)
adasyn = ADASYN(random_state=0)
ncr = NeighbourhoodCleaningRule(random_state=0)

# classifiers
logreg = LogisticRegression(solver='liblinear', random_state=0)
gaussnb = GaussianNB()
randforest = RandomForestClassifier(random_state=0)

X, y = credit.drop(columns=['Class']), credit['Class']

X_tr, X_val, y_tr, y_val = train_test_split(X, y, test_size=0.2)

class_names = np.array(['Non-Fraud', 'Fraud'])

In [82]:
is_fraud = y_tr == 1
X_tr_nf = X_tr[~is_fraud]
y_tr_nf = y_tr[~is_fraud]
X_tr_fr = X_tr[is_fraud]
y_tr_fr = y_tr[is_fraud]

data_tr_nf = pd.concat([X_tr_nf, y_tr_nf], axis=1)
data_tr_fr = pd.concat([X_tr_fr, y_tr_fr], axis=1)

loc = LocalOutlierFactor(contamination='auto')
pred = loc.fit_predict(data_tr_nf)
is_inlier = pred == 1

data_in = pd.concat([data_tr_nf[is_inlier], data_tr_fr])
X_tr_in, y_tr_in = data_in.drop(columns=['Class']), data_in['Class']

len(X_tr), len(X_tr_in)

(182276, 178751)

In [67]:
# unfortunately imblearn doesn't allow FeatureUnions or nested Pipelines
# so here is a workaround for trying to reduce code repetition

def construct_pipeline(transform_steps, resample_steps, classifier):
    return Pipeline(list(chain(transform_steps, resample_steps, [('classifier', classifier)])))


transform_steps = [
    ('hour', hour),
    ('log1p', log1p),
    ('cents', cents),
]

nosample_pipe = partial(construct_pipeline, transform_steps, [])
adasyn_pipe = partial(construct_pipeline, transform_steps, [('adasyn', adasyn)])
smote_pipe = partial(construct_pipeline, transform_steps, [('smote', smote)])
ncr_smote_pipe = partial(construct_pipeline, transform_steps, [('ncr', ncr), ('smote', smote)])

nosample_pipes = []
adasyn_pipes = []
smote_pipes = []
ncr_smote_pipes = []

classifiers = [logreg, gauss_nb, randforest]
for clf in classifiers:
    nosample_pipes.append(nosample_pipe(clf))
    adasyn_pipes.append(adasyn_pipe(clf))
    smote_pipes.append(smote_pipe(clf))
    ncr_smote_pipes.append(ncr_smote_pipe(clf))

all_pipes = list(chain(nosample_pipes, adasyn_pipes, smote_pipes, ncr_smote_pipes))

In [68]:
y_preds = []
for pipe in all_pipes:
    y_pred = pipe.fit(X_tr, y_tr).predict(X_val)
    y_preds.append(y_pred)



In [84]:
y_in_preds = []
for pipe in all_pipes:
    y_in_pred = pipe.fit(X_tr_in, y_tr_in).predict(X_val)
    y_in_preds.append(y_in_pred)

In [85]:
from itertools import cycle
sampler_names = cycle(['none', 'adasyn', 'smote', 'ncr smote'])
estimator_names = cycle(['log reg', 'gauss nb', 'rand forest'])

for idx, (y_pred) in enumerate(y_in_preds):
    cm = confusion_matrix(y_val, y_pred)
    if idx % len(classifiers) == 0:
        print('=' * 100)
        print(next(sampler_names).upper())
    print(next(estimator_names))
    print(idx)
    print(cm)
    print(f'Accuracy: {accuracy_score(y_val, y_pred)}')
    print(f'Recall: {recall_score(y_val, y_pred)}')
    print(f'Precision: {precision_score(y_val, y_pred)}')
    print(f'Specificity: {specificity_score(y_val, y_pred)}')
    print('-' * 50)

NONE
log reg
0
[[45466    18]
 [   31    54]]
Accuracy: 0.9989247075862977
Recall: 0.6352941176470588
Precision: 0.75
Specificity: 0.9996042564418257
--------------------------------------------------
gauss nb
1
[[45022   462]
 [   26    59]]
Accuracy: 0.9892909653492505
Recall: 0.6941176470588235
Precision: 0.11324376199616124
Specificity: 0.9898425820068596
--------------------------------------------------
rand forest
2
[[45482     2]
 [   23    62]]
Accuracy: 0.9994513814215804
Recall: 0.7294117647058823
Precision: 0.96875
Specificity: 0.9999560284935362
--------------------------------------------------
ADASYN
log reg
3
[[44535   949]
 [   10    75]]
Accuracy: 0.9789549913318265
Recall: 0.8823529411764706
Precision: 0.0732421875
Specificity: 0.9791355201829215
--------------------------------------------------
gauss nb
4
[[44976   508]
 [   19    66]]
Accuracy: 0.988435120366916
Recall: 0.7764705882352941
Precision: 0.11498257839721254
Specificity: 0.9888312373581919
-------------

In [75]:
from itertools import cycle
sampler_names = cycle(['none', 'adasyn', 'smote', 'ncr smote'])
estimator_names = cycle(['log reg', 'gauss nb', 'rand forest'])

for idx, (y_pred) in enumerate(y_preds):
    cm = confusion_matrix(y_val, y_pred)
    if idx % len(classifiers) == 0:
        print('=' * 100)
        print(next(sampler_names).upper())
    print(next(estimator_names))
    print(idx)
    print(cm)
    print(f'Accuracy: {accuracy_score(y_val, y_pred)}')
    print(f'Recall: {recall_score(y_val, y_pred)}')
    print(f'Precision: {precision_score(y_val, y_pred)}')
    print(f'Specificity: {specificity_score(y_val, y_pred)}')
    print('-' * 50)

NONE
log reg
0
[[45476     8]
 [   38    47]]
Accuracy: 0.998990541815708
Recall: 0.5529411764705883
Precision: 0.8545454545454545
Specificity: 0.9998241139741447
--------------------------------------------------
gauss nb
1
[[45157   327]
 [   28    57]]
Accuracy: 0.9922096161864425
Recall: 0.6705882352941176
Precision: 0.1484375
Specificity: 0.9928106586931669
--------------------------------------------------
rand forest
2
[[45481     3]
 [   26    59]]
Accuracy: 0.9993636024490333
Recall: 0.6941176470588235
Precision: 0.9516129032258065
Specificity: 0.9999340427403043
--------------------------------------------------
ADASYN
log reg
3
[[44720   764]
 [   10    75]]
Accuracy: 0.983014768812131
Recall: 0.8823529411764706
Precision: 0.08939213349225268
Specificity: 0.983202884530824
--------------------------------------------------
gauss nb
4
[[45103   381]
 [   21    64]]
Accuracy: 0.9911782132590138
Recall: 0.7529411764705882
Precision: 0.14382022471910114
Specificity: 0.9916234280