In [12]:
import pickle
from pathlib import Path

import pandas as pd

from sklearn.metrics import confusion_matrix
from sklearn.metrics import recall_score
from sklearn.metrics import roc_curve

In [2]:
data_dir = Path.cwd() / 'data'
pkl_dir = data_dir / 'pkl'

credit_test = pd.read_pickle(pkl_dir / 'credit_test.pkl')
X_test, y_test = credit_test.drop(columns=['Class']), credit_test['Class']

with open(pkl_dir / 'smote_best.pkl', 'rb') as fp:
    smote_best = pickle.load(fp)
    
    
with open(pkl_dir / 'adasyn_best.pkl', 'rb') as fp:
    adasyn_best = pickle.load(fp)

## Evaluation
My metric of choice is recall, but I will also look into precision as the number of false-positives will affect business decisions should auditing too many non-fraud accounts be more costly than missing a few fraud accounts.

In [21]:
y_pred_smote = smote_best.predict(X_test)
recall_smote = recall_score(y_test, y_pred_smote)
conf_mtx_smote = confusion_matrix(y_test, y_pred_smote)
print('SMOTE')
print(f'Recall: {recall_smote:.2%}')
print(conf_mtx_smote)

SMOTE
Recall: 94.06%
[[55436  1425]
 [    6    95]]


In [20]:
y_pred_adasyn = adasyn_best.predict(X_test)
recall_adasyn = recall_score(y_test, y_pred_adasyn)
conf_mtx_adasyn = confusion_matrix(y_test, y_pred_adasyn)
print('ADASYN')
print(f'Recall: {recall_adasyn:.2%}')
print(conf_mtx_adasyn)

ADASYN
Recall: 97.03%
[[51404  5457]
 [    3    98]]


Let $F$ be the cost of leaving a fraudulent account being uncaught.

Let $f$ be the number of uncaught fraudulent accounts (TN).

Let $A$ be the cost of auditing an account to see if it is fraudulent.

Let $a$ be the number of accounts flagged as fraudulent (TP + FP).


$$\text{SMOTE } = \text{ADASYN}$$

$$F \cdot f_{\tiny{SMOTE}} + A \cdot a_{\tiny{SMOTE}} = F \cdot f_{\tiny{ADASYN}} + A \cdot a_{\tiny{ADASYN}}$$

$$6F + 1520A = 3F + 5555A$$

$$3F = 4035A$$

$$F = 1345A$$


Thus if the cost of leaving a fraudulent account uncaught is ~1350 times the cost per audit, the ADASYN model should be the one used.

In [37]:
audit = 50
fraud = audit * 2000

print(f'F_cost/A_cost = {fraud}/{audit} = {fraud / audit:,.2%}')

smote_cost = 6 * fraud + 1520 * audit
print(f'SMOTE cost: ${smote_cost:,}')

adasyn_cost = 3 * fraud + 5555 * audit
print(f'ADASYN cost: ${adasyn_cost:,}')

print(f'Use ADASYN: {adasyn_cost < smote_cost}')

F_cost/A_cost = 100000/50 = 200,000.00%
SMOTE cost: $676,000
ADASYN cost: $577,750
Use ADASYN: True
