In [27]:
# Regular EDA (exploratory data analysis) and plotting libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# we want our plots to appear inside the notebook
%matplotlib inline
from sklearn.model_selection import StratifiedKFold, cross_validate
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score

Load data

In [28]:
df = pd.read_csv('../input/creditcardfraud/creditcard.csv')
print(df.shape)
df.head()

(284807, 31)


Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [29]:
# 27 most important features according to our EDA
cols = ['V'+str(i) for i in range(1, 29) if i != 25]
print(cols)


['V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21', 'V22', 'V23', 'V24', 'V26', 'V27', 'V28']


In [30]:
# selecting the 27 most important features according to our EDA 
X = df[cols]

y = df['Class'] # selecting the target variable

X.shape

(284807, 27)

In [31]:
val_count = df['Class'].value_counts()
weights = dict(1 / val_count) # to be used as class weights
weights

{0: 3.51722561243691e-06, 1: 0.0020325203252032522}

In [32]:
admin_cost = 2.5

In [33]:

# defining a function to calculate cost savings
def cost_saving(ytrue, ypred, amount):
    fp = np.sum((ytrue == 0) & (ypred == 1))
    cost = np.sum(fp*admin_cost) + np.sum((amount[(ytrue == 1) & (ypred == 0)]))
    max_cost = np.sum((amount[(ytrue == 1)]))
    savings = 1 - (cost/max_cost)
    
    return savings

In [34]:
cv = StratifiedKFold(n_splits=4, shuffle=True, random_state=1)

In [35]:
# defining a function to calculate cost saving per fold (splits) of our cv
def cost_saving_per_split(scores, x, y, cv_object):
    results = []
    for i, (_, test_ind) in zip(range(cv_object.n_splits), cv_object.split(x, y)):
        ypred = scores['estimator'][i].predict(x[test_ind])
        ytrue = y[test_ind]
        amount = df['Amount'].values[test_ind]
        results.append(cost_saving(ytrue, ypred, amount))
        
    return results

In [36]:
# defining a function to return a dataframe of metrics results for each fold in our cv
def get_metric_scores(scores, x, y=y, cv_object=cv):
    ind = ['split_'+str(n) for n in range(1, cv_object.n_splits+1)]
    
    scores_df = pd.DataFrame(index=ind)
    
    scores_df['f1_score'] = scores['test_f1']
    scores_df['auc_pr'] = scores['test_average_precision']
    scores_df['cost_savings'] = cost_saving_per_split(scores, x, y, cv_object)

    return scores_df

In [37]:
from sklearn.pipeline import Pipeline

Fraud Sensitive model (Not cost sensitive)

In [38]:
# build the lightgbm model
import lightgbm as lgb
fraud_sensitive_model =  lgb.LGBMClassifier(class_weight=weights)
fraud_sensitive_scaler = StandardScaler()
fraud_sensitive_pipe = Pipeline([('scaler', fraud_sensitive_scaler), ('model', fraud_sensitive_model)])
fraud_sensitive_scores = cross_validate(fraud_sensitive_pipe, np.array(X), y, \
                            scoring=['f1', 'average_precision'], cv=cv, n_jobs=4, \
                                        return_estimator=True, error_score='raise')


In [39]:
fraud_sensitive_results = get_metric_scores(fraud_sensitive_scores, np.array(X))
fraud_sensitive_results

Unnamed: 0,f1_score,auc_pr,cost_savings
split_1,0.510101,0.758377,0.718968
split_2,0.551899,0.849785,0.77706
split_3,0.662651,0.857076,0.826118
split_4,0.622754,0.80052,0.752039


In [40]:
fraud_sensitive_results.mean()

f1_score        0.586851
auc_pr          0.816440
cost_savings    0.768546
dtype: float64

Cost sensitive model

In [41]:
sample_weights = np.array([df['Amount'][ind] if fraud else admin_cost for ind, fraud in enumerate(y)])

In [42]:
scaler2 = StandardScaler()
cost_sensitive_model =  lgb.LGBMClassifier()
cost_sensitive_pipe = Pipeline([('scaler', scaler2), ('model', cost_sensitive_model)])
cost_sensitive_scores = cross_validate(cost_sensitive_pipe, np.array(X), y, \
                        scoring=['f1', 'average_precision'], cv=cv, n_jobs=4, return_estimator=True, \
                          fit_params={'model__sample_weight': sample_weights}, error_score='raise')

In [43]:
cost_sensitive_results = get_metric_scores(cost_sensitive_scores, np.array(X))
cost_sensitive_results

Unnamed: 0,f1_score,auc_pr,cost_savings
split_1,0.839827,0.786108,0.726996
split_2,0.859504,0.700123,0.755516
split_3,0.858407,0.867033,0.747749
split_4,0.848485,0.830333,0.730411


In [44]:
cost_sensitive_results.mean()

f1_score        0.851556
auc_pr          0.795899
cost_savings    0.740168
dtype: float64

Bayes Mininmum Risk (BMR)


In [45]:
scaler3 = StandardScaler()
bmr_model =   lgb.LGBMClassifier()
bmr_pipe = Pipeline([('scaler', scaler3), ('model', bmr_model)])
bmr_scores = cross_validate(bmr_pipe, np.array(X), y, cv=cv, n_jobs=4, return_estimator=True, \
                            error_score='raise')

In [46]:
# defining a function to predict based on the predicting that will minimize the expected cost.
def bmr_predict(model, x, trans_cost):
    prob = model.predict_proba(x)[:, 1]
    expected_cost_0 = prob * trans_cost
    expected_cost_1 = (1-prob) * admin_cost
    pred = (expected_cost_1 < expected_cost_0).astype(int)
    return pred

In [47]:
def get_bmr_metric_scores(scores, x, y=y, cv_object=cv):
    ind = ['split_'+str(n) for n in range(1, cv_object.n_splits+1)]
    scores_df = pd.DataFrame(index=ind)
    f1_results = []
    cs_results = []
    
    for i, (_, test_ind) in zip(range(cv_object.n_splits), cv_object.split(x, y)):
        amount = df['Amount'].values[test_ind]
        
        ypred = bmr_predict(scores['estimator'][i], x[test_ind], amount)
        ytrue = y[test_ind]
                
        f1_results.append(f1_score(ytrue, ypred))
        cs_results.append(cost_saving(ytrue, ypred, amount))
        
    scores_df['f1_score'] = f1_results
    #scores_df['auc_pr'] = scores['test_average_precision']
    scores_df['cost_savings'] = cs_results

    return scores_df

In [48]:
bmr_results = get_bmr_metric_scores(bmr_scores, np.array(X))
bmr_results

Unnamed: 0,f1_score,cost_savings
split_1,0.306954,0.399387
split_2,0.178744,0.222781
split_3,0.257426,0.472736
split_4,0.155039,0.298381


In [49]:
bmr_results.mean()

f1_score        0.224541
cost_savings    0.348321
dtype: float64

In [50]:
from sklearn.metrics import roc_auc_score

In [51]:
from sklearn.model_selection import KFold
from imblearn.over_sampling import SMOTE
from sklearn.metrics import f1_score
smote = SMOTE(random_state=1)
xx =np.array(X)
for train_idx, test_idx, in cv.split(xx, y):
    X_train, y_train = xx[train_idx], y[train_idx]
    X_test, y_test = xx[test_idx], y[test_idx]
    X_train_oversampled, y_train_oversampled = smote.fit_sample(X_train, y_train)
    smote_gradient_model =  lgb.LGBMClassifier()
    smote_gradient_model.fit(X_train_oversampled, y_train_oversampled )  
    pred = smote_gradient_model.predict(X_test)
    print(f'roc_auc_score: {roc_auc_score(y_test, pred)}')
    print(f'f-score: {f1_score(y_test, pred)}')

roc_auc_score: 0.8895967352025692
f-score: 0.6173633440514469
roc_auc_score: 0.9301134876034532
f-score: 0.6235294117647059
roc_auc_score: 0.9300009367797218
f-score: 0.5955056179775281
roc_auc_score: 0.9140362116781358
f-score: 0.6580645161290323


In [52]:
print(f'roc_auc_score: {roc_auc_score(y_test, pred)}')
print(f'f-score: {f1_score(y_test, pred)}')

roc_auc_score: 0.9140362116781358
f-score: 0.6580645161290323
