In [1]:
import pandas as pd
import numpy as np

from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_validate
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler

from sklearn.pipeline import Pipeline

from sklearn.metrics import f1_score, auc, roc_auc_score

In [2]:
data = pd.read_csv('C:/Users/defaultuser0/Desktop/folders/Hamoye/g01-fraud-detection/data/creditcard.csv')
df = data.copy()

In [3]:
cols = ['V'+str(i) for i in range(1, 29) if i != 25]

X = df[cols]
y = df['Class']

### Using train_test_split

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3,random_state=42, shuffle=True, stratify = y)

In [5]:
sc = StandardScaler()
X_train_sc = sc.fit_transform(X_train)

In [6]:
gnb = GaussianNB()

In [7]:
gnb.fit(X_train_sc, y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [8]:
pred = gnb.predict(X_test)

In [9]:
print('F1 score on validation: ', f1_score(y_test, pred))
print('f1 score on training: ', f1_score(y_train, gnb.predict(X_train_sc)))

F1 score on validation:  0.09191880505553426
f1 score on training:  0.1189300411522634


### Using StratifiedKFold

In [10]:
cv = StratifiedKFold(n_splits=4, shuffle=True, random_state=1)
admin_cost = 2.5

In [11]:
# defining a function to calculate cost savings
def cost_saving(ytrue, ypred, amount):
    fp = np.sum((ytrue == 0) & (ypred == 1))
    cost = np.sum(fp*admin_cost) + np.sum((amount[(ytrue == 1) & (ypred == 0)]))
    max_cost = np.sum((amount[(ytrue == 1)]))
    savings = 1 - (cost/max_cost)
    
    return savings

# defining a function to calculate cost saving per fold (splits) of our cv
def cost_saving_per_split(scores, x, y, cv_object):
    results = []
    for i, (_, test_ind) in zip(range(cv_object.n_splits), cv_object.split(x, y)):
        ypred = scores['estimator'][i].predict(x[test_ind])
        ytrue = y[test_ind]
        amount = df['Amount'].values[test_ind]
        results.append(cost_saving(ytrue, ypred, amount))
        
    return results

# defining a function to return a dataframe of metrics results for each fold in our cv
def get_metric_scores(scores, x, y=y, cv_object=cv):
    #ind = ['split_'+str(n) for n in range(1, cv_object.n_splits+1)]
    
    scores_df = pd.DataFrame()
    
    scores_df['f1_score'] = scores['test_f1']
    scores_df['auc_pr'] = scores['test_average_precision']
    scores_df['cost_savings'] = cost_saving_per_split(scores, x, y, cv_object)

    return scores_df

In [12]:
sc0 = StandardScaler()
fraud_sensitive_pipe = Pipeline([('scaler', sc0), ('model', gnb)])

fraud_sensitive_scores = cross_validate(fraud_sensitive_pipe, np.array(X), y, \
                            scoring=['f1', 'average_precision'], cv=cv, n_jobs=4, \
                                        return_estimator=True, error_score='raise')

##### Fraud Sensitive Model

In [13]:
fraud_sensitive_results = get_metric_scores(fraud_sensitive_scores, np.array(X))
fraud_sensitive_results

Unnamed: 0,f1_score,auc_pr,cost_savings
0,0.110681,0.077734,0.540069
1,0.114127,0.083457,0.379573
2,0.119678,0.098892,0.557511
3,0.118129,0.089255,0.486035


In [14]:
fraud_sensitive_results.mean()

f1_score        0.115654
auc_pr          0.087334
cost_savings    0.490797
dtype: float64

In [15]:
sample_weights = np.array([df['Amount'][ind] if fraud else admin_cost for ind, fraud in enumerate(y)])

In [16]:
sc1 = StandardScaler()
cost_sensitive_pipe = Pipeline([('scaler', sc1), ('model', gnb)])

cost_sensitive_scores = cross_validate(cost_sensitive_pipe, np.array(X), y, \
                        scoring=['f1', 'average_precision'], cv=cv, n_jobs=4, return_estimator=True, \
                          fit_params={'model__sample_weight': sample_weights}, error_score='raise')

##### Cost Sensitive Model

In [17]:
cost_sensitive_results = get_metric_scores(cost_sensitive_scores, np.array(X))
cost_sensitive_results

Unnamed: 0,f1_score,auc_pr,cost_savings
0,0.098801,0.073916,0.613121
1,0.101171,0.08166,0.343395
2,0.103921,0.093017,0.516332
3,0.101879,0.08642,0.4333


In [18]:
cost_sensitive_results.mean()

f1_score        0.101443
auc_pr          0.083754
cost_savings    0.476537
dtype: float64

In [19]:
scaler3 = StandardScaler()
bmr_model = GaussianNB()

bmr_pipe = Pipeline([('scaler', scaler3), ('model', bmr_model)])

bmr_scores = cross_validate(bmr_pipe, np.array(X), y, cv=cv, n_jobs=4, return_estimator=True, \
                            error_score='raise')

In [20]:
# defining a function to predict based on the predicting that will minimize the expected cost.
def bmr_predict(model, x, trans_cost):
    prob = model.predict_proba(x)[:, 1]
        
    expected_cost_0 = prob * trans_cost
    expected_cost_1 = (1-prob) * admin_cost
        
    pred = (expected_cost_1 < expected_cost_0).astype(int)
    return pred

In [21]:
def get_bmr_metric_scores(scores, x, y=y, cv_object=cv):
    ind = ['split_'+str(n) for n in range(1, cv_object.n_splits+1)]
    scores_df = pd.DataFrame(index=ind)

    f1_results = []
    cs_results = []
    
    for i, (_, test_ind) in zip(range(cv_object.n_splits), cv_object.split(x, y)):
        amount = df['Amount'].values[test_ind]
        
        ypred = bmr_predict(scores['estimator'][i], x[test_ind], amount)
        ytrue = y[test_ind]
                
        f1_results.append(f1_score(ytrue, ypred))
        cs_results.append(cost_saving(ytrue, ypred, amount))
        
    scores_df['f1_score'] = f1_results
    #scores_df['auc_pr'] = scores['test_average_precision']
    scores_df['cost_savings'] = cs_results

    return scores_df    

##### BMR Model

In [22]:
bmr_results = get_bmr_metric_scores(bmr_scores, np.array(X))
bmr_results

Unnamed: 0,f1_score,cost_savings
split_1,0.100481,0.531192
split_2,0.104707,0.39149
split_3,0.109709,0.563708
split_4,0.10299,0.468298


In [23]:
bmr_results.mean()

f1_score        0.104472
cost_savings    0.488672
dtype: float64

### Using SMOTE

In [24]:
sm = SMOTE(random_state=1)
clf = GaussianNB()
X_train_smote, y_train_smote = sm.fit_sample(X_train, y_train)
clf.fit(X_train_smote, y_train_smote)

GaussianNB(priors=None, var_smoothing=1e-09)

In [25]:
y_train_smote.value_counts()

1    199020
0    199020
Name: Class, dtype: int64

In [26]:
pred1 = clf.predict(X_test)

In [27]:
print('SMOTE F1 score on validation: ', f1_score(pred1, y_test))
print('SMOTE f1 score on training: ', f1_score(clf.predict(X_train_smote), y_train_smote))
print()
print('SMOTE roc score on validation: ', roc_auc_score(pred1, y_test))
print('SMOTE roc score on training: ', roc_auc_score(clf.predict(X_train_smote), y_train_smote))

SMOTE F1 score on validation:  0.10262828535669587
SMOTE f1 score on training:  0.9154286018245166

SMOTE roc score on validation:  0.5271952356630193
SMOTE roc score on training:  0.9253345651790054


##### Fraud sensitive model after doing SMOTE oversampling

In [28]:
ss = StandardScaler()
clf1 = GaussianNB()
fraud_sensitive_pipe = Pipeline([('scaler', ss), ('model', clf1)])

fraud_sensitive_scores = cross_validate(fraud_sensitive_pipe, np.array(X_train_smote), y_train_smote, \
                            scoring=['f1', 'average_precision'], cv=cv, n_jobs=4, \
                                        return_estimator=True, error_score='raise')

In [29]:
fraud_sensitive_results = get_metric_scores(fraud_sensitive_scores, np.array(X))
fraud_sensitive_results

Unnamed: 0,f1_score,auc_pr,cost_savings
0,0.915314,0.957256,0.508708
1,0.915051,0.956882,0.378705
2,0.915259,0.958436,0.546137
3,0.916047,0.957912,0.444216


In [30]:
fraud_sensitive_results.mean()

f1_score        0.915418
auc_pr          0.957621
cost_savings    0.469441
dtype: float64

### Observations
- The fraud sensitive model with stratifiedKFold performed better on cost savings in relation to all other models, but in general it still had a poor performance
- The F1 score greatly(from 0.1 to 0.9) imporved after oversampling, but this imrovement happened only on the training data and not on the testing data
- over sampling did not favour cost saving

### Challenge
- GaussianNB model takes only 2 parameters, so hyperparameter tuning is not supported here.