## Blended MLP and KNN

### Choosing the best ensemble method. WARNING! This notebook takes about 7hrs to completely run.

In [1]:
# importing useful libraries
import numpy as np
import tensorflow as tf
import random as python_random

# setting random seed for result reproducibility
np.random.seed(1)
python_random.seed(12)
tf.random.set_seed(123)

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.metrics import Metric
from keras.wrappers.scikit_learn import KerasClassifier

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import VotingClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold, cross_validate, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin, ClassifierMixin

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping

import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

In [2]:
credit_card_df = pd.read_csv('../input/creditcardfraud/creditcard.csv')
print(credit_card_df.shape)
credit_card_df.head()

(284807, 31)


Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [3]:
# 27 most important features according to our EDA
cols = ['V'+str(i) for i in range(1, 29) if i != 25]
print(cols)

['V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21', 'V22', 'V23', 'V24', 'V26', 'V27', 'V28']


In [4]:
df = credit_card_df.copy()

In [5]:
# selecting the 19 most important features according to our EDA 
X = df[cols]

y = df['Class'] # selecting the target variable

In [6]:
y.shape

(284807,)

In [7]:
admin_cost = 2.5

In [8]:
cv = StratifiedKFold(n_splits=4, shuffle=True, random_state=1)

In [9]:
# defining a function to calculate cost savings
def cost_saving(ytrue, ypred, amount, threshold=0.5):
    ypred = ypred.flatten()
    fp = np.sum((ytrue == 0) & (ypred == 1))
    cost = np.sum(fp*admin_cost) + np.sum((amount[(ytrue == 1) & (ypred == 0)]))
    max_cost = np.sum((amount[(ytrue == 1)]))
    savings = 1 - (cost/max_cost)
    
    return savings

In [10]:
# defining a function to calculate cost saving per fold (splits) of our cv
def cost_saving_per_split(scores, x, y, cv_object):
    results = []
    for i, (_, test_ind) in zip(range(cv_object.n_splits), cv_object.split(x, y)):
        ypred = scores['estimator'][i].predict(x[test_ind])
        ytrue = y[test_ind]
        amount = df['Amount'].values[test_ind]
        results.append(cost_saving(ytrue, ypred, amount))
        
    return results

In [11]:
# defining a function to return a dataframe of metrics results for each fold in our cv
def get_metric_scores(scores, x, y=y, cv_object=cv):
    ind = ['split_'+str(n) for n in range(1, cv_object.n_splits+1)]
    
    scores_df = pd.DataFrame(index=ind)
    
    scores_df['f1_score'] = scores['test_f1']
    scores_df['auc_pr'] = scores['test_average_precision']
    scores_df['cost_savings'] = cost_saving_per_split(scores, x, y, cv_object)

    return scores_df

In [12]:
threshold = 0.5

In [14]:
stopper = EarlyStopping(monitor='val_stateful_binary_fbeta', patience=10, mode='max',
    restore_best_weights=True)

In [15]:
neg, pos = np.bincount(y)
print('negative class is {} in number, while positive is {}'.format(neg, pos))
initial_bias = np.log([pos/neg])
initial_bias

negative class is 284315 in number, while positive is 492


array([-6.35935934])

In [16]:
def build_model():
    model = Sequential()

    model.add(Dense(16, kernel_initializer='uniform', activation='relu'))
    
    model.add(Dropout(0.2))
    
    output_bias = tf.keras.initializers.Constant(initial_bias) 
    
    model.add(Dense(1, activation='sigmoid', bias_initializer=output_bias))
    
    #binary_fbeta = StatefulBinaryFBeta()
    
    # compling model
    model.compile(loss='binary_crossentropy', optimizer='adam')
    
    return model

In [17]:
class ClipOutliers(BaseEstimator, TransformerMixin):
    def __init__(self, lower_percentile=1, upper_percentile=99):
        self.lower_percentile = lower_percentile
        self.upper_percentile = upper_percentile
        
    def fit(self, X, y=None):
        self.a = np.percentile(X, self.lower_percentile, axis=0)
        self.b = np.percentile(X, self.upper_percentile, axis=0)
        
        return self
    
    def transform(self, X):
        self.Xt = np.clip(X, self.a, self.b)
        
        return self.Xt

## Fraud Sensitive model (Not cost sensitive)

### (I) Using Majority Voting as our Ensembling strategy.

In [18]:
epochs = 4
n_neighbors = 5

In [19]:
# setting _estimator_type atrribute of sklearn's pipeline to 'classifier' to avoid errors when using
# VotingClassifier.
class ClassifierPipeline(Pipeline):
    @property
    def _estimator_type(self):
        return 'classifier'

In [20]:
scaler = StandardScaler()
clipper = ClipOutliers()

mlp = KerasClassifier(build_fn=build_model, epochs=epochs, batch_size=512, verbose=0)
knn =  KNeighborsClassifier(n_neighbors=n_neighbors, algorithm='kd_tree')

clip_mlp = ClassifierPipeline([('clipper', clipper), ('mlp', mlp)])

vote_ensemble = VotingClassifier(estimators=[('knn', knn), ('mlp', clip_mlp)], voting='soft')

vote_pipe = Pipeline([('scaler', scaler), ('ensemble', vote_ensemble)])

vote_scores = cross_validate(vote_pipe, np.array(X), y, verbose=1, \
                         scoring=['f1', 'average_precision'], cv=cv, return_estimator=True, \
                         error_score='raise') # setting n_jobs greater than one seems not to work when using 
                                              # deep learning models

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed: 53.2min finished


In [21]:
vote_results = get_metric_scores(vote_scores, np.array(X))
vote_results

Unnamed: 0,f1_score,auc_pr,cost_savings
split_1,0.826087,0.810348,0.726768
split_2,0.858369,0.870847,0.72803
split_3,0.857143,0.900577,0.794781
split_4,0.828194,0.826006,0.696086


In [22]:
vote_results.mean()

f1_score        0.842448
auc_pr          0.851945
cost_savings    0.736416
dtype: float64

### (II) Stacking Ensemble

In [23]:
scaler3 = StandardScaler()
clipper3 = ClipOutliers()

mlp3 = KerasClassifier(build_fn=build_model, epochs=epochs, batch_size=512, verbose=0)
knn3 =  KNeighborsClassifier(n_neighbors=n_neighbors, algorithm='kd_tree')

clip_mlp3 = ClassifierPipeline([('clipper', clipper3), ('mlp', mlp3)])

blender3 = LogisticRegression()

stack_ensemble3 = StackingClassifier(estimators=[('knn', knn3), ('mlp', clip_mlp3)], \
                                   final_estimator=blender3, verbose=1)

stack_pipe3 = Pipeline([('scaler', scaler3), ('ensemble', stack_ensemble3)])

stack_scores3 = cross_validate(stack_pipe3, np.array(X), y, verbose=1, \
                         scoring=['f1', 'average_precision'], cv=cv, return_estimator=True, \
                         error_score='raise')

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed: 16.4min finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   14.6s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed: 16.0min finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   15.1s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed: 17.6min finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   15.8s finished
[Parallel(n_j

In [25]:
stack_results3 = get_metric_scores(stack_scores3, np.array(X))
stack_results3

Unnamed: 0,f1_score,auc_pr,cost_savings
split_1,0.801762,0.80473,0.707262
split_2,0.845455,0.866117,0.498718
split_3,0.841629,0.900262,0.755974
split_4,0.83871,0.825492,0.565301


In [26]:
stack_results3.mean()

f1_score        0.831889
auc_pr          0.849150
cost_savings    0.631813
dtype: float64

## (III) Custom stacking ensemble (without using cross validation for each stratified split)

In [27]:
# creating a custom transformer to disguise our models as scikit-learn transformer 
class TransformerModel(BaseEstimator, TransformerMixin):
    def __init__(self, model):
        self.model = model
        
    def fit(self, X, y):
        self.model.fit(X, y)
        return self
    
    def transform(self, X):
        self.pred = self.model.predict_proba(X)[:, 1].reshape(-1, 1)
        return self.pred

In [28]:
scaler2 = StandardScaler()
clipper2 = ClipOutliers()

mlp2 = KerasClassifier(build_fn=build_model, epochs=epochs, batch_size=512, verbose=0)
knn2 =  KNeighborsClassifier(n_neighbors=n_neighbors, algorithm='kd_tree')

mlp_trans = TransformerModel(mlp2)
knn_trans = TransformerModel(knn2)

clip_mlp2 = Pipeline([('clipper', clipper2), ('mlp', mlp_trans)])

union = FeatureUnion([('mlp', clip_mlp2), ('knn', knn_trans)])

blender = LogisticRegression()

stack_pipe = Pipeline([('scaler', scaler2), ('union', union), ('blender', blender)])

stack_scores = cross_validate(stack_pipe, np.array(X), y, verbose=1, \
                         scoring=['f1', 'average_precision'], cv=cv, return_estimator=True, \
                         error_score='raise')

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed: 121.8min finished


In [30]:
stack_results = get_metric_scores(stack_scores, np.array(X))
stack_results

Unnamed: 0,f1_score,auc_pr,cost_savings
split_1,0.808889,0.812319,0.719914
split_2,0.869565,0.875511,0.725794
split_3,0.850679,0.905657,0.777706
split_4,0.830357,0.824731,0.695133


In [31]:
stack_results.mean()

f1_score        0.839872
auc_pr          0.854554
cost_savings    0.729637
dtype: float64

# We now see that Voting ensemble is the best ensemble strategy and it improved KNN's f1_score by 0.01, auc by 0.05 and cost savings by 0.01. It also reduced overfitting.