# Final Model V1 (for deployment)

In [1]:
# importing useful libraries
import numpy as np
import tensorflow as tf
import random as python_random
 
# setting random seed for result reproducibility
np.random.seed(1)
python_random.seed(12)
tf.random.set_seed(123)
 
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.metrics import Metric
from keras.wrappers.scikit_learn import KerasClassifier

import dill
import gzip
 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
 
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import VotingClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold, cross_validate, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score, average_precision_score
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin, ClassifierMixin
 
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
 
import warnings
warnings.filterwarnings('ignore')
 
%matplotlib inline

In [2]:
credit_card_df = pd.read_csv('/content/drive/MyDrive/credit_card_dataset.zip') 

# alternatively you can use 
# credit_card_df = pd.read_csv('https://raw.github.com/HamoyeHQ/g01-fraud-detection/master/data/credit_card_dataset.zip')
# to load in the dataset if you don't have it downloaded

print(credit_card_df.shape)
credit_card_df.head()

(284807, 31)


Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,V15,V16,V17,V18,V19,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,0.090794,-0.5516,-0.617801,-0.99139,-0.311169,1.468177,-0.470401,0.207971,0.025791,0.403993,0.251412,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,-0.166974,1.612727,1.065235,0.489095,-0.143772,0.635558,0.463917,-0.114805,-0.183361,-0.145783,-0.069083,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,0.207643,0.624501,0.066084,0.717293,-0.165946,2.345865,-2.890083,1.109969,-0.121359,-2.261857,0.52498,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,-0.054952,-0.226487,0.178228,0.507757,-0.287924,-0.631418,-1.059647,-0.684093,1.965775,-1.232622,-0.208038,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,0.753074,-0.822843,0.538196,1.345852,-1.11967,0.175121,-0.451449,-0.237033,-0.038195,0.803487,0.408542,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [3]:
# 27 most important features according to our EDA
cols = ['V'+str(i) for i in range(1, 29) if i != 25]
print(cols)

['V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21', 'V22', 'V23', 'V24', 'V26', 'V27', 'V28']


In [4]:
df = credit_card_df.copy()

In [5]:
# selecting the features as X and target as y
y = df.pop('Class')
X = df

In [6]:
admin_cost = 2.5

In [7]:
# defining a function to calculate cost savings
def cost_saving(ytrue, ypred, amount, threshold=0.5, epsilon=1e-7):
    ypred = ypred.flatten()
    fp = np.sum((ytrue == 0) & (ypred == 1))
    cost = np.sum(fp*admin_cost) + np.sum((amount[(ytrue == 1) & (ypred == 0)]))
    max_cost = np.sum((amount[(ytrue == 1)])) 
    savings = 1 - (cost/(max_cost+epsilon))
    
    return savings

In [8]:
stopper = EarlyStopping(monitor='val_stateful_binary_fbeta', patience=10, mode='max',
    restore_best_weights=True)

In [9]:
neg, pos = np.bincount(y)
#print('negative class is {} in number, while positive is {}'.format(neg, pos))
initial_bias = np.log([pos/neg])

In [10]:
def build_model():
    model = Sequential()

    model.add(Dense(16, kernel_initializer='uniform', activation='relu'))
    
    model.add(Dropout(0.2))
    
    output_bias = tf.keras.initializers.Constant(initial_bias) 
    
    model.add(Dense(1, activation='sigmoid', bias_initializer=output_bias))
    
    #binary_fbeta = StatefulBinaryFBeta()
    
    # compling model
    model.compile(loss='binary_crossentropy', optimizer='adam')
    
    return model

In [11]:
class ColumnSelector(BaseEstimator, TransformerMixin):
  def __init__(self, cols=cols):
    self.cols = cols

  def fit(self, X, y=None):
    return self

  def transform(self, X):
    if isinstance(X, pd.DataFrame):
      return np.array(X[self.cols])

    elif isinstance(X, pd.Series):
      return np.array(X[self.cols]).reshape(1, -1)

    elif isinstance(X, np.ndarray):
      self.cols_ind = [int(col[1:]) for col in self.cols]
      if len(X.shape) == 1: # if one dimensional array
        return X[self.cols_ind].reshape(1, -1)
      return X[:, self.cols_ind]

    else:
      raise TypeError('expected input type to be any of pd.Series, pd.DataFrame or np.ndarray but got {}'.format(type(X)))

In [12]:
class ClipOutliers(BaseEstimator, TransformerMixin):
    def __init__(self, lower_percentile=1, upper_percentile=99):
        self.lower_percentile = lower_percentile
        self.upper_percentile = upper_percentile
        
    def fit(self, X, y=None):
        self.a = np.percentile(X, self.lower_percentile, axis=0)
        self.b = np.percentile(X, self.upper_percentile, axis=0)
        
        return self
    
    def transform(self, X):
        self.Xt = np.clip(X, self.a, self.b)
        
        return self.Xt

## Fraud Sensitive model (Not cost sensitive)

### Using Majority Voting as our Ensembling strategy.

In [13]:
epochs = 4
n_neighbors = 5

In [14]:
# setting _estimator_type atrribute of sklearn's pipeline to 'classifier' to avoid errors when using
# VotingClassifier.
class ClassifierPipeline(Pipeline):
    @property
    def _estimator_type(self):
        return 'classifier'

In [15]:
cols_select = ColumnSelector()
scaler = StandardScaler()

data_prep = Pipeline([('columns', cols_select), ('scaler', scaler)]) # data preparation pipeline

clipper = ClipOutliers()

mlp = KerasClassifier(build_fn=build_model, epochs=epochs, batch_size=512, verbose=0) # model 1
knn =  KNeighborsClassifier(n_neighbors=n_neighbors, algorithm='kd_tree', n_jobs=4) # model 2

clip_mlp = ClassifierPipeline([('clipper', clipper), ('mlp', mlp)]) # model 1 requires clipping, so it is encapsulated in a pipeline with a clipper

vote_ensemble = VotingClassifier(estimators=[('knn', knn), ('mlp', clip_mlp)], voting='soft') # voting ensemble

In [16]:
X_prep = data_prep.fit_transform(X, y) # fitting and transforming the data

In [17]:
# saving the data prep object
with gzip.open('data_prep_pipe.gz.dill', 'wb') as f:
  dill.dump(data_prep, f)

In [18]:
vote_ensemble.fit(X_prep, y); # fitting the voting ensemble

In [19]:
# saving the transformed training set
with gzip.open('X_prep.gz.dill', 'wb') as f:
  dill.dump(X_prep, f)

In [20]:
# saving the y labels array
with gzip.open('y_labels.gz.dill', 'wb') as f:
  dill.dump(y, f)

In [21]:
# saving the clipper object
with gzip.open('clipper.gz.dill', 'wb') as f:
  dill.dump(vote_ensemble.estimators_[1][0], f)

In [22]:
# saving the label encoder object of the voting ensemble
with gzip.open('label_encoder.gz.dill', 'wb') as f:
  dill.dump(vote_ensemble.le_, f)

In [23]:
vote_ensemble.estimators_[1][1].model.save('mlp.h5') # saving the mlp model

In [24]:
# defining function to get predictions
def get_predictions(X, proba=False):
  # loading in useful objects
  with gzip.open('data_prep_pipe.gz.dill', 'rb') as f:
    data_prep = dill.load(f)

  # setting useful atrributes and parameters
  n_neighbors = 5
  classes = np.array([0, 1])
  epochs = 4
  batch_size = 512

  # due to the large file size of the serialized knn classifier, we serialized the transformed (preprocessed) training dataset instead.
  # this dataset will be fitted on the knn classifier since it doesn't take time to fit knn classifiers. 
  # be sure to import KNeighborsClassifier when running this function outside of this notebook.
  knn = KNeighborsClassifier(n_neighbors=n_neighbors, algorithm='kd_tree', n_jobs=4)
  with gzip.open('X_prep.gz.dill', 'rb') as f:
    X_prep = dill.load(f)

  with gzip.open('y_labels.gz.dill', 'rb') as f:
    y = dill.load(f)

  knn.fit(X_prep, y)

  with gzip.open('clipper.gz.dill', 'rb') as f:
    clipper = dill.load(f)

  with gzip.open('label_encoder.gz.dill', 'rb') as f:
    le = dill.load(f)

  build_model = lambda: load_model('mlp.h5') # loading in the mlp model

  # initializes the mlp model
  mlp = KerasClassifier(build_fn=build_model, epochs=epochs, batch_size=batch_size, verbose=0)
  mlp.model = build_model() # rebuilding the mlp model
  mlp.classes_ = classes # setting the classes_ attribute of the mlp model

  clip_mlp = ClassifierPipeline([('clipper', clipper), ('mlp', mlp)]) # clipping pipeline

  # reconstructing the voting classifier
  vote_ensemble = VotingClassifier(estimators=[('knn', knn), ('mlp', clip_mlp)], voting='soft')
  vote_ensemble.classes_ = classes
  vote_ensemble.estimators_ = [knn, clip_mlp]
  vote_ensemble.le_ = le
  
  Xt = data_prep.transform(X) # prepare (preprocess) the user's input

  if proba:
    pred = vote_ensemble.predict_proba(Xt) # gets the probability of belonging to the positvie class

    if len(pred.shape) > 1: # pred is 2-dim (multi-input)
      pred = pred[:, 1]
    
    else: # pred is 1-dim (single-input)
      pred = pred[1]

  else: # get raw predictions
    pred = vote_ensemble.predict(Xt) # gets the prediction

  return pred

### Testing our model's prediction on X_test (multi-input)

In [25]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, stratify=y, random_state=1)

In [26]:
def prediction_summary(user_input, ytrue=None):
  """
  This function is both for testing our model when we know the true label of user's input and getting only predictions when we don't know the true labels.

  Args:
    user_input: 
      type: any of numpy array, pandas Series or dataframe. 

      User's input is expected to be for all features apart from 'Class' feature making them 30 in number as arranged in our dataset.

    y_true:
      type: any of numpy array or pandas Series.
      The true labels for user_input

  Return:
    a dataframe of 'Class' and the probability of 'Class' being fraud. A 'Class' of 1 means fraud, while 0 means not fraud. If ytrue is given;
    f1_score and cost saving are also printed out.
  """
  pred = get_predictions(user_input, proba=True)
  is_fraud = (pred >= 0.5).astype(np.int64)
  pred_df = pd.DataFrame({'Class': is_fraud, 'Fraud_Probabilty': pred})

  if ytrue is not None: # if we know the true labels, it means we want to test the model and printing out metrics will be useful

    if len(user_input.shape) > 1: # if the input has more than 1 row (multi-input)
      print('f1_score is {}'.format(f1_score(ytrue, is_fraud)))
      if isinstance(user_input, np.ndarray):
        amount = user_input[:, -1]
      else:
        amount = user_input.iloc[:, -1]
      print('cost saving is {}'.format(cost_saving(ytrue, is_fraud, amount)))

    else: # a single input.
      print('f1_score is {}'.format(f1_score(ytrue, is_fraud)))
      print('cost saving is {}'.format(cost_saving(ytrue, is_fraud, user_input[-1].reshape(1))))

  return pred_df # in any case, finally return the dataframe of predictions.

### Multi-output testing

In [27]:
result = prediction_summary(X_test, y_test)
result.head()

Instructions for updating:
Please use `model.predict()` instead.
f1_score is 0.8945147679324894
cost saving is 0.7310398178640722


Unnamed: 0,Class,Fraud_Probabilty
0,0,3.7e-05
1,0,0.000278
2,0,2.2e-05
3,0,4.4e-05
4,0,1.7e-05


In [28]:
# having a view of how the fraudulent predictions look like.
result[result['Class'] == 1].head()

Unnamed: 0,Class,Fraud_Probabilty
38,1,0.695852
317,1,0.955653
2399,1,0.917699
2471,1,0.796549
2851,1,0.997961


### Single input-output testing

In [29]:
prediction_summary(X_test.iloc[0].values, np.array([0]))

f1_score is 0.0
cost saving is 1.0


Unnamed: 0,Class,Fraud_Probabilty
0,0,3.7e-05
