In [None]:
#!python -m pip install --user --upgrade pip
#!pip3 install pandas==0.23.4 matplotlib==3.0.3 scipy scikit-learn==0.22 tensorflow==2.3 keras==2.4.3 --user

In [None]:
#!pip3 install kfp --upgrade --user

In [1]:
# create  directory for outputs.
output_dir = "/home/jovyan/data/"

In [2]:
def data_download_n_class_declr(data_path):
    
    # IMPORT LIBRARY 
    
   
    import sys, subprocess
    subprocess.run([sys.executable, "-m", "pip", "install", "pandas"])
    subprocess.run([sys.executable, "-m", "pip", "install", "numpy"])
    subprocess.run([sys.executable, "-m", "pip", "install", "dill"])
    subprocess.run([sys.executable, "-m", "pip", "install", "scikit-learn==0.22"])
    subprocess.run([sys.executable, "-m", "pip", "install", "tensorflow==2.3", "keras==2.4.3"])
    
    import dill
    import gzip
    import pandas as pd
    import numpy as np
    
    from tensorflow.keras.callbacks import EarlyStopping
    import tensorflow as tf
    
    import random as python_random
    
    
    from sklearn.preprocessing import StandardScaler
    from sklearn.ensemble import VotingClassifier
    from sklearn.base import BaseEstimator, TransformerMixin, ClassifierMixin
    
    import warnings
    warnings.filterwarnings('ignore')
    
    
    # setting random seed for result reproducibility
    np.random.seed(1)
    python_random.seed(12)
    tf.random.set_seed(123)
    
    
    # Data Download
    credit_card_df = pd.read_csv('https://raw.github.com/HamoyeHQ/g01-fraud-detection/master/data/credit_card_dataset.zip')
    
    
    print('=== DOWNLOAD DATA SUCCESSFUL ===')
    
    
                       
    # CREATING THE COLUMN SELECTOR CLASS
                       
    # 27 most important features according to our EDA
    cols = ['V'+str(i) for i in range(1, 29) if i != 25]
                       
    class ColumnSelector(BaseEstimator, TransformerMixin):
        def __init__(self, cols=cols):
            self.cols = cols

        def fit(self, X, y=None):
            return self

        def transform(self, X):
            if isinstance(X, pd.DataFrame):
                return np.array(X[self.cols])

            elif isinstance(X, pd.Series):
                return np.array(X[self.cols]).reshape(1, -1)

            elif isinstance(X, np.ndarray):
                self.cols_ind = [int(col[1:]) for col in self.cols]
                if len(X.shape) == 1: # if one dimensional array
                    return X[self.cols_ind].reshape(1, -1)
                    return X[:, self.cols_ind]

            else:
                raise TypeError('expected input type to be any of pd.Series, pd.DataFrame or np.ndarray but got {}'.format(type(X)))
            
    print('=== CREATED COLUMN SELECTOR ===')

    # CREATING THE OUTLIERS CLIPPER CLASS                   
    class ClipOutliers(BaseEstimator, TransformerMixin):
        def __init__(self, lower_percentile=1, upper_percentile=99):
            self.lower_percentile = lower_percentile
            self.upper_percentile = upper_percentile

        def fit(self, X, y=None):
            self.a = np.percentile(X, self.lower_percentile, axis=0)
            self.b = np.percentile(X, self.upper_percentile, axis=0)

            return self

        def transform(self, X):
            self.Xt = np.clip(X, self.a, self.b)

            return self.Xt
        
    print('=== CREATED OUTLIER CLIPPER ===')
                       
    cols_select = ColumnSelector()
    scaler = StandardScaler()
    clipper = ClipOutliers()
  
    print('=== SERIALIZING CLASSESS, AND DATA ===')
                       
    with gzip.open(f"{data_path}/columnSelector.gz.dill", "wb") as f:                
        dill.dump(cols_select, f)
    
    with gzip.open(f"{data_path}/scaler.gz.dill", "wb") as f:                
        dill.dump(scaler, f)
    
    with gzip.open(f"{data_path}/clipper.gz.dill", "wb") as f:                
        dill.dump(clipper, f)

                
    with gzip.open(f"{data_path}/data.gz.dill", "wb") as f:                
        dill.dump(credit_card_df, f)
        
    
    print('=== DONE ===')

In [3]:
### DATA DOWNLOAD ,FUNCTION AND CLASS DECLARATION
data_download_n_class_declr(output_dir)

=== DOWNLOAD DATA SUCCESSFUL ===
=== CREATED COLUMN SELECTOR ===
=== CREATED OUTLIER CLIPPER ===
=== SERIALIZING CLASSESS, AND DATA ===
=== DONE ===


In [5]:
def fraud_sensitive_model(data_path):
    
    
    import sys, subprocess
    subprocess.run([sys.executable, "-m", "pip", "install", "dill"])
    subprocess.run([sys.executable, "-m", "pip", "install", "scikit-learn==0.22"])
    subprocess.run([sys.executable, "-m", "pip", "install", "numpy"])
    subprocess.run([sys.executable, "-m", "pip", "install", "pandas"])
    subprocess.run([sys.executable, "-m", "pip", "install", "tensorflow==2.3", "keras==2.4.3"])
    
    
    import tensorflow as tf
    import random as python_random
    
    from tensorflow.keras.models import load_model
    from tensorflow.keras.models import Sequential
    from tensorflow.keras.layers import Dense, Dropout
    
    python_random.seed(12)
    tf.random.set_seed(123)
    
    
    import dill
    import gzip
    
    import numpy as np
    
    from keras.wrappers.scikit_learn import KerasClassifier
    from sklearn.neighbors import KNeighborsClassifier
    from sklearn.pipeline import Pipeline
    from sklearn.ensemble import VotingClassifier
    
    
    
    
    print('=== DE-SERIALIZING CLASSESS, AND DATA ===')
                       
    with gzip.open(f"{data_path}/columnSelector.gz.dill", "rb") as f:                
        columnselector = dill.load(f)
    
    with gzip.open(f"{data_path}/scaler.gz.dill", "rb") as f:                
        scaler = dill.load(f)
    
    with gzip.open(f"{data_path}/clipper.gz.dill", "rb") as f:                
        clipper = dill.load(f)
        
        
    with gzip.open(f"{data_path}/data.gz.dill", "rb") as f:                
        data = dill.load(f)
        
        
    print('=== DONE ===')
        
    # setting _estimator_type atrribute of sklearn's pipeline to 'classifier' to avoid errors when using
    # VotingClassifier.
    class ClassifierPipeline(Pipeline):
        @property
        def _estimator_type(self):
            return 'classifier'
        

                       
    # BUILDING THE MLP MODEL FUNCTION
    
    y = data['Class']
    neg, pos = np.bincount(y)
    initial_bias = np.log([pos/neg])
    
    print('=== BUILD MLP NETWORK ===')
                       
    def build_model():
        
        model = Sequential()

        model.add(Dense(16, kernel_initializer='uniform', activation='relu'))

        model.add(Dropout(0.2))

        output_bias = tf.keras.initializers.Constant(initial_bias) 

        model.add(Dense(1, activation='sigmoid', bias_initializer=output_bias))
        # compling model
        model.compile(loss='binary_crossentropy', optimizer='adam')

        return model
    
    
    print('=== DONE ===')
    
    
    
    print('=== CREATING VOTING ENSEMBLE MODEL ===')
    
    epochs = 4
    n_neighbors = 5
    
    
    mlp = KerasClassifier(build_fn=build_model, epochs=epochs, batch_size=512, verbose=0) # model 1
    knn =  KNeighborsClassifier(n_neighbors=n_neighbors, algorithm='kd_tree', n_jobs=4) # model 2

    clip_mlp = ClassifierPipeline([('clipper', clipper), ('mlp', mlp)]) # model 1 requires clipping, so it is encapsulated in a pipeline with a clipper

    vote_ensemble = VotingClassifier(estimators=[('knn', knn), ('mlp', clip_mlp)], voting='soft') # voting ensemble
    
    print('=== DONE ===')
    

    print('=== CREATING DATA PREPARATION PIPELINE ===')
    # data preparation pipeline
    data_prep = Pipeline([('columns', columnselector), ('scaler', scaler)])
    
    
    print('=== FITTING DATA PREPARATION PIPELINE TO DATA ===')
    
    y = data.pop('Class')
    X = data
    
    # fitting and transforming the data
    X_prep = data_prep.fit_transform(X, y)
    
    print('=== DONE ===')
    
    print('=== FITTING VOTE ENSEMBLED MODEL TO  PREPARED DATA ===')
    vote_ensemble.fit(X_prep, y); # fitting the voting ensemble
    print('=== DONE===')
    
    
    print('=== SERIALIZING FUNCTIONS AND MODELS ===')
    
    # saving the data prep object
    with gzip.open(f"{data_path}/data_prep_pipe.gz.dill", 'wb') as f:
        dill.dump(data_prep, f)
        
    # saving the fitted knn model
    #with gzip.open(f"{data_path}/knn.gz.dill", 'wb') as f:
     #   dill.dump(vote_ensemble.estimators_[0], f)
        
    # saving the clipper2 object
    with gzip.open(f"{data_path}/clipper2.gz.dill", 'wb') as f:
        dill.dump(vote_ensemble.estimators_[1][0], f)

    # saving the features 
    with gzip.open(f"{data_path}/features.gz.dill", 'wb') as f:
        dill.dump(X, f)
    
     # saving the targets
    with gzip.open(f"{data_path}/targets.gz.dill", 'wb') as f:
        dill.dump(y, f)
        
    # saving the transformed data
    with gzip.open(f"{data_path}/X_prep.gz.dill", 'wb') as f:
        dill.dump(X_prep, f)
        
        
   # saving the label encoder object of the voting ensemble
    with gzip.open(f"{data_path}/label_encoder.gz.dill", 'wb') as f:
        dill.dump(vote_ensemble.le_, f)
        
    vote_ensemble.estimators_[1][1].model.save(f'{data_path}/mlp.h5') # saving the mlp model
    
    print('=== DONE ===')

In [6]:
fraud_sensitive_model(output_dir)

=== DE-SERIALIZING CLASSESS, AND DATA ===
=== DONE ===
=== BUILD MLP NETWORK ===
=== DONE ===
=== CREATING VOTING ENSEMBLE MODEL ===
=== DONE ===
=== CREATING DATA PREPARATION PIPELINE ===
=== FITTING DATA PREPARATION PIPELINE TO DATA ===
=== DONE ===
=== FITTING VOTE ENSEMBLED MODEL TO  PREPARED DATA ===
=== DONE===
=== SERIALIZING FUNCTIONS AND MODELS ===
=== DONE ===


In [7]:
def train_predict(data_path):
    
    import sys, subprocess
    subprocess.run([sys.executable, "-m", "pip", "install", "pandas"])
    subprocess.run([sys.executable, "-m", "pip", "install", "dill"])
    subprocess.run([sys.executable, "-m", "pip", "install", "scikit-learn==0.22"])
    subprocess.run([sys.executable, "-m", "pip", "install", "numpy"])
    subprocess.run([sys.executable, "-m", "pip", "install", "tensorflow==2.3", "keras==2.4.3"])
    
    
    import tensorflow as tf
    import random as python_random
    
    from tensorflow.keras.models import load_model
    from tensorflow.keras.models import Sequential
    from tensorflow.keras.layers import Dense, Dropout
    from sklearn.model_selection import  train_test_split
    from sklearn.metrics import f1_score
        
    python_random.seed(12)
    tf.random.set_seed(123)
    
    
    import dill
    import gzip
    
    import numpy as np
    
    from keras.wrappers.scikit_learn import KerasClassifier
    from sklearn.neighbors import KNeighborsClassifier
    from sklearn.pipeline import Pipeline
    from sklearn.ensemble import VotingClassifier
    
    
    print('=== DE-SERIALIZING CLASSESS, AND DATA ===')
    
    # loading in useful objects
    with gzip.open(f"{data_path}/data_prep_pipe.gz.dill", 'rb') as f:
        data_prep = dill.load(f)

    #with gzip.open(f"{data_path}/knn.gz.dill", 'rb') as f:
    #    knn = dill.load(f)

    with gzip.open(f"{data_path}/clipper2.gz.dill", 'rb') as f:
        clipper2 = dill.load(f)

    with gzip.open(f"{data_path}/label_encoder.gz.dill", 'rb') as f:
        le = dill.load(f)
        
    with gzip.open(f"{data_path}/features.gz.dill", 'rb') as f:
        X = dill.load(f)
        
    with gzip.open(f"{data_path}/targets.gz.dill", 'rb') as f:
        y = dill.load(f)
    
    with gzip.open(f"{data_path}/X_prep.gz.dill", 'rb') as f:
        X_prep = dill.load(f)
        
        
    build_model = lambda: load_model(f"{data_path}/mlp.h5") # loading in the mlp model

    
    print('=== DONE ===')
    
    print('=== FITTING THE TRANSFORMED DATA TO THE KNN MODEL ===')
    
    n_neighbors = 5
    knn = KNeighborsClassifier(n_neighbors=n_neighbors, algorithm='kd_tree', n_jobs=4)
    knn.fit(X_prep, y)
    
    print('=== DONE ===')
    

    # setting useful atrributes and parameters
    classes = np.array([0, 1])
    epochs = 4
    batch_size = 512

    print('=== INITIALIZE THE MLP MODEL ===')
    
    # setting _estimator_type atrribute of sklearn's pipeline to 'classifier' to avoid errors when using
    # VotingClassifier.
    class ClassifierPipeline(Pipeline):
        @property
        def _estimator_type(self):
            return 'classifier'
        
    # initializes the mlp model
    mlp = KerasClassifier(build_fn=build_model, epochs=epochs, batch_size=batch_size, verbose=0)
    mlp.model = build_model() # rebuilding the mlp model
    mlp.classes_ = classes # setting the classes_ attribute of the mlp model

    clip_mlp = ClassifierPipeline([('clipper2', clipper2), ('mlp', mlp)]) # clipping pipeline
    
    print('=== DONE ===')
    
    print('=== RECONSTRUCTING THE VOTING CLASSIFIER ===')

    # reconstructing the voting classifier
    vote_ensemble = VotingClassifier(estimators=[('knn', knn), ('mlp', clip_mlp)], voting='soft')
    vote_ensemble.classes_ = classes
    vote_ensemble.estimators_ = [knn, clip_mlp]
    vote_ensemble.le_ = le
    
    print('=== DONE ===')
    
    
    print('=== DATA SPLITTING AND PREPARATION ===')
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, stratify=y, random_state=1)
    
    Xt = data_prep.transform(X_test)
    
    print('=== DONE ===')
    
    
    print('=== MODEL PREDICTION ===')
    
    def predictions(Xt):
        if True:
            pred = vote_ensemble.predict_proba(Xt) # gets the probability of belonging to the positvie class
            #print(f'This is from predict proba {pred}')

            if len(pred.shape) > 1: # pred is 2-dim (multi-input)
                pred = pred[:, 1]

            else: # pred is 1-dim (single-input)
                pred = pred[1]

        else: # get raw predictions
            pred = vote_ensemble.predict(Xt) # gets the prediction
            
            #print(f'This is from predict {pred}')

        return pred
    
    predictions = predictions(Xt)

    print('=== DONE ===')
    
    
    print('=== SERIALIZING PREDICTIONS, TEST FEATURES AND TEST TARGETS ===')
                       
    with gzip.open(f"{data_path}/predictions.gz.dill", "wb") as f:                
        dill.dump(predictions, f)
    
    with gzip.open(f"{data_path}/testTargets.gz.dill", "wb") as f:                
        dill.dump(y_test, f)
        
    with gzip.open(f"{data_path}/testFeatures.gz.dill", "wb") as f:                
        dill.dump(X_test, f)
    
    print(' === DONE ===')

In [8]:
train_predict(output_dir)

=== DE-SERIALIZING CLASSESS, AND DATA ===
=== DONE ===
=== FITTING THE TRANSFORMED DATA TO THE KNN MODEL ===
=== DONE ===
=== INITIALIZE THE MLP MODEL ===
=== DONE ===
=== RECONSTRUCTING THE VOTING CLASSIFIER ===
=== DONE ===
=== DATA SPLITTING AND PREPARATION ===
=== DONE ===
=== MODEL PREDICTION ===
Instructions for updating:
Please use `model.predict()` instead.
=== DONE ===
=== SERIALIZING PREDICTIONS, TEST FEATURES AND TEST TARGETS ===
 === DONE ===


In [9]:
def prediction_summary(data_path):
    
    import sys, subprocess
    subprocess.run([sys.executable, "-m", "pip", "install", "pandas"])
    subprocess.run([sys.executable, "-m", "pip", "install", "dill"])
    subprocess.run([sys.executable, "-m", "pip", "install", "scikit-learn==0.22"])
    subprocess.run([sys.executable, "-m", "pip", "install", "numpy"])
    subprocess.run([sys.executable, "-m", "pip", "install", "tensorflow==2.3", "keras==2.4.3"])
    
    
    import tensorflow as tf
    import random as python_random
    
    from sklearn.metrics import f1_score
        
    python_random.seed(12)
    tf.random.set_seed(123)
    
    
    import dill
    import gzip
    
    import numpy as np
    import pandas as pd
    
    
    print('=== DE-SERIALIZING PREDICTIONS, TEST FEATURES, AND TEST TARGETS ===')
    
    with gzip.open(f"{data_path}/predictions.gz.dill", 'rb') as f:
        pred = dill.load(f)

    with gzip.open(f"{data_path}/testTargets.gz.dill", 'rb') as f:
        y_test = dill.load(f)
        
    with gzip.open(f"{data_path}/testFeatures.gz.dill", "rb") as f:                
        x_test = dill.load(f)
        
    print('=== DONE ===')
    
    
    
    # defining a function to calculate cost savings
    def cost_saving(ytrue, ypred, amount, threshold=0.5,admin_cost=2.5, epsilon=1e-7):
        ypred = ypred.flatten()
        fp = np.sum((ytrue == 0) & (ypred == 1))
        cost = np.sum(fp*admin_cost) + np.sum((amount[(ytrue == 1) & (ypred == 0)]))
        max_cost = np.sum((amount[(ytrue == 1)])) 
        savings = 1 - (cost/(max_cost+epsilon))
        
        return savings
    


    print('=== MULTI-INPUT TESTING ===')
    
    is_fraud = (pred >= 0.5).astype(np.int64)
    pred_df = pd.DataFrame({'Class': is_fraud, 'Fraud_Probabilty': pred})
    
    
    
    print('f1_score is {}'.format(f1_score(y_test, is_fraud)))
    if isinstance(pred, np.ndarray):
        amount = x_test.iloc[:, -1]
    else:
        amount = x_test.iloc[:, -1]
    print('cost saving is {}'.format(cost_saving(y_test, is_fraud, amount)))
    
    print(pred_df.head())
    
    print('=== DONE ===')
    
    print('=== SINGLE-INPUT TESTING ===')
    
    
    is_fraud2 = (pred[0] >= 0.5).astype(np.int64)
    
    pred_df2 = pd.DataFrame({'Class': is_fraud2, 'Fraud_Probabilty': pred[0]}, index=[0])
    
    print(y_test[0])
    print(is_fraud2)

    print('f1_score is {}'.format(f1_score([y_test[0]], [is_fraud2])))#.format(f1_score(y_test[0], is_fraud2)))
    print('cost saving is {}'.format(cost_saving(y_test.iloc[0], is_fraud2, x_test.iloc[0][-1].reshape(1))))
    
    print(pred_df2)
    
    print('=== DONE ===')


In [10]:
prediction_summary(output_dir)

=== DE-SERIALIZING PREDICTIONS, TEST FEATURES, AND TEST TARGETS ===
=== DONE ===
=== MULTI-INPUT TESTING ===
f1_score is 0.8945147679324894
cost saving is 0.7310398178640722
   Class  Fraud_Probabilty
0      0          0.000037
1      0          0.000278
2      0          0.000022
3      0          0.000044
4      0          0.000017
=== DONE ===
=== SINGLE-INPUT TESTING ===
0
0
f1_score is 0.0
cost saving is 1.0
   Class  Fraud_Probabilty
0      0          0.000037
=== DONE ===


### CREATING COMPONENTS

In [11]:
import kfp
from kfp import dsl
import kfp.components as comp

In [12]:
!which dsl-compile

In [13]:
data_download_n_class_declr_op = comp.func_to_container_op(data_download_n_class_declr, base_image= "tensorflow/tensorflow:latest-gpu-py3")
fraud_sensitive_model_op = comp.func_to_container_op(fraud_sensitive_model, base_image= "tensorflow/tensorflow:latest-gpu-py3")
train_predict_op = comp.func_to_container_op(train_predict, base_image="tensorflow/tensorflow:latest-gpu-py3")
prediction_summary_op = comp.func_to_container_op(prediction_summary, base_image="tensorflow/tensorflow:latest-gpu-py3")

### CREATING PIPELINES

In [14]:
# create client that would enable communication with the Pipelines API server 
client = kfp.Client()

In [15]:
@dsl.pipeline(name ="Fraud Detection",
        description = "Fraud Detection Pipeline")

def fraud_detection(data_path:str):
    
    volume_op = dsl.VolumeOp(
        name="data_volume",
        resource_name="data-volume",
        size="1Gi",
        modes=dsl.VOLUME_MODE_RWO)
    
    # Create data download components.
    data_download_class_declr_container = data_download_n_class_declr_op(data_path).add_pvolumes({data_path:volume_op.volume})

    # Create data preprocessing component.
    fraud_sensitive_model_container = fraud_sensitive_model_op(data_path).add_pvolumes({data_path: data_download_class_declr_container.pvolume})
        
    # Create Forecasting Component.
    forecasting_conatiner = train_predict_op(data_path)\
                                        .add_pvolumes({data_path:fraud_sensitive_model_container.pvolume})
    
    # Prediction Summary Component.
    prediction_summary_conatiner = prediction_summary_op(data_path)\
                                        .add_pvolumes({data_path:forecasting_conatiner.pvolume})

In [16]:
#DATA_PATH ="/home/jovyan/data/"
DATA_PATH = "/mnt"




pipeline_func = fraud_detection

experiment_name = 'fraud_detection_kubeflow'
run_name = pipeline_func.__name__ + ' run'

arguments = {"data_path":DATA_PATH}

# Compile pipeline to generate compressed YAML definition of the pipeline.
kfp.compiler.Compiler().compile(pipeline_func,  
  '{}.zip'.format(experiment_name))

# Submit pipeline directly from pipeline function
run_result = client.create_run_from_pipeline_func(pipeline_func, 
                                                  experiment_name=experiment_name, 
                                                  run_name=run_name, 
                                                  arguments=arguments)