## Import Necessary Libraries

In [None]:
!python -m pip install --user --upgrade pip
!pip3 install pandas==0.23.4 matplotlib==3.0.3 scipy==1.2.1 scikit-learn==0.22 tensorflow==2.0 keras==1.2.2 --user


In [None]:
import numpy as np
import pandas as  pd
import os
import matplotlib.pyplot as plt

from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report
from sklearn.model_selection import RandomizedSearchCV

# evaluating the mode
from sklearn.metrics import classification_report, confusion_matrix

### Install Kubeflow pipelines SDK

In [None]:
!pip3 install kfp --upgrade --user

In [None]:
!which dsl-compile

## Build the Components

In [None]:
# Import Kubeflow SDK
import kfp
import kfp.dsl as dsl
import kfp.components as comp

In [None]:
# where the outputs are stored
out_dir = "/home/jovyan/02-heart-failure/data/out/"

In [None]:
def train(data_path, model_file):
    
    import pickle
    import sys, subprocess;
    subprocess.run([sys.executable, '-m', 'pip', 'install', 'pandas==0.23.4'])
    subprocess.run([sys.executable, '-m', 'pip', 'install', 'scikit-learn==0.22'])
    import tensorflow as tf
    from tensorflow.python import keras
    
    # Download the dataset and split into training and test data. 
    data = pd.read_csv("https://github.com/HamoyeHQ/02-heart-failure/tree/master/data/heart_failure_clinical_records_dataset.csv")
    
    # Splitting the datasets into x and y
    x = hf_data.drop(['DEATH_EVENT'], axis = 1)
    y = hf_data['DEATH_EVENT']
    
    # Instantiating the model with the best hyper-parameters obtained
    clf = keras.models.RandomForestClassifier(n_estimators = 50, min_samples_split = 7, min_samples_leaf = 6, max_features = 'log2')
    
    # Instantiating the KFold split  
    split = KFold(n_splits=10, shuffle = True, random_state=42)

    # Split data into train and test using stratified split to avoid bias
    for train_index, test_index in split.split(x, y):
        train = hf_data.loc[train_index]
        test = hf_data.loc[test_index]

    # Taking only the best features obtained
    xtrain = train[['time', 'ejection_fraction', 'serum_creatinine', 'age', 'serum_sodium', 'high_blood_pressure', 'creatinine_phosphokinase']]
    ytrain = train['DEATH_EVENT']
    xtest = test[['time', 'ejection_fraction', 'serum_creatinine', 'age', 'serum_sodium', 'high_blood_pressure', 'creatinine_phosphokinase']]
    ytest = test['DEATH_EVENT']

    # Run a training job with specified number of epochs
    clf.fit(xtrain, ytrain, epochs=10)

    #Evaluate the model and print the results
    test_loss, test_acc = clf.evaluate(xtest,  ytest, verbose=0)
    print('Test accuracy:', test_acc)

    #Save the model to the designated 
    clf.save(f'{data_path}/{model_file}')

    #Save the test_data as a pickle file to be used by the predict component.
    with open(f'{data_path}/test_data', 'wb') as f:
        pickle.dump((xtest,  ytest), f)

In [None]:
clf = train(out_dir, "model")

In [None]:
def predict(data_path, model_file):
    
    import pickle
    import tensorflow as tf
    from tensorflow import keras
    import numpy as np
    
    # Load the saved Keras model
    clf = keras.models.load_model(f'{data_path}/{model_file}')

    # Load and unpack the test_data
    with open(f'{data_path}/test_data','rb') as f:
        test_data = pickle.load(f)
    # Separate the X_test from y_test.
    xtest,  ytest = test_data

    # make predictions.
    y_pred = clf.predict(xtest)
    
    with open(f'{data_path}/result.txt', 'w') as result:
        result.write(" Prediction: {}, Actual: {} ".format(y_pred,ytest.astype(np.bool)))
    
    print('Prediction has be saved successfully!')

In [None]:
predict(out_dir, "model")

In [None]:
# Create train and predict lightweight components.
train_op = comp.func_to_container_op(train , base_image = "tensorflow/tensorflow:latest-gpu-py3")
predict_op = comp.func_to_container_op(predict , base_image = "tensorflow/tensorflow:latest-gpu-py3")

## Build Kubeflow Pipeline

In [None]:
#Create a client to enable communication with the Pipelines API server.
client = kfp.Client()

In [None]:
# Define the pipeline
@dsl.pipeline(
   name='Heart Failure Pipeline',
   description='An ML pipeline that performs Heart Failure model training and Death Event prediction.'
)

# Define parameters to be fed into pipeline
def heartfailure_container_pipeline(
    data_path: str,
    model_file: str
):
    
    # Define volume to share data between components.
    vop = dsl.VolumeOp(
    name="create_volume",
    resource_name="data-volume", 
    size="1Gi", 
    modes=dsl.VOLUME_MODE_RWO)
    
    # Create churn training component.
    heartfailure_training_container = train_op(data_path, model_file) \
                                    .add_pvolumes({data_path: vop.volume})

    # Create Churn prediction component.
    heartfailure_predict_container = predict_op(data_path, model_file) \
                                    .add_pvolumes({data_path: heartfailure_training_container.pvolume})
    
    # Print the result of the prediction
    heartfailure_result_container = dsl.ContainerOp(
        name="print_prediction",
        image='library/bash:4.4.23',
        pvolumes={data_path: heartfailure_predict_container.pvolume},
        arguments=['cat', f'{data_path}/result.txt']
    )

## Run the Pipeline

In [None]:
DATA_PATH = '/hfp'
MODEL_PATH='heartfailure_predictor.h5'

In [None]:
pipeline_func = heartfailure_container_pipeline

In [None]:
experiment_name = 'heartfailure_kubeflow'
run_name = pipeline_func.__name__ + ' run'

arguments = {"data_path":DATA_PATH,
             "model_file":MODEL_PATH}

# Compile pipeline to generate compressed YAML definition of the pipeline.
kfp.compiler.Compiler().compile(pipeline_func,  
  '{}.zip'.format(experiment_name))

# Submit pipeline directly from pipeline function
run_result = client.create_run_from_pipeline_func(pipeline_func, 
                                                  experiment_name=experiment_name, 
                                                  run_name=run_name, 
                                                  arguments=arguments)