## Import Necessary Libraries

In [1]:
!python -m pip install --user --upgrade pip
!pip3 install pandas==0.23.4 matplotlib==3.0.3 scipy==1.2.1 scikit-learn==0.22 tensorflow==2.0 keras==1.2.2 --user


Requirement already up-to-date: pip in /home/jovyan/.local/lib/python3.6/site-packages (20.2.3)
Please see https://github.com/pypa/pip/issues/5599 for advice on fixing the underlying issue.
To avoid this problem you can invoke Python with '-m pip' instead of running pip directly.


In [2]:
import numpy as np
import pandas as  pd
import os
import matplotlib.pyplot as plt

from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV

# evaluating the mode
from sklearn.metrics import classification_report, confusion_matrix

### Install Kubeflow pipelines SDK

In [3]:
!pip3 install kfp --upgrade --user

Please see https://github.com/pypa/pip/issues/5599 for advice on fixing the underlying issue.
To avoid this problem you can invoke Python with '-m pip' instead of running pip directly.
Requirement already up-to-date: kfp in /home/jovyan/.local/lib/python3.6/site-packages (1.0.3)


In [4]:
!which dsl-compile

## Build the Components

In [5]:
# Import Kubeflow SDK
import kfp
import kfp.dsl as dsl
import kfp.components as comp

In [6]:
# where the outputs are stored
out_dir = "/home/jovyan/02-heart-failure/data/out/"

## Preprocessing the data

In [7]:
def preprocess(data_path):
    
    import pickle
    import sys, subprocess;
    subprocess.run([sys.executable, '-m', 'pip', 'install', 'pandas', 'scikit-learn'])
    from sklearn.model_selection import KFold
    from sklearn.model_selection import StratifiedShuffleSplit
    import numpy as np
    import pandas as pd
    
    # Download the dataset and split into training and test data. 
    hf_data = pd.read_csv("https://raw.githubusercontent.com/HamoyeHQ/02-heart-failure/master/data/heart_failure_clinical_records_dataset.csv")
    
    # Splitting the datasets into x and y
    x = hf_data.drop(['DEATH_EVENT'], axis = 1)
    y = hf_data['DEATH_EVENT']
    
     # Instantiating the KFold split  
    split = KFold(n_splits=10, shuffle = True, random_state=42)

    # Split data into train and test using stratified split to avoid bias
    for train_index, test_index in split.split(x, y):
        train = hf_data.loc[train_index]
        test = hf_data.loc[test_index]

    # Taking only the best features obtained
    xtrain = train[['time', 'ejection_fraction', 'serum_creatinine', 'age', 'serum_sodium', 'high_blood_pressure', 'creatinine_phosphokinase']]
    ytrain = train['DEATH_EVENT']
    xtest = test[['time', 'ejection_fraction', 'serum_creatinine', 'age', 'serum_sodium', 'high_blood_pressure', 'creatinine_phosphokinase']]
    ytest = test['DEATH_EVENT']
    
    #output file to path
    np.savez_compressed(f'{data_path}/preprocessed-data.npz', 
                       xtrain=xtrain,
                       xtest=xtest,
                       ytrain=ytrain,
                       ytest=ytest)
    print("Preprocessing Done")

## Training the Random forest Classier

In [8]:
def train(data_path, clf_file):
    
    #import libraries and dependencies
    import pickle
    import sys, subprocess;
    subprocess.run([sys.executable, '-m', 'pip', 'install', 'pandas', 'scikit-learn'])
    from sklearn.ensemble import RandomForestClassifier
    import numpy as np
    
    #load the preprocessed data
    preprocessed_data = np.load(f'{data_path}/preprocessed-data.npz')
    x_train = preprocessed_data['xtrain']
    y_train = preprocessed_data['ytrain']
    
    # Instantiating the model with the best hyper-parameters obtained
    clf = RandomForestClassifier(n_estimators = 50, min_samples_split = 7, min_samples_leaf = 6, max_features = 'log2')

    # Run a training job with specified number of epochs
    clf.fit(x_train, y_train)

    #Save the model to the designated 
    with open(f'{data_path}/{clf_file}', 'wb') as file:
        pickle.dump(clf, file)
        
    print("Model Trained")

## Prediction using the model

In [9]:
def predict(data_path, clf_file):
    
    import pickle
    import numpy as np
    import sys, subprocess;
    subprocess.run([sys.executable, '-m', 'pip', 'install', 'pandas', 'scikit-learn'])
    from sklearn.metrics import accuracy_score, f1_score
    
    # Load the saved Random forest classifier model
    with open(f'{data_path}/{clf_file}','rb') as file:
        clf = pickle.load(file)
        
    #load the preprocessed data
    preprocessed_data = np.load(f'{data_path}/preprocessed-data.npz')
    x_test = preprocessed_data['xtest']
    y_test = preprocessed_data['ytest']
    
    #Evaluate the model and print the results
    clf_pred = clf.predict(x_test)
        
    print('Model \nAccuracy score = {} \nF1_score = {}' .format(accuracy_score(y_test, clf_pred), f1_score(y_test, clf_pred)))
    
    #np.savetxt(f'{data_path}/model_result.txt', clf_pred, fmt='%1.2f')
    with open(f'{data_path}/model_result.txt', 'w') as result:
        result.write(" Prediction: {},\nActual: {} ".format(clf_pred,y_test))
    
    print('Prediction has been saved successfully!')

In [10]:
# Create train and predict lightweight components.
preprocess_op = comp.func_to_container_op(preprocess, base_image="tensorflow/tensorflow:latest-gpu-py3")
train_op = comp.func_to_container_op(train , base_image = "tensorflow/tensorflow:latest-gpu-py3")
predict_op = comp.func_to_container_op(predict , base_image = "tensorflow/tensorflow:latest-gpu-py3")

## Build Kubeflow Pipeline

In [11]:
#Create a client to enable communication with the Pipelines API server.
client = kfp.Client()

In [12]:
# Define the pipeline
@dsl.pipeline(
   name='Heart Failure Pipeline',
   description='An ML pipeline that performs Heart Failure model training and Death Event prediction.'
)

# Define parameters to be fed into pipeline
def heartfailure_container_pipeline(
    data_path: str,
    clf_file: str
):
    
    # Define volume to share data between components.
    vop = dsl.VolumeOp(
    name="create_volume",
    resource_name="data-volume", 
    size="1Gi", 
    modes=dsl.VOLUME_MODE_RWO)
    
    # Create preprocess component.
    heartfailure_preprocess_container = preprocess_op(data_path).add_pvolumes({data_path: vop.volume})
    
    # Create training component.
    heartfailure_training_container = train_op(data_path, clf_file) \
                                    .add_pvolumes({data_path: heartfailure_preprocess_container.pvolume})

    # Create prediction component.
    heartfailure_predict_container = predict_op(data_path, clf_file) \
                                    .add_pvolumes({data_path: heartfailure_training_container.pvolume})
    
    # Print the result of the prediction
    heartfailure_result_container = dsl.ContainerOp(
        name="prediction",
        image='library/bash:4.4.23',
        pvolumes={data_path: heartfailure_predict_container.pvolume},
        arguments=['head', f'{data_path}/model_result.txt']
    )

## Run the Pipeline

In [13]:
DATA_PATH = '/mnt'
CLF_PATH='heartfailure_predictor.pkl'

In [14]:
pipeline_func = heartfailure_container_pipeline

In [15]:
experiment_name = 'heartfailure_kubeflow'
run_name = pipeline_func.__name__ + ' run'

arguments = {"data_path":DATA_PATH,
             "clf_file":CLF_PATH}

# Compile pipeline to generate compressed YAML definition of the pipeline.
kfp.compiler.Compiler().compile(pipeline_func,  
  '{}.zip'.format(experiment_name))

# Submit pipeline directly from pipeline function
run_result = client.create_run_from_pipeline_func(pipeline_func, 
                                                  experiment_name=experiment_name, 
                                                  run_name=run_name, 
                                                  arguments=arguments)