In [None]:
#installing pip
!python -m pip install --user --upgrade pip --quiet

In [None]:
#installing kfp
!pip install kfp --upgrade --user --quiet

In [1]:
%%bash
pwd

/home/jovyan


In [2]:
cd maven_kubeflow_pipeline

/home/jovyan/maven_kubeflow_pipeline


In [3]:
%%bash
mkdir download_data
mkdir model_training
mkdir print_result

mkdir: cannot create directory ‘download_data’: File exists
mkdir: cannot create directory ‘model_training’: File exists
mkdir: cannot create directory ‘print_result’: File exists


CalledProcessError: Command 'b'mkdir download_data\nmkdir model_training\nmkdir print_result\n'' returned non-zero exit status 1.

In [4]:
download_data_file = "./download_data/download_data.py"
docker_data_file = "./download_data/Dockerfile"
yaml_data_file = "./download_data/download_data.yaml"

In [5]:
%%writefile $download_data_file
import pickle

import argparse
from pathlib import Path

from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split

def download_data(args):

    # Gets and split dataset
    x, y = load_breast_cancer(return_X_y=True)
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25)

    data = x_train, x_test, y_train, y_test 
        
    #Save the train_data and test_data as a pickle file to be used by the next component.
    with open(args.data, 'wb') as f:
        pickle.dump(data, f)

if __name__ == '__main__':
    
    # This component does not receive any input
    # it only outpus one artifact which is `data`.
    parser = argparse.ArgumentParser()
    parser.add_argument('--data', type=str)
    
    args = parser.parse_args()
    
    # Creating the directory where the output file will be created 
    # (the directory may or may not exist).
    Path(args.data).parent.mkdir(parents=True, exist_ok=True)

    download_data(args)
    

Overwriting ./download_data/download_data.py


In [6]:
%%writefile $docker_data_file

FROM python:3.9-slim-buster
WORKDIR /maven
RUN pip install sklearn
COPY download_data.py /maven

Overwriting ./download_data/Dockerfile


In [7]:
%%writefile $yaml_data_file
name: Download Data Function
description: Download toy data from sklearn datasets

outputs:
- {name: Data, type: LocalPath, description: 'Path where data will be stored.'}

implementation:
  container:
    image: mavencodev/breast_cancer_download_data:2.0
    command: [
      python, download_data.py,

      --data,
      {outputPath: Data},
    ]

Overwriting ./download_data/download_data.yaml


In [8]:
model_training_file = "./model_training/model_training.py"
docker_model_file = "./model_training/Dockerfile"
yaml_model_file = "./model_training/model_training.yaml"

In [9]:
%%writefile $model_training_file
import pickle

import argparse
from pathlib import Path

from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier

def model_training(args):

    # Load and unpack the train_data
    with open(args.data, 'rb') as f:
        data = pickle.load(f)

    x_train, x_test, y_train, y_test  = data 
    
    # Initialize and train the model
    model = RandomForestClassifier(random_state=1)
    model.fit(x_train, y_train)

    # Get predictions
    y_pred = model.predict(x_test)
    
    # Get accuracy
    accuracy = accuracy_score(y_test, y_pred)

    # Save output into file
    with open(args.accuracy, 'w') as accuracy_file:
        accuracy_file.write(str(accuracy))



if __name__ == '__main__':

    # Defining and parsing the command-line arguments
    parser = argparse.ArgumentParser(description='My program description')
    parser.add_argument('--data', type=str)
    parser.add_argument('--accuracy', type=str)

    args = parser.parse_args()

    # Creating the directory where the output file will be created (the directory may or may not exist).
    Path(args.accuracy).parent.mkdir(parents=True, exist_ok=True)
    
    model_training(args)

Overwriting ./model_training/model_training.py


In [10]:
%%writefile $docker_model_file

FROM python:3.9-slim-buster
WORKDIR /maven
RUN pip install sklearn
COPY model_training.py /maven

Overwriting ./model_training/Dockerfile


In [11]:
%%writefile $yaml_model_file
name: Random Forest Classifier
description: Trains a Random Forest Classifier

inputs:
- {name: Data, type: LocalPath, description: 'Path where data is stored.'}
outputs:
- {name: Accuracy, type: Float, description: 'Accuracy metric'}

implementation:
  container:
    image: mavencodev/breast_cancer_random_forest:2.0
    command: [
      python, model_training.py,

      --data,
      {inputPath: Data},

      --accuracy,
      {outputPath: Accuracy},

    ]

Overwriting ./model_training/model_training.yaml


In [12]:
print_result_file = "./print_result/print_result.py"
docker_acc_file = "./print_result/Dockerfile"
yaml_acc_file = "./print_result/print_result.yaml"

In [13]:
%%writefile $print_result_file
import argparse

def print_result(args):
    # Print results
    with open(args.accuracy, 'r') as f:
        score = f.read()

    print(f"Random forest (accuracy): {score}")
    
if __name__ == '__main__':

    # Defining and parsing the command-line arguments
    parser = argparse.ArgumentParser(description='My program description')
    parser.add_argument('--data', type=str)
    parser.add_argument('--accuracy', type=str)

    args = parser.parse_args()
    
    print_result(args)

Overwriting ./print_result/print_result.py


In [14]:
%%writefile $docker_acc_file

FROM python:3.9-slim-buster
WORKDIR /maven
RUN pip install sklearn
COPY print_result.py /maven

Overwriting ./print_result/Dockerfile


In [15]:
%%writefile $yaml_acc_file
name: Prints result
description: Prints Random Forest Classifier result

inputs:
- {name: Accuracy, type: Float, description: 'Accuracy metric'}

implementation:
  container:
    image: mavencodev/breast_cancer_accuracy:3.0
    command: [
      python, print_result.py,

      --accuracy,
      {inputPath: Accuracy},


    ]

Overwriting ./print_result/print_result.yaml


In [16]:
pipeline_file = "pipeline.py"

In [17]:
%%writefile $pipeline_file

import kfp
from kfp import dsl

@dsl.pipeline(name='First Pipeline', description='Applies Random forest for classification problem.')
def first_pipeline():

    # Loads the yaml manifest for each component
    download = kfp.components.load_component_from_file('download_data/download_data.yaml')
    model_training = kfp.components.load_component_from_file('model_training/model_training.yaml')
    print_result = kfp.components.load_component_from_file("print_result/print_result.yaml")
    
    # Run download_data task
    download_task = download()

    # Run task "model_training" given
    # the output generated by "download_task".
    model_training_task = model_training(download_task.output)

    # Given the output from "model_training"
    # the component "print_result" is called to print the results.
    print_result(model_training_task.output)



if __name__ == '__main__':
    kfp.compiler.Compiler().compile(first_pipeline, 'Pipeline.yaml')
    kfp.Client().create_run_from_pipeline_func(first_pipeline, arguments={})

Overwriting pipeline.py


In [18]:
%run $pipeline_file