In [1]:
#installing pip
!python -m pip install --user --upgrade pip --quiet

You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [2]:
#installing kfp
!pip install kfp --upgrade --user --quiet

Please see https://github.com/pypa/pip/issues/5599 for advice on fixing the underlying issue.
To avoid this problem you can invoke Python with '-m pip' instead of running pip directly.


In [1]:
%%bash
pwd

/home/jovyan/maven_breast_cancer


In [2]:
%%bash
mkdir download_data
mkdir logistic_regression

In [18]:
download_data_file = "./download_data/download_data.py"
docker_data_file = "./download_data/Dockerfile"
yaml_data_file = "./download_data/download_data.yaml"

In [19]:
%%writefile $download_data_file
import json

import argparse
from pathlib import Path

from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split

def download_data(args):

    # Gets and split dataset
    x, y = load_breast_cancer(return_X_y=True)
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

    # Creates `data` structure to save and 
    # share train and test datasets.
    data = {'x_train' : x_train.tolist(),
            'y_train' : y_train.tolist(),
            'x_test' : x_test.tolist(),
            'y_test' : y_test.tolist()}

    # Creates a json object based on `data`
    data_json = json.dumps(data)

    # Saves the json object into a file
    with open(args.data, 'w') as out_file:
        json.dump(data_json, out_file)

if __name__ == '__main__':
    
    # This component does not receive any input
    # it only outpus one artifact which is `data`.
    parser = argparse.ArgumentParser()
    parser.add_argument('--data', type=str)
    
    args = parser.parse_args()
    
    # Creating the directory where the output file will be created 
    # (the directory may or may not exist).
    Path(args.data).parent.mkdir(parents=True, exist_ok=True)

    download_data(args)
    

Overwriting ./download_data/download_data.py


In [20]:
%%writefile $docker_data_file

FROM python:3.9-slim-buster
WORKDIR /maven
RUN pip install sklearn
COPY download_data.py /maven

Writing ./download_data/Dockerfile


In [21]:
%%writefile $yaml_data_file
name: Download Data Function
description: Download toy data from sklearn datasets

outputs:
- {name: Data, type: LocalPath, description: 'Path where data will be stored.'}

implementation:
  container:
    image: mavencodev/breast_cancer_download_data:1.0
    command: [
      python, download_data.py,

      --data,
      {outputPath: Data},
    ]

Writing ./download_data/download_data.yaml


In [22]:
logistic_regression_file = "./logistic_regression/logistic_regression.py"
docker_log_file = "./logistic_regression/Dockerfile"
yaml_log_file = "./logistic_regression/logistic_regression.yaml"

In [23]:
%%writefile $logistic_regression_file
import json

import argparse
from pathlib import Path

from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression

def logistic_regression(args):

    # Open and reads file "data"
    with open(args.data) as data_file:
        data = json.load(data_file)
    
    # The excted data type is 'dict', however since the file
    # was loaded as a json object, it is first loaded as a string
    # thus we need to load again from such string in order to get 
    # the dict-type object.
    data = json.loads(data)

    x_train = data['x_train']
    y_train = data['y_train']
    x_test = data['x_test']
    y_test = data['y_test']
    
    # Initialize and train the model
    model = LogisticRegression()
    model.fit(x_train, y_train)

    # Get predictions
    y_pred = model.predict(x_test)
    
    # Get accuracy
    accuracy = accuracy_score(y_test, y_pred)

    # Save output into file
    with open(args.accuracy, 'w') as accuracy_file:
        accuracy_file.write(str(accuracy))



if __name__ == '__main__':

    # Defining and parsing the command-line arguments
    parser = argparse.ArgumentParser(description='My program description')
    parser.add_argument('--data', type=str)
    parser.add_argument('--accuracy', type=str)

    args = parser.parse_args()

    # Creating the directory where the output file will be created (the directory may or may not exist).
    Path(args.accuracy).parent.mkdir(parents=True, exist_ok=True)
    
    logistic_regression(args)

Writing ./logistic_regression/logistic_regression.py


In [24]:
%%writefile $docker_log_file

FROM python:3.9-slim-buster
WORKDIR /maven
RUN pip install sklearn
COPY logistic_regression.py /maven

Writing ./logistic_regression/Dockerfile


In [25]:
%%writefile $yaml_log_file
name: Logistic Regression Classifier
description: Trains a Logistic Regression Classifier

inputs:
- {name: Data, type: LocalPath, description: 'Path where data is stored.'}
outputs:
- {name: Accuracy, type: Float, description: 'Accuracy metric'}

implementation:
  container:
    image: mavencodev/breast_cancer_logistic_regression:1.0
    command: [
      python, logistic_regression.py,

      --data,
      {inputPath: Data},

      --accuracy,
      {outputPath: Accuracy},

    ]

Writing ./logistic_regression/logistic_regression.yaml


In [26]:
pipeline_file = "pipeline.py"

In [27]:
%%writefile $pipeline_file
import kfp
from kfp import dsl
from kfp.components import func_to_container_op

@func_to_container_op
def show_results(logistic_regression : float) -> None:
    # Given the outputs from logistic regression components
    # the results are shown.

    print(f"Logistic regression (accuracy): {logistic_regression}")


@dsl.pipeline(name='First Pipeline', description='Applies Logistic Regression for classification problem.')
def first_pipeline():

    # Loads the yaml manifest for each component
    download = kfp.components.load_component_from_file('download_data/download_data.yaml')
    logistic_regression = kfp.components.load_component_from_file('logistic_regression/logistic_regression.yaml')

    # Run download_data task
    download_task = download()

    # Run task "logistic_regression" given
    # the output generated by "download_task".
    logistic_regression_task = logistic_regression(download_task.output)

    # Given the output from "logistic_regression"
    # the component "show_results" is called to print the results.
    show_results(logistic_regression_task.output)



if __name__ == '__main__':
    kfp.compiler.Compiler().compile(first_pipeline, 'FirstPipeline.yaml')
    kfp.Client().create_run_from_pipeline_func(first_pipeline, arguments={})

Writing pipeline.py
