In [1]:
#!python -m pip install --user --upgrade pip

Collecting pip
[?25l  Downloading https://files.pythonhosted.org/packages/54/eb/4a3642e971f404d69d4f6fa3885559d67562801b99d7592487f1ecc4e017/pip-20.3.3-py2.py3-none-any.whl (1.5MB)
[K     |████████████████████████████████| 1.5MB 7.9MB/s eta 0:00:01
[?25hInstalling collected packages: pip
Successfully installed pip-20.3.3
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [2]:
#!pip3 install kfp --upgrade --user

Please see https://github.com/pypa/pip/issues/5599 for advice on fixing the underlying issue.
To avoid this problem you can invoke Python with '-m pip' instead of running pip directly.
Collecting kfp
  Downloading kfp-1.3.0.tar.gz (170 kB)
[K     |████████████████████████████████| 170 kB 4.9 MB/s eta 0:00:01
Collecting docstring-parser>=0.7.3
  Downloading docstring_parser-0.7.3.tar.gz (13 kB)
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h    Preparing wheel metadata ... [?25ldone
Collecting kfp-pipeline-spec<0.2.0,>=0.1.0
  Downloading kfp_pipeline_spec-0.1.4-py3-none-any.whl (25 kB)
Collecting kfp-server-api<2.0.0,>=1.1.2
  Downloading kfp-server-api-1.3.0.tar.gz (54 kB)
[K     |████████████████████████████████| 54 kB 4.3 MB/s  eta 0:00:01
Collecting requests_toolbelt>=0.8.0
  Downloading requests_toolbelt-0.9.1-py2.py3-none-any.whl (54 kB)
[K     |████████████████████████████████| 54 kB 3.2 MB/s  eta 0:00:01
[?25

In [1]:
import kfp
from kfp import dsl
import kfp.components as comp

In [2]:
pwd

'/home/jovyan'

In [3]:
# create  directory for outputs.
output_dir = "/home/jovyan/trial/"

In [4]:
# create preprocessing fucntion

def get_data(data_path):
    #importing libraries
    import pickle
    import sys, subprocess;
    subprocess.run([sys.executable, '-m', 'pip', 'install','pandas==0.23.4'])
    import pandas as pd
    #importing the data
    data = pd.read_csv("https://raw.githubusercontent.com/AdeloreSimiloluwa/Artificial-Neural-Network/master/data/Churn_Modelling.csv")

    ## serialize clean data to output directory
    with open(f'{data_path}/clean_data','wb') as f:
        pickle.dump((data),f)
    
    return (print('Done!'))


In [5]:
get_data(output_dir)

Done!


In [10]:
# create training and prediction function

def preprocess(data_path):
    import pickle
    # import Library
    import sys, subprocess;
    subprocess.run([sys.executable, '-m', 'pip', 'install','scikit-learn==0.22'])
    subprocess.run([sys.executable, '-m', 'pip', 'install','pandas==0.23.4'])
    import pandas as pd
    import numpy as np
    from sklearn.preprocessing import LabelEncoder
    from sklearn.preprocessing import OneHotEncoder
    from sklearn.model_selection import train_test_split
    from sklearn.preprocessing import StandardScaler 

    # deserialize clean data from output directory
    with open(f'{data_path}/clean_data','rb') as f:
        data = pickle.load(f)
    #dropping some columns that are not needed
    data = data.drop(columns=['RowNumber','CustomerId','Surname'], axis=1)
    #data features
    X = data.iloc[:,:-1]
    #target data
    y = data.iloc[:,-1:]   
    #encoding the categorical columns
    le = LabelEncoder()
    ohe = OneHotEncoder()
    X['Gender'] = le.fit_transform(X['Gender'])
    geo_df = pd.DataFrame(ohe.fit_transform(X[['Geography']]).toarray())

    #getting feature name after onehotencoding
    geo_df.columns = ohe.get_feature_names(['Geography'])

    #merging geo_df with the main data
    X = X.join(geo_df) 
    #dropping the old columns after encoding
    X.drop(columns=['Geography'], axis=1, inplace=True)

    #splitting the data 
    X_train,X_test,y_train,y_test = train_test_split( X,y, test_size=0.2, random_state = 42)
    #feature scaling
    sc =StandardScaler()
    X_train = sc.fit_transform(X_train)
    X_test = sc.transform(X_test)
    
    # write predictions to results.txt
    with open(f'{data_path}/results.txt','w') as result:
        result.write(f'X_test: {X_test} | Actual {y_test}')
    
    
    
    return(print('Done!'))

In [11]:
preprocess(output_dir)

Done!


In [12]:
# create light weight components
get_data_op = comp.func_to_container_op(get_data)
preprocess_op = comp.func_to_container_op(preprocess)#, base_image="tensorflow/tensorflow:latest-gpu-py3")
#, base_image="tensorflow/tensorflow:latest-gpu-py3")


In [13]:
# create client that would enable communication with the Pipelines API server 
client = kfp.Client()

In [14]:
# define pipeline
@dsl.pipeline(name="Road Safety ML Pipeline", description="Performs Preprocessing, training and prediction")

# Define parameters to be fed into pipeline
def road_safety_pipeline(data_path: str ):
    
    # Define volume to share data between components.
    vop = dsl.VolumeOp(
    name="create_volume",
    resource_name="data-volume", 
    size="1Gi", 
    modes=dsl.VOLUME_MODE_RWO) #RWO

    # Create preprocess components.
    road_safety_get_data_container = get_data_op(data_path).add_pvolumes({data_path: vop.volume})

    # Create train&prediction component.
    road_safety_preprocess_container = preprocess_op(data_path).add_pvolumes({data_path: road_safety_get_data_container.pvolume})


    # Print the result of the prediction
    road_safety_result_container = dsl.ContainerOp(
            name="print_prediction",
            image='library/bash:4.4.23', # 'gcr.io/kubeflow-images-public/tensorflow-2.1.0-notebook-gpu:1.0.0'
            pvolumes={data_path: road_safety_preprocess_container.pvolume},
            arguments=['cat', f'{data_path}/results.txt']
    )

In [15]:
DATA_PATH = '/home/jovyan/data/clean_data'


pipeline_func = road_safety_pipeline

experiment_name = 'road_safety_kubeflow'
run_name = pipeline_func.__name__ + ' run'

arguments = {"data_path":DATA_PATH}

# Compile pipeline to generate compressed YAML definition of the pipeline.
kfp.compiler.Compiler().compile(pipeline_func,  
  '{}.zip'.format(experiment_name))

# Submit pipeline directly from pipeline function
run_result = client.create_run_from_pipeline_func(pipeline_func, 
                                                  experiment_name=experiment_name, 
                                                  run_name=run_name, 
                                                  arguments=arguments)


