In [1]:
#!python -m pip install --user --upgrade pip



In [2]:
#!pip3 install kfp --upgrade --user

Please see https://github.com/pypa/pip/issues/5599 for advice on fixing the underlying issue.
To avoid this problem you can invoke Python with '-m pip' instead of running pip directly.
Collecting kfp
  Downloading kfp-1.4.0.tar.gz (159 kB)
[K     |████████████████████████████████| 159 kB 6.7 MB/s eta 0:00:01
Collecting fire>=0.3.1
  Downloading fire-0.4.0.tar.gz (87 kB)
[K     |████████████████████████████████| 87 kB 6.0 MB/s  eta 0:00:01
Building wheels for collected packages: kfp, fire
  Building wheel for kfp (setup.py) ... [?25ldone
[?25h  Created wheel for kfp: filename=kfp-1.4.0-py3-none-any.whl size=222156 sha256=de633a2d7bba9725d1d6ef5c0215a21cd9380aa6313e9c3e8b33a140277a379d
  Stored in directory: /home/jovyan/.cache/pip/wheels/88/63/63/f727a62aaba1e0fe13fe549e1b7538e9b8a2bc43dcae8138c8
  Building wheel for fire (setup.py) ... [?25ldone
[?25h  Created wheel for fire: filename=fire-0.4.0-py2.py3-none-any.whl size=117174 sha256=21db6e5c95672c82b6e50037dbc89c4560a5d80a67968

In [1]:
import kfp
from kfp import dsl

## Using python function to create lightweight component

### Defining the python function with all its dependencies installed and imported within it

In [2]:
def preprocess(data_path,train_data,test_data):
    import pickle
    # import Library
    import sys, subprocess;
    subprocess.run([sys.executable, '-m', 'pip', 'install','scikit-learn==0.22'])
    subprocess.run([sys.executable, '-m', 'pip', 'install','pandas==0.23.4'])
    import pandas as pd
    import numpy as np
    from sklearn.preprocessing import LabelEncoder
    from sklearn.preprocessing import OneHotEncoder
    from sklearn.model_selection import train_test_split
    from sklearn.preprocessing import StandardScaler 

    #importing the data
    data = pd.read_csv("https://raw.githubusercontent.com/MavenCode/KubeflowTraining/master/Data/Churn_Modelling.csv")

    #dropping some columns that are not needed
    data = data.drop(columns=['RowNumber','CustomerId','Surname'], axis=1)
    #data features
    X = data.iloc[:,:-1]
    #target data
    y = data.iloc[:,-1:]   
    #encoding the categorical columns
    le = LabelEncoder()
    ohe = OneHotEncoder()
    X['Gender'] = le.fit_transform(X['Gender'])
    geo_df = pd.DataFrame(ohe.fit_transform(X[['Geography']]).toarray())

    #getting feature name after onehotencoding
    geo_df.columns = ohe.get_feature_names(['Geography'])

    #merging geo_df with the main data
    X = X.join(geo_df) 
    #dropping the old columns after encoding
    X.drop(columns=['Geography'], axis=1, inplace=True)

    #splitting the data 
    X_train,X_test,y_train,y_test = train_test_split( X,y, test_size=0.2, random_state = 42)
    #feature scaling
    sc =StandardScaler()
    X_train = sc.fit_transform(X_train)
    X_test = sc.transform(X_test)
    #saving the values from the dataframe
    y_train = y_train.values
    y_test = y_test.values
    
    #Save the train_data as a pickle file to be used by the train component.
    with open(f'{data_path}/{train_data}', 'wb') as f:
        pickle.dump((X_train,  y_train), f)
        
    #Save the test_data as a pickle file to be used by the predict component.
    with open(f'{data_path}/{test_data}', 'wb') as f:
        pickle.dump((X_test,  y_test), f)
    
    return(print('Done!'))

### converting the python function to a component 

In [3]:
preprocess_op = kfp.components.create_component_from_func(preprocess,base_image="python:3.7.1")

Now, we have the preprocess_op as a kubeflow pipeline component

## Using python function to create reusable component

#### Note that the python script and docker image below, are not executed on this notebook but using your desired choice of text editor. VSCode was used to execute these ones

#### python script. Named "preprocess.py"

In [5]:
#importing libraries
def preprocess():
    #importing libraries
    import joblib
    import pandas as pd
    import numpy as np
    from sklearn.preprocessing import LabelEncoder
    from sklearn.preprocessing import OneHotEncoder
    from sklearn.model_selection import train_test_split
    from sklearn.preprocessing import StandardScaler 
    
    #importing the data
    data = pd.read_csv("https://raw.githubusercontent.com/MavenCode/KubeflowTraining/master/Data/Churn_Modelling.csv")
    #dropping some columns that are not needed
    data = data.drop(columns=['RowNumber','CustomerId','Surname'], axis=1)
    #data features
    X = data.iloc[:,:-1]
    #target data
    y = data.iloc[:,-1:]   
    #encoding the categorical columns
    le = LabelEncoder()
    ohe = OneHotEncoder()
    X['Gender'] = le.fit_transform(X['Gender'])
    geo_df = pd.DataFrame(ohe.fit_transform(X[['Geography']]).toarray())

    #getting feature name after onehotencoding
    geo_df.columns = ohe.get_feature_names(['Geography'])

    #merging geo_df with the main data
    X = X.join(geo_df) 
    #dropping the old columns after encoding
    X.drop(columns=['Geography'], axis=1, inplace=True)

    #splitting the data 
    X_train,X_test,y_train,y_test = train_test_split( X,y, test_size=0.2, random_state = 42)
    #feature scaling
    sc =StandardScaler()
    X_train = sc.fit_transform(X_train)
    X_test = sc.transform(X_test)

    #saving output file to path
    np.save('X_train.npy', X_train)
    np.save('X_test.npy', X_test)
    np.save('y_train.npy', y_train)
    np.save('y_test.npy', y_test)

#defining and parsing arguments
if __name__ == '__main__':
    preprocess()


#### Dockerfile
#### This Docker image was pushed to dockerhub. Steps to push to docker hub was explained in the slides and labs

In [None]:
FROM python:3.7.1
WORKDIR /preprocess_data
RUN pip install --upgrade pip \
&& pip install -U scikit-learn numpy pandas
COPY preprocess.py /preprocess_data
ENTRYPOINT ["python", "preprocess.py"]

##### writing the component python function using the Kubeflow Pipelines DSL to define your pipeline’s interactions with the component’s Docker container

In [None]:
def preprocess_op(data):
    return dsl.ContainerOp(
        name = 'Preprocess Data',
        image = 'mavencodev/preprocess-component:v.0.1',
        arguments = ['--data', data],
        file_outputs={
            'X_train': '/preprocess_data/X_train.npy',
            'X_test': '/preprocess_data/X_test.npy',
            'y_train': '/preprocess_data/y_train.npy',
            'y_test': '/preprocess_data/y_test.npy'     
        }
    )

### Using YAML file to create component. To create a yaml file from the python function, a few things are added

In [7]:
def preprocess(data_path,train_data,test_data):
    import pickle
    # import Library
    import sys, subprocess;
    subprocess.run([sys.executable, '-m', 'pip', 'install','scikit-learn==0.22'])
    subprocess.run([sys.executable, '-m', 'pip', 'install','pandas==0.23.4'])
    import pandas as pd
    import numpy as np
    from sklearn.preprocessing import LabelEncoder
    from sklearn.preprocessing import OneHotEncoder
    from sklearn.model_selection import train_test_split
    from sklearn.preprocessing import StandardScaler 

    #importing the data
    data = pd.read_csv("https://raw.githubusercontent.com/MavenCode/KubeflowTraining/master/Data/Churn_Modelling.csv")

    #dropping some columns that are not needed
    data = data.drop(columns=['RowNumber','CustomerId','Surname'], axis=1)
    #data features
    X = data.iloc[:,:-1]
    #target data
    y = data.iloc[:,-1:]   
    #encoding the categorical columns
    le = LabelEncoder()
    ohe = OneHotEncoder()
    X['Gender'] = le.fit_transform(X['Gender'])
    geo_df = pd.DataFrame(ohe.fit_transform(X[['Geography']]).toarray())

    #getting feature name after onehotencoding
    geo_df.columns = ohe.get_feature_names(['Geography'])

    #merging geo_df with the main data
    X = X.join(geo_df) 
    #dropping the old columns after encoding
    X.drop(columns=['Geography'], axis=1, inplace=True)

    #splitting the data 
    X_train,X_test,y_train,y_test = train_test_split( X,y, test_size=0.2, random_state = 42)
    #feature scaling
    sc =StandardScaler()
    X_train = sc.fit_transform(X_train)
    X_test = sc.transform(X_test)
    #saving the values from the dataframe
    y_train = y_train.values
    y_test = y_test.values
    
    #Save the train_data as a pickle file to be used by the train component.
    with open(f'{data_path}/{train_data}', 'wb') as f:
        pickle.dump((X_train,  y_train), f)
        
    #Save the test_data as a pickle file to be used by the predict component.
    with open(f'{data_path}/{test_data}', 'wb') as f:
        pickle.dump((X_test,  y_test), f)
    
    return(print('Done!'))

#exporting function to YAML file
if __name__ == "__main__":
    kfp.components.create_component_from_func(
    preprocess, #function name
    output_component_file="preprocess-component.yaml") ,
    base_image="python:3.7.1",
    packages_to_install = ["pandas==0.23.4", "scikit-learn==0.22"]

The YAML file is created in the working directory of the notebook except if it is specified otherwise

#### a copy of the YAML file  shown below

In [None]:
name: Preprocess
inputs:
- {name: data_path}
- {name: train_data}
- {name: test_data}
implementation:
  container:
    image: python:3.7
    command:
    - sh
    - -ec
    - |
      program_path=$(mktemp)
      printf "%s" "$0" > "$program_path"
      python3 -u "$program_path" "$@"
    - "def preprocess(data_path,train_data,test_data):\n    import pickle\n    # import\
      \ Library\n    import sys, subprocess;\n    subprocess.run([sys.executable,\
      \ '-m', 'pip', 'install','scikit-learn==0.22'])\n    subprocess.run([sys.executable,\
      \ '-m', 'pip', 'install','pandas==0.23.4'])\n    import pandas as pd\n    import\
      \ numpy as np\n    from sklearn.preprocessing import LabelEncoder\n    from\
      \ sklearn.preprocessing import OneHotEncoder\n    from sklearn.model_selection\
      \ import train_test_split\n    from sklearn.preprocessing import StandardScaler\
      \ \n\n    #importing the data\n    data = pd.read_csv(\"https://raw.githubusercontent.com/MavenCode/KubeflowTraining/master/Data/Churn_Modelling.csv\"\
      )\n\n    #dropping some columns that are not needed\n    data = data.drop(columns=['RowNumber','CustomerId','Surname'],\
      \ axis=1)\n    #data features\n    X = data.iloc[:,:-1]\n    #target data\n\
      \    y = data.iloc[:,-1:]   \n    #encoding the categorical columns\n    le\
      \ = LabelEncoder()\n    ohe = OneHotEncoder()\n    X['Gender'] = le.fit_transform(X['Gender'])\n\
      \    geo_df = pd.DataFrame(ohe.fit_transform(X[['Geography']]).toarray())\n\n\
      \    #getting feature name after onehotencoding\n    geo_df.columns = ohe.get_feature_names(['Geography'])\n\
      \n    #merging geo_df with the main data\n    X = X.join(geo_df) \n    #dropping\
      \ the old columns after encoding\n    X.drop(columns=['Geography'], axis=1,\
      \ inplace=True)\n\n    #splitting the data \n    X_train,X_test,y_train,y_test\
      \ = train_test_split( X,y, test_size=0.2, random_state = 42)\n    #feature scaling\n\
      \    sc =StandardScaler()\n    X_train = sc.fit_transform(X_train)\n    X_test\
      \ = sc.transform(X_test)\n    #saving the values from the dataframe\n    y_train\
      \ = y_train.values\n    y_test = y_test.values\n\n    #Save the train_data as\
      \ a pickle file to be used by the train component.\n    with open(f'{data_path}/{train_data}',\
      \ 'wb') as f:\n        pickle.dump((X_train,  y_train), f)\n\n    #Save the\
      \ test_data as a pickle file to be used by the predict component.\n    with\
      \ open(f'{data_path}/{test_data}', 'wb') as f:\n        pickle.dump((X_test,\
      \  y_test), f)\n\n    return(print('Done!'))\n\nimport argparse\n_parser = argparse.ArgumentParser(prog='Preprocess',\
      \ description='')\n_parser.add_argument(\"--data-path\", dest=\"data_path\"\
      , type=str, required=True, default=argparse.SUPPRESS)\n_parser.add_argument(\"\
      --train-data\", dest=\"train_data\", type=str, required=True, default=argparse.SUPPRESS)\n\
      _parser.add_argument(\"--test-data\", dest=\"test_data\", type=str, required=True,\
      \ default=argparse.SUPPRESS)\n_parsed_args = vars(_parser.parse_args())\n\n\
      _outputs = preprocess(**_parsed_args)\n"
    args:
    - --data-path
    - {inputValue: data_path}
    - --train-data
    - {inputValue: train_data}
    - --test-data
    - {inputValue: test_data}


#### loading the yaml file from the preprocess function.
##### kfp.components.load_component_from_file or kfp.components.load_component_from_url  can be used to load the yaml file

In [9]:
kfp.components.load_component_from_file("preprocess-component.yaml")

<function Preprocess(data_path, train_data, test_data)>

### The different ways of creating pipeline components have been discussed above