Install and import packages

In [5]:
import os

# The Vertex AI Workbench Notebook product has specific requirementss
IS_WORKBENCH_NOTEBOOK = os.getenv("DL_ANACONDA_HOME") and not os.getenv("VIRTUAL_ENV")
IS_USER_MANAGED_WORKBENCH_NOTEBOOK = os.path.exists(
    "/opt/deeplearning/metadata/env_version"
)

# Vertex AI Notebook requires dependencies to be installed with '--user'
USER_FLAG = ""
if IS_WORKBENCH_NOTEBOOK:
    USER_FLAG = "--user"

! pip3 install --upgrade google-cloud-aiplatform {USER_FLAG} -q
! pip3 install -U google-cloud-storage {USER_FLAG} -q
! pip3 install {USER_FLAG} kfp google-cloud-pipeline-components --upgrade -q

You should consider upgrading via the 'c:\python37\python.exe -m pip install --upgrade pip' command.
ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
kfp 1.8.14 requires google-cloud-storage<2,>=1.20.0, but you have google-cloud-storage 2.5.0 which is incompatible.
google-cloud-pipeline-components 1.0.25 requires google-cloud-storage<2,>=1.20.0, but you have google-cloud-storage 2.5.0 which is incompatible.
You should consider upgrading via the 'c:\python37\python.exe -m pip install --upgrade pip' command.
You should consider upgrading via the 'c:\python37\python.exe -m pip install --upgrade pip' command.


In [6]:
import kfp
from kfp.v2 import dsl
from kfp.v2.dsl import component
from kfp.v2.dsl import (
    Input,
    Output,
    Artifact,
    Dataset,
)

Project and pipeline configuration

In [15]:
#The Google Cloud project that this pipeline runs in.
project_id = "de-2022-ng"
# The region that this pipeline runs in
region = "us-west1"
# Specify a Cloud Storage URI that your pipelines service account can access. The artifacts of your pipeline runs are stored within the pipeline root.
pipeline_root_path = "gs://data_de2022_ng"

Pipeline Component: Data Ingestion

In [16]:

def download_data(project_id: str, bucket: str, file_name: str):
    from google.cloud import storage
    import pandas as pd
    import logging 
    import sys
    
    logging.basicConfig(stream=sys.stdout, level=logging.INFO)

    # Download file from google bucket
    client = storage.Client(project=project_id)
    bucket = client.get_bucket(bucket)
    blob = bucket.blob(file_name)
    local_path = '/tmp/'+ file_name
    blob.download_to_filename(local_path)
    logging.info('Downloaded Data!')

    # Create dataframe from downloaded data
    data = pd.read_csv(local_path, index_col=None, squeeze=True)
    logging.info('Downloaded data and created DF')
    return data

In [17]:
# create a KFP component for data ingestion
data_ingestion_comp = kfp.components.create_component_from_func(
    download_data, output_component_file='data_ingestion.yaml', packages_to_install=['google-cloud-storage', 'pandas'])

Pipeline Component: Train RandomForestRegressor

In [18]:
from pandas import DataFrame
from typing import Dict

def train_rfr(data: DataFrame, project_id: str, model_repo: str) -> Dict:
    import json
    import logging 
    import sys
    import os
    import joblib
    
    import pandas as pd
    from google.cloud import storage

    from sklearn.ensemble import RandomForestRegressor
    from sklearn.model_selection import cross_val_score

    logging.basicConfig(stream=sys.stdout, level=logging.INFO)

    X = data.drop(['MEDV'], axis=1)
    y = data['MEDV']

    model = RandomForestRegressor()
    model.fit(X, y)

    # Evaluate the model
    accuracy = cross_val_score(model, X, y, cv=5, scoring='accuracy')
    r2 = cross_val_score(model, X, y, cv=5, scoring='r2')

    metrics = {
        "accuracy:": accuracy,
    }
    logging.info("RFR metrics:" + metrics)

    # Save model locally
    local_file = '/tmp/local_rfr_model.pkl'
    joblib.dump(model, local_file)

    client = storage.Client(project=project_id)
    bucket = client.get_bucket(model_repo)
    blob = bucket.blob('rfr_model.pkl')
    # Upload the locally saved model
    blob.upload_from_filename(local_file)

    print("Saved the model to GCP bucket : " + model_repo)
    return metrics



In [19]:
# create a KFP component for training 
train_rfr_com = kfp.components.create_component_from_func(
    train_rfr, output_component_file='training_rfr.yaml', packages_to_install=['google-cloud-storage', 'pandas', 'joblib', 'scikit-learn'])

TypeError: to_dict() missing 1 required positional argument: 'self'

Pipeline Component: Train LogisticRegressor



In [None]:
from pandas import DataFrame
from typing import Dict

def train_lr (data: DataFrame, project_id: str, model_repo: str) -> Dict:
    '''train a LogisticRegression with default parameters'''
    import json
    import logging 
    import sys
    import os
    import joblib

    import pandas as pd
    from google.cloud import storage

    from sklearn.linear_model import LogisticRegression
    from sklearn.model_selection import cross_val_score
        
    logging.basicConfig(stream=sys.stdout, level=logging.INFO)
    
    X = data.drop(['MEDV'], axis=1)
    y = data['MEDV']
  
    model = LogisticRegression()
    model.fit(X, y)

    # Evaluate the model
    accuracy = cross_val_score(model, X, y, cv=5, scoring='accuracy')
    r2 = cross_val_score(model, X, y, cv=5, scoring='r2')

    metrics = {
        "accuracy:": accuracy,
        "r2": r2,
    }
    logging.info("LR metrics:" + metrics)
   
    # Save the model localy
    local_file = '/tmp/local_lr_model.pkl'
    joblib.dump(model, local_file)
    # write out output
  
    # Save to GCS as model.h5
    client = storage.Client(project=project_id)
    bucket = client.get_bucket(model_repo)
    blob = bucket.blob('lr_model.pkl')
   # Upload the locally saved model
    blob.upload_from_filename(local_file)
  
    print("Saved the model to GCP bucket : " + model_repo)
    return metrics

In [None]:
# create a KFP component for training lr model
trail_lr_com = kfp.components.create_component_from_func(
    train_lr, output_component_file='train_lr_model.yaml', packages_to_install=['google-cloud-storage', 'pandas', 'joblib', 'scikit-learn'])

Pipeline Component: Prediction RFR

In [None]:
from pandas import DataFrame

def predict_rfr(project_id: str, model_repo: str, test_data: DataFrame) -> Dict:
    import pandas as pd
    from google.cloud import storage

    import json
    import logging
    import sys
    import os
    import joblib
    
    logging.basicConfig(stream=sys.stdout, level=logging.INFO)
    
    # Extract independent variables
    X = test_data.drop(['MEDV'], axis=1)

    client = storage.Client(project=project_id)
    bucket = client.get_bucket(model_repo)
    blob = bucket.blob('rfr_model.pkl')
    blob.download_to_filename('/tmp/local_rfr_model.pkl')

    # Load RandomForestRegressor model
    model = joblib.load('/tmp/local_rfr_model.pkl')

    pred = model.predict(X)   

    logging.info(pred)
    return pred

In [None]:
# create a KFP component for prediction LR 
prediction_rfr_com = kfp.components.create_component_from_func(
    predict_rfr, output_component_file='prediction_rfr_com.yaml', packages_to_install=['google-cloud-storage', 'pandas', 'joblib', 'scikit-learn'])

Pipeline Component: Prediction LR

In [None]:
from pandas import DataFrame

def predict_lr(project_id: str, model_repo: str, test_data: DataFrame) -> Dict:
    import pandas as pd
    from google.cloud import storage

    import json
    import logging
    import sys
    import os
    import joblib
    
    logging.basicConfig(stream=sys.stdout, level=logging.INFO)
    
    # Extract independent variables
    X = test_data.drop(['MEDV'], axis=1)

    client = storage.Client(project=project_id)
    bucket = client.get_bucket(model_repo)
    blob = bucket.blob('lr_model.pkl')
    blob.download_to_filename('/tmp/local_lr_model.pkl')

    # Load RandomForestRegressor model
    model = joblib.load('/tmp/local_lr_model.pkl')

    pred = model.predict(X)   

    logging.info(pred)
    return pred

In [None]:
# create a KFP component for prediction LR 
prediction_lr_com = kfp.components.create_component_from_func(
    predict_lr, output_component_file='prediction_lr_com.yaml', packages_to_install=['google-cloud-storage', 'pandas', 'joblib', 'scikit-learn'])