Install and import packages

In [157]:
import os

# The Vertex AI Workbench Notebook product has specific requirementss
IS_WORKBENCH_NOTEBOOK = os.getenv("DL_ANACONDA_HOME") and not os.getenv("VIRTUAL_ENV")
IS_USER_MANAGED_WORKBENCH_NOTEBOOK = os.path.exists(
    "/opt/deeplearning/metadata/env_version"
)

# Vertex AI Notebook requires dependencies to be installed with '--user'
USER_FLAG = ""
if IS_WORKBENCH_NOTEBOOK:
    USER_FLAG = "--user"

! pip3 install --upgrade google-cloud-aiplatform {USER_FLAG} -q
! pip3 install -U google-cloud-storage {USER_FLAG} -q
! pip3 install {USER_FLAG} kfp google-cloud-pipeline-components --upgrade -q

You should consider upgrading via the 'c:\python37\python.exe -m pip install --upgrade pip' command.
ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
kfp 1.8.14 requires google-cloud-storage<2,>=1.20.0, but you have google-cloud-storage 2.5.0 which is incompatible.
google-cloud-pipeline-components 1.0.25 requires google-cloud-storage<2,>=1.20.0, but you have google-cloud-storage 2.5.0 which is incompatible.
You should consider upgrading via the 'c:\python37\python.exe -m pip install --upgrade pip' command.
You should consider upgrading via the 'c:\python37\python.exe -m pip install --upgrade pip' command.


In [1]:
import kfp
from kfp.v2 import dsl
from kfp.v2.dsl import component
from kfp.v2.dsl import (
    Input,
    Output,
    Artifact,
    Dataset,
)
import google.cloud.aiplatform as aip
from google_cloud_pipeline_components import aiplatform as gcc_aip
from kfp.v2.components import importer_node
from google_cloud_pipeline_components.types import artifact_types

Project and pipeline configuration

In [2]:
#The Google Cloud project that this pipeline runs in.
project_id = "de-2022-ng"
# The region that this pipeline runs in
region = "us-west1"
# Specify a Cloud Storage URI that your pipelines service account can access. The artifacts of your pipeline runs are stored within the pipeline root.
pipeline_root_path = "gs://data_de2022_ng"

Pipeline Component: Data Ingestion

In [3]:
from typing import Dict

def download_data(project_id: str, bucket: str, file_name: str) -> Dict:
    from google.cloud import storage
    import pandas as pd
    import logging 
    import sys
    
    logging.basicConfig(stream=sys.stdout, level=logging.INFO)

    # Download file from google bucket
    client = storage.Client(project=project_id)
    bucket = client.get_bucket(bucket)
    blob = bucket.blob(file_name)
    local_path = '/tmp/'+ file_name
    blob.download_to_filename(local_path)
    logging.info('Downloaded Data!')

    # Create dataframe from downloaded data
    data_dict = pd.read_csv(local_path, index_col=None, squeeze=True).to_dict()
    logging.info('Built dict')
    return data_dict

In [4]:
# create a KFP component for data ingestion
data_ingestion_comp = kfp.components.create_component_from_func(
    download_data, output_component_file='components/data_ingestion.yaml', packages_to_install=['google-cloud-storage', 'pandas'])

Pipeline Component: Train RandomForestRegressor

In [5]:
from typing import NamedTuple, Dict

def train_rfr(data: Dict, project_id: str, model_repo: str) -> Dict:
    import json
    import logging 
    import sys
    import os
    import joblib
    
    import pandas as pd
    from google.cloud import storage

    from sklearn.ensemble import RandomForestRegressor
    from sklearn.model_selection import train_test_split
    from sklearn.metrics import r2_score

    logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)

    data = pd.DataFrame.from_dict(data)  
    
    logging.info('Features:' + str(list(data.columns)))

    # Split dependent and independent variables
    X = data.drop(['MEDV'], axis=1)
    y = data['MEDV']

    # Split train and test data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, 
                                                        random_state=101)
    
    # Fit model
    model = RandomForestRegressor()
    model.fit(X_train, y_train)

    # Predict on test data
    y_pred = model.predict(X_test)

    # Get r2 score
    metrics = {
        "r2": r2_score(y_pred, y_test)
    }
    logging.info("RFR r2:" + str(metrics['r2']))

    # Save model locally
    local_file = '/tmp/local_rfr_model.pkl'
    joblib.dump(model, local_file)

    client = storage.Client(project=project_id)
    bucket = client.get_bucket(model_repo)
    blob = bucket.blob('rfr_model.pkl')
    # Upload the locally saved model
    blob.upload_from_filename(local_file)

    print("Saved the RFR model locally: " + model_repo)
    return metrics



In [6]:
# create a KFP component for training 
train_rfr_comp = kfp.components.create_component_from_func(
    train_rfr, output_component_file='components/train_rfr_model.yaml', packages_to_install=['google-cloud-storage', 'pandas', 'joblib', 'scikit-learn'])

Pipeline Component: Train LinearRegressor

In [7]:
from typing import Dict

def train_lr (data: Dict, project_id: str, model_repo: str) -> Dict:
    '''train a LinearRegression with default parameters'''
    import json
    import logging 
    import sys
    import os
    import joblib

    import pandas as pd
    from google.cloud import storage

    from sklearn.linear_model import LinearRegression
    from sklearn.model_selection import train_test_split
    from sklearn.metrics import r2_score
        
    logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
    
    data = pd.DataFrame.from_dict(data)  
    
    logging.info('Features:' + str(list(data.columns)))
    
    # Split dependent and independent variables
    X = data.drop(['MEDV'], axis=1)
    y = data['MEDV']
    
    # Split train and test data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, 
                                                        random_state=101)
    
    # Fit model
    model = LinearRegression()
    model.fit(X_train, y_train)

    # Predict on test data
    y_pred = model.predict(X_test)

    # Get r2 score
    metrics = {
        "r2": r2_score(y_pred, y_test)
    }
    logging.info("LR r2:" + str(metrics['r2']))
   
    # Save the model localy
    local_file = '/tmp/local_lr_model.pkl'
    joblib.dump(model, local_file)
  
    print("Saved the LR model locally: " + model_repo)
    return metrics

In [8]:
# create a KFP component for training lr model
train_lr_comp = kfp.components.create_component_from_func(
    train_lr, output_component_file='components/train_lr_model.yaml', packages_to_install=['google-cloud-storage', 'pandas', 'joblib', 'scikit-learn'])

#### Pipeline Component: Current model metrics

In [9]:
from typing import Dict

def eval_current_model (data: Dict, project_id: str, model_repo: str) -> Dict:
    
    '''train a LinearRegression with default parameters'''
    import json
    import logging 
    import sys
    import os
    import joblib

    import pandas as pd
    from google.cloud import storage

    from sklearn.linear_model import LinearRegression
    from sklearn.model_selection import train_test_split
    from sklearn.metrics import r2_score
        
    logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
    
    data = pd.DataFrame.from_dict(data)  
    
    # Split dependent and independent variables
    X = data.drop(['MEDV'], axis=1)
    y = data['MEDV']
    
    # Split train and test data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, 
                                                        random_state=101)

    # Load current model
    client = storage.Client(project=project_id)
    bucket = client.get_bucket(model_repo)
    blob = bucket.blob('depl_model.pkl')
    filename = '/tmp/curr_model.pkl'
    blob.download_to_filename(filename)
        
    #Loading the saved model with joblib
    model = joblib.load(filename)

    # Predict on test data
    y_pred = model.predict(X_test)

    # Get r2 score
    metrics = {
        "r2": r2_score(y_pred, y_test)
    }
    logging.info("Current r2:" + str(metrics['r2']))


    return metrics

In [10]:
# create a KFP component for training lr model
eval_current_comp = kfp.components.create_component_from_func(
    eval_current_model, output_component_file='components/eval_current_model.yaml', packages_to_install=['google-cloud-storage', 'pandas', 'joblib', 'scikit-learn'])

#### Pipeline Component: Model Selection

In [11]:
def compare_model(rfr_metrics: Dict, lr_metrics: Dict, current_metrics: Dict) -> str:
    import logging
    import json
    import sys

    logging.basicConfig(stream=sys.stdout, level=logging.INFO)
    logging.info(rfr_metrics)
    logging.info(lr_metrics)
    logging.info(current_metrics)
    

    # Add metrics to list
    metrics_dict = {
        "RFR": rfr_metrics['r2'], 
        "LR": lr_metrics['r2'], 
        "Curr": current_metrics['r2']
    }
    
    best_model = max(metrics_dict, key=metrics_dict.get)

    logging.info("Best model:", best_model)

    if best_model == "RFR":
        local_file = '/tmp/local_RFR_model.pkl'
    elif best_model == "LR":
        local_file = '/tmp/local_lr_model.pkl'
    
    # Save to GCS as lr_model.pkl
    client = storage.Client(project=project_id)
    bucket = client.get_bucket(model_repo)
    blob = bucket.blob('depl_model.pkl')
    # Upload the locally saved model
    blob.upload_from_filename(local_file)

In [12]:
# create a KFP component for selecting between RFR and LR
compare_model_comp = kfp.components.create_component_from_func(
    compare_model, output_component_file='components/model_selection_comp.yaml', packages_to_install=['google-cloud-storage', 'pandas', 'joblib', 'scikit-learn'])

#### Define Pipeline

In [13]:
# Define the workflow of the pipeline.
@kfp.dsl.pipeline(
    name="house-pricing-prediction-pipeline",
    pipeline_root=pipeline_root_path)

def pipeline(project_id: str, data_bucket: str, dataset_filename: str, model_repo: str, testset_filename: str):
    
    di_op = data_ingestion_comp(
        project_id=project_id,
        bucket=data_bucket,
        file_name=dataset_filename
    )

 
    training_rfr_job_run_op = train_rfr_comp(
        project_id=project_id,
        model_repo=model_repo,       
        data=di_op.output
    )
    
     
    training_lr_job_run_op = train_lr_comp(
        project_id=project_id,
        model_repo=model_repo,       
        data=di_op.output
    )

    eval_current_model_job_run_op = eval_current_comp(
        project_id=project_id,
        model_repo=model_repo,       
        data=di_op.output
    )
    
    compare_model_op = compare_model_comp(
        training_rfr_job_run_op.output,
        training_lr_job_run_op.output,
        eval_current_model_job_run_op.output
    ).after(training_rfr_job_run_op, training_lr_job_run_op)  

    import_unmanaged_model_task = importer_node.importer(
            artifact_uri= "gs://model_repo_de2022_ng",
            artifact_class=artifact_types.UnmanagedContainerModel,
            metadata={
                "containerSpec": {
                    "imageUri": "us-docker.pkg.dev/vertex-ai/prediction/sklearn-cpu.1-0:latest",  # see https://cloud.google.com/vertex-ai/docs/predictions/pre-built-containers  
                },
            },
        ).after(compare_model_op)

    model_upload_op = gcc_aip.ModelUploadOp(
            project=project_id,
            display_name="housing-prediction-model",
            unmanaged_container_model=import_unmanaged_model_task.outputs["artifact"],
        ).after(import_unmanaged_model_task)     
       

#### Compile Pipeline into JSON

In [14]:
from kfp.v2 import compiler
compiler.Compiler().compile(pipeline_func=pipeline,
        package_path='house_pricing_training_pipeline.json')



In [17]:
import google.cloud.aiplatform as aip

job = aip.PipelineJob(
    display_name="house-pricing",
    enable_caching=False,
    template_path="house_pricing_training_pipeline.json",
    pipeline_root=pipeline_root_path,
    parameter_values={
        'project_id': project_id, # makesure to use your project id 
        'data_bucket': 'test_data_de2022_ng',  # makesure to use your data bucket name 
        'dataset_filename': 'dataset.csv',     # makesure to upload these to your data bucket from DE2022/lab4/data
        'model_repo':'model_repo_de2022_ng' # makesure to use your model bucket name 
    }
)

job.run()

GoogleAuthError: Unable to find your project. Please provide a project ID by:
- Passing a constructor argument
- Using aiplatform.init()
- Setting a GCP environment variable

: 

: 

: 