### Installation
Install the packages required for executing this notebook.

## Some of the source codes are based on
https://towardsdatascience.com/how-to-set-up-custom-vertex-ai-pipelines-step-by-step-467487f81cad 

In [14]:
# Install the packages
! pip3 install --user --no-cache-dir --upgrade "kfp>2" "google-cloud-pipeline-components>2" \
                                        google-cloud-aiplatform



## Restart the kernel
Once you've installed the additional packages, you need to restart the notebook kernel so it can find the packages.

In [15]:
import os

if not os.getenv("IS_TESTING"):
    # Automatically restart kernel after installs
    import IPython

    app = IPython.Application.instance()
    app.kernel.do_shutdown(True)

Check the versions of the packages you installed. The KFP SDK version should be >=1.6.

In [2]:
! python3 -c "import kfp; print('KFP SDK version: {}'.format(kfp.__version__))"
! pip3 freeze | grep aiplatform
! python3 -c "import google_cloud_pipeline_components; print('google_cloud_pipeline_components version: {}'.format(google_cloud_pipeline_components.__version__))"

KFP SDK version: 2.3.0
google-cloud-aiplatform==1.34.0
google_cloud_pipeline_components version: 2.4.1


In [3]:
import kfp
import typing
from typing import Dict
from typing import NamedTuple
from kfp import dsl
from kfp.dsl import (Artifact,
                        Dataset,
                        Input,
                        Model,
                        Output,
                        Metrics,
                        ClassificationMetrics,
                        component, 
                        OutputPath, 
                        InputPath)
import google.cloud.aiplatform as aip
from google_cloud_pipeline_components.v1.model import ModelUploadOp
from google_cloud_pipeline_components.v1.endpoint import (EndpointCreateOp,ModelDeployOp)
from google_cloud_pipeline_components.types import artifact_types

#### Project and Pipeline Configurations

In [18]:
#The Google Cloud project that this pipeline runs in.
PROJECT_ID = "your_project_id"
# The region that this pipeline runs in
REGION = "us-central1"
# Specify a Cloud Storage URI that your pipelines service account can access. The artifacts of your pipeline runs are stored within the pipeline root.
PIPELINE_ROOT = "gs://your_temp_bucket"

#### Create Pipeline Components

We can create a component from Python functions (inline) and from a container. We will first try inline python functions. 
Refer to  https://www.kubeflow.org/docs/components/pipelines/v2/components/lightweight-python-components/ for more information.

#### Pipeline Component : Train and Test Split

In [5]:
@dsl.component(
    packages_to_install=["pandas", "scikit-learn"],
    base_image="python:3.10.7-slim"
)
def train_test_split(dataset: Input[Dataset], dataset_train: Output[Dataset], dataset_test: Output[Dataset]):
    '''train_test_split'''
    import pandas as pd
    import logging 
    import sys
    from sklearn.model_selection import train_test_split as tts
    
    logging.basicConfig(stream=sys.stdout, level=logging.INFO) 
    
    alldata = pd.read_csv(dataset.path, index_col=None)
    train, test = tts(alldata, test_size=0.3)
    train.to_csv(dataset_train.path + ".csv" , index=False, encoding='utf-8-sig')
    test.to_csv(dataset_test.path + ".csv" , index=False, encoding='utf-8-sig')

#### Pipeline Component : Training LogisticRegression

In [7]:
@dsl.component(
    packages_to_install=["pandas", "scikit-learn"],
    base_image="python:3.10.7-slim"
)
def train_lr(features: Input[Dataset], model: Output[Model]):
    '''train a LogisticRegression with default parameters'''
    import pandas as pd
    from sklearn.linear_model import LogisticRegression        
    import pickle 
    
    data = pd.read_csv(features.path+".csv")
    model_lr = LogisticRegression()
    model_lr.fit(data.drop('class',axis=1), data['class'])
    model.metadata["framework"] = "LR"
    file_name = model.path + f".pkl"
    with open(file_name, 'wb') as file:  
        pickle.dump(model_lr, file)   


#### Pipeline Component : Model Evaluation

See https://www.kubeflow.org/docs/components/pipelines/v2/data-types/parameters/ for NamedTuple

In [9]:
@dsl.component(
    packages_to_install = [
       "pandas", "scikit-learn", "numpy"
    ], base_image="python:3.10.7-slim"
)
def lr_model_evaluation(
    test_set:  Input[Dataset],
    model_lr: Input[Model],
    thresholds_dict_str: str,
    metrics: Output[ClassificationMetrics],
    kpi: Output[Metrics]
) -> NamedTuple('outputs', approval=bool):
  
    import pandas as pd
    import logging     
    from sklearn.metrics import roc_curve, confusion_matrix, accuracy_score
    import json
    import typing
    import pickle
    from numpy import nan_to_num
    
    def threshold_check(val1, val2):
        cond = False
        if val1 >= val2 :
            cond = True
        return cond

    data = pd.read_csv(test_set.path+".csv")
    
     #Loading the saved model with joblib
    m_filename = model_lr.path + ".pkl"
    model = pickle.load(open(m_filename, 'rb'))
    
    y_test = data.drop(columns=["class"])
    y_target = data['class']
    y_pred = model.predict(y_test)    

    y_scores =  model.predict_proba(data.drop(columns=["class"]))[:, 1]
 
    fpr, tpr, thresholds = roc_curve(
         y_true=data['class'], y_score=y_scores, pos_label=True
    )
    
    thresholds = nan_to_num(thresholds)   
    metrics.log_roc_curve(fpr, tpr, thresholds)  
    
    metrics.log_confusion_matrix(
        ['Negative', 'Positive'],
         confusion_matrix(
           data['class'], y_pred
       ).tolist(), 
    )
    
    accuracy = accuracy_score(data['class'], y_pred.round())
    thresholds_dict  = json.loads(thresholds_dict_str)
    model_lr.metadata["accuracy"] = float(accuracy)
    kpi.log_metric("accuracy", float(accuracy))
    outputs = NamedTuple('outputs', approval=bool)
    approval_value = threshold_check(float(accuracy), int(thresholds_dict['roc']))
    return outputs(approval_value)

### Upload Model and Metrics to Google Bucket 

In [11]:
@dsl.component(
    packages_to_install=["google-cloud-storage"],
    base_image="python:3.10.7-slim"
)
def upload_model_to_gcs(project_id: str, model_repo: str, model: Input[Model]):
    '''upload model to gsc'''
    from google.cloud import storage   
    import logging 
    import sys
    
    logging.basicConfig(stream=sys.stdout, level=logging.INFO)    
  
    # upload the model to GCS
    client = storage.Client(project=project_id)
    bucket = client.bucket(model_repo)
    blob = bucket.blob('model.pkl')
    source_file_name= model.path + '.pkl'
   
    blob.upload_from_filename(source_file_name)    
    
    print(f"File {source_file_name} uploaded to {model_repo}.")

### Deploy the model at Vertext AI 
We can use Google Pre-built Kebeflow componets such as  EndpointCreateOp, ModelUploadOp, and ModelDeployOp to deploy the models locally at Vertex AI.

***This is only for testing.  In your assigment, please use custom serving applications and CI-CD pipelines to deploy models. We should be able to deploy a given model at any given production environment. CI-CD pipelines are the best solution. ***

<img src="imgs/EnterpriseMlOps_Model_Deployment.png">

source: https://cloud.redhat.com/blog/enterprise-mlops-reference-design

https://cloud.google.com/vertex-ai/docs/pipelines/components-introduction

https://cloud.google.com/vertex-ai/docs/pipelines/gcpc-list

#### Define the Pipeline

In [12]:
# Define the workflow of the pipeline.
@kfp.dsl.pipeline(
    name="diabetes-predictor-training-pipeline")
def pipeline(project_id: str, data_bucket: str, dataset_uri: str, model_repo: str, thresholds_dict_str:str, model_repo_uri:str):    
    
    dataset_op = kfp.dsl.importer(
        artifact_uri=dataset_uri,
        artifact_class=Dataset,
        reimport=False,
    )
     
    train_test_split_op = train_test_split(dataset=dataset_op.output)
        
    training_lr_job_run_op = train_lr(features=train_test_split_op.outputs["dataset_train"])
    
    model_evaluation_op = lr_model_evaluation(
        test_set=train_test_split_op.outputs["dataset_test"],
        model_lr=training_lr_job_run_op.outputs["model"],
        thresholds_dict_str=thresholds_dict_str, # I deploy the model anly if the model performance is above the threshold
    )
    
    with dsl.If(
        model_evaluation_op.outputs["approval"]== True,
        name="approve-model",
    ):
        upload_model_to_gc_op = upload_model_to_gcs(
            project_id=project_id,
            model_repo=model_repo,
            model=training_lr_job_run_op.outputs['model']
        )    
        
        import_unmanaged_model_task = dsl.importer(
            artifact_uri=model_repo_uri,
            artifact_class=artifact_types.UnmanagedContainerModel,
            metadata={
                "containerSpec": {
                    "imageUri": "us-docker.pkg.dev/vertex-ai/prediction/sklearn-cpu.1-0:latest",  # see https://cloud.google.com/vertex-ai/docs/predictions/pre-built-containers  
                },
            },
        ).after(upload_model_to_gc_op)      
       
        # using Google's custom components for for uloading and deploying the model.
       
        model_upload_op = ModelUploadOp(
            project=project_id,
            display_name="diabetes-prediction-model",
            unmanaged_container_model=import_unmanaged_model_task.outputs["artifact"],
        ).after(import_unmanaged_model_task)       
               
        create_endpoint_op = EndpointCreateOp(
            project=project_id,
            display_name="diabetes-prediction-service",
        ).after(model_upload_op)      
        
        model_deploy_op = ModelDeployOp(
            model=model_upload_op.outputs["model"],
            endpoint=create_endpoint_op.outputs['endpoint'],
            deployed_model_display_name="diabetes-prediction-model",
            dedicated_resources_machine_type="n1-standard-4",
            dedicated_resources_min_replica_count=1,
            dedicated_resources_max_replica_count=1,
            traffic_split={"0": 100},
        ).after(create_endpoint_op)      

#### Compile the pipeline into a JSON file

In [13]:
from kfp import compiler
compiler.Compiler().compile(pipeline_func=pipeline,
        package_path='diabetes_prdictor_training_pipeline.yaml')

#### Submit the pipeline run

In [14]:
import google.cloud.aiplatform as aip

# Before initializing, make sure to set the GOOGLE_APPLICATION_CREDENTIALS
# environment variable to the path of your service account.
aip.init(
    project=PROJECT_ID,
    location=REGION,
)

# Prepare the pipeline job
job = aip.PipelineJob(
    display_name="diabetes-predictor",
    enable_caching=False,
    template_path="diabetes_prdictor_training_pipeline.yaml",
    pipeline_root=PIPELINE_ROOT,
    location=REGION,
    parameter_values={
        'project_id': PROJECT_ID, # makesure to use your project id 
        'data_bucket': 'data_de2023',  # makesure to use your data bucket name 
        'dataset_uri':'gs://data_de2023/training_set.csv',
        'model_repo':'models_de2023', # makesure to use your model bucket name 
        'thresholds_dict_str':'{"roc":0.8}',
        'model_repo_uri':'gs://models_de2023' # makesure to use your model bucket name 
    }
)

job.run()

Creating PipelineJob
PipelineJob created. Resource name: projects/448730450443/locations/us-central1/pipelineJobs/diabetes-predictor-training-pipeline-20231004145834
To use this PipelineJob in another session:
pipeline_job = aiplatform.PipelineJob.get('projects/448730450443/locations/us-central1/pipelineJobs/diabetes-predictor-training-pipeline-20231004145834')
View Pipeline Job:
https://console.cloud.google.com/vertex-ai/locations/us-central1/pipelines/runs/diabetes-predictor-training-pipeline-20231004145834?project=448730450443
PipelineJob projects/448730450443/locations/us-central1/pipelineJobs/diabetes-predictor-training-pipeline-20231004145834 current state:
PipelineState.PIPELINE_STATE_RUNNING
PipelineJob projects/448730450443/locations/us-central1/pipelineJobs/diabetes-predictor-training-pipeline-20231004145834 current state:
PipelineState.PIPELINE_STATE_RUNNING
PipelineJob projects/448730450443/locations/us-central1/pipelineJobs/diabetes-predictor-training-pipeline-202310041458

### List all models

In [15]:
DISPLAY_NAME = "diabetes-prediction-model"
! gcloud ai models list --region={REGION} --filter={DISPLAY_NAME}

Using endpoint [https://us-central1-aiplatform.googleapis.com/]
MODEL_ID             DISPLAY_NAME
7377624066330460160  diabetes-prediction-model
163420413236346880   diabetes-prediction-model


### Make an online prediction request

In [16]:
ENDPOINT_NAME="diabetes-prediction-service"
instance = [[1,126,60,0,0,30.1,0.349,47]]  # Prediciton request inputs 
ENDPOINT_ID = !(gcloud ai endpoints list --region=$REGION \
              --format='value(ENDPOINT_ID)'\
              --filter=display_name=$ENDPOINT_NAME \
              --sort-by=creationTimeStamp | tail -1)
ENDPOINT_ID = ENDPOINT_ID[1]

def endpoint_predict(
    project: str, location: str, instances: list, endpoint: str
):
    aip.init(project=project, location=location)

    endpoint = aip.Endpoint(endpoint)

    prediction = endpoint.predict(instances=instances)
    return prediction

endpoint_predict(PROJECT_ID, REGION, instance, ENDPOINT_ID)

Prediction(predictions=[0.0], deployed_model_id='548060366957969408', model_version_id='1', model_resource_name='projects/448730450443/locations/us-central1/models/163420413236346880', explanations=None)

### Test the batch prediction

In [17]:
# Define variables 
job_display_name = "diabetes-prediction-batch-prediction-job"
MODEL_NAME="diabetes-prediction-model"
ENDPOINT_NAME="diabetes-prediction-service"
BUCKET_URI="gs://data_de2023"
input_file_name="test_set_batch_prediction.csv"

# Get model id
MODEL_ID=!(gcloud ai models list --region=$REGION \
           --filter=display_name=$MODEL_NAME)
MODEL_ID=MODEL_ID[2].split(" ")[0]

model_resource_name = f'projects/{PROJECT_ID}/locations/{REGION}/models/{MODEL_ID}'
gcs_source= [f"{BUCKET_URI}/{input_file_name}"]
gcs_destination_prefix=f"{BUCKET_URI}/output"

def batch_prediction_job(
    project: str,
    location: str,
    model_resource_name: str,
    job_display_name: str,
    gcs_source: str,
    gcs_destination_prefix: str,
    machine_type: str,
    starting_replica_count: int = 1, # The number of nodes for this batch prediction job. 
    max_replica_count: int = 1,    
):   
    aip.init(project=project, location=location)

    model = aip.Model(model_resource_name)

    batch_prediction_job = model.batch_predict(
        job_display_name=job_display_name,
        instances_format='csv', #csv
        gcs_source=[f"{BUCKET_URI}/{input_file_name}"],
        gcs_destination_prefix=f"{BUCKET_URI}/output",
        machine_type=machine_type, # must be present      
    )
    batch_prediction_job.wait()
    print(batch_prediction_job.display_name)
    print(batch_prediction_job.state)
    return batch_prediction_job

batch_prediction_job(PROJECT_ID, REGION, model_resource_name, job_display_name, gcs_source, gcs_destination_prefix, machine_type="n1-standard-2")

Creating BatchPredictionJob
BatchPredictionJob created. Resource name: projects/448730450443/locations/us-central1/batchPredictionJobs/8636203484405825536
To use this BatchPredictionJob in another session:
bpj = aiplatform.BatchPredictionJob('projects/448730450443/locations/us-central1/batchPredictionJobs/8636203484405825536')
View Batch Prediction Job:
https://console.cloud.google.com/ai/platform/locations/us-central1/batch-predictions/8636203484405825536?project=448730450443
BatchPredictionJob projects/448730450443/locations/us-central1/batchPredictionJobs/8636203484405825536 current state:
JobState.JOB_STATE_RUNNING
BatchPredictionJob projects/448730450443/locations/us-central1/batchPredictionJobs/8636203484405825536 current state:
JobState.JOB_STATE_RUNNING
BatchPredictionJob projects/448730450443/locations/us-central1/batchPredictionJobs/8636203484405825536 current state:
JobState.JOB_STATE_RUNNING
BatchPredictionJob projects/448730450443/locations/us-central1/batchPredictionJobs/

<google.cloud.aiplatform.jobs.BatchPredictionJob object at 0x7f01144c7a00> 
resource name: projects/448730450443/locations/us-central1/batchPredictionJobs/8636203484405825536