# An end-to-end Vertex Batch Prediction Pipeline Demonstration

Finally, check that you have correctly installed the packages. The KFP SDK version should be >=1.6:

In [1]:
!python3 -c "import kfp; print('KFP SDK version: {}'.format(kfp.__version__))"

KFP SDK version: 1.8.2


In [2]:
import os
from functools import partial

import kfp
import pprint
import yaml
from jinja2 import Template
from kfp.v2 import dsl
from kfp.v2.compiler import compiler
from kfp.v2.dsl import Dataset
from kfp.v2.google.client import AIPlatformClient

In [3]:
project_id='woven-rush-197905'
project_number='297370817971'

In [4]:
af_registry_location='asia-southeast1'
af_registry_name='mlops-vertex-kit'

In [5]:
components_dir='../components/'

In [6]:
def _load_custom_component(project_id: str,
                           af_registry_location: str,
                           af_registry_name: str,
                           components_dir: str,
                           component_name: str):
  component_path = os.path.join(components_dir,
                                component_name,
                                'component.yaml.jinja')
  with open(component_path, 'r') as f:
    component_text = Template(f.read()).render(
      project_id=project_id,
      af_registry_location=af_registry_location,
      af_registry_name=af_registry_name)

  return kfp.components.load_component_from_text(component_text)

load_custom_component = partial(_load_custom_component,
                                project_id=project_id,
                                af_registry_location=af_registry_location,
                                af_registry_name=af_registry_name,
                                components_dir=components_dir)

In [7]:
preprocess_op = load_custom_component(component_name='data_preprocess')
batch_prediction_op = load_custom_component(component_name='batch_prediction')

Then define the pipeline using the following function:

In [8]:
pipeline_region='asia-southeast1'
pipeline_root='gs://vertex_pipeline_demo_root/pipeline_root'

In [9]:
data_region='asia-southeast1'
input_dataset_uri='bq://woven-rush-197905.vertex_pipeline_demo.banknote_authentication_features'
gcs_data_output_folder='gs://vertex_pipeline_demo_root/datasets/prediction'
gcs_result_folder='gs://vertex_pipeline_demo_root/prediction'

data_pipeline_root='gs://vertex_pipeline_demo_root/compute_root'

In [10]:
@dsl.pipeline(name='batch-prediction-pipeline-template')
def pipeline(project_id: str,
             data_region: str,
             gcs_data_output_folder: str,
             input_dataset_uri: str,
             data_pipeline_root: str,
             gcs_result_folder: str,
             model_resource_name: str = '',
             endpoint_resource_name: str = '',
             machine_type: str = "n1-standard-8",
             accelerator_count: int = 0,
             accelerator_type: str = 'ACCELERATOR_TYPE_UNSPECIFIED',
             starting_replica_count: int = 1,
             max_replica_count: int = 2):
    dataset_importer = kfp.dsl.importer(
      artifact_uri=input_dataset_uri,
      artifact_class=Dataset,
      reimport=False)

    preprocess_task = preprocess_op(
      project_id=project_id,
      data_region=data_region,
      gcs_output_folder=gcs_data_output_folder,
      gcs_output_format="NEWLINE_DELIMITED_JSON",
      input_dataset=dataset_importer.output)

    batch_prediction_op(
      project_id=project_id,
      data_region=data_region,
      data_pipeline_root=data_pipeline_root,
      gcs_result_folder=gcs_result_folder,
      instances_format='jsonl',
      predictions_format='jsonl',
      input_dataset=preprocess_task.outputs['output_dataset'],
      model_resource_name=model_resource_name,
      endpoint_resource_name=endpoint_resource_name,
      machine_type=machine_type,
      accelerator_type=accelerator_type,
      accelerator_count=accelerator_count,
      starting_replica_count=starting_replica_count,
      max_replica_count=max_replica_count)

### Compile and run the end-to-end ML pipeline
With our full pipeline defined, it's time to compile it:

In [11]:
compiler.Compiler().compile(
    pipeline_func=pipeline, 
    package_path="batch_prediction_pipeline_job.json"
)



Next, instantiate an API client:

In [12]:
api_client = AIPlatformClient(
    project_id=project_id,
    region=pipeline_region)



Next, kick off a pipeline run:

In [13]:
pipeline_params = {
    'project_id': project_id,
    'data_region': data_region,
    'gcs_data_output_folder': gcs_data_output_folder,
    'gcs_result_folder': gcs_result_folder,
    'input_dataset_uri': input_dataset_uri,
    'data_pipeline_root': data_pipeline_root,
    'endpoint_resource_name': 'projects/297370817971/locations/asia-southeast1/endpoints/8843521555783745536',
}

response = api_client.create_run_from_job_spec(
    job_spec_path="batch_prediction_pipeline_job.json", 
    pipeline_root=pipeline_root,
    parameter_values=pipeline_params,
    enable_caching=True)