# AI Platform Pipeline with TFX

### Install and Import Libraries

In [2]:
from aiplatform.pipelines import client

In [26]:
# Get the AI Platform client library from restricted bucket
!gsutil cp gs://cloud-aiplatform-pipelines/releases/20210209/aiplatform_pipelines_client-0.1.0.caip20210209-py3-none-any.whl .  
# Get the Metadata SDK to query the produced metadata.
!gsutil cp gs://cloud-aiplatform-metadata/sdk/google-cloud-aiplatform-metadata-0.0.1.tar.gz .

Copying gs://cloud-aiplatform-pipelines/releases/20210209/aiplatform_pipelines_client-0.1.0.caip20210209-py3-none-any.whl...
/ [1 files][ 21.9 KiB/ 21.9 KiB]                                                
Operation completed over 1 objects/21.9 KiB.                                     


In [None]:
#Install both libraries
!python3 -m pip install google-cloud-aiplatform
!python3 -m pip install kfp==1.4 google-cloud-aiplatform-metadata-0.0.1.tar.gz aiplatform_pipelines_client-0.1.0.caip20210209-py3-none-any.whl --upgrade

In [3]:
from typing import Optional, Text, List
import absl
import os
import tensorflow as tf
import tensorflow_model_analysis as tfma
#from aiplatform.pipelines import client

from tfx.components.example_gen.import_example_gen.component import ImportExampleGen
from tfx.components import CsvExampleGen
from tfx.components import Evaluator
from tfx.components import ExampleValidator
from tfx.components import InfraValidator
from tfx.components import Pusher
from tfx.components import ResolverNode
from tfx.components import SchemaGen
from tfx.components import StatisticsGen
from tfx.components import Trainer
from tfx.components import Transform
from tfx.orchestration import metadata
from ml_metadata.proto import metadata_store_pb2
from tfx.components.trainer.executor import GenericExecutor
from tfx.dsl.components.base import executor_spec
from tfx.dsl.experimental import latest_artifacts_resolver ## demo
from tfx.dsl.experimental import latest_blessed_model_resolver ## demo
from tfx.orchestration import pipeline as tfx_pipeline
from tfx.orchestration.local.local_dag_runner import LocalDagRunner
from tfx.orchestration.kubeflow.v2 import kubeflow_v2_dag_runner
from tfx.utils.dsl_utils import external_input
from tfx.proto import example_gen_pb2
from tfx.proto import pusher_pb2
from tfx.proto import trainer_pb2
from tfx.types import standard_artifacts
from tfx.types import channel



In [4]:
# Check Versions of Pipeline
print('TensorFlow version: {}'.format(tf.__version__))
print('TFX version: {}'.format(__import__('tfx.version').__version__))

TensorFlow version: 2.4.1
TFX version: 0.27.0


### Set Environment Variables for Local execution

In [54]:
PROJECT_ID         = 'crazy-hippo-01'
REGION             = 'us-central1'
API_KEY            = ''
PIPELINE_NAME      = 'earnings'
PIPELINE_ROOT      = 'gs://crazy-hippo-01/tfx/binary_classification/pipeline'
TRANSFORM_FILE     = 'transform.py'
#TRANSFORM_FILE    = 'gs://crazy-hippo-01/tfx/binary_classification/transform.py'
TRAINER_FILE       = 'trainer.py'
#TRAINER_FILE      = 'gs://crazy-hippo-01/tfx/binary_classification/trainer.py'
RAW_DATA           = "gs://crazy-hippo-01/tfx/binary_classification/raw/"
SERVING_MODEL_DIR = 'gs://crazy-hippo-01/tfx/binary_classification/serving_model/'
METADATA_PATH     = os.path.join('.', 'tfx_metadata', PIPELINE_NAME, 'metadata.db')


### Pipeline Function

In [5]:
def create_tfx_pipeline(
    pipeline_name: Text, input_dir: Text, metadata_connection_config: Optional[metadata_store_pb2.ConnectionConfig] = None
    ):
   
    # Output 2 splits: train:eval=3:1.
    example_gen = CsvExampleGen(input_base=RAW_DATA)
    
    statistics_gen = StatisticsGen(examples=example_gen.outputs['examples'])

    schema_gen = SchemaGen(
        statistics=statistics_gen.outputs['statistics'],
        infer_feature_shape=True)

    example_validator = ExampleValidator(
        statistics=statistics_gen.outputs['statistics'],
        schema=schema_gen.outputs['schema'])

    transform = Transform(
        examples=example_gen.outputs['examples'],
        schema=schema_gen.outputs['schema'],
        module_file=TRANSFORM_FILE)
    
    # Fetch the latest trained model under the same context for warm-starting.
    latest_model_resolver = ResolverNode(
        instance_name='latest_model_resolver',
        resolver_class=latest_artifacts_resolver.LatestArtifactsResolver,
        model=channel.Channel(type=standard_artifacts.Model))
    
    trainer = Trainer(
        module_file=TRAINER_FILE,
        custom_executor_spec=executor_spec.ExecutorClassSpec(GenericExecutor),
        examples=transform.outputs['transformed_examples'],
        transform_graph=transform.outputs['transform_graph'],
        schema=schema_gen.outputs['schema'],
        train_args=trainer_pb2.TrainArgs(num_steps=3000),
        eval_args=trainer_pb2.EvalArgs(num_steps=3000))
    
    # Get the latest blessed model for model validation.
    model_resolver = ResolverNode(
        instance_name='latest_blessed_model_resolver',
        resolver_class=latest_blessed_model_resolver.LatestBlessedModelResolver,
        model=channel.Channel(type=standard_artifacts.Model),
        model_blessing=channel.Channel(type=standard_artifacts.ModelBlessing))

    # Set the TFMA config for Model Evaluation and Validation.
    eval_config = tfma.EvalConfig(
       model_specs=[tfma.ModelSpec(label_key='label')],
       slicing_specs=[tfma.SlicingSpec()],
       metrics_specs=[
           tfma.MetricsSpec(metrics=[
              tfma.MetricConfig(
                  class_name='SparseCategoricalAccuracy',
                  threshold=tfma.MetricThreshold(
                      # Accept models only if SparseCategoricalAccuracy > 0.8
                      value_threshold=tfma.GenericValueThreshold(
                          lower_bound={'value': 0.8}),
                      # TODO: modify this
                      change_threshold=tfma.GenericChangeThreshold(
                          direction=tfma.MetricDirection.HIGHER_IS_BETTER,
                          absolute={'value': -1e-2})))
          ])
      ]
    )

    evaluator = Evaluator(
        examples=example_gen.outputs['examples'],
        model=trainer.outputs['model'],
        baseline_model=model_resolver.outputs['model'],
        # Change threshold will be ignored if there is no baseline (first run).
        eval_config=eval_config)

    pusher = Pusher(
        model=trainer.outputs['model'],
        #model_blessing=evaluator.outputs['blessing'],
        push_destination=pusher_pb2.PushDestination(
            filesystem=pusher_pb2.PushDestination.Filesystem(
                base_directory=SERVING_MODEL_DIR)))

    components=[
        example_gen, statistics_gen, schema_gen, example_validator, 
        transform, trainer, pusher
    ]

    return tfx_pipeline.Pipeline(
        pipeline_name=pipeline_name,
        pipeline_root=PIPELINE_ROOT,
        components=components,
        metadata_connection_config=metadata_connection_config
    )

### Create Pipeline

In [8]:
mypipeline = create_tfx_pipeline(
          pipeline_name=PIPELINE_NAME,
          input_dir=RAW_DATA,
          metadata_connection_config=metadata.sqlite_metadata_connection_config(METADATA_PATH)
        )



#### Run Pipeline in Local environment

In [None]:
LocalDagRunner().run(mypipeline)

### Execute Pi in AI Platform

Install Kubeflow Pipeline

In [None]:
!pip3 install kfp --upgrade --user

Copy transform and trainer functions to cloud storage to be accessable via AI Platform

In [9]:
!gsutil cp transform.py gs://crazy-hippo-01/tfx/binary_classification
!gsutil cp trainer.py gs://crazy-hippo-01/tfx/binary_classification

Copying file://transform.py [Content-Type=text/x-python]...
/ [1 files][  2.8 KiB/  2.8 KiB]                                                
Operation completed over 1 objects/2.8 KiB.                                      
Copying file://trainer.py [Content-Type=text/x-python]...
/ [1 files][  5.2 KiB/  5.2 KiB]                                                
Operation completed over 1 objects/5.2 KiB.                                      


Set AI Platform environment variables

In [7]:
PROJECT_ID         = 'crazy-hippo-01'
REGION             = 'us-central1'
API_KEY            = ''
PIPELINE_NAME      = 'earnings'
PIPELINE_ROOT      = 'gs://crazy-hippo-01/tfx/binary_classification/pipeline'
TRANSFORM_FILE     = 'gs://crazy-hippo-01/tfx/binary_classification/transform.py'
TRAINER_FILE       = 'gs://crazy-hippo-01/tfx/binary_classification/trainer.py'
RAW_DATA           = "gs://crazy-hippo-01/tfx/binary_classification/raw/"
SERVING_MODEL_DIR  = 'gs://crazy-hippo-01/tfx/binary_classification/serving_model/'
METADATA_PATH      = os.path.join('.', 'tfx_metadata', PIPELINE_NAME, 'metadata.db')
API_KEY = 'AIzaSyBZQOYPxdfgGzmc5qMr6-LMFK8RHH9RSMs'

Create pipeline JSON file

In [None]:
config = kubeflow_v2_dag_runner.KubeflowV2DagRunnerConfig(
    project_id=PROJECT_ID,
    display_name=PIPELINE_NAME)

runner = kubeflow_v2_dag_runner.KubeflowV2DagRunner(
    config=config,
    output_filename='pipeline01.json')

runner.run(pipeline=mypipeline, write_out=True)

Import AI Platform Client set up variables

In [11]:
import time
from aiplatform.pipelines import client

api_client = client.Client(project_id=PROJECT_ID, region=REGION, api_key=API_KEY)
DISPLAY_NAME = 'earnings{}'.format(str(int(time.time())))
print(DISPLAY_NAME)

earnings1614519655


Create run from job spec

In [12]:
result = api_client.create_run_from_job_spec(
          job_spec_path='pipeline01.json',
          name = DISPLAY_NAME
          )