# MNIST Digits Recognition

In [None]:
# Specify proxy value if it's should be configured on the cluster
%env https_proxy=http://hpeproxy.its.hpecorp.net:80
# Install required packages (Kubeflow Pipelines and Katib SDK).
import sys
!{sys.executable} -m pip install "pyarrow>7.0.0" "kfp>=1.8.0,<=1.8.22" kubeflow-katib==0.16.0

In [None]:
import os
import uuid
import kfp
import kfp.dsl as dsl
from kfp import components
from kubeflow.katib import ApiClient
from kubeflow.katib import V1beta1ExperimentSpec
from kubeflow.katib import V1beta1AlgorithmSpec
from kubeflow.katib import V1beta1ObjectiveSpec
from kubeflow.katib import V1beta1ParameterSpec
from kubeflow.katib import V1beta1FeasibleSpace
from kubeflow.katib import V1beta1TrialTemplate
from kubeflow.katib import V1beta1TrialParameterSpec
from pathlib import Path

######################################### Storage Parameters ##############################################

# The path that is used inside the pods to mount training data
mnt_path = "/mnt/data/"
uuid = uuid.uuid4().hex[:4]

# The initial training data is written to the user volume by the spark job in the Apache Parquet format.
initial_training_data_dir = f"{str(Path.home())}/user/mnist-spark-data"

# The path is relative, final_training_data_dir should be in the same folder with the notebook
final_training_data_dir=f"training-{uuid}"

os.mkdir("/mnt/user/" + final_training_data_dir, 0o777)
# os.chmod("/mnt/user/" + final_training_data_dir, 0o777)

######################################## KFP parameters ##################################################

# Name of the Katib experiment
name = f"mnist-experiment-{uuid}"
print(f"Katib experiment name: {name}")

# Number of epoch to train the model
training_steps="100"


kfp_client = kfp.Client()
namespace = kfp_client.get_user_namespace()

In [None]:
# Transformation of the training data from Apache Parquet format to the format that is required for the MNIST example
import pandas as pd
import os

if not os.path.exists(final_training_data_dir):
    os.makedirs(final_training_data_dir)
    
with open(final_training_data_dir + "/train-images-idx3-ubyte.gz", 'wb') as f1, \
     open(final_training_data_dir + "/t10k-images-idx3-ubyte.gz", 'wb') as f2, \
     open(final_training_data_dir + "/train-labels-idx1-ubyte.gz", 'wb') as f3, \
     open(final_training_data_dir + "/t10k-labels-idx1-ubyte.gz", 'wb') as f4:
         mnist_parquet = pd.read_parquet(initial_training_data_dir)
         x_train, x_test, y_train, y_test = mnist_parquet["content"]
         f1.write(x_train)
         f2.write(x_test)
         f3.write(y_train)
         f4.write(y_test)

In [None]:
env = [
    {
        "name": "HTTP_PROXY",
        "value": "http://hpeproxy.its.hpecorp.net:80"
    },
    {
        "name": "HTTPS_PROXY",
        "value": "http://hpeproxy.its.hpecorp.net:80"
    },
    {
        "name": "http_proxy",
        "value": "http://hpeproxy.its.hpecorp.net:80"
    },
    {
        "name": "https_proxy",
        "value": "http://hpeproxy.its.hpecorp.net:80"
    },
    {
        "name": "no_proxy",
        "value": "10.227.212.69,10.227.0.0/16,10.96.0.1,10.96.0.0/12,10.96.0.12,10.244.0.0/16,10.43.0.0/16,.external.hpe.local,localhost,.cluster.local,.svc,.default.svc,127.0.0.1,mip-bd-vcenter-dev.mip.storage.hpecorp.net,172.20.0.2"
    },
    {
        "name": "NO_PROXY",
        "value": "10.227.212.69,10.227.0.0/16,10.96.0.1,10.96.0.0/12,10.96.0.12,10.244.0.0/16,10.43.0.0/16,.external.hpe.local,localhost,.cluster.local,.svc,.default.svc,127.0.0.1,mip-bd-vcenter-dev.mip.storage.hpecorp.net,172.20.0.2"
    }
]

In [None]:
# You should define number of training steps in the arguments.
def create_katib_experiment_task(experiment_name, experiment_namespace, training_steps):
    # Trial count specification.
    max_trial_count = 5
    max_failed_trial_count = 3
    parallel_trial_count = 2

    # Objective specification.
    objective = V1beta1ObjectiveSpec(
        type="minimize",
        goal=0.001,
        objective_metric_name="loss"
    )

    # Algorithm specification.
    algorithm = V1beta1AlgorithmSpec(
        algorithm_name="random",
    )

    # Experiment search space.
    # In this example we tune learning rate and batch size.
    parameters = [
        V1beta1ParameterSpec(
            name="learning_rate",
            parameter_type="double",
            feasible_space=V1beta1FeasibleSpace(
                min="0.01",
                max="0.05"
            ),
        ),
        V1beta1ParameterSpec(
            name="batch_size",
            parameter_type="int",
            feasible_space=V1beta1FeasibleSpace(
                min="80",
                max="100"
            ),
        )
    ]

    # Experiment Trial template.
    trial_spec = {
        "apiVersion": "kubeflow.org/v1",
        "kind": "TFJob",
        "spec": {
            "tfReplicaSpecs": {
                "Chief": {
                    "replicas": 1,
                    "restartPolicy": "OnFailure",
                    "template": {
                        "metadata": {
                            "annotations": {
                                "sidecar.istio.io/inject": "false"
                            }
                        },
                        "spec": {
                            "containers": [
                                {
                                    "name": "tensorflow",
                                    "image": "gcr.io/mapr-252711/kubeflow/kfexamples/docker.io/liuhougangxa/tf-estimator-mnist",
                                    "command": [
                                        "python",
                                        "/opt/model.py",
                                        f"--tf-data-dir={mnt_path}{final_training_data_dir}/",
                                        f"--tf-train-steps={str(training_steps)}",
                                        f"--tf-export-dir={mnt_path}{final_training_data_dir}/",
                                        "--tf-learning-rate=${trialParameters.learningRate}",
                                        "--tf-batch-size=${trialParameters.batchSize}"
                                    ],
                                    "env": env,
                                    "volumeMounts": [
                                        {
                                            "mountPath": mnt_path,
                                            "name": "data-volume"
                                        }
                                    ]
                                }
                            ],
                            "imagePullSecrets": [
                                {
                                    "name": "hpe-imagepull-secrets"
                                }
                            ],
                            "volumes": [
                                {
                                    "name": "data-volume",
                                    "persistentVolumeClaim": {
                                        "claimName": "user-pvc"
                                    }
                                }
                            ]
                        }
                    }
                },
                "Worker": {
                    "replicas": 1,
                    "restartPolicy": "OnFailure",
                    "template": {
                        "metadata": {
                            "annotations": {
                                "sidecar.istio.io/inject": "false"
                            }
                        },
                        "spec": {
                            "containers": [
                                {
                                    "name": "tensorflow",
                                    "image": "gcr.io/mapr-252711/kubeflow/kfexamples/docker.io/liuhougangxa/tf-estimator-mnist",
                                    "command": [
                                        "python",
                                        "/opt/model.py",
                                        f"--tf-data-dir={mnt_path}{final_training_data_dir}/",
                                        f"--tf-train-steps={str(training_steps)}",
                                        f"--tf-export-dir={mnt_path}{final_training_data_dir}/",
                                        "--tf-learning-rate=${trialParameters.learningRate}",
                                        "--tf-batch-size=${trialParameters.batchSize}"
                                    ],
                                    "env": env,
                                    "volumeMounts": [
                                        {
                                            "mountPath": mnt_path,
                                            "name": "data-volume"
                                        }
                                    ]
                                }
                            ],
                            "imagePullSecrets": [
                                {
                                    "name": "hpe-imagepull-secrets"
                                }
                            ],
                            "volumes": [
                                {
                                    "name": "data-volume",
                                    "persistentVolumeClaim": {
                                        "claimName": "user-pvc"
                                    }
                                }
                            ]
                        }
                    }
                }
            }
        }
    }

    # Configure parameters for the Trial template.
    trial_template = V1beta1TrialTemplate(
        primary_container_name="tensorflow",
        trial_parameters=[
            V1beta1TrialParameterSpec(
                name="learningRate",
                description="Learning rate for the training model",
                reference="learning_rate"
            ),
            V1beta1TrialParameterSpec(
                name="batchSize",
                description="Batch size for the model",
                reference="batch_size"
            ),
        ],
        trial_spec=trial_spec
    )

    # Create an Experiment from the above parameters.
    experiment_spec = V1beta1ExperimentSpec(
        max_trial_count=max_trial_count,
        max_failed_trial_count=max_failed_trial_count,
        parallel_trial_count=parallel_trial_count,
        objective=objective,
        algorithm=algorithm,
        parameters=parameters,
        trial_template=trial_template
    )

    # Create the KFP task for the Katib Experiment.
    # Experiment Spec should be serialized to a valid Kubernetes object.
    katib_experiment_launcher_op = components.load_component_from_file("component/katib-launcher-component.yaml")

    op = katib_experiment_launcher_op(
        experiment_name=experiment_name,
        experiment_namespace=experiment_namespace,
        experiment_spec=ApiClient().sanitize_for_serialization(experiment_spec),
        experiment_timeout_minutes=60,
        delete_finished_experiment=False)

    return op

In [None]:
# This function converts Katib Experiment HP results to args.
def convert_katib_results(katib_results) -> str:
    import json
    import pprint
    katib_results_json = json.loads(katib_results)
    print("Katib results:")
    pprint.pprint(katib_results_json)
    best_hps = []
    for pa in katib_results_json["currentOptimalTrial"]["parameterAssignments"]:
        if pa["name"] == "learning_rate":
            best_hps.append("--tf-learning-rate=" + pa["value"])
        elif pa["name"] == "batch_size":
            best_hps.append("--tf-batch-size=" + pa["value"])
    print(f"Best Hyperparameters: {best_hps}")
    return " ".join(best_hps)

# You should define the TFJob name, namespace, number of training steps, output of Katib and model volume tasks in the arguments.
def create_tfjob_task(tfjob_name, tfjob_namespace, training_steps, katib_op, model_volume_op):
    import json
    # Get parameters from the Katib Experiment.
    # Parameters are in the format "--tf-learning-rate=0.01 --tf-batch-size=100"
    convert_katib_results_op = components.func_to_container_op(convert_katib_results)
    best_hp_op = convert_katib_results_op(katib_op.output)
    best_hps = str(best_hp_op.output)

    # Create the TFJob Chief and Worker specification with the best Hyperparameters.
    tfjob_chief_spec = {
        "replicas": 1,
        "restartPolicy": "OnFailure",
        "template": {
            "metadata": {
                "annotations": {
                    "sidecar.istio.io/inject": "false"
                }
            },
            "spec": {
                "containers": [
                    {
                        "name": "tensorflow",
                        "image": "gcr.io/mapr-252711/kubeflow/kfexamples/docker.io/liuhougangxa/tf-estimator-mnist",
                        "command": [
                            "sh",
                            "-c"
                        ],
                        "args": [
                            f"python /opt/model.py --tf-data-dir={mnt_path}{final_training_data_dir}/ --tf-export-dir=/mnt/export --tf-train-steps={training_steps} {best_hps}"
                        ],
                        "env": env,
                        "volumeMounts": [
                            {
                                "mountPath": "/mnt/export",
                                "name": "model-volume"
                            },
                            {
                                "mountPath": mnt_path,
                                "name": "data-volume"
                            }
                        ]
                    }
                ],
                "imagePullSecrets": [
                    {
                        "name": "hpe-imagepull-secrets"
                    }
                ],
                "volumes": [
                    {
                        "name": "model-volume",
                        "persistentVolumeClaim": {
                            "claimName": str(model_volume_op.outputs["name"])
                        }
                    },
                    {
                        "name": "data-volume",
                        "persistentVolumeClaim": {
                            "claimName": "user-pvc"
                        }
                    }
                ]
            }
        }
    }

    tfjob_worker_spec = {
        "replicas": 1,
        "restartPolicy": "OnFailure",
        "template": {
            "metadata": {
                "annotations": {
                    "sidecar.istio.io/inject": "false"
                }
            },
            "spec": {
                "containers": [
                    {
                        "name": "tensorflow",
                        "image": "gcr.io/mapr-252711/kubeflow/kfexamples/docker.io/liuhougangxa/tf-estimator-mnist",
                        "command": [
                            "sh",
                            "-c",
                        ],
                        "args": [
                          f"python /opt/model.py --tf-data-dir={mnt_path}{final_training_data_dir}/ --tf-export-dir=/mnt/export --tf-train-steps={training_steps} {best_hps}" 
                        ],
                        "env": env,
                        "volumeMounts": [
                            {
                                "mountPath": "/mnt/export",
                                "name": "model-volume"
                            },
                            {
                                "mountPath": mnt_path,
                                "name": "data-volume"
                            }
                        ]
                    }
                ],
                "imagePullSecrets": [
                    {
                        "name": "hpe-imagepull-secrets"
                    }
                ],
                "volumes": [
                    {
                        "name": "model-volume",
                        "persistentVolumeClaim": {
                            "claimName": str(model_volume_op.outputs["name"])
                        }
                    },
                    {
                        "name": "data-volume",
                        "persistentVolumeClaim": {
                            "claimName": "user-pvc"
                        }
                    }
                ]
            }
        }
    }

    # Create the KFP task for the TFJob.
    tfjob_launcher_op = components.load_component_from_file("component/kubeflow-launcher-component.yaml")
    
    op = tfjob_launcher_op(
        name=tfjob_name,
        namespace=tfjob_namespace,
        chief_spec=json.dumps(tfjob_chief_spec),
        worker_spec=json.dumps(tfjob_worker_spec),
        tfjob_timeout_minutes=60,
        delete_finished_tfjob=False)
    return op

In [None]:
def create_serving_task(model_name, model_namespace, tfjob_op, model_volume_op):
    api_version = 'serving.kserve.io/v1beta1'
    serving_component_url = 'component/kserve-component.yaml'

    # Uncomment the following two lines if you are using KFServing v0.6.x or v0.5.x
    # api_version = 'serving.kubeflow.org/v1beta1'
    # serving_component_url = 'component/kfserving-component.yaml'

    inference_service = f'''
      apiVersion: "{api_version}"
      kind: "InferenceService"
      metadata:
        name: {model_name}
        namespace: {model_namespace}
        annotations:
          "sidecar.istio.io/inject": "false"
      spec:
        predictor:
          tensorflow:
            storageUri: "pvc://{str(model_volume_op.outputs["name"])}/"
      '''

    serving_launcher_op = components.load_component_from_file(serving_component_url)
    serving_launcher_op(action="apply", inferenceservice_yaml=inference_service).after(tfjob_op)

In [None]:
@dsl.pipeline(
    name=name,
    description="An end to end mnist example including hyperparameter tuning, train and inference"
)
def mnist_pipeline(name=name, namespace=namespace, training_steps=training_steps):

    # Run the hyperparameter tuning with Katib.
    katib_op = create_katib_experiment_task(name, namespace, training_steps)

    volume_str = f"model-volume-{uuid}"
    
    # Create volume to train and serve the model.
    model_volume_op = dsl.VolumeOp(
        name=volume_str,
        resource_name=volume_str,
        size="1Gi",
        modes=dsl.VOLUME_MODE_RWO
    )

    # Run the distributive training with TFJob.
    tfjob_op = create_tfjob_task(name, namespace, training_steps, katib_op, model_volume_op)

    # Create the KServe inference.
    create_serving_task(name, namespace, tfjob_op, model_volume_op)
    print("Volume: ", volume_str)
    
# Run the Kubeflow Pipeline in the user's namespace.
run_id = kfp_client.create_run_from_pipeline_func(mnist_pipeline, namespace=namespace, arguments={}).run_id

print(f"Run ID: {run_id}")

kfp_client.wait_for_run_completion(run_id=run_id, timeout=36000)

In [None]:
import numpy as np
from PIL import Image
import requests

# Pipeline Run should be succeeded.
kfp_run = kfp_client.get_run(run_id=run_id)
if kfp_run.run.status == "Succeeded":
    print(f"Run {run_id} has been Succeeded\n")

    # Specify the image URL here.
    image = Image.open("image/9.bmp")
    data = np.array(image.convert('L').resize((28, 28))).astype(np.float64).reshape(-1, 28, 28, 1)
    data_formatted = np.array2string(data, separator=",", formatter={"float": lambda x: "%.1f" % x})
    json_request = f'{{ "instances" : {data_formatted} }}'

    # Specify the prediction URL. If you are runing this notebook outside of Kubernetes cluster, you should set the Cluster IP.
    url = f"http://{name}-predictor.{namespace}.svc.cluster.local/v1/models/{name}:predict"
    response = requests.post(url, data=json_request)

    print("Prediction for the image")
    display(image)
    print(response.json())