In [None]:
# Copyright 2021 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

## Feedback or issues?

Let us know if you have any [feedback or questions](https://forms.gle/hXDnv1T4WanMwTi79). If you provide an email address, we will follow up with you.

# Visualizing a pipeline run's parameters and metrics using the Model Builder SDK

To use this Jupyter notebook, copy the notebook to an AI Platform (Unified) Notebooks instance with Tensorflow installed and open it. You can run each step, or cell, and see its results. To run a cell, use Shift+Enter. Jupyter automatically displays the return value of the last line in each cell. For more information about running notebooks in AI Platform (Unified) Notebook, see the [AI Platform (Unified) Notebook guide](https://cloud.google.com/ai-platform-unified/docs/general/notebooks).


This notebook demonstrates how to extract and visualize parameters and metrics for pipeline runs created using AI Platform (Unified) Pipelines.



Note: You might incur charges for training, prediction, storage or usage of other Google Cloud products in connection with running this example.

## Setting up

This notebook is intended to be run in the following environments:

* [AI Platform Notebooks](https://cloud.google.com/ai-platform-notebooks). 
* [Google Colab](https://colab.research.google.com/notebooks/intro.ipynb)

If you haven't already enabled the AI Platform API, on the [AI Platform (Unified) Dashboard](https://console.cloud.google.com/ai/platform) page in the Google Cloud Console, click **Enable the AI Platform API**.


Set `gcloud` to use your project.  **Edit the following cell before running it**.

In [None]:
PROJECT_ID = 'your-project-id'  # <---CHANGE THIS

In [None]:
!gcloud config set project {PROJECT_ID}

If you're running this notebook on colab, authenticate with your user account:

In [None]:
import sys
if 'google.colab' in sys.modules:
  from google.colab import auth
  auth.authenticate_user()

### Install the Model Builder SDK and the Kubeflow Pipelines SDK

Use the instructions in this section to install the Model Builder SDK and the Kubeflow Pipelines SDK.

After you install the SDKs, the kernel is be automatically restarted.

In [None]:
!gsutil cp gs://cloud-aiplatform-pipelines/releases/latest/kfp-1.5.0rc5.tar.gz .
!gsutil cp gs://cloud-aiplatform-pipelines/releases/latest/aiplatform_pipelines_client-0.1.0.caip20210415-py3-none-any.whl .

In [None]:
if 'google.colab' in sys.modules:
  USER_FLAG = ''
else:
  USER_FLAG = '--user'

In [None]:
!python3 -m pip install {USER_FLAG} kfp-1.5.0rc5.tar.gz --upgrade
!python3 -m pip install {USER_FLAG} aiplatform_pipelines_client-0.1.0.caip20210415-py3-none-any.whl --upgrade

Install the Model Builder SDK and restart the kernel.

In [None]:
%%capture
!pip3 uninstall -y google-cloud-aiplatform
!pip3 install --user git+https://github.com/googleapis/python-aiplatform.git@dev-test 
import IPython
app = IPython.Application.instance()
app.kernel.do_shutdown(True)

Check the version of the Kubeflow Pipelines SDK. It should be >= 1.6.



In [None]:
# Check the KFP version
!python3 -c "import kfp; print('KFP version: {}'.format(kfp.__version__))"

If you're on colab, re-authorize after the kernel restart. **Edit the following cell for your project ID before running it.**

In [None]:
import sys
if 'google.colab' in sys.modules:
  PROJECT_ID = 'your-project-id'  # <---CHANGE THIS
  !gcloud config set project {PROJECT_ID}
  from google.colab import auth
  auth.authenticate_user()
  USER_FLAG = ''

### Set some variables

**Before you run the next cell**, **edit it** to set variables for your project. For `BUCKET_NAME`, enter the name of a Cloud Storage bucket in your project.  Don't include the `gs://` prefix.

In [None]:
# Required Parameters
USER = 'your-user-name' # <---CHANGE THIS
BUCKET_NAME = 'your-bucket-name'  # <---CHANGE THIS
PIPELINE_ROOT = 'gs://{}/pipeline_root/{}'.format(BUCKET_NAME, USER)

PROJECT_ID = 'your-project-id'  # <---CHANGE THIS
REGION = 'us-central1'
API_KEY = 'your-api-key'  # <---CHANGE THIS

print('PIPELINE_ROOT: {}'.format(PIPELINE_ROOT))

### Initialize Model Builder SDK

Initialize the *client* for AI Platform (Unified).

In [None]:
from google.cloud import aiplatform

aiplatform.init(project=PROJECT_ID)

## Define and run a pipeline that tracks metrics

In this section you will define and run a simple pipeline that tracks parameters and metrics.

In [None]:
from typing import NamedTuple
from kfp.v2 import dsl
from kfp.v2.dsl import (
    component,
    InputPath,
    OutputPath,
    InputArtifact,
    OutputArtifact,
    Artifact,
    Dataset,
    Model,
    ClassificationMetrics,
    Metrics,
)

Define a simple Python function-based component that uses scikit-learn to train a model using some input parameters and produces an accuracy metric. The accuracy metric is logged in the `metrics` output artifact.

In [None]:
@component(
    packages_to_install=['sklearn'],
    base_image='python:3.9',
)

def digit_classification(input_seed: int, split_count: int, metrics: Output[Metrics]):
  from sklearn import model_selection
  from sklearn.linear_model import LogisticRegression
  from sklearn import datasets
  from sklearn.metrics import accuracy_score

  # Load digits dataset
  iris = datasets.load_iris()
  
  # # Create feature matrix
  X = iris.data
  
  # Create target vector
  y = iris.target
  
  #test size
  test_size = 0.20
  
  #cross-validation settings
  kfold = model_selection.KFold(n_splits=split_count, random_state=input_seed, shuffle=True)
  
  #Model instance
  model = LogisticRegression()
  scoring = 'accuracy'
  results = model_selection.cross_val_score(model, X, y, cv=kfold, scoring=scoring)
  
  #split data
  X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=test_size, random_state=input_seed)
  #fit model
  model.fit(X_train, y_train)
  
  #accuracy on test set
  result = model.score(X_test, y_test)
  metrics.get().log_metric('accuracy', (result*100.0))


Define a pipeline that uses the `digit_classification` component.

In [None]:
@dsl.pipeline(
    # Default pipeline root. You can override it when submitting the pipeline.
    pipeline_root='gs://ml-pipeline-artifacts/v2-artifacts',
    # A name for the pipeline. Use to determine the pipeline Context.
    name='metrics-pipeline-v2')
def pipeline(seed: int, splits: int):
  digit_classification_op = digit_classification(input_seed=seed, split_count=splits)

  
if __name__ == '__main__':
  from kfp.v2 import compiler
  from aiplatform.pipelines import client  

  compiler.Compiler().compile(pipeline_func=pipeline,                                                     
                              package_path='metrics_pipeline.json')

  

  api_client = client.Client(
    project_id=PROJECT_ID,
    region=REGION,
    api_key=API_KEY)

  response = api_client.create_run_from_job_spec(
    job_spec_path='metrics_pipeline.json',
    pipeline_root=PIPELINE_ROOT,  # Override if needed.
    parameter_values={'seed': 8, 'splits': 11})

Try to changing the `seed` and `splits` values and rerunning the cell above to create multiple pipeline runs.

## Comparing the parameters and metrics of pipeline runs

In this section, you use the Model Builder SDK to compare the parameters and metrics of the pipeline runs you created in the previous section.

### Extract metrics and parameters into a pandas dataframe for run comparison

In [None]:
pipeline_df = aiplatform.get_pipeline_df(pipeline="metrics-pipeline-v2")
pipeline_df

### Parallel coordinates plot of parameters and metrics

With the metric and parameters in a dataframe, you can perform further analysis to exetract useful information. The following example compares data from each run using a parallel coordinate plot. 


In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd


plt.rcParams["figure.figsize"] = [15, 5]

pipeline_df['param.input:seed'] = pipeline_df['param.input:seed'].astype(np.float16)
pipeline_df['param.input:splits'] = pipeline_df['param.input:splits'].astype(np.float16) 

ax = pd.plotting.parallel_coordinates(
    pipeline_df.reset_index(level=0),
    'run_name', cols=['param.input:seed','param.input:splits', 'metric.accuracy'],
    # color=['blue', 'green', 'pink', 'red'],
    )
ax.set_yscale('symlog')
ax.legend(bbox_to_anchor=(1.0, 0.5))

## Define and run a pipeline that tracks complex metrics

In addition to basic key/value pair metrics, you can also track more complex metrics and use Model Builder SDK to visualize those metrics.

The following example defines a Python function-based component that uses scikit-learn to train a classifier and produce evaluations that can be visualized. This example shows how to visualize an receiver operating characteristic (ROC) curve.


In [None]:
@component(
    packages_to_install=['sklearn'],
    base_image='python:3.9',
)
def wine_classification(metrics: Output[ClassificationMetrics]):
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.metrics import roc_curve
    from sklearn.datasets import load_wine
    from sklearn.model_selection import train_test_split, cross_val_predict

    X, y = load_wine(return_X_y=True)
    # Binary classification problem for label 1.
    y = y == 1

    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
    rfc = RandomForestClassifier(n_estimators=10, random_state=42)
    rfc.fit(X_train, y_train)
    y_scores = cross_val_predict(rfc, X_train, y_train, cv=3, method='predict_proba')
    y_predict = cross_val_predict(rfc, X_train, y_train, cv=3, method='predict')
    fpr, tpr, thresholds = roc_curve(y_true=y_train, y_score=y_scores[:,1], pos_label=True)
    metrics.get().log_roc_curve(fpr, tpr, thresholds)

In [None]:
@dsl.pipeline(
    # Default pipeline root. You can override it when submitting the pipeline.
    pipeline_root='gs://ml-pipeline-artifacts/v2-artifacts',
    # A name for the pipeline. Use to determine the pipeline Context.
    name='metrics-roc-pipeline-v2')
def pipeline():
  wine_classification_op = wine_classification()  

if __name__ == '__main__':
  from kfp.v2 import compiler
  from aiplatform.pipelines import client  

  compiler.Compiler().compile(pipeline_func=pipeline,                                                     
                              package_path='metrics_pipeline.json')

  

  api_client = client.Client(
    project_id=PROJECT_ID,
    region='us-central1',
    api_key=API_KEY)

  response = api_client.create_run_from_job_spec(
    job_spec_path='metrics_pipeline.json',
    pipeline_root=PIPELINE_ROOT,  # Override if needed.
    parameter_values={})
  
    

### Plot ROC curve and calculate AUC number

In addition to basic metrics, you can extract complex metrics and perform further analysis using the `get_pipeline_df` method.

In [None]:
pipeline_df = aiplatform.get_pipeline_df(pipeline="metrics-roc-pipeline-v2")
pipeline_df

In [None]:
df = pd.DataFrame(pipeline_df['metric.confidenceMetrics'][0])
auc = np.trapz(df['recall'],df['falsePositiveRate'])
plt.plot(df['falsePositiveRate'],df['recall'], label="auc="+str(auc))
plt.legend(loc=4)
plt.show() 