In [None]:
# Copyright 2022 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Fraudfinder - ML Pipeline

<table align="left">
  <td>
    <a href="https://console.cloud.google.com/ai-platform/notebooks/deploy-notebook?download_url=https://github.com/GoogleCloudPlatform/fraudfinder/raw/main/06_model_training_pipeline.ipynb">
       <img src="https://www.gstatic.com/cloud/images/navigation/vertex-ai.svg" alt="Google Cloud Notebooks">Open in Cloud Notebook
    </a>
  </td> 
  <td>
    <a href="https://colab.research.google.com/github/GoogleCloudPlatform/fraudfinder/blob/main/06_model_training_pipeline.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/colab-logo-32px.png" alt="Colab logo"> Open in Colab
    </a>
  </td>
  <td>
    <a href="https://github.com/GoogleCloudPlatform/fraudfinder/blob/main/06_model_training_pipeline.ipynb">
        <img src="https://cloud.google.com/ml-engine/images/github-logo-32px.png" alt="GitHub logo">
      View on GitHub
    </a>
  </td>
</table>

## Overview

[Fraudfinder](https://github.com/googlecloudplatform/fraudfinder) is a series of labs on how to build a real-time fraud detection system on Google Cloud. Throughout the Fraudfinder labs, you will learn how to read historical bank transaction data stored in data warehouse, read from a live stream of new transactions, perform exploratory data analysis (EDA), do feature engineering, ingest features into a feature store, train a model using feature store, register your model in a model registry, evaluate your model, deploy your model to an endpoint, do real-time inference on your model with feature store, and monitor your model.

### Objective

This notebook shows how to use Feature Store, Pipelines and Model Monitoring for building an end-to-end demo using both components defined in `google_cloud_pipeline_components` and custom components. 

This lab uses the following Google Cloud services and resources:

- [Vertex AI](https://cloud.google.com/vertex-ai/)
- [BigQuery](https://cloud.google.com/bigquery/)

Steps performed in this notebook:

    * Create a Feature Store for store and sharing features
    * Create a Pipeline to deploy the model
    * Create a Model Monitoring Job to check the status of the model

### Costs

This tutorial uses billable components of Google Cloud:

* Vertex AI
* BigQuery

Learn about [Vertex AI
pricing](https://cloud.google.com/vertex-ai/pricing), [BigQuery pricing](https://cloud.google.com/bigquery/pricing) and use the [Pricing
Calculator](https://cloud.google.com/products/calculator/)
to generate a cost estimate based on your projected usage.

### Load configuration settings from the setup notebook

Set the constants used in this notebook and load the config settings from the `00_environment_setup.ipynb` notebook.

In [None]:
GCP_PROJECTS = !gcloud config get-value project
PROJECT_ID = GCP_PROJECTS[0]
BUCKET_NAME = f"{PROJECT_ID}-fraudfinder"
config = !gsutil cat gs://{BUCKET_NAME}/config/notebook_env.py
print(config.n)
exec(config.n)

### Import libraries and define constants

#### Libraries

In [None]:
import os
import random
from datetime import datetime

In [None]:
MODEL_DISPLAY_NAME = f'xgboost-fruad-finder'
PIPELINE_NAME = f'fraud-finder-xgb-pipeline-{ID}'
PIPELINE_STORE_URI = f"{BUCKET_NAME}/pipeline-store/"
IMAGE_REPOSITORY = f'fraudfinder-{ID}'
IMAGE_NAME='dask-xgb-classificator'
IMAGE_TAG='v1'
IMAGE_URI=f"us-central1-docker.pkg.dev/{PROJECT_ID}/{IMAGE_REPOSITORY}/{IMAGE_NAME}:{IMAGE_TAG}"

os.environ["PROJECT_ID"] = PROJECT_ID

In [None]:
!mkdir ./pipelines/components/

### Define Custom Components

#### Define feature store component

Notice that the component assumes that containes the entities-timestamps "query" is already created.

In [None]:
%%writefile deploy_kfp_pipeline/pipeline/components/batch_serve.py


"""
Component for interacting with the feature store
"""

from kfp.v2.dsl import Dataset, Input, Output, Artifact, component
from typing import NamedTuple

# COMPONENTS_DIR=os.path.join(os.curdir, 'pipelines', 'components')
# COMPONENT_URI=f"{COMPONENTS_DIR}/features_to_gcs.yaml"

@component(output_component_file='./pipelines/components/features_to_gcs.yaml', 
       base_image='python:3.7', 
       packages_to_install=["git+https://github.com/googleapis/python-aiplatform.git@main"])

def features_to_gcs(project_id:str, region:str, bucket_name:str, feature_store_id: str, read_instances_uri:str) -> NamedTuple("Outputs", [("snapshot_uri_paths", str),],):

    # Libraries --------------------------------------------------------------------------------------------------------------------------
    from datetime import datetime
    import glob
    import urllib
    import json

    #Feature Store
    from google.cloud.aiplatform import Featurestore, EntityType, Feature

    # Variables --------------------------------------------------------------------------------------------------------------------------
    timestamp = datetime.now().strftime("%Y%m%d%H%M%S")
    api_endpoint = region + "-aiplatform.googleapis.com"
    bucket = urllib.parse.urlsplit(bucket_name).netloc
    export_uri = f'{bucket_name}/data/snapshots/{timestamp}' #format as new gsfuse requires
    export_uri_path = f'/gcs/{bucket}/data/snapshots/{timestamp}' 
    event_entity = 'event'
    customer_entity = 'customer'
    terminal_entity = 'terminal'
    serving_feature_ids = {customer_entity: ['*'], terminal_entity: ['*']}

    # Main -------------------------------------------------------------------------------------------------------------------------------

    ## Set a client for Feature store managment

    ### Create admin_client for create, read, update and delete (CRUD)
    feature_store_resource_path = f"projects/{project_id}/locations/{region}/featurestores/{feature_store_id}"
    print("Feature Store: \t", feature_store_resource_path)

    ## Run batch job request
    try:
        ff_feature_store = Featurestore(feature_store_resource_path)
        ff_feature_store.batch_serve_to_gcs(
            gcs_destination_output_uri_prefix = export_uri,
            gcs_destination_type = 'csv',
            serving_feature_ids = serving_feature_ids,
            read_instances_uri = read_instances_uri,
            pass_through_fields = ['tx_fraud','tx_amount']
        )
    except Exception as error:
        print(error)

    #Store metadata
    snapshot_pattern = f'{export_uri_path}/*.csv'
    snapshot_files = glob.glob(snapshot_pattern)
    snapshot_files_fmt = [p.replace('/gcs/', 'gs://') for p in snapshot_files]
    snapshot_files_string = json.dumps(snapshot_files_fmt)

    component_outputs = NamedTuple("Outputs",
                                [("snapshot_uri_paths", str),],)

    return component_outputs(snapshot_files_string)


#### Define an evaluate custom component

In [None]:
%%writefile deploy_kfp_pipeline/pipeline/components/evaluate_model.py

from kfp.v2.dsl import Artifact, Dataset, Input, InputPath, Model, Output, OutputPath, Metrics, ClassificationMetrics, Condition, component
from typing import NamedTuple


@component(
output_component_file='./pipelines/components/evaluate.yaml')
def evaluate_model(
    model_in: Input[Artifact],
    metrics_uri: str,
    meta_metrics: Output[Metrics],
    graph_metrics: Output[ClassificationMetrics],
    model_out: Output[Model]) -> NamedTuple("Outputs",
                                            [("metrics_thr", float),],):

    # Libraries --------------------------------------------------------------------------------------------------------------------------
    import json

    # Variables --------------------------------------------------------------------------------------------------------------------------
    metrics_path = metrics_uri.replace('gs://', '/gcs/')
    labels = ['not fraud', 'fraud']

    # Main -------------------------------------------------------------------------------------------------------------------------------
    with open(metrics_path, mode='r') as json_file:
        metrics = json.load(json_file)

    ## metrics
    fpr = metrics['fpr']
    tpr = metrics['tpr']
    thrs = metrics['thrs']
    c_matrix = metrics['confusion_matrix']
    avg_precision_score = metrics['avg_precision_score']
    f1 = metrics['f1_score']
    lg_loss = metrics['log_loss']
    prec_score = metrics['precision_score']
    rec_score = metrics['recall_score']

    meta_metrics.log_metric('avg_precision_score', avg_precision_score)
    meta_metrics.log_metric('f1_score', f1)
    meta_metrics.log_metric('log_loss', lg_loss)
    meta_metrics.log_metric('precision_score', prec_score)
    meta_metrics.log_metric('recall_score', rec_score)
    graph_metrics.log_roc_curve(fpr, tpr, thrs)
    graph_metrics.log_confusion_matrix(labels, c_matrix)

    ## model metadata
    model_framework = 'xgb.dask'
    model_type = 'DaskXGBClassifier'
    model_user = 'inardini' 
    model_function = 'classification'
    model_out.metadata["framework"] = model_framework
    model_out.metadata["type"] = model_type
    model_out.metadata["model function"] = model_function
    model_out.metadata["modified by"] = model_user

    component_outputs = NamedTuple("Outputs",
                                [("metrics_thr", float),],)

    return component_outputs(float(avg_precision_score))

### Define Custom Components

#### Define feature store component

Notice that the component assumes that containes the entities-timestamps "query" is already created.

In [None]:
%%writefile deploy_kfp_pipeline/pipeline/kfp_pipeline.py

#General
import os
import sys
import random
import json
from datetime import datetime, timedelta

#Vertex Pipelines
from typing import NamedTuple
import kfp
from kfp.v2 import dsl
from kfp.v2.dsl import Artifact, Dataset, Input, InputPath, Model, Output, OutputPath, Metrics, ClassificationMetrics, Condition, component
from kfp.v2 import compiler
import google_cloud_pipeline_components
from google_cloud_pipeline_components import aiplatform as vertex_ai_components
from google.cloud import aiplatform as vertex_ai
from pipeline.components.batch_serve import features_to_gcs
from pipeline.components.evaluate_model import evaluate_model
from google.cloud import storage





TIMESTAMP = datetime.now().strftime("%Y%m%d%H%M%S")
PROJECT_ID = os.getenv("PROJECT_ID", "") # These variables would be passed from Cloud Build in CI/CD in case the deployment is supposed to be on different project, cusch as test, eval, prod.
BUCKET_NAME = f"{PROJECT_ID}-fraudfinder"

client = storage.Client()
bucket =  client.get_bucket(BUCKET_NAME)
blob = bucket.get_blob('config/notebook_env.py')
config = blob.download_as_string()
exec(config)



# TODO to load it from config file
IMAGE_REPOSITORY = f'fraudfinder-{ID}'
IMAGE_NAME='dask-xgb-classificator'
IMAGE_TAG='v1'
IMAGE_URI=f"us-central1-docker.pkg.dev/{PROJECT_ID}/{IMAGE_REPOSITORY}/{IMAGE_NAME}:{IMAGE_TAG}"

#Components
BASE_IMAGE='python:3.7'
COMPONENTS_DIR=os.path.join(os.curdir, 'pipelines', 'components')
INGEST_FEATURE_STORE=f"{COMPONENTS_DIR}/ingest_feature_store_{TIMESTAMP}.yaml"
EVALUATE=f"{COMPONENTS_DIR}/evaluate_{TIMESTAMP}.yaml"

#Pipeline
PIPELINE_NAME = f'fraud-finder-xgb-pipeline2-{ID}'
PIPELINE_DIR=os.path.join(os.curdir, 'pipelines')
PIPELINE_ROOT = f"gs://{BUCKET_NAME}/pipelines"
PIPELINE_PACKAGE_PATH = f"{PIPELINE_DIR}/pipeline_{TIMESTAMP}.json"

#Feature Store component
START_DATE_TRAIN = (datetime.today() - timedelta(days=1)).strftime("%Y-%m-%d")
END_DATE_TRAIN = (datetime.today() - timedelta(days=1)).strftime("%Y-%m-%d")
BQ_DATASET = "tx"
READ_INSTANCES_TABLE = f"ground_truth"
READ_INSTANCES_URI = f"bq://{PROJECT_ID}.{BQ_DATASET}.{READ_INSTANCES_TABLE}"

#Dataset component
DATASET_NAME = f'fraud_finder_dataset_{END_DATE_TRAIN}'

#Training component
JOB_NAME = f'fraudfinder-train-xgb-{TIMESTAMP}'
MODEL_NAME = f'fraudfinder-xgb-{ID}'
TRAIN_MACHINE_TYPE = 'n2-standard-4'
CONTAINER_URI = 'us-docker.pkg.dev/vertex-ai/training/xgboost-cpu.1-1:latest'
MODEL_SERVING_IMAGE_URI = 'us-docker.pkg.dev/vertex-ai/prediction/xgboost-cpu.1-1:latest'
PYTHON_MODULE = 'trainer.train_model'
ARGS=["--bucket", f"gs://{BUCKET_NAME}"]

#Evaluation component
METRICS_URI = f"gs://{BUCKET_NAME}/deliverables/metrics.json"
AVG_PR_THRESHOLD = 0.8
AVG_PR_CONDITION = 'avg_pr_condition'

#endpoint
ENDPOINT_NAME = 'fraudfinder_xgb_prediction'


@dsl.pipeline(
pipeline_root=PIPELINE_ROOT,
name=PIPELINE_NAME,)
def pipeline(project_id:str = PROJECT_ID, 
             region:str = REGION, 
             bucket_name:str = f"gs://{BUCKET_NAME}",
             feature_store_id:str = FEATURESTORE_ID, 
             read_instances_uri:str = READ_INSTANCES_URI,
             replica_count:int = 1,
             machine_type:str = "n1-standard-4",
             train_split:float = 0.8,
             test_split:float = 0.1,
             val_split:float = 0.1,
             metrics_uri: str = METRICS_URI, 
             thold: float = AVG_PR_THRESHOLD,
            ):

    #Export data from featurestore
    features_to_gcs_op = features_to_gcs(project_id=project_id, region=region, bucket_name=bucket_name, 
                                             feature_store_id=feature_store_id, read_instances_uri=read_instances_uri)

    #create dataset 
    dataset_create_op = vertex_ai_components.TabularDatasetCreateOp(project=project_id,
                                                       display_name=DATASET_NAME,
                                                       gcs_source=features_to_gcs_op.outputs['snapshot_uri_paths']).after(features_to_gcs_op)

    #custom training job component - script
    train_model_op = vertex_ai_components.CustomContainerTrainingJobRunOp(
        display_name=JOB_NAME,
        model_display_name=MODEL_NAME,
        container_uri=IMAGE_URI,
        staging_bucket=bucket_name,
        dataset=dataset_create_op.outputs['dataset'],
        base_output_dir=bucket_name,
        args = ARGS,
        replica_count= replica_count,
        machine_type= machine_type,
        training_fraction_split=train_split,
        validation_fraction_split=val_split,
        test_fraction_split=test_split,
        model_serving_container_image_uri=MODEL_SERVING_IMAGE_URI,
        project=project_id,
        location=region).after(dataset_create_op)

    #evaluate component
    evaluate_model_op = evaluate_model(model_in=train_model_op.outputs["model"], 
                                       metrics_uri=metrics_uri).after(train_model_op)

    #if threshold
    with Condition(evaluate_model_op.outputs['metrics_thr'] < thold, name=AVG_PR_CONDITION):

        #create endpoint
        create_endpoint_op = vertex_ai_components.EndpointCreateOp(
            display_name=ENDPOINT_NAME,
            project=project_id).after(evaluate_model_op)

        #deploy th model
        custom_model_deploy_op = vertex_ai_components.ModelDeployOp(
            model=train_model_op.outputs["model"],
            endpoint=create_endpoint_op.outputs["endpoint"],
            deployed_model_display_name=MODEL_NAME,
            dedicated_resources_machine_type=machine_type,
        dedicated_resources_min_replica_count=replica_count
        ).after(create_endpoint_op)


## Compile your pipeline into a JSON file

Please check the `deploy_kfp_pipeline` directory. We already have added the code for building the pipline and its's components in the `pipeine` directory. Please feel free to change it.

After the workflow of your pipeline is defined, you can proceed to compile the pipeline into a JSON format. The JSON file will include all the information for executing your pipeline on Vertex AI Pipelines.

In [None]:
!python deploy_kfp_pipeline/pipeline_compile.py  --pipeline-name=$PIPELINE_NAME

## Submit your pipeline run
Once the workflow of your pipeline is compiled into the JSON format, you can use the Vertex AI Python client to submit and run your pipeline.

In [None]:
!gsutil ubla set on gs://{BUCKET_NAME}

In [None]:
pipelines_file_location = os.path.join('./pipelines/', f'{PIPELINE_NAME}.json')
!python ./deploy_kfp_pipeline/pipeline_run.py --pipelines-file-location=$pipelines_file_location