In [None]:
# Copyright 2021 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# FraudFinder - Model Training XGBoost and model formalization 

<table align="left">

  <td>
    <a href="https://colab.research.google.com/github/GoogleCloudPlatform/ai-platform-samples/blob/master/ai-platform-unified/notebooks/notebook_template.ipynb"">
      <img src="https://cloud.google.com/ml-engine/images/colab-logo-32px.png" alt="Colab logo"> Run in Colab
    </a>
  </td>
  <td>
    <a href="https://github.com/GoogleCloudPlatform/ai-platform-samples/blob/master/ai-platform-unified/notebooks/notebook_template.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/github-logo-32px.png" alt="GitHub logo">
      View on GitHub
    </a>
  </td>
</table>

## Overview

This notebook shows how to get your training dataset from Vertex AI Feature Store, train your model using Vertex AI managed training pipeline, and deploy it as a Vertex AI endpoint. You will learn how to use your own custom code for ML training on Vertex AI.

### Objective

In the following notebook, you will learn how to:

* build a container to run your own custom code on Vertex AI
* use Vertex AI to train your model at scale
* use Vertex AI to create an endpoint

This tutorial uses the following Google Cloud data analytics and services:

- [BigQuery](https://cloud.google.com/bigquery/)
- [BigQuery ML](https://cloud.google.com/bigquery-ml/)
- [Vertex AI](https://cloud.google.com/vertex-ai/)

### Costs 

This tutorial uses billable components of Google Cloud:

* BigQuery
* BigQuery ML
* Vertex AI

Learn about [BigQuery Pricing](https://cloud.google.com/bigquery/pricing), [BigQuery ML pricing](https://cloud.google.com/bigquery-ml/pricing), [Vertex AI pricing](https://cloud.google.com/vertex-ai/pricing), and use the [Pricing Calculator](https://cloud.google.com/products/calculator/) to generate a cost estimate based on your projected usage.

### Load configuration settings from the setup notebook

Set the constants used in this notebook and load the config settings from the `00_environment_setup.ipynb` notebook.

In [1]:
GCP_PROJECTS = !gcloud config get-value project
PROJECT_ID = GCP_PROJECTS[0]
BUCKET_NAME = f"{PROJECT_ID}-fraudfinder"
config = !gsutil cat gs://{BUCKET_NAME}/config/notebook_env.py
print(config.n)
exec(config.n)


BUCKET_NAME          = "ff04-dryrun-fraudfinder"
PROJECT              = "ff04-dryrun"
REGION               = "us-central1"
ID                   = "698au"
FEATURESTORE_ID      = "fraudfinder_698au"
MODEL_NAME           = "fraudfinder_logreg_model"
ENDPOINT_NAME        = "fraudfinder_logreg_endpoint"
TRAINING_DS_SIZE     = "1000"



### Import libraries

In [2]:
#General
import os
import sys
from typing import Union, List
import json
from datetime import datetime, timedelta

#Data Preprocessing
import numpy as np
import pandas as pd

#Model Training with Vertex AI
from google.cloud import bigquery
from google.cloud import aiplatform as vertex_ai
from google.cloud.aiplatform_v1 import ModelServiceClient
from google.cloud.aiplatform_v1.types import ListModelEvaluationsRequest
from google.protobuf.json_format import MessageToDict
from google.cloud.aiplatform import gapic as aip
from google.cloud import storage

#Model Deployment and Evaluation
from sklearn.metrics import precision_recall_fscore_support
import xgboost as xgb


#Feature Store
from google.cloud import aiplatform as vertex_ai
from google.cloud.aiplatform import Featurestore, EntityType, Feature

### Define constants

In [3]:
# General
DATA_DIR = os.path.join(os.pardir, "data")
TRAIN_DATA_DIR = os.path.join(DATA_DIR, "train")
DATA_URI = f"gs://{BUCKET_NAME}/data"
TRAIN_DATA_URI = f"{DATA_URI}/train"

#Feature Store
START_DATE_TRAIN = (datetime.today() - timedelta(days=1)).strftime("%Y-%m-%d") 
CUSTOMER_ENTITY = "customer"
TERMINAL_ENTITY = "terminal"
SERVING_FEATURE_IDS = {CUSTOMER_ENTITY: ["*"], TERMINAL_ENTITY: ["*"]}
READ_INSTANCES_TABLE = f"ground_truth_{ID}"
READ_INSTANCES_URI = f"bq://{PROJECT_ID}.tx.{READ_INSTANCES_TABLE}"

# Training
EXPERIMENT_NAME=f"fraudfinder-xgb-experiment-{ID}"
TARGET = "tx_fraud"

## Custom Training
DATASET_NAME=f"sample_train-{ID}"
TRAIN_JOB_NAME=f"fraudfinder_xgb_train_frmlz-{ID}"
MODEL_NAME=f"fraudfinder_xgb_model_frmlz-{ID}"
DEPLOYED_NAME = f"fraudfinder_xgb_prediction_frmlz-{ID}"
MODEL_SERVING_IMAGE_URI = "us-docker.pkg.dev/vertex-ai/prediction/xgboost-cpu.1-1:latest"
IMAGE_REPOSITORY = f"fraudfinder-{ID}"
IMAGE_NAME="dask-xgb-classificator"
IMAGE_TAG="v1"
IMAGE_URI=f"us-central1-docker.pkg.dev/{PROJECT_ID}/{IMAGE_REPOSITORY}/{IMAGE_NAME}:{IMAGE_TAG}"
TRAIN_COMPUTE="e2-standard-4"
DEPLOY_COMPUTE="n1-standard-4"

### Initialize Vertex AI SDK and BigQuery Client for Python
Next you will initialize the Vertex AI SDK and BigQuery Client for Python for your project and corresponding bucket. 

In [4]:
bq_client = bigquery.Client(project=PROJECT_ID, location=REGION)

In [5]:
vertex_ai.init(project=PROJECT_ID, location=REGION, staging_bucket=BUCKET_NAME, experiment=EXPERIMENT_NAME)

### Helper Functions
You will now run some helper functions that we will use throughout the notebook.

In [6]:
def create_gcs_dataset(client,
                       display_name: str, 
                       gcs_source: Union[str, List[str]]):
    
    dataset = client.TabularDataset.create(
        display_name=display_name, gcs_source=gcs_source,
    )

    dataset.wait()
    return dataset
    
def get_evaluation_metrics(client, model_resource_name):
    model_evalution_request = ListModelEvaluationsRequest(parent=model_resource_name)
    model_evaluation_list = client.list_model_evaluations(request=model_evalution_request)
    metrics_strlist = []
    for evaluation in model_evaluation_list:
        metrics = MessageToDict(evaluation._pb.metrics)
    return metrics

def gcs_list(gcs_uri):
    obj_list=[]
    storage_client = storage.Client()
    bucket, key = gcs_uri.replace("gs://", "").split("/", 1)
    for blob in storage_client.list_blobs(bucket, prefix=key):
        obj_list.append("gs://"+bucket+"/"+str(blob.name))
    return obj_list

We're also using the BigQuery helper function. 

In [7]:
# Wrapper to use BigQuery client to run query/job, return job ID or result as DF
def run_bq_query(sql: str) -> Union[str, pd.DataFrame]:
    """
    Run a BigQuery query and return the job ID or result as a DataFrame
    Args:
        sql: SQL query, as a string, to execute in BigQuery
    Returns:
        df: DataFrame of results from query,  or error, if any
    """

    bq_client = bigquery.Client()

    # Try dry run before executing query to catch any errors
    job_config = bigquery.QueryJobConfig(dry_run=True, use_query_cache=False)
    bq_client.query(sql, job_config=job_config)

    # If dry run succeeds without errors, proceed to run query
    job_config = bigquery.QueryJobConfig()
    client_result = bq_client.query(sql, job_config=job_config)

    job_id = client_result.job_id

    # Wait for query/job to finish running. then get & return data frame
    df = client_result.result().to_arrow().to_pandas()
    print(f"Finished job_id: {job_id}")
    return df

## Fetching feature values for model training

To fetch training data, we have to specify the following inputs to batch serving:

- a file containing a "query", with the entities and timestamps for each label
- a list of feature values to fetch
- the destination location and format


### Read-instance list

In our case, we need a csv file with content formatted like the table below:

|event|customer                     |terminal|timestamp                                    |
|-----|-----------------------------|--------|---------------------------------------------|
|xxx57538|xxx3859                         |xxx8811    |2021-07-07 00:01:10 UTC                      |
|xxx57539|xxx4165                         |xxx8810    |2021-07-07 00:01:55 UTC                      |
|xxx57540|xxx2289                         |xxx2081    |2021-07-07 00:02:12 UTC                      |
|xxx57541|xxx3227                         |xxx3011    |2021-07-07 00:03:23 UTC                      |
|xxx57542|xxx2819                         |xxx6263    |2021-07-07 00:05:30 UTC                      |

where the column names are the name of entities in Feature Store and the timestamps represents the time an event occured.

In [8]:
sql_query = f"""
CREATE OR REPLACE TABLE {PROJECT_ID}.tx.{READ_INSTANCES_TABLE} as (
    SELECT
        raw_tx.TX_TS AS timestamp,
        raw_tx.CUSTOMER_ID AS customer,
        raw_tx.TERMINAL_ID AS terminal,
        raw_tx.TX_AMOUNT AS tx_amount,
        raw_lb.TX_FRAUD AS tx_fraud,
    FROM 
        tx.tx as raw_tx
    LEFT JOIN 
        tx.txlabels as raw_lb
    ON raw_tx.TX_ID = raw_lb.TX_ID
    WHERE
        DATE(raw_tx.TX_TS) = "{START_DATE_TRAIN}"
    LIMIT {TRAINING_DS_SIZE}
);
"""

print(sql_query)

run_bq_query(sql_query)


CREATE OR REPLACE TABLE ff04-dryrun.tx.ground_truth_698au as (
    SELECT
        raw_tx.TX_TS AS timestamp,
        raw_tx.CUSTOMER_ID AS customer,
        raw_tx.TERMINAL_ID AS terminal,
        raw_tx.TX_AMOUNT AS tx_amount,
        raw_lb.TX_FRAUD AS tx_fraud,
    FROM 
        tx.tx as raw_tx
    LEFT JOIN 
        tx.txlabels as raw_lb
    ON raw_tx.TX_ID = raw_lb.TX_ID
    WHERE
        DATE(raw_tx.TX_TS) = "2023-01-24"
    LIMIT 1000
);

Finished job_id: 847ff94c-9315-4224-b99a-28a43069fad9


### Get Feature Store ID
Initiate the feature store you created in the `02_feature_engineering_batch.ipynb` notebook.

In [9]:
try:
    ff_feature_store = Featurestore(FEATURESTORE_ID)
except NameError:
    print(f"""The feature store {FEATURESTORE_ID} does not exist!""") 

### Fetch a sample of data and dump it into a bucket 
In this section, we will use the Batch Serving of the Vetex AI Feature Store to prepare a dataset for training. You will do this by calling the BatchReadFeatureValues API. Batch Serving is used to fetch large feature values with high throughput, typically for training a model or batch prediction.


In [10]:
!gsutil uniformbucketlevelaccess set on gs://{BUCKET_NAME}

Enabling Uniform bucket-level access for gs://ff04-dryrun-fraudfinder...


In [11]:
ff_feature_store.batch_serve_to_gcs(
    gcs_destination_output_uri_prefix = TRAIN_DATA_URI,
    gcs_destination_type = "csv",
    serving_feature_ids = SERVING_FEATURE_IDS, 
    read_instances_uri = READ_INSTANCES_URI,
    pass_through_fields=["tx_amount", "tx_fraud"],  
)

Serving Featurestore feature values: projects/1003399910665/locations/us-central1/featurestores/fraudfinder_698au
Serve Featurestore feature values backing LRO: projects/1003399910665/locations/us-central1/featurestores/fraudfinder_698au/operations/5462959600330342400
Featurestore feature values served. Resource name: projects/1003399910665/locations/us-central1/featurestores/fraudfinder_698au


<google.cloud.aiplatform.featurestore.featurestore.Featurestore object at 0x7fce263f5910> 
resource name: projects/1003399910665/locations/us-central1/featurestores/fraudfinder_698au

In [12]:
!gsutil uniformbucketlevelaccess set off gs://{BUCKET_NAME}

Disabling Uniform bucket-level access for gs://ff04-dryrun-fraudfinder...


Now you will create a copy of the training data in your local notebook instance. You will need it later for testing our model.

In [13]:
!gsutil ls $TRAIN_DATA_URI
!sudo gsutil cp -r $TRAIN_DATA_URI $TRAIN_DATA_DIR

gs://ff04-dryrun-fraudfinder/data/train/000000000000.csv
Copying gs://ff04-dryrun-fraudfinder/data/train/000000000000.csv...
/ [1 files][169.9 KiB/169.9 KiB]                                                
Operation completed over 1 objects/169.9 KiB.                                    


Exporting the features into cloud storage will generate a csv file. Let's list the file:

In [14]:
obj_list = gcs_list(TRAIN_DATA_URI)
print(obj_list)

['gs://ff04-dryrun-fraudfinder/data/train/000000000000.csv']




## Builing a custom fraud detection model

### Fixing an imbalanced dataset
In the real world, you will need to deal with an imbalance in the dataset. For example, you might randomly delete some of the non-fraudulent transactions to approximately match the number of fraudulent transactions. This technique is called undersampling.

For this workshop, we will skip the data balance process because our sample data is small, and the further reduction will compromise the quality of our results.


#### Builing a Vertex AI dataset
In this section, you will create a managed [Vertex AI dataset](https://cloud.google.com/vertex-ai/docs/training/using-managed-datasets). Vertex AI datasets can be used to train AutoML models or custom-trained models.  

In [15]:
dataset = create_gcs_dataset(client=vertex_ai, display_name=DATASET_NAME, gcs_source=obj_list[0]) #obj_list

print("Dataset:", f"{dataset.display_name}")
print("Name: \t", f"{dataset.resource_name}")

Creating TabularDataset
Create TabularDataset backing LRO: projects/1003399910665/locations/us-central1/datasets/5573162757429133312/operations/2249641266201493504
TabularDataset created. Resource name: projects/1003399910665/locations/us-central1/datasets/5573162757429133312
To use this TabularDataset in another session:
ds = aiplatform.TabularDataset('projects/1003399910665/locations/us-central1/datasets/5573162757429133312')
Dataset: sample_train-698au
Name: 	 projects/1003399910665/locations/us-central1/datasets/5573162757429133312


### Train a custom model
In this section, you will use the xgboost algorithm. Specifically, you will do custom training using a xgboost container.

#### Create the training application
To perform custom training, you can use either a pre-built container or buy your container. In this section, we will build a container for xgboost and use it to train a model with the Vertex AI Managed Training service.


The first step is to write your training code. After, you have to write a Dockerfile and build a container image based on it. The following cell writes our code into `train_gb.py`, the module for training an XGBClassifier. We will copy this code into our container to run through the Vertex Training service.

In [16]:
!mkdir -p -m 777 build_training

In [16]:
%%writefile build_training/train_xgb.py

"""
train_gb.py is the module for training a XGBClassifier pipeline
"""

# Libraries --------------------------------------------------------------------------------------------------------------------------

import argparse
import numpy as np
import os
import json
import logging
from pathlib import Path
import dask.dataframe as dask_df
from dask.distributed import LocalCluster, Client
import xgboost as xgb
from sklearn.metrics import roc_curve, confusion_matrix, average_precision_score, f1_score, log_loss, precision_score, recall_score

# Variables --------------------------------------------------------------------------------------------------------------------------
## Read environmental variables
TRAINING_DATA_PATH = os.environ["AIP_TRAINING_DATA_URI"].replace("gs://", "/gcs/")
TEST_DATA_PATH = os.environ["AIP_TEST_DATA_URI"].replace("gs://", "/gcs/")
MODEL_DIR = os.environ["AIP_MODEL_DIR"].replace("gs://", "/gcs/")
MODEL_PATH = MODEL_DIR + "model.bst"


## Training variables
LABEL_COLUMN = "tx_fraud"
UNUSED_COLUMNS = ["timestamp","entity_type_customer","entity_type_terminal"]
DATA_SCHEMA = {
"timestamp" : "object",
"tx_amount": "float64",
"tx_fraud": "Int64",
"entity_type_customer": "Int64",
"customer_id_nb_tx_1day_window": "Int64",
"customer_id_nb_tx_7day_window": "Int64",
"customer_id_nb_tx_14day_window": "Int64",
"customer_id_avg_amount_1day_window": "float64",
"customer_id_avg_amount_7day_window": "float64",
"customer_id_avg_amount_14day_window": "float64",
"customer_id_nb_tx_15min_window": "Int64",
"customer_id_avg_amount_15min_window": "float64",
"customer_id_nb_tx_30min_window": "Int64",
"customer_id_avg_amount_30min_window": "float64",
"customer_id_nb_tx_60min_window": "Int64",
"customer_id_avg_amount_60min_window": "float64",
"entity_type_terminal": "Int64",
"terminal_id_nb_tx_1day_window": "Int64",
"terminal_id_nb_tx_7day_window": "Int64",
"terminal_id_nb_tx_14day_window": "Int64",
"terminal_id_risk_1day_window": "float64",
"terminal_id_risk_7day_window": "float64",
"terminal_id_risk_14day_window": "float64",
"terminal_id_nb_tx_15min_window": "Int64",
"terminal_id_avg_amount_15min_window": "float64",
"terminal_id_nb_tx_30min_window": "Int64",
"terminal_id_avg_amount_30min_window": "float64",
"terminal_id_nb_tx_60min_window": "Int64",
"terminal_id_avg_amount_60min_window": "float64"
}

# Helpers -----------------------------------------------------------------------------------------------------------------------------
def get_args():
    parser = argparse.ArgumentParser()

    # Data files arguments
    parser.add_argument("--bucket", dest="bucket", type=str,
                        required=True, help="Bucket uri")
    parser.add_argument("--max_depth", dest="max_depth",
                        default=6, type=int,
                        help="max_depth value.")
    parser.add_argument("--eta", dest="eta",
                        default=0.4, type=float,
                        help="eta.")
    parser.add_argument("--gamma", dest="gamma",
                        default=0.0, type=float,
                        help="eta value")
    parser.add_argument("-v", "--verbose", 
                        help="increase output verbosity", 
                        action="store_true")
    
    return parser.parse_args()

def set_logging():
    #TODO
    pass

def resample(df, replace, frac=1, random_state = 8):
    shuffled_df = df.sample(frac=frac, replace=replace, random_state=random_state)
    return shuffled_df

def preprocess(df):
    
    df = df.drop(columns=UNUSED_COLUMNS)

    # Drop rows with NaN"s
    df = df.dropna()

    # Convert integer valued (numeric) columns to floating point
    numeric_columns = df.select_dtypes(["float32", "float64"]).columns
    numeric_format = {col:"float32" for col in numeric_columns}
    df.astype(numeric_format)

    return df

def evaluate_model(model, x_true, y_true):
    
    y_true = y_true.compute()
    
    #calculate metrics
    metrics={}
    
    y_score =  model.predict_proba(x_true)[:, 1]
    y_score = y_score.compute()
    fpr, tpr, thr = roc_curve(
         y_true=y_true, y_score=y_score, pos_label=True
    )
    fpr_list = fpr.tolist()[::1000]
    tpr_list = tpr.tolist()[::1000]
    thr_list = thr.tolist()[::1000]

    y_pred = model.predict(x_true)
    y_pred.compute()
    c_matrix = confusion_matrix(y_true, y_pred)
    
    avg_precision_score = round(average_precision_score(y_true, y_score), 3)
    f1 = round(f1_score(y_true, y_pred), 3)
    lg_loss = round(log_loss(y_true, y_pred), 3)
    prec_score = round(precision_score(y_true, y_pred), 3)
    rec_score = round(recall_score(y_true, y_pred), 3)
    
    
    metrics["fpr"] = [round(f, 3) for f in fpr_list]
    metrics["tpr"] = [round(f, 3) for f in tpr_list]
    metrics["thrs"] = [round(f, 3) for f in thr_list]
    metrics["confusion_matrix"] = c_matrix.tolist()
    metrics["avg_precision_score"] = avg_precision_score
    metrics["f1_score"] = f1
    metrics["log_loss"] = lg_loss
    metrics["precision_score"] = prec_score
    metrics["recall_score"] = rec_score
    
    return metrics


def main():
    args = get_args()
    if args.verbose:
        set_logging()
        
    #variables
    bucket = args.bucket.replace("gs://", "/gcs/")
    deliverable_uri = (Path(bucket)/"deliverables")
    metrics_uri = (deliverable_uri/"metrics.json")

    #read data
    train_df = dask_df.read_csv(TRAINING_DATA_PATH, dtype=DATA_SCHEMA)
    test_df = dask_df.read_csv(TEST_DATA_PATH, dtype=DATA_SCHEMA)
    
    #preprocessing
    preprocessed_train_df = preprocess(train_df)
    preprocessed_test_df = preprocess(test_df)
    
    #downsampling
    train_nfraud_df = preprocessed_train_df[preprocessed_train_df[LABEL_COLUMN]==0]
    train_fraud_df = preprocessed_train_df[preprocessed_train_df[LABEL_COLUMN]==1]
    train_nfraud_downsample = resample(train_nfraud_df,
                          replace=True, 
                          frac=len(train_fraud_df)/len(train_df))
    ds_preprocessed_train_df = dask_df.multi.concat([train_nfraud_downsample, train_fraud_df])
    
    #target, features split
    x_train = ds_preprocessed_train_df[ds_preprocessed_train_df.columns.difference([LABEL_COLUMN])]
    y_train = ds_preprocessed_train_df.loc[:, LABEL_COLUMN].astype(int)
    x_true = preprocessed_test_df[preprocessed_test_df.columns.difference([LABEL_COLUMN])]
    y_true = preprocessed_test_df.loc[:, LABEL_COLUMN].astype(int)
    
    #train model
    cluster =  LocalCluster()
    client = Client(cluster)
    model = xgb.dask.DaskXGBClassifier(objective="reg:logistic", eval_metric="logloss")
    model.client = client  # assign the client
    model.fit(x_train, y_train, eval_set=[(x_true, y_true)])
    if not Path(MODEL_DIR).exists():
        Path(MODEL_DIR).mkdir(parents=True, exist_ok=True)
    model.save_model(MODEL_PATH)
    
    #generate metrics
    metrics = evaluate_model(model, x_true, y_true)
    if not Path(deliverable_uri).exists():
        Path(deliverable_uri).mkdir(parents=True, exist_ok=True)
    with open(metrics_uri, "w") as file:
        json.dump(metrics, file, sort_keys = True, indent = 4)
    file.close()
    
if __name__ == "__main__":
    main()

Overwriting build_training/train_xgb.py


#### Define a custom image for dask model training

Now you will build a custom container. By running your machine learning (ML) training job in a custom container, you can use any ML framework, non-ML dependencies, libraries, and binaries that are not otherwise supported on Vertex AI. Finally, you will package your training code into a Docker container image, push the container image to Container Registry, and create a custom job on Vertex AI.

For the ML framework, we will use xgboost. You will also use dask and scikit libraries. Dask is an open-source library for parallel computing written in Python. You will use Dask to speed up pre-processing of our dataset.

Let's first check if the repository already exists. If it already exists, then you can skip the cell that creates the repository.

In [None]:
# Check if the repository already exists
!gcloud artifacts repositories describe $IMAGE_REPOSITORY --location=us-central1

In [18]:
# Create image repositorie
!gcloud artifacts repositories create $IMAGE_REPOSITORY \
    --repository-format=docker \
    --location=us-central1 \
    --description="Fraud Finder Docker Image repository"

# List repositories under the project
!gcloud artifacts repositories list

Create request issued for: [fraudfinder-698au]
Waiting for operation [projects/ff04-dryrun/locations/us-central1/operations/f9
1d24f4-2c41-43ca-ba20-5c94c74cbebb] to complete...done.                        
Created repository [fraudfinder-698au].
Listing items under project ff04-dryrun, across all locations.

                                                                                ARTIFACT_REGISTRY
REPOSITORY         FORMAT  MODE                 DESCRIPTION                           LOCATION     LABELS  ENCRYPTION          CREATE_TIME          UPDATE_TIME          SIZE (MB)
fraudfinder-698au  DOCKER  STANDARD_REPOSITORY  Fraud Finder Docker Image repository  us-central1          Google-managed key  2023-01-25T06:48:15  2023-01-25T06:48:15  0


In [19]:
!gcloud auth configure-docker us-central1-docker.pkg.dev -q


{
  "credHelpers": {
    "gcr.io": "gcloud",
    "us.gcr.io": "gcloud",
    "eu.gcr.io": "gcloud",
    "asia.gcr.io": "gcloud",
    "staging-k8s.gcr.io": "gcloud",
    "marketplace.gcr.io": "gcloud",
    "us-central1-docker.pkg.dev": "gcloud"
  }
}
Adding credentials for: us-central1-docker.pkg.dev
gcloud credential helpers already registered correctly.


Next you need to write your docker file in order to create your container. 

In [20]:
%%writefile build_training/Dockerfile
# Specifies base image and tag
FROM python:3.7
WORKDIR /root

# Installs additional packages
RUN pip install gcsfs numpy pandas scikit-learn dask distributed xgboost --upgrade

# Copies the trainer code to the docker image.
COPY ./train_xgb.py /root/train_xgb.py

# Sets up the entry point to invoke the trainer.
ENTRYPOINT ["python3", "train_xgb.py"]

Overwriting build_training/Dockerfile


Next you will have to build the docker container. 

In [21]:
# Build and push docker file
!docker build -t $IMAGE_URI ./build_training/
!docker push $IMAGE_URI

Sending build context to Docker daemon  10.24kB
Step 1/5 : FROM python:3.7
3.7: Pulling from library/python

[1Bf03cda1f: Pulling fs layer 
[1Bf75f014e: Pulling fs layer 
[1B1d0e6b05: Pulling fs layer 
[1B50679dbd: Pulling fs layer 
[1B2ee9da04: Pulling fs layer 
[1B27d5e312: Pulling fs layer 
[1Bc5733757: Waiting fs layer 
[1Bd6676584: Pulling fs layer 
[1Bc4ff9b48: Pull complete 886MB/2.886MBB[8A[2K[7A[2K[9A[2K[9A[2K[6A[2K[9A[2K[5A[2K[9A[2K[5A[2K[9A[2K[5A[2K[4A[2K[6A[2K[5A[2K[5A[2K[3A[2K[5A[2K[3A[2K[3A[2K[9A[2K[5A[2K[1A[2K[1A[2K[9A[2K[5A[2K[5A[2K[9A[2K[5A[2K[9A[2K[5A[2K[9A[2K[5A[2K[9A[2K[5A[2K[9A[2K[5A[2K[9A[2K[9A[2K[9A[2K[9A[2K[9A[2K[9A[2K[8A[2K[8A[2K[8A[2K[8A[2K[7A[2K[7A[2K[7A[2K[7A[2K[6A[2K[6A[2K[6A[2K[6A[2K[6A[2K[6A[2K[6A[2K[6A[2K[6A[2K[6A[2K[6A[2K[6A[2K[6A[2K[6A[2K[6A[2K[6A[2K[6A[2K[6A[2K[6A[2K[6A[2K[5A[2K[5A[2K[5A[2K[5A[2K

#### Start a custom training job on Vertex AI
In this section, you will create a training pipeline. This will create custom training jobs, load our dataset and upload the model to Vertex AI after the training job is successfully completed. Learn more about creating of custom jobs [here](https://cloud.google.com/vertex-ai/docs/training/create-custom-job).

In [22]:
job = vertex_ai.CustomContainerTrainingJob(
    display_name=TRAIN_JOB_NAME,
    container_uri=IMAGE_URI,
    model_serving_container_image_uri=MODEL_SERVING_IMAGE_URI,
)

parameters = {"MAX_DEPTH": 4, "ETA": 0.3, "GAMMA": 0.1}

CMDARGS = [ f"""--bucket={BUCKET_NAME}""",
    "--max_depth=" + str(parameters["MAX_DEPTH"]),
    "--eta=" + str(parameters["ETA"]),
    "--gamma=" + str(parameters["GAMMA"]),
    "--verbose"
]

model = job.run(
    dataset=dataset,
    model_display_name=MODEL_NAME,
    args=CMDARGS,
    replica_count=1,
    machine_type=TRAIN_COMPUTE,
    accelerator_count=0)

Training Output directory:
gs://ff04-dryrun-fraudfinder/aiplatform-custom-training-2023-01-25-06:51:32.715 
No dataset split provided. The service will use a default split.
View Training:
https://console.cloud.google.com/ai/platform/locations/us-central1/training/6647510000019177472?project=1003399910665
CustomContainerTrainingJob projects/1003399910665/locations/us-central1/trainingPipelines/6647510000019177472 current state:
PipelineState.PIPELINE_STATE_RUNNING
CustomContainerTrainingJob projects/1003399910665/locations/us-central1/trainingPipelines/6647510000019177472 current state:
PipelineState.PIPELINE_STATE_RUNNING
CustomContainerTrainingJob projects/1003399910665/locations/us-central1/trainingPipelines/6647510000019177472 current state:
PipelineState.PIPELINE_STATE_RUNNING
CustomContainerTrainingJob projects/1003399910665/locations/us-central1/trainingPipelines/6647510000019177472 current state:
PipelineState.PIPELINE_STATE_RUNNING
View backing custom job:
https://console.cloud

#### Evaluate the model locally

Before you can run the model via an endpoint, you need to pre-process the data to match the format that your custom model expects.

In [23]:
LABEL_COLUMN = "tx_fraud"
UNUSED_COLUMNS = ["timestamp","entity_type_customer","entity_type_terminal"]
NA_VALUES = ["NA", "."]

def preprocess(df):
    """Converts categorical features to numeric. Removes unused columns.

    Args:
      df: Pandas df with raw data

    Returns:
      df with preprocessed data
    """
    df = df.drop(columns=UNUSED_COLUMNS)

    # Drop rows with NaN's
    df = df.dropna()

    # Convert integer valued (numeric) columns to floating point
    numeric_columns = df.select_dtypes(["int32", "float32", "float64"]).columns
    df[numeric_columns] = df[numeric_columns].astype("float32")

    dummy_columns = list(df.dtypes[df.dtypes == "category"].index)
    df = pd.get_dummies(df, columns=dummy_columns)

    return df

#test set
train_sample_path = os.path.join(TRAIN_DATA_DIR, "000000000000.csv")
df_test = pd.read_csv(train_sample_path)
preprocessed_test_Data = preprocess(df_test)

x_test = preprocessed_test_Data[preprocessed_test_Data.columns.drop(LABEL_COLUMN).to_list()].values
y_test = preprocessed_test_Data.loc[:,LABEL_COLUMN].astype(int)

Next you will copy the model artifact to the local directory to evaluate the model localy before deploying the model:

In [24]:
!gsutil cp -r $model.uri .

Copying gs://ff04-dryrun-fraudfinder/aiplatform-custom-training-2023-01-25-06:51:32.715/model/model.bst...
/ [1 files][ 30.0 KiB/ 30.0 KiB]                                                
Operation completed over 1 objects/30.0 KiB.                                     


Now it's time to test the model.

In [25]:
bst = xgb.Booster()  
bst.load_model("./model/model.bst") 
xgtest = xgb.DMatrix(x_test)
y_pred_prob = bst.predict(xgtest)
y_pred = y_pred_prob.round().astype(int)
y_pred_prob[0:10]
precision_recall_fscore_support(y_test.values, y_pred, average="weighted")

  _warn_prf(average, modifier, msg_start, len(result))


(0.956484, 0.978, 0.9671223458038423, None)

#### Deploy the model
Before you use your model to make predictions, you need to deploy it to an Endpoint. You can do this by calling the deploy function on the Model resource. This will do two things:

- create an Endpoint resource
- deploy the Model resource to the Endpoint resource


In [29]:
DEPLOY_COMPUTE="n1-standard-4"
TRAFFIC_SPLIT = {"0": 100}

MIN_NODES = 1
MAX_NODES = 1


endpoint = model.deploy(
    deployed_model_display_name=DEPLOYED_NAME,
    traffic_split=TRAFFIC_SPLIT,
    machine_type=DEPLOY_COMPUTE,
    accelerator_count=0,
    min_replica_count=MIN_NODES,
    max_replica_count=MAX_NODES,
)

Creating Endpoint
Create Endpoint backing LRO: projects/1003399910665/locations/us-central1/endpoints/5600564786216566784/operations/8713432631384997888
Endpoint created. Resource name: projects/1003399910665/locations/us-central1/endpoints/5600564786216566784
To use this Endpoint in another session:
endpoint = aiplatform.Endpoint('projects/1003399910665/locations/us-central1/endpoints/5600564786216566784')
Deploying model to Endpoint : projects/1003399910665/locations/us-central1/endpoints/5600564786216566784
Deploy Endpoint model backing LRO: projects/1003399910665/locations/us-central1/endpoints/5600564786216566784/operations/1473896230386925568
Endpoint model deployed. Resource name: projects/1003399910665/locations/us-central1/endpoints/5600564786216566784


#### Test the deployed model (Make an online prediction request)
Send an online prediction request to your deployed model. To make sure your deployed model is working, test it out by sending a request to the endpoint.

Let's first get a test data.

In [30]:
payload = {
  "instances": x_test[:2].tolist()
}

# In case you want to test it in the console
import json
with open("predictions.json", "w", encoding="utf-8") as f:
    json.dump(payload, f, ensure_ascii=False, indent=4)

In [31]:
endpoint.predict(instances = payload["instances"])

Prediction(predictions=[0.0277096051722765, 0.0277096051722765], deployed_model_id='7220567626395680768', model_version_id='1', model_resource_name='projects/1003399910665/locations/us-central1/models/6986228713167781888', explanations=None)

## (DO NOT RUN) Cleaning up

To clean up all Google Cloud resources used in this project, you can [delete the Google Cloud
project](https://cloud.google.com/resource-manager/docs/creating-managing-projects#shutting_down_projects) you used for the tutorial.

Otherwise, you can delete the individual resources you created in this tutorial:



In [None]:
# Delete endpoint resource
#! gcloud ai endpoints delete $ENDPOINT_NAME --quiet --region $REGION_NAME

# Delete model resource
#! gcloud ai models delete $MODEL_NAME --quiet

# Delete Cloud Storage objects that were created
#! gsutil -m rm -r $JOB_DIR

Now that we understand we packaged our XGBoost model and started a custom training job on Vertex AI we can take the ML workflow and formalize it into a Vertex AI Pipeline.

You can continue with the next Notebook: `06_formalization.ipynb`.