In [1]:
# pip install kfp
# !pip install watermark

In [2]:
# import os

# # Set the environment variable for Google Cloud credentials
# os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = ""

In [3]:
# IMPORT REQUIRED LIBRARIES
from kfp import dsl
from kfp.dsl import (Artifact,
                        Dataset,
                        Input,
                        Model,
                        Output,
                        Metrics,
                        Markdown,
                        HTML,
                        component, 
                        OutputPath, 
                        InputPath)
from kfp import compiler
from google.cloud.aiplatform import pipeline_jobs

import kfp
import google.cloud.aiplatform 
print(f"kfp version: {kfp.__version__}")
print(f"google.cloud.aiplatform version: {google.cloud.aiplatform.__version__}")

kfp version: 2.9.0
google.cloud.aiplatform version: 1.71.1


In [4]:
BASE_IMAGE = "eu.gcr.io/norse-voice-440615-v3/practice_gcp@sha256:506a11f1f5ded3bc2c4fb97f560e7aedf91a89dcc53f4b730c9ca4b8de6a777d"

@component(
    base_image=BASE_IMAGE,
    output_component_file="data_cleaning.yaml"
)
def data_cleaning(
    output_dir: Output[Dataset],
):
    
    from src.preprocess import data_cleaning
    
    df = data_cleaning()
    print("Data is cleaned")
    
    df.to_csv(output_dir.path, index=False)

  @component(
  def data_cleaning(


In [5]:
@component(
    base_image=BASE_IMAGE,
    output_component_file="data_processing.yaml",
)
def data_processing(
    dataset_full: Input[Dataset],
    dataset_processed: Output[Dataset]):

    import pandas as pd
    from src.preprocess import data_preprocessing

    df = pd.read_csv(dataset_full.path)

    df_processed = data_preprocessing(df)
    print("Performed Feature Engineering and Label Encoding")
    
    df_processed.to_csv(dataset_processed.path, index=False)

  @component(
  def data_processing(


In [6]:
@component(
    base_image=BASE_IMAGE,
    output_component_file="train_model.yaml",
)
def train_model(
    dataset_processed: Input[Dataset],
    model: Output[Model]
):
    import json
    import pandas as pd
    from src.train import train_and_evaluate

    df = pd.read_csv(dataset_processed.path)
    
    # Pass `model.path` to save the trained model directly to the specified path
    outputs = train_and_evaluate(df, model.path)
    scores = outputs['scores']
    
    # Save the evaluation scores as metadata
    with open(model.path + "_scores.json", 'w') as f:
        json.dump(scores, f)

    # Print the path to confirm where the model artifact was saved
    print(f"Model artifact saved at: {model.path}")
    print(f"Scores saved at: {model.path}_scores.json")

  @component(
  def train_model(


In [7]:
@component(
    base_image="python:3.9",
    output_component_file="compute_metrics.yaml",
)
def compute_metrics(
    model: Input[Model],
    train_metric: Output[Metrics],
    test_metric: Output[Metrics],
    cv_metric: Output[Metrics]
):
    
    import json
    
    # Define the path to the scores JSON file
    scores_file_name = model.path + "_scores.json"
    
    # Open and load the scores from the JSON file
    with open(scores_file_name, 'r') as file:  
        model_metrics = json.load(file)
        
    # Log metrics for train, test, and cross-validation AUC
    train_metric.log_metric('train_auc', model_metrics['train'])
    test_metric.log_metric('test_auc', model_metrics['test'])
    cv_metric.log_metric('cv_auc', model_metrics['cross_validation (5 folds)'])


  @component(
  def compute_metrics(


In [8]:
import os
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "C:/Users/Hezron Ling/Desktop/Practice_GCP/norse-voice-440615-v3-e1e314d6a20e.json"

In [9]:
# USE TIMESTAMP TO DEFINE UNIQUE PIPELINE NAMES
import datetime as dt

TIMESTAMP = dt.datetime.now().strftime("%Y%m%d%H%M%S")
DISPLAY_NAME = 'pipeline-practice_gcp-{}'.format(TIMESTAMP)
BUCKET_NAME = 'norse-voice-440615-v3_cloudbuild'
PIPELINE_ROOT = f"gs://{BUCKET_NAME}/pipeline_root/"
REGION = "asia-southeast2"
PROJECT_ID = "norse-voice-440615-v3"

# Define the pipeline. Notice how steps reuse outputs from previous steps
@dsl.pipeline(
    pipeline_root=PIPELINE_ROOT,
    # A name for the pipeline. Use to determine the pipeline Context.
    name="pipeline-practiceGCP"   
)

def pipeline(
    project: str = PROJECT_ID,
    region: str = REGION, 
    display_name: str = DISPLAY_NAME
):

    data_cleaning_op = data_cleaning()
    data_processing_op = data_processing(
        dataset_full=data_cleaning_op.outputs["output_dir"]
    )
    train_model_op = train_model(
        dataset_processed=data_processing_op.outputs["dataset_processed"]
    ).set_cpu_limit('8').set_memory_limit('32G')
    model_evaluation_op = compute_metrics(
        model=train_model_op.outputs["model"]
    )

# Compile the pipeline as JSON
compiler.Compiler().compile(
    pipeline_func=pipeline,
    package_path='practiceGCP_pipeline.json'
)

# Start the pipeline
start_pipeline = pipeline_jobs.PipelineJob(
    display_name="practiceGCP-pipeline",
    template_path="practiceGCP_pipeline.json",
    enable_caching=False,
    location=REGION,
    project=PROJECT_ID
)

# Run the pipeline
start_pipeline.run(service_account="120166958921-compute@developer.gserviceaccount.com")

Creating PipelineJob
PipelineJob created. Resource name: projects/120166958921/locations/asia-southeast2/pipelineJobs/pipeline-practicegcp-20241113223058
To use this PipelineJob in another session:
pipeline_job = aiplatform.PipelineJob.get('projects/120166958921/locations/asia-southeast2/pipelineJobs/pipeline-practicegcp-20241113223058')
View Pipeline Job:
https://console.cloud.google.com/vertex-ai/locations/asia-southeast2/pipelines/runs/pipeline-practicegcp-20241113223058?project=120166958921
PipelineJob projects/120166958921/locations/asia-southeast2/pipelineJobs/pipeline-practicegcp-20241113223058 current state:
PipelineState.PIPELINE_STATE_RUNNING
PipelineJob projects/120166958921/locations/asia-southeast2/pipelineJobs/pipeline-practicegcp-20241113223058 current state:
PipelineState.PIPELINE_STATE_RUNNING
PipelineJob projects/120166958921/locations/asia-southeast2/pipelineJobs/pipeline-practicegcp-20241113223058 current state:
PipelineState.PIPELINE_STATE_RUNNING
PipelineJob proje