### Load Config File

In [1]:
import yaml
from box import ConfigBox


with open("config.yaml", "r") as ymlfile:
    config = yaml.safe_load(ymlfile)

config = ConfigBox(config)

### Imports

In [2]:
from google.cloud import aiplatform
from google.cloud.aiplatform import TabularDataset
from google.cloud.aiplatform import pipeline_jobs

from typing import NamedTuple

from kfp.dsl import component
from kfp.dsl import pipeline
from kfp import compiler
from kfp.dsl import (Artifact, Model, Dataset, Input, Output, Metrics)
from kfp import dsl

### Components

#### Preprocess

In [3]:
@component(
    base_image="python:3.11-slim",
    packages_to_install=[
        config.packages.BIGQUERY, 
        config.packages.PANDAS,
        config.packages.DB_TYPES
    ]
)
def fetch_big_data_table(
    project_id: str,
    dataset_id: str,
    table_id: str,
    dataset_artifact: Output[Dataset]
):
    
    from google.cloud import bigquery

    client = bigquery.Client(project=project_id)
    
    query = f"""
        SELECT *
        FROM `{project_id}.{dataset_id}.{table_id}`
        """
    
    data = client.query(query).to_dataframe()
    
    data.to_csv(dataset_artifact.path, index=False)
    

In [4]:
@component(
    base_image="southamerica-east1-docker.pkg.dev/pebolas-sandbox/sample-model/prepare_data:latest"
)
def prepare_data(
    input_data: Input[Dataset],
    scaler_artifact: Output[Artifact],
    dataset_artifact: Output[Dataset]
    
):
    import pickle as pkl
    import pandas as pd
    from sklearn.preprocessing import StandardScaler
    
    data = pd.read_csv(input_data.path, index_col=False)
    
    data.Age = data.Age.fillna(data.Age.mean())

    data.Embarked = data.Embarked.fillna(
        data.Embarked.mode()[0]
    )

    data.drop(columns=["Cabin", "Name", "Ticket"], inplace=True)

    data.Sex = data.Sex.map({"male": 0, "female": 1})

    data = pd.get_dummies(
        data, columns=["Embarked"], drop_first=True
    )
    
    
    scaler = StandardScaler()
    
    column_to_scale = ["Age", "Fare"]
    data[column_to_scale] = scaler.fit_transform(
            data[column_to_scale]
        )
    
    with open(scaler_artifact.path, "wb") as f:
        pkl.dump(scaler, f)
        
        
    data.to_csv(dataset_artifact.path, index=False)

    
    

### Train

In [5]:
@component(
    base_image="python:3.11-slim",
    packages_to_install=[
        config.packages.AIPLATFORM
    ]
)
def train_model(
    display_name: str,
    train_image_uri: str,
    staging_bucket: str,
    model_serving_container_image_uri: str,
    model_serving_container_health_route: str,
    model_serving_container_predict_route: str,
    dataset: Input[Dataset]
):
    from google.cloud import aiplatform
    
    data = aiplatform.TabularDataset.create(
        display_name=display_name,
        
        gcs_source=dataset.uri
    )
    
    job = aiplatform.CustomContainerTrainingJob(
        display_name=display_name+ "_training",
        container_uri=train_image_uri,
        model_serving_container_image_uri= model_serving_container_image_uri,
        model_serving_container_health_route= model_serving_container_health_route,
        model_serving_container_predict_route= model_serving_container_predict_route,
        staging_bucket=staging_bucket
    )
    
    model = job.run(
        dataset=data,
        machine_type="n1-standard-4",
        
        args=["--max_iter=1000"]
    )
    
    model.evaluate(
        prediction_type="regression",
        target_field_name="Survived",
        staging_bucket=staging_bucket,
        gcs_source_uris=['gs://sample-model-kubeflow-pipeline/titanic-pipeline/staging/aiplatform-custom-training-2024-10-11-18:58:57.367/dataset-867752168826863616-tables-2024-10-11T18:58:58.637536Z/test-*']
        
    )
    return model

#### Deploy

In [6]:
@component(
    base_image="python:3.11-slim",
    packages_to_install=[
        config.packages.AIPLATFORM
    ]
)
def deploy(
):
    print("deploy :)")
    # model.batch_predict(
    #     job_display_name="test_batch_predict",
    #     gcs_source="gs://pebolas-sandbox-vertex-staging-southamerica-east1/test.jsonl",
    #     gcs_destination_prefix="gs://pebolas-sandbox-vertex-staging-southamerica-east1/",
    #     machine_type="n1-standard-2",
    #     sync=True
    # )

### Pipeline

In [7]:
@pipeline(
    name=config.pipeline.name,
    description=config.pipeline.description,
    pipeline_root=config.pipeline.root + config.pipeline.name
)
def titaninc_pipeline(
    project_id: str,
    dataset_id: str,
    table_id: str,
    dataset_display_name: str,
    train_image_uri: str,
    staging_bucket: str,
    model_serving_container_image_uri: str,
    model_serving_container_health_route: str,
    model_serving_container_predict_route: str,
    
):
    fetch_data = fetch_big_data_table(
        project_id=project_id, 
        dataset_id=dataset_id, 
        table_id=table_id
        )
    
    prepare_data_task = prepare_data(
        input_data=fetch_data.output
        )
    
    train_model_task = train_model(
        display_name=dataset_display_name,
        train_image_uri=train_image_uri,
        staging_bucket=staging_bucket,
        model_serving_container_image_uri= model_serving_container_image_uri,
        model_serving_container_health_route= model_serving_container_health_route,
        model_serving_container_predict_route= model_serving_container_predict_route,
        dataset=prepare_data_task.outputs["dataset_artifact"]
    )
    
    deploy_task = deploy(
    )
    

In [8]:
aiplatform.init(
    project=config.project_id, 
    location=config.location
)

In [9]:
compiler.Compiler().compile(
    pipeline_func=titaninc_pipeline, 
    package_path=config.pipeline.package_path
    )

In [10]:
job = pipeline_jobs.PipelineJob(
    display_name=config.pipeline.name,
    template_path=config.pipeline.package_path,
    parameter_values={
        'project_id': config.project_id,
        'dataset_id': config.parameters.dataset_id,
        'table_id': config.parameters.table_id,
        'dataset_display_name': config.parameters.dataset_display_name,
        'train_image_uri': config.parameters.train_image_uri,
        'staging_bucket': config.parameters.staging_bucket,
        'model_serving_container_image_uri': config.parameters.model_serving_container_image_uri,
        'model_serving_container_health_route': config.parameters.model_serving_container_health_route,
        'model_serving_container_predict_route': config.parameters.model_serving_container_predict_route
    }
)

In [None]:
job.run()

Creating PipelineJob
PipelineJob created. Resource name: projects/470842673491/locations/southamerica-east1/pipelineJobs/titanic-pipeline-20241011214257
To use this PipelineJob in another session:
pipeline_job = aiplatform.PipelineJob.get('projects/470842673491/locations/southamerica-east1/pipelineJobs/titanic-pipeline-20241011214257')
View Pipeline Job:
https://console.cloud.google.com/vertex-ai/locations/southamerica-east1/pipelines/runs/titanic-pipeline-20241011214257?project=470842673491
PipelineJob projects/470842673491/locations/southamerica-east1/pipelineJobs/titanic-pipeline-20241011214257 current state:
3
PipelineJob projects/470842673491/locations/southamerica-east1/pipelineJobs/titanic-pipeline-20241011214257 current state:
3
PipelineJob projects/470842673491/locations/southamerica-east1/pipelineJobs/titanic-pipeline-20241011214257 current state:
3
PipelineJob projects/470842673491/locations/southamerica-east1/pipelineJobs/titanic-pipeline-20241011214257 current state:
3
