### Load Config File

In [26]:
import yaml
from box import ConfigBox


with open("config.yaml", "r") as ymlfile:
    config = yaml.safe_load(ymlfile)

config = ConfigBox(config)

### Imports

In [27]:
from google.cloud import aiplatform
from google.cloud.aiplatform import TabularDataset
from google.cloud.aiplatform import pipeline_jobs

from typing import NamedTuple

from kfp.dsl import component
from kfp.dsl import pipeline
from kfp import compiler
from kfp.dsl import (Artifact, Model, Dataset, Input, Output, Metrics)
from kfp import dsl

### Components

#### Preprocess

In [28]:
@component(
    base_image="python:3.11-slim",
    packages_to_install=[
        config.packages.BIGQUERY, 
        config.packages.PANDAS,
        config.packages.DB_TYPES
    ]
)
def fetch_big_data_table(
    project_id: str,
    dataset_id: str,
    table_id: str,
    dataset_artifact: Output[Dataset]
):
    
    from google.cloud import bigquery

    client = bigquery.Client(project=project_id)
    
    query = f"""
        SELECT *
        FROM `{project_id}.{dataset_id}.{table_id}`
        """
    
    data = client.query(query).to_dataframe()
    
    data.to_csv(dataset_artifact.path, index=False)
    

In [29]:
@component(
    base_image="southamerica-east1-docker.pkg.dev/pebolas-sandbox/sample-model/prepare_data:latest"
)
def prepare_data(
    input_data: Input[Dataset],
    scaler_artifact: Output[Artifact],
    dataset_artifact: Output[Dataset]
    
):
    import pickle as pkl
    import pandas as pd
    from sklearn.preprocessing import StandardScaler
    
    data = pd.read_csv(input_data.path, index_col=False)
    
    data.Age = data.Age.fillna(data.Age.mean())

    data.Embarked = data.Embarked.fillna(
        data.Embarked.mode()[0]
    )

    data.drop(columns=["Cabin", "Name", "Ticket"], inplace=True)

    data.Sex = data.Sex.map({"male": 0, "female": 1})

    data = pd.get_dummies(
        data, columns=["Embarked"], drop_first=True
    )
    
    
    scaler = StandardScaler()
    
    column_to_scale = ["Age", "Fare"]
    data[column_to_scale] = scaler.fit_transform(
            data[column_to_scale]
        )
    
    with open(scaler_artifact.path, "wb") as f:
        pkl.dump(scaler, f)
        
        
    data.to_csv(dataset_artifact.path, index=False)

    
    

In [30]:
@component(
    base_image="python:3.11-slim",
    packages_to_install=[
        config.packages.AIPLATFORM
    ]
)
def create_dataset(
    display_name: str,
    dataset: Input[Dataset]
) -> Dataset:
    from google.cloud import aiplatform
    
    
    aiplatform.init()
    
    
    data = aiplatform.TabularDataset.create(
        display_name=display_name,
        gcs_source=dataset.uri
    )
    
    
    return Dataset(
        uri=data.resource_name
    )
    

#### Train

In [None]:
def train():
    print("training :)")

#### Deploy

In [31]:
def deploy():
    model = aiplatform.Model.upload(
        display_name="test_model",
        project="pebolas-sandbox",
        location="southamerica-east1",
        serving_container_image_uri="southamerica-east1-docker.pkg.dev/pebolas-sandbox/sample-model/deploy_model:latest",
        serving_container_predict_route="/predict",
        serving_container_health_route="/health"
    )
    
    model.batch_predict(
        job_display_name="test_batch_predict",
        gcs_source="gs://pebolas-sandbox-vertex-staging-southamerica-east1/test.jsonl",
        gcs_destination_prefix="gs://pebolas-sandbox-vertex-staging-southamerica-east1/",
        machine_type="n1-standard-2",
        sync=True
    )

### Pipeline

In [32]:
@pipeline(
    name=config.pipeline.name,
    description=config.pipeline.description,
    pipeline_root=config.pipeline.root + config.pipeline.name
)
def titaninc_pipeline(
    project_id: str,
    dataset_id: str,
    table_id: str,
    dataset_display_name: str
):
    fetch_data = fetch_big_data_table(
        project_id=project_id, 
        dataset_id=dataset_id, 
        table_id=table_id
        )
    
    prepare_data_task = prepare_data(
        input_data=fetch_data.output
        )
    
    create_dataset_task = create_dataset(
        display_name=dataset_display_name,
        dataset=prepare_data_task.outputs["dataset_artifact"]
    )

In [33]:
aiplatform.init(
    project=config.project_id, 
    location=config.location
)

In [34]:
compiler.Compiler().compile(
    pipeline_func=titaninc_pipeline, 
    package_path=config.pipeline.package_path
    )

In [35]:
job = pipeline_jobs.PipelineJob(
    display_name=config.pipeline.name,
    template_path=config.pipeline.package_path,
    parameter_values={
        'project_id': config.project_id,
        'dataset_id': config.parameters.dataset_id,
        'table_id': config.parameters.table_id,
        'dataset_display_name': config.parameters.dataset_display_name
    }
)

In [36]:
job.run()

Creating PipelineJob
PipelineJob created. Resource name: projects/470842673491/locations/southamerica-east1/pipelineJobs/titanic-pipeline-20241009162220
To use this PipelineJob in another session:
pipeline_job = aiplatform.PipelineJob.get('projects/470842673491/locations/southamerica-east1/pipelineJobs/titanic-pipeline-20241009162220')
View Pipeline Job:
https://console.cloud.google.com/vertex-ai/locations/southamerica-east1/pipelines/runs/titanic-pipeline-20241009162220?project=470842673491
PipelineJob projects/470842673491/locations/southamerica-east1/pipelineJobs/titanic-pipeline-20241009162220 current state:
3
PipelineJob projects/470842673491/locations/southamerica-east1/pipelineJobs/titanic-pipeline-20241009162220 current state:
3
PipelineJob projects/470842673491/locations/southamerica-east1/pipelineJobs/titanic-pipeline-20241009162220 current state:
3
PipelineJob projects/470842673491/locations/southamerica-east1/pipelineJobs/titanic-pipeline-20241009162220 current state:
3
Pip