In [77]:
!pip install google-cloud-aiplatform
!pip install google-cloud-pipeline-components

[0m

In [78]:
from google.cloud import aiplatform
from google.cloud.aiplatform import pipeline_jobs

from typing import NamedTuple

from kfp.dsl import component
from kfp.dsl import pipeline
from kfp import compiler
from kfp.dsl import (Artifact, Dataset, Input, Output)
from kfp import dsl

In [79]:
PROJECT_ID = 'pebolas-sandbox'
PIPELINE_ROOT = 'gs://sample-model-kubeflow-pipeline/'

In [80]:
@component(
    base_image="python:3.11-slim",
    packages_to_install=["google-cloud-bigquery==3.26.0", "pandas==2.2.3", "db-dtypes==1.3.0"]
)
def fetch_big_data_table(
    project_id: str,
    dataset_id: str,
    table_id: str,
    dataset_artifact: Output[Dataset]
):
    
    from google.cloud import bigquery

    client = bigquery.Client(project=project_id)
    
    query = f"""
        SELECT *
        FROM `{project_id}.{dataset_id}.{table_id}`
        """
    
    data = client.query(query).to_dataframe()
    
    data.to_csv(dataset_artifact.path, index=False)
    

In [81]:
@component(
    base_image="southamerica-east1-docker.pkg.dev/pebolas-sandbox/sample-model/prepare_data:latest"
)
def prepare_data(
    input_data: Input[Dataset],
    dataset_artifact: Output[Dataset]
    
):
    
    import pandas as pd
    
    data = pd.read_csv(input_data.path, index_col=False)
    
    data.Age = data.Age.fillna(data.Age.mean())

    data.Embarked = data.Embarked.fillna(
        data.Embarked.mode()[0]
    )

    data.drop(columns=["Cabin", "Name", "Ticket"], inplace=True)

    data.Sex = data.Sex.map({"male": 0, "female": 1})

    data = pd.get_dummies(
        data, columns=["Embarked"], drop_first=True
    )
    
    data.to_csv(dataset_artifact.path, index=False)

    
    

In [82]:
@component(
    base_image="python:3.11-slim",
    packages_to_install=["scikit-learn==1.5.2", "pandas==2.2.3", "db-dtypes==1.3.0"]
)
def scale_data(
    input_data: Input[Dataset],
    scaler_artifact: Output[Artifact],
    dataset_artifact: Output[Dataset]
):
    
    import pickle as pkl
    import pandas as pd
    from sklearn.preprocessing import StandardScaler
    
    data = pd.read_csv(input_data.path, index_col=False)
    
    scaler = StandardScaler()
    
    column_to_scale = ["Age", "Fare"]
    data[column_to_scale] = scaler.fit_transform(
            data[column_to_scale]
        )
    
    with open(scaler_artifact.path, "wb") as f:
        pkl.dump(scaler, f)
        
    data.to_csv(dataset_artifact.path, index=False)
    

In [83]:
@component(
    base_image="python:3.11-slim",
    packages_to_install=["google-cloud-bigquery==3.26.0", "pandas==2.2.3", "db-dtypes==1.3.0"]
)
def save_big_query_table(
    project_id: str,
    dataset_id: str,
    table_id: str,
    input_data: Input[Dataset]
):
    import pandas as pd
    from google.cloud import bigquery
    
    client = bigquery.Client(project=project_id)
    
    data = pd.read_csv(input_data.path, index_col=False)
    
    table_ref = client.dataset(dataset_id).table(table_id)
    
    job = client.load_table_from_dataframe(data, table_ref)
    
    job.result()

In [84]:
@pipeline(
    name="titanic-pipeline",
    description="Pipeline to preprocess Titanic dataset",
    pipeline_root=PIPELINE_ROOT + "titanic-pipeline"
)
def titaninc_pipeline(
    project_id: str,
    dataset_id: str,
    table_id: str,
    output_table_id: str
):
    fetch_data = fetch_big_data_table(
        project_id=project_id, 
        dataset_id=dataset_id, 
        table_id=table_id
        )
    
    prepare_data_task = prepare_data(
        input_data=fetch_data.output
        )
    
    scale_data_task = scale_data(
        input_data=prepare_data_task.output
        )
    
    save_big_query_table(
        project_id=project_id, 
        dataset_id=dataset_id, 
        table_id=output_table_id, 
        input_data=scale_data_task.outputs["dataset_artifact"]
        )

In [85]:
aiplatform.init(
    project=PROJECT_ID, 
    location='southamerica-east1'
)

In [86]:
compiler.Compiler().compile(
    pipeline_func=titaninc_pipeline, 
    package_path='titanic_pipeline.json'
    )

In [87]:
job = pipeline_jobs.PipelineJob(
    display_name='titanic-pipeline',
    template_path='titanic_pipeline.json',
    parameter_values={
        'project_id': PROJECT_ID,
        'dataset_id': 'titanic',
        'table_id': 'train_data',
        'output_table_id': 'train_cleaned'
    }
)

In [88]:
job.run()

Creating PipelineJob
PipelineJob created. Resource name: projects/470842673491/locations/southamerica-east1/pipelineJobs/titanic-pipeline-20241002195428
To use this PipelineJob in another session:
pipeline_job = aiplatform.PipelineJob.get('projects/470842673491/locations/southamerica-east1/pipelineJobs/titanic-pipeline-20241002195428')
View Pipeline Job:
https://console.cloud.google.com/vertex-ai/locations/southamerica-east1/pipelines/runs/titanic-pipeline-20241002195428?project=470842673491
PipelineJob projects/470842673491/locations/southamerica-east1/pipelineJobs/titanic-pipeline-20241002195428 current state:
3
PipelineJob projects/470842673491/locations/southamerica-east1/pipelineJobs/titanic-pipeline-20241002195428 current state:
3
PipelineJob projects/470842673491/locations/southamerica-east1/pipelineJobs/titanic-pipeline-20241002195428 current state:
3
PipelineJob projects/470842673491/locations/southamerica-east1/pipelineJobs/titanic-pipeline-20241002195428 current state:
3
Pip