### Load Config File

In [14]:
import yaml
from box import ConfigBox


with open("config.yaml", "r") as ymlfile:
    config = yaml.safe_load(ymlfile)

config = ConfigBox(config)

### Imports

In [15]:
from google.cloud import aiplatform
from google.cloud.aiplatform import pipeline_jobs

from typing import NamedTuple

from kfp.dsl import component
from kfp.dsl import pipeline
from kfp import compiler
from kfp.dsl import (Artifact, Model, Dataset, Input, Output, Metrics)

### Components

#### Preprocess

In [16]:
@component(
    base_image="python:3.11-slim",
    packages_to_install=[
        config.packages.BIGQUERY, 
        config.packages.PANDAS,
        config.packages.DB_TYPES
    ]
)
def fetch_big_data_table(
    project_id: str,
    dataset_id: str,
    table_id: str,
    dataset_artifact: Output[Dataset]
):
    
    from google.cloud import bigquery

    client = bigquery.Client(project=project_id)
    
    query = f"""
        SELECT *
        FROM `{project_id}.{dataset_id}.{table_id}`
        """
    
    data = client.query(query).to_dataframe()

    print('Big query data fetched')
    data.to_csv(dataset_artifact.path, index=False)
    

In [17]:
@component(
    base_image="us-central1-docker.pkg.dev/pebolas-sandbox/sample-model/prepare_data:latest"
)
def prepare_data(
    input_data: Input[Dataset],
    scaler_artifact: Output[Artifact],
    dataset_artifact: Output[Dataset]
    
):
    import joblib
    import pandas as pd
    from sklearn.preprocessing import StandardScaler
    
    data = pd.read_csv(input_data.path, index_col=False)
    
    data.Age = data.Age.fillna(data.Age.mean())

    data.Embarked = data.Embarked.fillna(
        data.Embarked.mode()[0]
    )

    data.drop(columns=["Cabin", "Name", "Ticket"], inplace=True)

    data.Sex = data.Sex.map({"male": 0, "female": 1})

    data = pd.get_dummies(
        data, columns=["Embarked"], drop_first=True
    )
    
    data = pd.get_dummies(
        data, columns=["Pclass"], drop_first=True
    )
    
    
    scaler = StandardScaler()

    scaled_data = scaler.fit_transform(data.values)
    
    processed_data = pd.DataFrame(
        scaled_data, index=data.index, columns=data.columns
    )
    
    joblib.dump(scaler, scaler_artifact.path)
        
    processed_data.to_csv(dataset_artifact.path, index=False)

    
    

In [18]:
@component(
    base_image="python:3.11-slim",
    packages_to_install=[
        config.packages.AIPLATFORM
    ]
)
def fetch_dataset(
    display_name: str,
    dataset_artifact: Input[Dataset],
) -> str:
    from google.cloud import aiplatform
    
    data = aiplatform.TabularDataset.create(
        display_name=display_name,
        gcs_source=dataset_artifact.uri
    )
        
    return data.display_name

### Train

In [19]:
@component(
    base_image="python:3.11-slim",
    packages_to_install=[
        config.packages.AIPLATFORM
    ]
)
def fetch_model(
    display_name: str
) -> str:
    from google.cloud import aiplatform
    
    # here it'd be a good practice to use the aiplatform.init function to setup the project and location
    
    model_list = aiplatform.Model.list(
        filter=f"display_name={display_name}"
    )
    
    if len(model_list) > 0:
        model = model_list[0]
        parent_model = model.resource_name
    else:
        parent_model = ''
    
    return parent_model

In [20]:
@component(
    base_image="python:3.11-slim",
    packages_to_install=[
        config.packages.AIPLATFORM
    ]
)
def train_model(
    parent_model: str,
    dataset_name: str,
    training_job_name: str,
    train_image_uri: str,
    staging_bucket: str,
    model_serving_container_image_uri: str
) -> Model:
    
    
    from google.cloud import aiplatform
    
    dataset = aiplatform.TabularDataset.list(
        filter=f"display_name={dataset_name}"
    )[0]
    
    # check if the model exists
    job = aiplatform.CustomContainerTrainingJob(
        display_name=training_job_name,
        container_uri=train_image_uri,
        model_serving_container_image_uri= model_serving_container_image_uri,
        staging_bucket=staging_bucket
    )
    
    model = job.run(
        dataset=dataset,
        parent_model=parent_model if parent_model != '' else None,
        machine_type="n1-standard-4",
        args=["--max_iter=1000"],
        service_account="guido-owner@pebolas-sandbox.iam.gserviceaccount.com"
    )
    
    return Model(uri=model.uri, name=model.resource_name)

#### Deploy

In [21]:
@component(
    base_image="python:3.11-slim",
    packages_to_install=[
        config.packages.AIPLATFORM
    ]
)
def deploy(
    model_artifact: Input[Model],
):
    print("deploy :)")
    # model.batch_predict(
    #     job_display_name="test_batch_predict",
    #     gcs_source="gs://pebolas-sandbox-vertex-staging-us-central1/test.jsonl",
    #     gcs_destination_prefix="gs://pebolas-sandbox-vertex-staging-us-central1/",
    #     machine_type="n1-standard-2",
    #     sync=True
    # )

### Pipeline

In [22]:
@pipeline(
    name=config.pipeline.name,
    description=config.pipeline.description,
    pipeline_root=config.pipeline.root + config.pipeline.name
)
def titaninc_pipeline(
    project_id: str,
    dataset_id: str,
    table_id: str,
    dataset_display_name: str,
    train_model_name: str,
    train_job_name: str,
    train_image_uri: str,
    staging_bucket: str,
    model_serving_container_image_uri: str,
    
):
    fetch_data = fetch_big_data_table(
        project_id=project_id, 
        dataset_id=dataset_id, 
        table_id=table_id
        )
    
    prepare_data_task = prepare_data(
        input_data=fetch_data.output
        )
    
    fetch_dataset_task = fetch_dataset(
        display_name=dataset_display_name,
        dataset_artifact=prepare_data_task.outputs["dataset_artifact"]
    )
    
    fetch_model_task = fetch_model(
        display_name=train_model_name
    )
    
    train_model_task = train_model(
        parent_model=fetch_model_task.output,
        dataset_name=fetch_dataset_task.output,
        training_job_name=train_job_name,
        train_image_uri=train_image_uri,
        staging_bucket=staging_bucket,
        model_serving_container_image_uri=model_serving_container_image_uri
    )
    
    deploy_task = deploy(
        model_artifact=train_model_task.output
    )
    

In [23]:
aiplatform.init(
    project=config.project_id, 
    location=config.location
)

In [24]:
compiler.Compiler().compile(
    pipeline_func=titaninc_pipeline, 
    package_path=config.pipeline.package_path
    )

In [25]:
job = pipeline_jobs.PipelineJob(
    display_name=config.pipeline.name,
    template_path=config.pipeline.package_path,
    parameter_values={
        'project_id': config.project_id,
        'dataset_id': config.parameters.dataset_id,
        'table_id': config.parameters.table_id,
        'dataset_display_name': config.parameters.dataset_display_name,
        'train_model_name': config.parameters.train_model_name,
        'train_job_name': config.parameters.train_job_name,
        'train_image_uri': config.parameters.train_image_uri,
        'staging_bucket': config.parameters.staging_bucket,
        'model_serving_container_image_uri': config.parameters.model_serving_container_image_uri,
    }
)

In [26]:
job.run()

Creating PipelineJob
PipelineJob created. Resource name: projects/470842673491/locations/us-central1/pipelineJobs/titanic-pipeline-20241017143159
To use this PipelineJob in another session:
pipeline_job = aiplatform.PipelineJob.get('projects/470842673491/locations/us-central1/pipelineJobs/titanic-pipeline-20241017143159')
View Pipeline Job:
https://console.cloud.google.com/vertex-ai/locations/us-central1/pipelines/runs/titanic-pipeline-20241017143159?project=470842673491
PipelineJob projects/470842673491/locations/us-central1/pipelineJobs/titanic-pipeline-20241017143159 current state:
3
PipelineJob projects/470842673491/locations/us-central1/pipelineJobs/titanic-pipeline-20241017143159 current state:
3
PipelineJob projects/470842673491/locations/us-central1/pipelineJobs/titanic-pipeline-20241017143159 current state:
3
PipelineJob projects/470842673491/locations/us-central1/pipelineJobs/titanic-pipeline-20241017143159 current state:
3
PipelineJob projects/470842673491/locations/us-centr

RuntimeError: Job failed with:
code: 9
message: " The DAG failed because some tasks failed. The failed tasks are: [train-model].; Job (project_id = pebolas-sandbox, job_id = 5888208847792766976) is failed due to the above error.; Failed to handle the job: {project_number = 470842673491, job_id = 5888208847792766976}"
