### Load Config File

In [22]:
import yaml
from box import ConfigBox


with open("config.yaml", "r") as ymlfile:
    config = yaml.safe_load(ymlfile)

config = ConfigBox(config)

### Imports

In [23]:
from google.cloud import aiplatform
from google.cloud.aiplatform import pipeline_jobs

from typing import NamedTuple

from kfp.dsl import component
from kfp.dsl import pipeline
from kfp import compiler
from kfp.dsl import (Artifact, Model, Dataset, Input, Output, OutputPath, InputPath)
from kfp import dsl

### Components

In [24]:
@component(
    base_image="python:3.11-slim",
    packages_to_install=[
        config.packages.BIGQUERY, 
        config.packages.PANDAS,
        config.packages.DB_TYPES
    ]
)
def fetch_big_data_table(
    project_id: str,
    dataset_id: str,
    table_id: str,
    dataset_artifact: Output[Dataset]
):
    
    from google.cloud import bigquery

    client = bigquery.Client(project=project_id)
    
    query = f"""
        SELECT *
        FROM `{project_id}.{dataset_id}.{table_id}`
        """
    
    data = client.query(query).to_dataframe()
    
    data.to_csv(dataset_artifact.path, index=False)
    

In [25]:
@component(
    base_image="southamerica-east1-docker.pkg.dev/pebolas-sandbox/sample-model/prepare_data:latest"
)
def prepare_data(
    input_data: Input[Dataset],
    dataset_artifact: Output[Dataset]
    
):
    
    import pandas as pd
    
    data = pd.read_csv(input_data.path, index_col=False)
    
    data.Age = data.Age.fillna(data.Age.mean())

    data.Embarked = data.Embarked.fillna(
        data.Embarked.mode()[0]
    )

    data.drop(columns=["Cabin", "Name", "Ticket"], inplace=True)

    data.Sex = data.Sex.map({"male": 0, "female": 1})

    data = pd.get_dummies(
        data, columns=["Embarked"], drop_first=True
    )
    
    data.to_csv(dataset_artifact.path, index=False)

    
    

In [26]:
@component(
    base_image="python:3.11-slim",
    packages_to_install=[
        config.packages.SKLEARN, 
        config.packages.PANDAS,
        config.packages.DB_TYPES
    ]
)
def scale_data(
    input_data: Input[Dataset],
    scaler_artifact: Output[Artifact],
    dataset_artifact: Output[Dataset]
):
    
    import pickle as pkl
    import pandas as pd
    from sklearn.preprocessing import StandardScaler
    
    data = pd.read_csv(input_data.path, index_col=False)
    
    scaler = StandardScaler()
    
    column_to_scale = ["Age", "Fare"]
    data[column_to_scale] = scaler.fit_transform(
            data[column_to_scale]
        )
    
    with open(scaler_artifact.path, "wb") as f:
        pkl.dump(scaler, f)
        
    data.to_csv(dataset_artifact.path, index=False)
    

In [27]:
@component(
    base_image="python:3.11-slim",
    packages_to_install=[
        config.packages.SKLEARN,
        config.packages.PANDAS,
        config.packages.DB_TYPES
    ]
)
def split_train_test(
    input_data: Input[Dataset],
    dataset_train: Output[Artifact],
    dataset_test: Output[Artifact]
):
    import pickle as pkl
    import pandas as pd
    from sklearn import model_selection
    
    data = pd.read_csv(input_data.path, index_col=False)
    
    x_train, x_test, y_train, y_test = model_selection.train_test_split(
        data.drop(columns=["Survived"]),
        data.Survived,
        test_size=0.2,
        random_state=42
    )
    
    x_train_results = {'x_train': x_train, 'y_train': y_train}
    x_test_results = {'x_test': x_test, 'y_test': y_test}
    
    with open(dataset_train.path, "wb") as f:
        pkl.dump(x_train_results, f)
        
    with open(dataset_test.path, "wb") as f:
        pkl.dump(x_test_results, f)
    

In [28]:
@component(
    base_image="python:3.11-slim",
    packages_to_install=[
        config.packages.SKLEARN,
        config.packages.PANDAS,
        config.packages.DB_TYPES
    ]
)
def train_model(
    train_data: Input[Artifact],
    test_data: Input[Artifact],
    model: Output[Model],
    metrics_artifact: Output[Artifact]
):
    import pickle as pkl
    from sklearn.linear_model import LogisticRegression
    from sklearn.metrics import f1_score, accuracy_score
    
    
    with open(train_data.path, "rb") as f:
        data = pkl.load(f)
        
    X_train = data['x_train']
    y_train = data['y_train']
    
    lr = LogisticRegression()
    lr.fit(X_train, y_train)
    
    
    with open(test_data.path, "rb") as f:
        data = pkl.load(f)
        
    X_test = data['x_test']
    y_test = data['y_test']
    
    predictions = lr.predict(X_test)
    
    metrics = {
        "f1": f1_score(y_test, predictions),
        "accuracy": accuracy_score(y_test, predictions)
    }

    with open(model.path, "wb") as f:
        pkl.dump(lr, f)
    
    with open(metrics_artifact.path, "wb") as f:
        pkl.dump(metrics, f)

In [29]:
@component(
    base_image="python:3.11-slim",
    packages_to_install=[
        config.packages.AIPLATFORM
    ]
)
def upload_model_regestry(
    model: Input[Model],
    metrics_artifact: Input[Artifact]
):
    import pickle as pkl
    from google.cloud import aiplatform

    model_regestry = aiplatform.Model.upload(
        display_name="titanic_survivors",
        artifact_uri=model.uri,
        serving_container_image_uri="southamerica-east1-docker.pkg.dev/pebolas-sandbox/sample-model/titanic_model:latest"
    )

    with open(metrics_artifact.path, "rb") as f:
        metrics = pkl.load(f)

    model_regestry.add_metrics(metrics)

In [30]:
@pipeline(
    name=config.pipeline.name,
    description=config.pipeline.description,
    pipeline_root=config.pipeline.root + config.pipeline.name
)
def titaninc_pipeline(
    project_id: str,
    dataset_id: str,
    table_id: str
):
    fetch_data = fetch_big_data_table(
        project_id=project_id, 
        dataset_id=dataset_id, 
        table_id=table_id
        )
    
    prepare_data_task = prepare_data(
        input_data=fetch_data.output
        )
    
    scale_data_task = scale_data(
        input_data=prepare_data_task.output
        )
    
    split_train_test_task = split_train_test( 
        input_data=scale_data_task.outputs["dataset_artifact"]
        )
    
    train_model_task = train_model(
        train_data=split_train_test_task.outputs["dataset_train"],
        test_data=split_train_test_task.outputs["dataset_test"]
    )

    upload_model_regestry_task = upload_model_regestry(
        model=train_model_task.outputs["model"],
        metrics_artifact=train_model_task.outputs["metrics_artifact"]
    )

In [31]:
aiplatform.init(
    project=config.project_id, 
    location=config.location
)

In [32]:
compiler.Compiler().compile(
    pipeline_func=titaninc_pipeline, 
    package_path=config.pipeline.package_path
    )

In [33]:
job = pipeline_jobs.PipelineJob(
    display_name=config.pipeline.name,
    template_path=config.pipeline.package_path,
    parameter_values={
        'project_id': config.project_id,
        'dataset_id': config.parameters.dataset_id,
        'table_id': config.parameters.table_id
    }
)

In [34]:
job.run()

Creating PipelineJob
PipelineJob created. Resource name: projects/470842673491/locations/southamerica-east1/pipelineJobs/titanic-pipeline-20241004170914
To use this PipelineJob in another session:
pipeline_job = aiplatform.PipelineJob.get('projects/470842673491/locations/southamerica-east1/pipelineJobs/titanic-pipeline-20241004170914')
View Pipeline Job:
https://console.cloud.google.com/vertex-ai/locations/southamerica-east1/pipelines/runs/titanic-pipeline-20241004170914?project=470842673491
