In [1]:
from kfp.v2 import dsl
import kfp.v2.compiler as compiler
from kfp.v2.dsl import component, Input, Output, Artifact, Model, Dataset

from google.cloud import aiplatform
from google.cloud.aiplatform import pipeline_jobs

from typing import List, NamedTuple

  from kfp.v2 import dsl


In [2]:
PROJECT_ID = ''
GCS_BUCKET_NAME = ''
PROJECT_REGION = ''

VERTEX_DATASET_NAME = ''
VERTEX_MODEL_NAME = ''
VERTEX_PREDICTION_NAME = ''

BQ_DATASET_NAME = ''
BQ_SOURCE_TABLE = ''
BQ_TRAIN_TABLE = ''
BQ_PREDICT_TABLE = ''



In [3]:
PROJECT_ID = 'wb-ai-acltr-tbs-3-pr-a62583'
GCS_BUCKET_NAME = 'bkt_b2b_wf_prediction'
PROJECT_REGION = 'northamerica-northeast1'

VERTEX_DATASET_NAME = 'b2b_wf_short_term_prediction'
VERTEX_MODEL_NAME = 'b2b_wf_short_term_model'

BQ_DATASET_NAME = 'b2b_wf_prediction'
BQ_SOURCE_TABLE = 'vw_wf_daily_historical'


In [4]:
TRAINING_DATASET_BQ_PATH   = f"bq://{PROJECT_ID}.{BQ_DATASET_NAME}.{BQ_TRAIN_TABLE}"
BUCKET_URI = f"gs://{PROJECT_ID}_{GCS_BUCKET_NAME}"
PIPELINE_PACKAGE_PATH = 'short_term_pipeline.json'

In [5]:
EXPERIMENT_FEATURES = [
    "District",
    "Region_Type",
    "Product",
    "Product_Grp",
    "Technology",
    "Work_Order_Action",
    "Work_Order_Action_Grp",
    "Work_Force"
]

In [6]:
aiplatform.init(
    project=PROJECT_ID,
    location=PROJECT_REGION,
    staging_bucket=BUCKET_URI)

In [7]:
@component(
    base_image="northamerica-northeast1-docker.pkg.dev/cio-workbench-image-np-0ddefe/bi-platform/bi-aaaie/images/kfp-preprocess-slim:2.0.1"
)
def query_and_preprocess(
    project_id: str,
    project_location: str,
    bq_dataset: str,
    bq_source_table: str,
    time_column: str,
    target_column: str,
    attribute_columns: List[str],
    output_dataset: Output[Dataset]
):
    """ 
    Queries BigQuery data, performs preprocessing, and exports to a CSV dataset for training.
    The function aggregates data by time and attributes, creates a unique series identifier,
    and handles categorical features appropriately.

    Args:
        project_id: GCP project ID
        project_location: GCP project location/region
        bq_dataset: BigQuery dataset name
        bq_source_table: Source table name
        attribute_columns: List of categorical columns to group by
        output_dataset: Output path for the preprocessed CSV dataset

    Returns:
        Writes a preprocessed CSV file to the output_dataset path containing:
        - Series_Identifier: Concatenated string of attribute values
        - Appointment_Day: Timestamp column
        - Attribute columns: Original categorical features
        - SWT: Aggregated target variable
    """
    
    import datetime
    import pandas as pd
    from google.cloud import bigquery
    from google.cloud import aiplatform
    

    def create_series_identifier(columns):
        coalesce_parts = [f"COALESCE({column}, 'None')" for column in columns]
        separator = "' '"
        return f"CONCAT({f', {separator}, '.join(coalesce_parts)}) AS Series_Identifier"
    
    time_column = "Appointment_Day"
    target_column = "SWT"
    
    FORECAST_TIMESTAMP = datetime.datetime.now()
    ATTRIBUTE_STRING = ','.join(attribute_columns)

    COLUMN_SPECS = {
        time_column:             "timestamp",
        target_column:           "numeric"
    }

    for category in attribute_columns:
        COLUMN_SPECS[category] = "categorical"
        
    experiment_train_data_query = f"""
    WITH historical_table AS (
        SELECT 
            {time_column},
            {ATTRIBUTE_STRING},
            SUM({target_column}) AS {target_column}
        FROM `{project_id}.{bq_dataset}.{bq_source_table}`
        WHERE {time_column} <= DATE('2025-03-31')
        GROUP BY {time_column},{ATTRIBUTE_STRING}
    )
    SELECT 
        {create_series_identifier(attribute_columns)},
        {time_column},
        {ATTRIBUTE_STRING},
        {target_column}
    FROM historical_table
    """

    client = bigquery.Client(
        project=project_id,
        location=project_location)

    processed_data = client.query(experiment_train_data_query).to_dataframe()
    
    processed_data.to_csv(output_dataset.path, index=False)
    print(f"CSV file written to {output_dataset.path}")


In [None]:
@component(
    base_image="northamerica-northeast1-docker.pkg.dev/cio-workbench-image-np-0ddefe/bi-platform/bi-aaaie/images/mlops/kfp-2.0.0/kfp-load-model-slim:1.0.0"
)
def split_rolling_forecast(
    input_dataset: Input[Dataset],
    time_column: str,
    output_train1: Output[Dataset],
    output_test1: Output[Dataset],
    output_train2: Output[Dataset],
    output_test2: Output[Dataset],
    output_train3: Output[Dataset],
    output_test3: Output[Dataset],
    output_train4: Output[Dataset],
    output_test4: Output[Dataset],
):
    """
    Splits the preprocessed data into rolling forecast datasets for model training.

    Args:
        input_dataset: Input path for the preprocessed CSV dataset
        time_column: Name of the timestamp column
    """
    
    import pandas as pd
    
    forecast_processed_data = pd.read_csv(input_dataset.path, index_col=False)
    forecast_processed_data[time_column] = pd.to_datetime(forecast_processed_data[time_column])
    
    splits = [
        {
            "train_start": "2022-01-01",
            "train_end":   "2024-03-31",
            "test_start":  "2024-04-01",
            "test_end":    "2024-06-30"
        },
        {
            "train_start": "2022-01-01",
            "train_end":   "2024-06-30",
            "test_start":  "2024-07-01",
            "test_end":    "2024-09-30"
        },
        {
            "train_start": "2022-01-01",
            "train_end":   "2024-09-30",
            "test_start":  "2024-10-01",
            "test_end":    "2024-12-31"
        },
        {
            "train_start": "2022-01-01",
            "train_end":   "2024-12-31",
            "test_start":  "2025-01-01",
            "test_end":    "2025-03-31"
        },
    ]
    
    for s in splits:
        s["train_start"] = pd.to_datetime(s["train_start"])
        s["train_end"]   = pd.to_datetime(s["train_end"])
        s["test_start"]  = pd.to_datetime(s["test_start"])
        s["test_end"]    = pd.to_datetime(s["test_end"])
    
    train_test_pairs = []
    for s in splits:
        train_mask = (forecast_processed_data[time_column] >= s["train_start"]) \
                   & (forecast_processed_data[time_column] <= s["train_end"])
        
        test_mask = (forecast_processed_data[time_column] >= s["test_start"]) \
                  & (forecast_processed_data[time_column] <= s["test_end"])

        train_df = forecast_processed_data.loc[train_mask].copy()
        test_df = forecast_processed_data.loc[test_mask].copy()
        train_test_pairs.append((train_df, test_df))
        
    
    train1_df, test1_df = train_test_pairs[0]
    train2_df, test2_df = train_test_pairs[1]
    train3_df, test3_df = train_test_pairs[2]
    train4_df, test4_df = train_test_pairs[3]
    
    train1_df.to_csv(output_train1.path, index=False)
    test1_df.to_csv(output_test1.path, index=False)
    
    train2_df.to_csv(output_train2.path, index=False)
    test2_df.to_csv(output_test2.path, index=False)
    
    train3_df.to_csv(output_train3.path, index=False)
    test3_df.to_csv(output_test3.path, index=False)
    
    train4_df.to_csv(output_train4.path, index=False)
    test4_df.to_csv(output_test4.path, index=False)
    
    

In [9]:
@dsl.pipeline(
    name="forecast-training-pipeline",
    description="A Kubeflow pipeline for training forecast models using AutoML Forecast on Vertex AI Pipelines from a BigQuery view."
)
def forecast_pipeline(
    project_id: str,
    project_location: str,
    bq_dataset: str,
    bq_source_table: str,
    time_column: str,
    target_column: str,
    attribute_columns: List[str]
):
    query_and_preprocess_task = query_and_preprocess(
        project_id=project_id,
        project_location=project_location,
        bq_dataset=bq_dataset,
        bq_source_table=bq_source_table,
        time_column=time_column,
        target_column=target_column,
        attribute_columns=attribute_columns
    )
    
    split_rolling_forecast_task = split_rolling_forecast(
        time_column=time_column,
        input_dataset=query_and_preprocess_task.outputs['output_dataset']
    )

In [10]:
compiler.Compiler().compile(
    pipeline_func=forecast_pipeline,
    package_path=PIPELINE_PACKAGE_PATH
)

job = pipeline_jobs.PipelineJob(
    display_name="b2b_wf_short_term_prediction",
    template_path=PIPELINE_PACKAGE_PATH,
    parameter_values={
        'project_id': PROJECT_ID,
        'project_location': PROJECT_REGION,
        'bq_dataset': BQ_DATASET_NAME,
        'bq_source_table': BQ_SOURCE_TABLE,
        'time_column': "Appointment_Day",
        "target_column": "SWT",
        'attribute_columns': EXPERIMENT_FEATURES
    }
)

job.run()

Creating PipelineJob
PipelineJob created. Resource name: projects/7796273458/locations/northamerica-northeast1/pipelineJobs/forecast-training-pipeline-20250408144422
To use this PipelineJob in another session:
pipeline_job = aiplatform.PipelineJob.get('projects/7796273458/locations/northamerica-northeast1/pipelineJobs/forecast-training-pipeline-20250408144422')
View Pipeline Job:
https://console.cloud.google.com/vertex-ai/locations/northamerica-northeast1/pipelines/runs/forecast-training-pipeline-20250408144422?project=7796273458
PipelineJob projects/7796273458/locations/northamerica-northeast1/pipelineJobs/forecast-training-pipeline-20250408144422 current state:
3
PipelineJob projects/7796273458/locations/northamerica-northeast1/pipelineJobs/forecast-training-pipeline-20250408144422 current state:
3
PipelineJob projects/7796273458/locations/northamerica-northeast1/pipelineJobs/forecast-training-pipeline-20250408144422 current state:
3
PipelineJob projects/7796273458/locations/northame