In [1]:
from kfp import dsl
import kfp.compiler as compiler
from kfp.dsl import component, Input, Output, Artifact, Model, Dataset
    
from google.cloud import aiplatform
from google.cloud.aiplatform import pipeline_jobs
from google_cloud_pipeline_components.v1.custom_job import create_custom_training_job_from_component


from typing import Any, Dict, List

In [2]:
PROJECT_ID = ''
GCS_BUCKET_NAME = ''
PROJECT_REGION = ''

VERTEX_DATASET_NAME = ''
VERTEX_MODEL_NAME = ''
VERTEX_PREDICTION_NAME = ''

BQ_DATASET_NAME = ''
BQ_SOURCE_TABLE = ''
BQ_TRAIN_TABLE = ''
BQ_PREDICT_TABLE = ''



In [3]:
PROJECT_ID = 'wb-ai-acltr-tbs-3-pr-a62583'
GCS_BUCKET_NAME = 'bkt_b2b_wf_prediction'
PROJECT_REGION = 'northamerica-northeast1'

VERTEX_DATASET_NAME = 'b2b_wf_short_term_prediction'
VERTEX_MODEL_NAME = 'b2b_wf_short_term_model'

BQ_DATASET_NAME = 'b2b_wf_prediction'
BQ_SOURCE_TABLE = 'vw_wf_daily_historical'


In [4]:
TRAINING_DATASET_BQ_PATH   = f"bq://{PROJECT_ID}.{BQ_DATASET_NAME}.{BQ_TRAIN_TABLE}"
BUCKET_URI = f"gs://{PROJECT_ID}_{GCS_BUCKET_NAME}"
PIPELINE_PACKAGE_PATH = 'short_term_pipeline.json'

In [5]:
EXPERIMENT_NAME = "b2b-wf-daily-forecast-model"

RUN_NAME = 'sma-training'
MODEL = 'SMA' # 'SARIMA
FORECAST_HORIZON = 91
EXPERIMENT_FEATURES = [
    "District",
    "Region_Type",
    "Product",
    "Product_Grp",
    "Technology",
    "Work_Order_Action",
    "Work_Order_Action_Grp",
    "Work_Force"
]

# Preprocess

In [6]:
# query_and_preprocess
@component(
    base_image="northamerica-northeast1-docker.pkg.dev/cio-workbench-image-np-0ddefe/bi-platform/bi-aaaie/images/b2b_ai/wf_pipeline/training:1.0.2-rc"
)
def query_and_preprocess(
    project_id: str,
    project_location: str,
    bq_dataset: str,
    bq_source_table: str,
    time_column: str,
    target_column: str,
    series_identifier: str,
    attribute_columns: List[str],
    output_dataset: Output[Dataset]
):
    """ 
    Queries BigQuery data, performs preprocessing, and exports to a CSV dataset for training.
    The function aggregates data by time and attributes, creates a unique series identifier,
    and handles categorical features appropriately.

    Args:
        project_id: GCP project ID
        project_location: GCP project location/region
        bq_dataset: BigQuery dataset name
        bq_source_table: Source table name
        attribute_columns: List of categorical columns to group by
        output_dataset: Output path for the preprocessed CSV dataset

    Returns:
        Writes a preprocessed CSV file to the output_dataset path containing:
        - Series_Identifier: Concatenated string of attribute values
        - Appointment_Day: Timestamp column
        - Attribute columns: Original categorical features
        - SWT: Aggregated target variable
    """
    
    from google.cloud import bigquery
    

    def create_series_identifier(columns, series_identifier):
        coalesce_parts = [f"COALESCE({column}, 'None')" for column in columns]
        separator = "' '"
        return f"CONCAT({f', {separator}, '.join(coalesce_parts)}) AS {series_identifier}"
    
    ATTRIBUTE_STRING = ','.join(attribute_columns)
        
    experiment_train_data_query = f"""
    WITH historical_table AS (
        SELECT 
            {time_column},
            {ATTRIBUTE_STRING},
            SUM({target_column}) AS {target_column}
        FROM `{project_id}.{bq_dataset}.{bq_source_table}`
        WHERE {time_column} <= DATE('2025-03-31')
        GROUP BY {time_column},{ATTRIBUTE_STRING}
    )
    SELECT 
        {create_series_identifier(attribute_columns, series_identifier)},
        {time_column},
        {ATTRIBUTE_STRING},
        {target_column}
    FROM historical_table
    """

    client = bigquery.Client(
        project=project_id,
        location=project_location)

    processed_data = client.query(experiment_train_data_query).to_dataframe()
    
    processed_data.to_csv(output_dataset.path, index=False)
    print(f"CSV file written to {output_dataset.path}")


In [7]:
# generate_time_series_cv
@component(
    base_image="northamerica-northeast1-docker.pkg.dev/cio-workbench-image-np-0ddefe/bi-platform/bi-aaaie/images/b2b_ai/wf_pipeline/training:1.0.2-rc"
)
def generate_time_series_cv(
    input_dataset: Input[Dataset],
    time_column: str,
    forecast_horizon: int,
    output_train: Output[Dataset],
    output_test: Output[Dataset]
):
    import pandas as pd
    import logging
    from datetime import timedelta
    """
    Splits preprocessed time series data into rolling forecast datasets for training and evaluation.
    
    This component implements a rolling window cross-validation strategy for time series forecasting.
    It reads a CSV dataset containing timestamp-based data and creates training and test sets with
    an additional 'split_index' column to identify different temporal splits. Each split represents
    a different forecasting period, with training data incrementally growing and test data moving
    forward in time.
    
    Split Structure:
        - All splits start training from 2022-01-01
        - Training periods grow progressively longer
        - Each test period is forecast_horizon days following its training period
        - Data is labeled with split_index (1-4) to identify which split it belongs to
    
    Args:
        input_dataset: Input[Dataset]
            The preprocessed CSV dataset containing time series data
        time_column: str
            Name of the column containing timestamps
        forecast_horizon: int
            Number of days for each test period
        output_train: Output[Dataset]
            Output path for the combined training dataset
            Contains all training data with a 'split_index' column
        output_test: Output[Dataset]
            Output path for the combined test dataset
            Contains all test data with a 'split_index' column
    
    Output Dataset Structure:
        Both training and test datasets include all original columns plus:
        - split_index: int (1-4)
            Identifies which temporal split the row belongs to
            Allows filtering/grouping data by split for analysis or modeling
    """
    
    logging.basicConfig(level=logging.INFO)
    logger = logging.getLogger(__name__)
    
    logger.info(f"Reading input dataset from: {input_dataset.path}")
    forecast_processed_data = pd.read_csv(input_dataset.path, parse_dates=[time_column])
    
    train_start = pd.to_datetime("2022-01-01")
    
    splits = []
    current_train_end = pd.to_datetime("2024-03-31")
    
    for split_index in range(1, 5):
        test_start = current_train_end + timedelta(days=1)
        test_end = test_start + timedelta(days=forecast_horizon-1)
        
        splits.append({
            "split_index": split_index,
            "train_start": train_start,
            "train_end": current_train_end,
            "test_start": test_start,
            "test_end": test_end
        })
        
        current_train_end = test_end
    
    logger.info("Generated splits:")
    for s in splits:
        logger.info(f"Split {s['split_index']}:")
        logger.info(f"  Train: {s['train_start'].strftime('%Y-%m-%d')} to {s['train_end'].strftime('%Y-%m-%d')}")
        logger.info(f"  Test:  {s['test_start'].strftime('%Y-%m-%d')} to {s['test_end'].strftime('%Y-%m-%d')}")
    
    all_train_data = []
    all_test_data = []
    
    for s in splits:
        logger.info(f"\nProcessing split {s['split_index']}")
        
        train_mask = (forecast_processed_data[time_column] >= s["train_start"]) & \
                     (forecast_processed_data[time_column] <= s["train_end"])
        
        test_mask = (forecast_processed_data[time_column] >= s["test_start"]) & \
                    (forecast_processed_data[time_column] <= s["test_end"])

        train_df = forecast_processed_data.loc[train_mask].copy()
        test_df = forecast_processed_data.loc[test_mask].copy()
        
        train_df['split_index'] = s['split_index']
        test_df['split_index'] = s['split_index']
        
        all_train_data.append(train_df)
        all_test_data.append(test_df)
        
        logger.info(f"Split {s['split_index']} - Train shape: {train_df.shape}, Test shape: {test_df.shape}")
    
    combined_train = pd.concat(all_train_data, ignore_index=True)
    combined_test = pd.concat(all_test_data, ignore_index=True)
    
    logger.info(f"\nSaving combined training data (shape: {combined_train.shape}) to {output_train.path}")
    combined_train.to_csv(output_train.path, index=False)
    
    logger.info(f"Saving combined test data (shape: {combined_test.shape}) to {output_test.path}")
    combined_test.to_csv(output_test.path, index=False)
    
    for split_index in range(1, 5):
        train_count = len(combined_train[combined_train['split_index'] == split_index])
        test_count = len(combined_test[combined_test['split_index'] == split_index])
        logger.info(f"\nSplit {split_index} summary:")
        logger.info(f"  Training samples: {train_count}")
        logger.info(f"  Test samples: {test_count}")


# Data Metadata

In [8]:
#generate_dataset_statistics
@component(
    base_image="northamerica-northeast1-docker.pkg.dev/cio-workbench-image-np-0ddefe/bi-platform/bi-aaaie/images/b2b_ai/wf_pipeline/training:1.0.2-rc"
)
def generate_dataset_statistics(
    train_dataset: Input[Dataset],
    test_dataset: Input[Dataset],
    time_column: str,
    target_column: str,
    attribute_columns: List[str],
    output_statistics: Output[Artifact]
) -> Dict[str, Dict[str, Dict[str, any]]]:
    """
    Generates statistics for the training and test datasets produced by the generate_time_series_cv component.

    This component analyzes the combined training and test datasets, which include a split_index column
    identifying different temporal splits. It generates statistics for each split as well as overall statistics.

    Args:
        train_dataset: Input[Dataset]
            Combined training dataset with split_index column
        test_dataset: Input[Dataset]
            Combined test dataset with split_index column
        output_statistics: Output[Artifact]
            Output artifact to store the generated statistics
        time_column: str
            Name of the timestamp column
        target_column: str
            Name of the target column (worked hours)
        attribute_columns: List[str]
            List of categorical columns (location, type of work, technology, product)

    Returns:
        Dict containing statistics for:
        - Each training split (train_split_1 through train_split_4)
        - Each test split (test_split_1 through test_split_4)
        - Overall statistics combining all data
    """
    import pandas as pd
    import json
    import logging
    from datetime import datetime

    logging.basicConfig(level=logging.INFO)
    logger = logging.getLogger(__name__)

    def calculate_statistics(df: pd.DataFrame, dataset_type: str, split_number: int) -> Dict[str, any]:
        # Convert timestamp column to datetime if it's not already
        if not pd.api.types.is_datetime64_any_dtype(df[time_column]):
            df[time_column] = pd.to_datetime(df[time_column])
            
        # Get date range as strings to avoid timestamp arithmetic
        date_range_start = df[time_column].min()
        date_range_end = df[time_column].max()
        
        stats = {
            "dataset_type": dataset_type,
            "split_number": split_number,
            "total_rows": len(df),
            "date_range": {
                "start": date_range_start.strftime("%Y-%m-%d"),
                "end": date_range_end.strftime("%Y-%m-%d")
            },
            "target_column": {
                "mean": float(df[target_column].mean()),
                "median": float(df[target_column].median()),
                "min": float(df[target_column].min()),
                "max": float(df[target_column].max()),
                "std": float(df[target_column].std()),
                "total": float(df[target_column].sum())
            },
            "null_counts": df.isnull().sum().to_dict(),
            "categorical_columns": {}
        }

        for col in attribute_columns:
            value_counts = df[col].value_counts()
            stats["categorical_columns"][col] = {
                "unique_values": int(df[col].nunique()),
                "top_5_values": value_counts.nlargest(5).to_dict(),
                "null_count": int(df[col].isnull().sum()),
                "total_count": int(len(df)),
                "distribution_percentage": value_counts.nlargest(5).apply(lambda x: float(x/len(df) * 100)).to_dict()
            }

        return stats

    all_statistics = {}

    logger.info("Reading input datasets")
    # Read CSVs with explicit datetime parsing
    train_df = pd.read_csv(train_dataset.path)
    test_df = pd.read_csv(test_dataset.path)
    
    # Convert time column to datetime after reading
    train_df[time_column] = pd.to_datetime(train_df[time_column])
    test_df[time_column] = pd.to_datetime(test_df[time_column])

    logger.info("Processing training splits")
    for split_index in range(1, 5):
        split_train = train_df[train_df['split_index'] == split_index]
        logger.info(f"Processing training split {split_index} (shape: {split_train.shape})")
        all_statistics[f"train_split_{split_index}"] = calculate_statistics(split_train, "train", split_index)

    logger.info("Processing test splits")
    for split_index in range(1, 5):
        split_test = test_df[test_df['split_index'] == split_index]
        logger.info(f"Processing test split {split_index} (shape: {split_test.shape})")
        all_statistics[f"test_split_{split_index}"] = calculate_statistics(split_test, "test", split_index)
        
    logger.info("Calculating overall statistics")
    all_data = pd.concat([train_df, test_df])
    all_statistics["overall"] = calculate_statistics(all_data, "overall", 0)

    # Calculate summary statistics
    earliest_date = all_data[time_column].min()
    latest_date = all_data[time_column].max()
    
    all_statistics["summary"] = {
        "total_rows": {
            "train": len(train_df),
            "test": len(test_df),
            "total": len(all_data)
        },
        "date_range": {
            "earliest": earliest_date.strftime("%Y-%m-%d"),
            "latest": latest_date.strftime("%Y-%m-%d")
        },
        "splits_info": {
            f"split_{i}": {
                "train_rows": len(train_df[train_df['split_index'] == i]),
                "test_rows": len(test_df[test_df['split_index'] == i])
            } for i in range(1, 5)
        }
    }

    logger.info(f"Saving statistics to {output_statistics.path}")
    with open(output_statistics.path, "w") as f:
        json.dump(all_statistics, f, indent=2)

    logger.info("Statistics generation completed")
    return all_statistics


In [9]:
# generate_statistics_visualization
@component(
    base_image="northamerica-northeast1-docker.pkg.dev/cio-workbench-image-np-0ddefe/bi-platform/bi-aaaie/images/b2b_ai/wf_pipeline/training:1.0.2-rc"
)
def generate_statistics_visualization(
    statistics_artifact: Input[Artifact],
    train_dataset: Input[Artifact],
    test_dataset: Input[Artifact],
    attribute_columns: List[str],
    time_column: str,
    target_column: str,
    output_visualization: Output[Artifact]
):
    """
    Generates an interactive HTML report visualizing the statistics from the dataset analysis.
    Focuses on train split 4 (which contains all historical data) and creates specific visualizations:
    1. Categorical variables distribution
    2. Time series visualization for the last split
    3. Target column statistics card
    4. Weekly time series comparison between train and test data for split 4
    
    Args:
        statistics_artifact: Input artifact containing the JSON statistics
        train_dataset: Training dataset for additional visualizations
        test_dataset: Test dataset for additional visualizations
        output_visualization: Output artifact for the HTML report
        time_column: Name of the timestamp column
        target_column: Name of the target column (worked hours)
    """
    import json
    import pandas as pd
    import numpy as np
    import plotly.express as px
    import plotly.graph_objects as go
    from plotly.subplots import make_subplots
    import logging
    from datetime import datetime, timedelta

    logging.basicConfig(level=logging.INFO)
    logger = logging.getLogger(__name__)

    logger.info("Loading statistics from JSON")
    with open(statistics_artifact.path, 'r') as f:
        stats = json.load(f)

    logger.info("Loading datasets")
    train_df = pd.read_csv(train_dataset.path)
    train_df[time_column] = pd.to_datetime(train_df[time_column])
    
    test_df = pd.read_csv(test_dataset.path)
    test_df[time_column] = pd.to_datetime(test_df[time_column])

    train_split4 = train_df[train_df['split_index'] == 4]
    test_split4 = test_df[test_df['split_index'] == 4]
    split4_stats = stats['train_split_4']

    figures = []
    
    def create_categorical_distributions():
        logger.info("Creating categorical distributions chart")
        
        fig = make_subplots(
            rows=4, cols=2,
            subplot_titles=attribute_columns,
            vertical_spacing=0.12,
            horizontal_spacing=0.1
        )
        
        for idx, col in enumerate(attribute_columns, 1):
            row = ((idx-1) // 2) + 1
            col_num = ((idx-1) % 2) + 1
            
            col_stats = split4_stats['categorical_columns'][col]
            values = list(col_stats['distribution_percentage'].values())
            labels = list(col_stats['distribution_percentage'].keys())
            
            fig.add_trace(
                go.Bar(
                    x=labels,
                    y=values,
                    name=col,
                    text=[f'{v:.1f}%' for v in values],
                    textposition='auto',
                    showlegend=False
                ),
                row=row, col=col_num
            )
            
            fig.update_xaxes(tickangle=45, row=row, col=col_num)
            fig.update_yaxes(title='Percentage (%)', row=row, col=col_num)

        fig.update_layout(
            height=1200,
            title_text="Categorical Variables Distribution (Train Split 4)",
            showlegend=False
        )
        return fig

    def create_target_stats_card():
        logger.info("Creating target statistics card")
        target_stats = split4_stats['target_column']
        
        fig = make_subplots(
            rows=1, cols=2,
            column_widths=[0.5, 0.5],
            specs=[[{"type": "table"}, {"type": "violin"}]],
            horizontal_spacing=0.05
        )
        
        fig.add_trace(
            go.Table(
                header=dict(
                    values=['Metric', 'Value'],
                    fill_color='lightgrey',
                    align='left',
                    font=dict(size=14)
                ),
                cells=dict(
                    values=[
                        ['Mean', 'Standard Deviation', 'Minimum', 'Maximum'],
                        [
                            f"{target_stats['mean']:,.2f}",
                            f"{target_stats['std']:,.2f}",
                            f"{target_stats['min']:,.2f}",
                            f"{target_stats['max']:,.2f}",
                        ]
                    ],
                    align='left',
                    font=dict(size=13)
                )
            ),
            row=1, col=1
        )
        
        log_swt = np.log1p(train_split4[target_column])
        
        fig.add_trace(
            go.Violin(
                y=log_swt,
                name=f"log({target_column} + 1)",
                box_visible=False,
                meanline_visible=True,
                points=False,
                fillcolor='lightblue',
                line_color='blue',
                showlegend=False,
                hovertemplate="log(%{y:.2f})<extra></extra>"
            ),
            row=1, col=2
        )

        fig.update_layout(
            title=f"Target Column Statistics - log({target_column} + 1) (Train Split 4)",
            height=400,
            showlegend=False,
            yaxis=dict(
                title=f"log({target_column} + 1)",
                tickformat=".2f",
                side='right'
            )
        )
        
        return fig

    def create_weekly_timeseries_comparison():
        logger.info("Creating weekly time series comparison")
        

        last_train_date = train_split4[time_column].max()
        one_year_ago = last_train_date - timedelta(days=365)
        

        train_last_year = train_split4[train_split4[time_column] >= one_year_ago].copy()
        

        train_weekly = train_last_year.groupby(pd.Grouper(key=time_column, freq='W-MON'))[target_column].sum().reset_index()
        test_weekly = test_split4.groupby(pd.Grouper(key=time_column, freq='W-MON'))[target_column].sum().reset_index()
        
        fig = go.Figure()
        

        fig.add_trace(
            go.Scatter(
                x=train_weekly[time_column],
                y=train_weekly[target_column],
                name='Training Data',
                line=dict(color='blue', width=2),
                mode='lines',
                hovertemplate="Date: %{x}<br>Value: %{y:,.0f}<extra></extra>"
            )
        )
        

        fig.add_trace(
            go.Scatter(
                x=test_weekly[time_column],
                y=test_weekly[target_column],
                name='Test Data',
                line=dict(color='red', width=2),
                mode='lines',
                hovertemplate="Date: %{x}<br>Value: %{y:,.0f}<extra></extra>"
            )
        )
        
        fig.update_layout(
            title="Weekly Time Series Comparison (Split 4)",
            xaxis_title="Date",
            yaxis_title=target_column,
            height=500,
            showlegend=True,
            legend=dict(
                yanchor="top",
                y=0.99,
                xanchor="left",
                x=0.01
            ),
            hovermode='x unified'
        )
        
        return fig

    figures.extend([
        create_target_stats_card(),
        create_weekly_timeseries_comparison(),
        create_categorical_distributions()
    ])

    html_parts = [
        "<!DOCTYPE html>",
        "<html>",
        "<head>",
        "<title>Workforce Time Series Analysis Report - Split 4</title>",
        "<script src='https://cdn.plot.ly/plotly-latest.min.js'></script>",
        "<style>",
        "body { margin: 20px; }",
        ".header { text-align: center; margin-bottom: 30px; }",
        ".plot { margin-bottom: 40px; }",
        ".stats-summary { background: #f5f5f5; padding: 20px; border-radius: 5px; margin-bottom: 20px; }",
        "</style>",
        "</head>",
        "<body>",
        "<div class='header'>",
        "<h1>Workforce Time Series Analysis Report - Split 4</h1>",
        f"<p>Generated on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}</p>",
        "</div>",
        "<div class='stats-summary'>",
        "<h2>Dataset Overview</h2>",
        f"<p>Total Records: {split4_stats['total_rows']:,}</p>",
        f"<p>Date Range: {split4_stats['date_range']['start']} to {split4_stats['date_range']['end']}</p>",
        "</div>"
    ]

    for fig in figures:
        html_parts.append(f"<div class='plot'>{fig.to_html(full_html=False, include_plotlyjs=False)}</div>")

    html_parts.extend([
        "</body>",
        "</html>"
    ])

    logger.info(f"Saving HTML report to {output_visualization.path}")
    with open(output_visualization.path, "w") as f:
        f.write("\n".join(html_parts))

    logger.info("Visualization generation completed")


# Models training

In [10]:
#sma_trainer_component
@component(
    base_image="northamerica-northeast1-docker.pkg.dev/cio-workbench-image-np-0ddefe/bi-platform/bi-aaaie/images/b2b_ai/wf_pipeline/training:1.0.2-rc"
)
def sma_trainer_component(
    dataset: Input[Dataset],
    experiment_name: str,
    window_size: int,
    project_id: str,
    project_location: str,
    time_column: str,
    target_column: str,
    series_identifier: str,
    forecast_horizon: int,
    run_name: str,
    output_model: Output[Model],
    experiment_run: Output[Artifact]
):
    import pickle
    from google.cloud import aiplatform
    import pandas as pd
    import numpy as np
    from datetime import datetime, timedelta
    import json
    import logging

    logging.basicConfig(
        level=logging.INFO,
        format='%(asctime)s - %(levelname)s - %(message)s'
    )
    logger = logging.getLogger(__name__)

    def generate_rolling_predictions(values, window_size, horizon):
        """Generate predictions using a rolling window approach"""
        predictions = []
        rolling_window = np.array(values[-window_size:])
        
        for _ in range(horizon):
            next_pred = np.mean(rolling_window)
            predictions.append(next_pred)
            rolling_window = np.roll(rolling_window, -1)
            rolling_window[-1] = next_pred
            
        return predictions

    logger.info(f"Starting SMA trainer component with window size: {window_size}")
    
    logger.info(f"Initializing Vertex AI SDK for project: {project_id}, location: {project_location}")
    aiplatform.init(project=project_id, location=project_location, experiment=experiment_name)

    logger.info(f"Reading dataset from: {dataset.path}")
    df = pd.read_csv(dataset.path, parse_dates=[time_column])
    logger.info(f"Dataset shape: {df.shape}")

    model_output = {
        'model_type': 'SMA',
        'parameters': {'window_size': window_size},
        'predictions': {}
    }

    timestamp = datetime.now().strftime('%Y-%m-%d-%H%M')
    run = aiplatform.start_run(f"{run_name}-{timestamp}")

    unique_series = df[series_identifier].unique()
    unique_splits = df['split_index'].unique()
    
    total_combinations = len(unique_series) * len(unique_splits)
    successful_predictions = 0
    
    for series_id in unique_series:
        for split_index in unique_splits:
            series_data = df[
                (df[series_identifier] == series_id) & 
                (df['split_index'] == split_index)
            ].sort_values(time_column)
            
            if len(series_data) == 0:
                logger.warning(f"No data for series {series_id}, split {split_index}")
                continue
            
            try:
                historical_values = series_data[target_column].values
                future_predictions = generate_rolling_predictions(
                    historical_values,
                    window_size,
                    forecast_horizon
                )
                
                last_date = series_data[time_column].iloc[-1]
                future_dates = [(last_date + timedelta(days=i+1)).strftime('%Y-%m-%d')
                              for i in range(forecast_horizon)]
                
                model_output['predictions'][(series_id, split_index)] = {
                    'series_id': series_id,
                    'split_index': split_index,
                    'timestamps': future_dates,
                    'values': future_predictions,
                    'metadata': {
                        'last_training_date': last_date.strftime('%Y-%m-%d'),
                        'window_size': window_size
                    }
                }
                successful_predictions += 1
            except Exception as e:
                logger.error(f"Error processing series {series_id}, split {split_index}: {str(e)}")
    
    logger.info(f"Successfully processed {successful_predictions} out of {total_combinations} combinations")
    
    with open(output_model.path, "wb") as f:
        pickle.dump(model_output, f)

    run.log_params({
        "window_size": window_size,
        "model_uri": output_model.path,
        "forecast_horizon": forecast_horizon,
        "total_series": len(unique_series),
        "total_splits": len(unique_splits)
    })
    
    run.log_metrics({
        "total_combinations": total_combinations,
        "successful_predictions": successful_predictions,
        "completion_rate": successful_predictions / total_combinations
    })

    run_info = {
        "run_name": run.name,
        "experiment": experiment_name,
        "project_id": project_id,
        "location": project_location
    }
    with open(experiment_run.path, 'w') as f:
        json.dump(run_info, f)

    logger.info("SMA trainer component completed successfully")

sma_trainer_component_job = create_custom_training_job_from_component(
    sma_trainer_component,
    display_name='sma-model-training',
    machine_type='e2-highcpu-16',
    service_account='notebook-service-account@wb-ai-acltr-tbs-3-pr-a62583.iam.gserviceaccount.com'
)


In [11]:
#sarima_trainer_component
@component(
    base_image="northamerica-northeast1-docker.pkg.dev/cio-workbench-image-np-0ddefe/bi-platform/bi-aaaie/images/b2b_ai/wf_pipeline/training:1.0.2-rc"
)
def sarima_trainer_component(
    dataset: Input[Dataset],
    experiment_name: str,
    project_id: str,
    project_location: str,
    time_column: str,
    target_column: str,
    series_identifier: str,
    forecast_horizon: int,
    run_name: str,
    output_model: Output[Model],
    experiment_run: Output[Artifact]
):

    import pickle
    from google.cloud import aiplatform
    import pandas as pd
    import numpy as np
    from datetime import datetime, timedelta
    import json
    import logging
    from concurrent.futures import ProcessPoolExecutor
    import os
    from pmdarima import auto_arima

    # Configure logging to be multiprocessing-safe
    logging.basicConfig(
        level=logging.INFO,
        format='%(asctime)s - %(processName)s - %(levelname)s - %(message)s'
    )
    logger = logging.getLogger(__name__)

    def fit_sarima_model(values):
        """Fit SARIMA model using auto_arima"""
        if len(values) == 0:
            logger.warning("Empty array passed to fit_sarima_model. Returning None.")
            return None
        try:
            model = auto_arima(values, seasonal=True, m=7, suppress_warnings=True, error_action="ignore")
            return model
        except Exception as e:
            logger.error(f"Error in fit_sarima_model: {str(e)}")
            return None

    def generate_sarima_predictions(model, horizon):
        """Generate predictions using the fitted SARIMA model"""
        if model is None:
            logger.warning("None model passed to generate_sarima_predictions. Returning zeros.")
            return np.zeros(horizon)
        try:
            return model.predict(n_periods=horizon)
        except Exception as e:
            logger.error(f"Error in generate_sarima_predictions: {str(e)}")
            return np.zeros(horizon)

    def process_series_split_sarima(args):
        """Worker function to process a single series-split combination for SARIMA"""
        series_id, split_index, series_data, forecast_horizon, time_column, target_column = args
        
        try:
            if len(series_data) == 0:
                return None
            
            historical_values = series_data[target_column].values
            
            sarima_model = fit_sarima_model(historical_values)
            if sarima_model is None:
                logger.warning(f"Could not fit SARIMA model for series {series_id} in split {split_index}")
                return None
            
            future_predictions = generate_sarima_predictions(sarima_model, forecast_horizon)
            
            last_date = series_data[time_column].iloc[-1]
            future_dates = [(last_date + timedelta(days=i+1)).strftime('%Y-%m-%d')
                            for i in range(forecast_horizon)]
            
            return {
                'series_id': series_id,
                'split_index': split_index,
                'timestamps': future_dates,
                'values': future_predictions.tolist(),
                'metadata': {
                    'last_training_date': last_date.strftime('%Y-%m-%d'),
                    'model_order': sarima_model.order(),
                    'seasonal_order': sarima_model.seasonal_order()
                },
                'parameters': {
                    'order': sarima_model.order(),
                    'seasonal_order': sarima_model.seasonal_order()
                }
            }
        except Exception as e:
            logger.error(f"Error processing series {series_id}, split {split_index}: {str(e)}")
            return None
    
    logger.info("Starting optimized SARIMA trainer component")
    
    logger.info(f"Initializing Vertex AI SDK for project: {project_id}, location: {project_location}")
    aiplatform.init(project=project_id, location=project_location, experiment=experiment_name)

    logger.info(f"Reading dataset from: {dataset.path}")
    df = pd.read_csv(dataset.path, parse_dates=[time_column])
    logger.info(f"Dataset shape: {df.shape}")

    model_output = {
        'model_type': 'SARIMA',
        'parameters': {},
        'predictions': {}
    }

    timestamp = datetime.now().strftime('%Y-%m-%d-%H%M')
    run = aiplatform.start_run(f"{run_name}-{timestamp}")

    unique_series = df[series_identifier].unique()
    unique_splits = df['split_index'].unique()

    # Prepare work items
    work_items = []
    for series_id in unique_series:
        for split_index in unique_splits:
            series_split_df = df[(df[series_identifier] == series_id) & 
                               (df['split_index'] == split_index)].sort_values(time_column)
            
            work_items.append((
                series_id,
                split_index,
                series_split_df,
                forecast_horizon,
                time_column,
                target_column
            ))

    total_combinations = len(work_items)
    logger.info(f"Processing {total_combinations} series-split combinations using multiprocessing")
    
    # Use all available CPUs except one for system processes
    num_processes = os.cpu_count() - 1
    logger.info(f"Using {num_processes} processes for parallel processing")
    
    # Process work items in parallel
    with ProcessPoolExecutor(max_workers=num_processes) as executor:
        results = list(executor.map(process_series_split_sarima, work_items))

    # Collect results
    successful_predictions = 0
    for result in results:
        if result is not None:
            series_id = result['series_id']
            split_index = result['split_index']
            model_output['predictions'][f"{series_id}_{split_index}"] = result
            model_output['parameters'][f"{series_id}_{split_index}"] = result['parameters']
            successful_predictions += 1

    logger.info(f"Successfully processed {successful_predictions} out of {total_combinations} combinations")

    with open(output_model.path, "wb") as f:
        pickle.dump(model_output, f)

    run.log_params({
        "model_type": "SARIMA",
        "model_uri": output_model.path,
        "forecast_horizon": forecast_horizon,
        "total_series": len(unique_series),
        "total_splits": len(unique_splits),
        "num_processes": num_processes
    })

    run.log_metrics({
        "total_combinations": total_combinations,
        "successful_predictions": successful_predictions,
        "completion_rate": successful_predictions / total_combinations
    })

    run_info = {
        "run_name": run.name,
        "experiment": experiment_name,
        "project_id": project_id,
        "location": project_location
    }
    with open(experiment_run.path, 'w') as f:
        json.dump(run_info, f)

    logger.info("Optimized SARIMA trainer component completed successfully")

sarima_trainer_component_job = create_custom_training_job_from_component(
    sarima_trainer_component,
    display_name='sarima-model-training',
    machine_type='e2-highcpu-16',
    service_account='notebook-service-account@wb-ai-acltr-tbs-3-pr-a62583.iam.gserviceaccount.com'
)

# Model Eval

In [12]:
#model_evaluator_component
@component(
    base_image="northamerica-northeast1-docker.pkg.dev/cio-workbench-image-np-0ddefe/bi-platform/bi-aaaie/images/b2b_ai/wf_pipeline/training:1.0.2-rc"
)
def model_evaluator_component(
    test_dataset: Input[Dataset],
    trained_model: Input[Model],
    experiment_run: Input[Artifact],
    time_column: str,
    target_column: str,
    series_identifier: str,
    output_metrics: Output[Artifact]
):
    import pickle
    from google.cloud import aiplatform
    import pandas as pd
    import numpy as np
    import logging
    from sklearn.metrics import mean_absolute_error, mean_squared_error
    import json
    
    logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
    logger = logging.getLogger(__name__)
    
    def get_tier(series_id):
        if "Tier 1" in series_id:
            return "Tier 1"
        elif "Tier 2" in series_id:
            return "Tier 2"
        elif "Tier 3" in series_id:
            return "Tier 3"
        return "Unknown"
    
    def calculate_metrics(y_true, y_pred):
        mae = mean_absolute_error(y_true, y_pred)
        wape = np.sum(np.abs(y_true - y_pred)) / (np.sum(np.abs(y_true)) + 1e-8)
        rmse = np.sqrt(mean_squared_error(y_true, y_pred))
        bias = np.mean(y_pred - y_true)
        
        return {
            'MAE': mae,
            'WAPE': wape,
            'RMSE': rmse,
            'Bias': bias
        }

    logger.info("Starting model evaluator component")
    
    with open(experiment_run.path, 'r') as f:
        run_info = json.load(f)
    
    logger.info(f"Initializing Vertex AI SDK and continuing experiment run")
    aiplatform.init(project=run_info['project_id'], location=run_info['location'], experiment=run_info['experiment'])
    run = aiplatform.start_run(run_info['run_name'], resume=True)

    logger.info(f"Reading test dataset from: {test_dataset.path}")
    test_df = pd.read_csv(test_dataset.path)
    test_df[time_column] = pd.to_datetime(test_df[time_column])
    
    logger.info(f"Loading model predictions from: {trained_model.path}")
    with open(trained_model.path, "rb") as f:
        model_output = pickle.load(f)

    tier_predictions = {
        "Tier 1": {"true": [], "pred": []},
        "Tier 2": {"true": [], "pred": []},
        "Tier 3": {"true": [], "pred": []},
        "Overall": {"true": [], "pred": []}
    }

    logger.info(f"Evaluating model type: {model_output['model_type']}")
    
    for (series_id, split), predictions in model_output['predictions'].items():
        tier = get_tier(series_id)
        
        series_test = test_df[(test_df[series_identifier] == series_id) & 
                              (test_df['split_index'] == split)].sort_values(time_column)
        
        if len(series_test) == 0:
            logger.warning(f"No test data for series {series_id}, split {split}")
            continue

        pred_dates = pd.to_datetime(predictions['timestamps'])
        y_true = series_test[series_test[time_column].isin(pred_dates)][target_column].values
        y_pred = np.array(predictions['values'])[:len(y_true)]
        
        if len(y_true) == 0 or len(y_pred) == 0:
            logger.warning(f"No matching data for series {series_id}, split {split}")
            continue

        logger.info(f"Series {series_id}, Split {split}: True shape: {y_true.shape}, Pred shape: {y_pred.shape}")

        if tier in tier_predictions:
            tier_predictions[tier]["true"].extend(y_true)
            tier_predictions[tier]["pred"].extend(y_pred)
        tier_predictions["Overall"]["true"].extend(y_true)
        tier_predictions["Overall"]["pred"].extend(y_pred)

    all_metrics = {}
    
    for tier, values in tier_predictions.items():
        if not len(values['true']):
            logger.warning(f"No data for {tier}")
            continue

        y_true = np.array(values["true"])
        y_pred = np.array(values["pred"])
        
        logger.info(f"{tier} - True shape: {y_true.shape}, Pred shape: {y_pred.shape}")
        
        metrics = calculate_metrics(y_true, y_pred)
        
        prefix = "Overall_" if tier == "Overall" else f"{tier}_"
        all_metrics.update({
            f"{prefix}{metric_name}": value 
            for metric_name, value in metrics.items()
        })
        
        logger.info(f"\n{tier} Metrics:")
        for metric_name, value in metrics.items():
            logger.info(f"{metric_name}: {value:.4f}")

    # Log metrics and parameters dynamically based on model output
    run.log_metrics(all_metrics)

    # Log model parameters dynamically
    model_params = {
        "model_type": model_output['model_type'],
        "time_column": time_column,
        "target_column": target_column,
        "series_identifier": series_identifier
    }
    
    # Add any model-specific parameters from the model output
    if 'parameters' in model_output:
        if isinstance(model_output['parameters'], dict):
            # For models with global parameters (like SMA)
            model_params.update(model_output['parameters'])
        else:
            # For models with per-series parameters (like SARIMA)
            # Log the parameters of the first series as an example
            first_series_params = next(iter(model_output['parameters'].values()))
            model_params.update({
                f"example_series_params_{k}": str(v) 
                for k, v in first_series_params.items()
            })

    run.log_params(model_params)
    
    logger.info("Model evaluator component completed successfully")

    with open(output_metrics.path, 'w') as f:
        json.dump(all_metrics, f)


In [13]:
#generate_model_report_card
@component(
    base_image="northamerica-northeast1-docker.pkg.dev/cio-workbench-image-np-0ddefe/bi-platform/bi-aaaie/images/b2b_ai/wf_pipeline/training:1.0.2-rc"
)
def generate_model_report_card(
    evaluation_metrics: Input[Artifact],
    statistics_artifact: Input[Artifact],
    trained_model: Input[Model],  # Added to access model parameters
    time_column: str,
    target_column: str,
    series_identifier: str,
    output_report: Output[Artifact]
):
    import json
    import pandas as pd
    import plotly.express as px
    import plotly.graph_objects as go
    from plotly.subplots import make_subplots
    import logging
    from datetime import datetime
    import pickle

    logging.basicConfig(level=logging.INFO)
    logger = logging.getLogger(__name__)

    logger.info("Loading metrics, statistics, and model")
    with open(evaluation_metrics.path, 'r') as f:
        metrics = json.load(f)
    with open(statistics_artifact.path, 'r') as f:
        stats = json.load(f)
    with open(trained_model.path, 'rb') as f:
        model_output = pickle.load(f)

    def create_metrics_comparison():
        logger.info("Creating metrics comparison plots")
        
        tiers = ['Overall', 'Tier 1', 'Tier 2', 'Tier 3']
        metric_types = ['MAE', 'WAPE', 'RMSE', 'Bias']
        
        fig = make_subplots(
            rows=2, cols=2,
            subplot_titles=metric_types,
            vertical_spacing=0.15,
            horizontal_spacing=0.1
        )
        
        colors = px.colors.qualitative.Set3
        
        for idx, metric in enumerate(metric_types, 1):
            row = ((idx-1) // 2) + 1
            col = ((idx-1) % 2) + 1
            
            values = []
            for tier in tiers:
                prefix = 'Overall_' if tier == 'Overall' else f"{tier}_"
                key = f"{prefix}{metric}"
                if key in metrics:
                    values.append(metrics[key])
                else:
                    values.append(None)
            
            fig.add_trace(
                go.Bar(
                    x=tiers,
                    y=values,
                    name=metric,
                    text=[f'{v:.2f}' if v is not None else 'N/A' for v in values],
                    textposition='auto',
                    marker_color=colors[idx-1],
                    showlegend=False
                ),
                row=row, col=col
            )
            
            fig.update_xaxes(title='Tier', row=row, col=col)
            fig.update_yaxes(title=metric, row=row, col=col)

        fig.update_layout(
            height=800,
            title_text="Model Performance Metrics by Tier",
            showlegend=False
        )
        return fig

    def create_dataset_summary():
        logger.info("Creating dataset summary")
        
        summary = stats['summary']
        split_info = summary['splits_info']
        
        fig = go.Figure(data=[go.Table(
            header=dict(
                values=['Metric', 'Value'],
                fill_color='lightgrey',
                align='left',
                font=dict(size=14)
            ),
            cells=dict(
                values=[
                    [
                        'Total Training Rows',
                        'Total Test Rows',
                        'Date Range',
                        'Split 4 Training Rows',
                        'Split 4 Test Rows'
                    ],
                    [
                        f"{summary['total_rows']['train']:,}",
                        f"{summary['total_rows']['test']:,}",
                        f"{summary['date_range']['earliest']} to {summary['date_range']['latest']}",
                        f"{split_info['split_4']['train_rows']:,}",
                        f"{split_info['split_4']['test_rows']:,}"
                    ]
                ],
                align='left',
                font=dict(size=13)
            )
        )])
        
        fig.update_layout(
            title="Dataset Summary",
            height=300
        )
        return fig

    def create_feature_summary():
        logger.info("Creating detailed feature summary")
        
        split_stats = stats['train_split_4']
        
        feature_names = []
        null_counts = []
        unique_values = []
        
        feature_names.append(time_column)
        null_counts.append(split_stats['null_counts'].get(time_column, 'N/A'))
        unique_values.append('N/A (Date column)')
        
        feature_names.append(target_column)
        null_counts.append(split_stats['null_counts'].get(target_column, 'N/A'))
        unique_values.append('N/A (Continuous)')
        
        feature_names.append(series_identifier)
        null_counts.append(split_stats['null_counts'].get(series_identifier, 'N/A'))
        unique_values.append(split_stats['categorical_columns'].get(series_identifier, {}).get('unique_values', 'N/A'))
        
        for col, col_stats in split_stats['categorical_columns'].items():
            if col != series_identifier:
                feature_names.append(col)
                null_counts.append(split_stats['null_counts'].get(col, 'N/A'))
                unique_values.append(col_stats['unique_values'])
        
        fig = go.Figure(data=[go.Table(
            header=dict(
                values=['Feature', 'Null Count', 'Unique Values'],
                fill_color='lightgrey',
                align='left',
                font=dict(size=14)
            ),
            cells=dict(
                values=[feature_names, null_counts, unique_values],
                align='left',
                font=dict(size=13)
            )
        )])
        
        fig.update_layout(
            title="Feature Summary",
            height=400
        )
        return fig

    def get_model_parameters_description():
        """Dynamically generate model parameters description based on model type"""
        model_type = model_output['model_type']
        parameters = model_output.get('parameters', {})
        
        if model_type == 'SMA':
            return f"The Simple Moving Average model uses a window size of {parameters.get('window_size')} to generate predictions based on historical values."
        elif model_type == 'SARIMA':
            # For SARIMA, we'll show parameters from the first series as an example
            first_series_params = next(iter(parameters.values()))
            return (f"The SARIMA model uses automatically determined parameters for each series. "
                   f"Example parameters from one series - Order: {first_series_params.get('order')}, "
                   f"Seasonal Order: {first_series_params.get('seasonal_order')}")
        else:
            return f"This is a {model_type} model. Refer to model documentation for specific parameter details."

    figures = [
        create_metrics_comparison(),
        create_dataset_summary(),
        create_feature_summary()
    ]

    html_parts = [
        "<!DOCTYPE html>",
        "<html>",
        "<head>",
        "<title>Model Report Card</title>",
        "<script src='https://cdn.plot.ly/plotly-latest.min.js'></script>",
        "<style>",
        "body { margin: 20px; font-family: Arial, sans-serif; }",
        ".header { text-align: center; margin-bottom: 30px; }",
        ".plot { margin-bottom: 40px; }",
        ".metrics-summary { background: #f5f5f5; padding: 20px; border-radius: 5px; margin-bottom: 20px; }",
        ".section { margin-bottom: 40px; }",
        "h2 { color: #2c3e50; }",
        "</style>",
        "</head>",
        "<body>",
        "<div class='header'>",
        "<h1>Model Performance Report Card</h1>",
        f"<p>Generated on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}</p>",
        "</div>",
        "<div class='section'>",
        "<h2>Model Overview</h2>",
        f"<p>This report presents the performance metrics of the {model_output['model_type']} model across different tiers, ",
        "along with dataset statistics and detailed feature information.</p>",
        f"<p>{get_model_parameters_description()}</p>",
        "</div>"
    ]

    for fig in figures:
        html_parts.append(f"<div class='plot'>{fig.to_html(full_html=False, include_plotlyjs=False)}</div>")

    html_parts.extend([
        "<div class='section'>",
        "<h2>Metrics Interpretation</h2>",
        "<ul>",
        "<li><strong>MAE (Mean Absolute Error):</strong> Average absolute difference between predicted and actual values.</li>",
        "<li><strong>WAPE (Weighted Absolute Percentage Error):</strong> Percentage error weighted by the magnitude of actual values.</li>",
        "<li><strong>RMSE (Root Mean Square Error):</strong> Square root of the average squared differences, penalizing larger errors.</li>",
        "<li><strong>Bias:</strong> Average difference between predicted and actual values, indicating systematic over/under-prediction.</li>",
        "</ul>",
        "</div>",
        "<div class='section'>",
        "<h2>Feature Information</h2>",
        "<ul>",
        f"<li><strong>Time Column ({time_column}):</strong> Used to order the time series data.</li>",
        f"<li><strong>Target Column ({target_column}):</strong> The variable being predicted by the model.</li>",
        f"<li><strong>Series Identifier ({series_identifier}):</strong> Used to distinguish between different time series within the dataset.</li>",
        "</ul>",
        "</div>",
        "</body>",
        "</html>"
    ])

    logger.info(f"Saving HTML report to {output_report.path}")
    with open(output_report.path, "w") as f:
        f.write("\n".join(html_parts))

    logger.info("Report card generation completed")


In [17]:
#log_reports
@component(
    base_image='northamerica-northeast1-docker.pkg.dev/cio-workbench-image-np-0ddefe/bi-platform/bi-aaaie/images/b2b_ai/wf_pipeline/training:1.0.2-rc'
)
def log_reports(
    data_report: Input[Artifact],
    eval_report: Input[Artifact],
    experiment_run: Input[Artifact]
):
    from google.cloud import aiplatform
    import logging
    import json
    
    logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
    logger = logging.getLogger(__name__)
    
    with open(experiment_run.path, 'r') as f:
        run_info = json.load(f)
        
    logger.info(f"Initializing Vertex AI SDK and continuing experiment run")
    aiplatform.init(project=run_info['project_id'], location=run_info['location'], experiment=run_info['experiment'])
    run = aiplatform.start_run(run_info['run_name'], resume=True)
    
    run.log_params({
        'dataset_report_uri': data_report.path,
        'model_eval_uri': eval_report.path
    })
    
    run.end_run()

# Build Pipeline

In [15]:
@dsl.pipeline(
    name="b2b-wf-short-term-prediction-experiments",
    description="A Kubeflow pipeline for training forecast models using AutoML Forecast on Vertex AI Pipelines from a BigQuery view."
)
def forecast_pipeline(
    project_id: str,
    project_location: str,
    bq_dataset: str,
    bq_source_table: str,
    time_column: str,
    target_column: str,
    series_identifier: str,
    window_size: int,
    experiment_name: str,
    forecast_horizon: int,
    run_name: str,
    attribute_columns: List[str]
):
    query_and_preprocess_task = query_and_preprocess(
        project_id=project_id,
        project_location=project_location,
        bq_dataset=bq_dataset,
        bq_source_table=bq_source_table,
        time_column=time_column,
        target_column=target_column,
        series_identifier=series_identifier,
        attribute_columns=attribute_columns
    )
    
    generate_time_series_cv_task = generate_time_series_cv(
        time_column=time_column,
        input_dataset=query_and_preprocess_task.outputs['output_dataset'],
        forecast_horizon=forecast_horizon
    )
    
    generate_dataset_statistics_task = generate_dataset_statistics(
        train_dataset=generate_time_series_cv_task.outputs['output_train'],
        test_dataset=generate_time_series_cv_task.outputs['output_test'],
        time_column=time_column,
        target_column=target_column,
        attribute_columns=attribute_columns
    )
    
    generate_statistics_visualization_task = generate_statistics_visualization(
        statistics_artifact=generate_dataset_statistics_task.outputs['output_statistics'],
        train_dataset=generate_time_series_cv_task.outputs['output_train'],
        test_dataset=generate_time_series_cv_task.outputs['output_test'],
        attribute_columns=attribute_columns,
        time_column=time_column,
        target_column=target_column
    )
    
    if MODEL == 'SMA':
        model_trainer_task = sma_trainer_component_job(
            project=project_id,
            location=project_location,
            dataset=generate_time_series_cv_task.outputs['output_train'],
            experiment_name=experiment_name,
            window_size=window_size,
            project_id=project_id,
            project_location=project_location,
            time_column=time_column,
            target_column=target_column,
            forecast_horizon=forecast_horizon,
            run_name=run_name,
            series_identifier=series_identifier
        )
    elif MODEL == 'SARIMA':
        model_trainer_task = sarima_trainer_component_job(
            project=project_id,
            location=project_location,
            dataset=generate_time_series_cv_task.outputs['output_train'],
            experiment_name=experiment_name,
            project_id=project_id,
            project_location=project_location,
            time_column=time_column,
            target_column=target_column,
            series_identifier=series_identifier,
            forecast_horizon=forecast_horizon,
            run_name=run_name
        )
    
    model_evaluator_task = model_evaluator_component(
        test_dataset=generate_time_series_cv_task.outputs['output_test'],
        trained_model=model_trainer_task.outputs['output_model'],
        experiment_run=model_trainer_task.outputs['experiment_run'],
        time_column=time_column,
        target_column=target_column,
        series_identifier=series_identifier
    )
    model_evaluator_task.set_cpu_limit('16')
    
    generate_model_report_card_task = generate_model_report_card(
        evaluation_metrics=model_evaluator_task.outputs['output_metrics'],
        statistics_artifact=generate_dataset_statistics_task.outputs['output_statistics'],
        trained_model=model_trainer_task.outputs['output_model'],
        time_column=time_column,
        target_column=target_column,
        series_identifier=series_identifier
    )
    
    log_report_task = log_reports(
        data_report=generate_statistics_visualization_task.outputs['output_visualization'],
        eval_report=generate_model_report_card_task.outputs['output_report'],
        experiment_run=model_trainer_task.outputs['experiment_run']
    )

In [None]:
aiplatform.init(
    project=PROJECT_ID,
    location=PROJECT_REGION,
    staging_bucket=BUCKET_URI
)

compiler.Compiler().compile(
    pipeline_func=forecast_pipeline,
    package_path=PIPELINE_PACKAGE_PATH
)

job = pipeline_jobs.PipelineJob(
    display_name="b2b_wf_short_term_prediction_sma_pipeline",
    template_path=PIPELINE_PACKAGE_PATH,
    parameter_values={
            'project_id': PROJECT_ID,
            'project_location': PROJECT_REGION,
            'bq_dataset': BQ_DATASET_NAME,
            'bq_source_table': BQ_SOURCE_TABLE,
            'time_column': "Appointment_Day",
            'target_column': "SWT",
            'series_identifier': "Series_Identifier",
            'window_size': 183,
            'run_name': RUN_NAME,
            'forecast_horizon': FORECAST_HORIZON,
            'attribute_columns': EXPERIMENT_FEATURES,
            'experiment_name': EXPERIMENT_NAME
        }
)

job.run()

Creating PipelineJob
PipelineJob created. Resource name: projects/7796273458/locations/northamerica-northeast1/pipelineJobs/b2b-wf-short-term-prediction-experiments-20250416182535
To use this PipelineJob in another session:
pipeline_job = aiplatform.PipelineJob.get('projects/7796273458/locations/northamerica-northeast1/pipelineJobs/b2b-wf-short-term-prediction-experiments-20250416182535')
View Pipeline Job:
https://console.cloud.google.com/vertex-ai/locations/northamerica-northeast1/pipelines/runs/b2b-wf-short-term-prediction-experiments-20250416182535?project=7796273458
PipelineJob run completed. Resource name: projects/7796273458/locations/northamerica-northeast1/pipelineJobs/b2b-wf-short-term-prediction-experiments-20250416182535
