In [None]:
from kfp.v2 import dsl
import kfp.v2.compiler as compiler
from kfp.v2.dsl import component, Input, Output, Artifact, Model, Dataset

from google.cloud import aiplatform
from google.cloud.aiplatform import pipeline_jobs

from typing import Any, Dict, List

In [None]:
PROJECT_ID = ''
GCS_BUCKET_NAME = ''
PROJECT_REGION = ''

VERTEX_DATASET_NAME = ''
VERTEX_MODEL_NAME = ''
VERTEX_PREDICTION_NAME = ''

BQ_DATASET_NAME = ''
BQ_SOURCE_TABLE = ''
BQ_TRAIN_TABLE = ''
BQ_PREDICT_TABLE = ''



In [None]:
PROJECT_ID = 'wb-ai-acltr-tbs-3-pr-a62583'
GCS_BUCKET_NAME = 'bkt_b2b_wf_prediction'
PROJECT_REGION = 'northamerica-northeast1'

VERTEX_DATASET_NAME = 'b2b_wf_short_term_prediction'
VERTEX_MODEL_NAME = 'b2b_wf_short_term_model'

BQ_DATASET_NAME = 'b2b_wf_prediction'
BQ_SOURCE_TABLE = 'vw_wf_daily_historical'


In [None]:
TRAINING_DATASET_BQ_PATH   = f"bq://{PROJECT_ID}.{BQ_DATASET_NAME}.{BQ_TRAIN_TABLE}"
BUCKET_URI = f"gs://{PROJECT_ID}_{GCS_BUCKET_NAME}"
PIPELINE_PACKAGE_PATH = 'short_term_pipeline.json'

In [None]:
EXPERIMENT_NAME = "sma-full-no-hype-tuning"
EXPERIMENT_FEATURES = [
    "District",
    "Region_Type",
    "Product",
    "Product_Grp",
    "Technology",
    "Work_Order_Action",
    "Work_Order_Action_Grp",
    "Work_Force"
]

In [None]:
aiplatform.init(
    project=PROJECT_ID,
    location=PROJECT_REGION,
    staging_bucket=BUCKET_URI)

In [None]:
@component(
    base_image="northamerica-northeast1-docker.pkg.dev/cio-workbench-image-np-0ddefe/bi-platform/bi-aaaie/images/kfp-preprocess-slim:2.0.1"
)
def query_and_preprocess(
    project_id: str,
    project_location: str,
    bq_dataset: str,
    bq_source_table: str,
    time_column: str,
    target_column: str,
    series_identifier: str,
    attribute_columns: List[str],
    output_dataset: Output[Dataset]
):
    """ 
    Queries BigQuery data, performs preprocessing, and exports to a CSV dataset for training.
    The function aggregates data by time and attributes, creates a unique series identifier,
    and handles categorical features appropriately.

    Args:
        project_id: GCP project ID
        project_location: GCP project location/region
        bq_dataset: BigQuery dataset name
        bq_source_table: Source table name
        attribute_columns: List of categorical columns to group by
        output_dataset: Output path for the preprocessed CSV dataset

    Returns:
        Writes a preprocessed CSV file to the output_dataset path containing:
        - Series_Identifier: Concatenated string of attribute values
        - Appointment_Day: Timestamp column
        - Attribute columns: Original categorical features
        - SWT: Aggregated target variable
    """
    
    import datetime
    import pandas as pd
    from google.cloud import bigquery
    

    def create_series_identifier(columns, series_identifier):
        coalesce_parts = [f"COALESCE({column}, 'None')" for column in columns]
        separator = "' '"
        return f"CONCAT({f', {separator}, '.join(coalesce_parts)}) AS {series_identifier}"
    
    time_column = "Appointment_Day"
    target_column = "SWT"
    
    ATTRIBUTE_STRING = ','.join(attribute_columns)

    COLUMN_SPECS = {
        time_column:             "timestamp",
        target_column:           "numeric"
    }

    for category in attribute_columns:
        COLUMN_SPECS[category] = "categorical"
        
    experiment_train_data_query = f"""
    WITH historical_table AS (
        SELECT 
            {time_column},
            {ATTRIBUTE_STRING},
            SUM({target_column}) AS {target_column}
        FROM `{project_id}.{bq_dataset}.{bq_source_table}`
        WHERE {time_column} <= DATE('2025-03-31')
        GROUP BY {time_column},{ATTRIBUTE_STRING}
    )
    SELECT 
        {create_series_identifier(attribute_columns, series_identifier)},
        {time_column},
        {ATTRIBUTE_STRING},
        {target_column}
    FROM historical_table
    """

    client = bigquery.Client(
        project=project_id,
        location=project_location)

    processed_data = client.query(experiment_train_data_query).to_dataframe()
    
    processed_data.to_csv(output_dataset.path, index=False)
    print(f"CSV file written to {output_dataset.path}")


In [None]:
@component(
    base_image="northamerica-northeast1-docker.pkg.dev/cio-workbench-image-np-0ddefe/bi-platform/bi-aaaie/images/mlops/kfp-2.0.0/kfp-load-model-slim:1.0.0"
)
def generate_time_series_cv(
    input_dataset: Input[Dataset],
    time_column: str,
    output_train: Output[Dataset],
    output_test: Output[Dataset]
):
    """
    Splits preprocessed time series data into rolling forecast datasets for training and evaluation.
    
    This component implements a rolling window cross-validation strategy for time series forecasting.
    It reads a CSV dataset containing timestamp-based data and creates training and test sets with
    an additional 'split_index' column to identify different temporal splits. Each split represents
    a different forecasting period, with training data incrementally growing and test data moving
    forward in time.
    
    Split Structure:
        - All splits start training from 2022-01-01
        - Training periods grow progressively longer
        - Each test period is a 3-month window following its training period
        - Data is labeled with split_index (1-4) to identify which split it belongs to
    
    Split Periods:
        Split 1: Train (2022-01-01 to 2024-03-31), Test (2024-04-01 to 2024-06-30)
        Split 2: Train (2022-01-01 to 2024-06-30), Test (2024-07-01 to 2024-09-30)
        Split 3: Train (2022-01-01 to 2024-09-30), Test (2024-10-01 to 2024-12-31)
        Split 4: Train (2022-01-01 to 2024-12-31), Test (2025-01-01 to 2025-03-31)
    
    Args:
        input_dataset: Input[Dataset]
            The preprocessed CSV dataset containing time series data
        time_column: str
            Name of the column containing timestamps
        output_train: Output[Dataset]
            Output path for the combined training dataset
            Contains all training data with a 'split_index' column
        output_test: Output[Dataset]
            Output path for the combined test dataset
            Contains all test data with a 'split_index' column
    
    Output Dataset Structure:
        Both training and test datasets include all original columns plus:
        - split_index: int (1-4)
            Identifies which temporal split the row belongs to
            Allows filtering/grouping data by split for analysis or modeling
    
    Note:
        This implementation combines all splits into two files (train and test) with a
        split_index column, rather than creating separate files for each split. This
        approach simplifies data handling while maintaining the ability to analyze
        individual splits through the split_index column.
    """
    
    import pandas as pd
    import logging
    from datetime import datetime
    
    logging.basicConfig(level=logging.INFO)
    logger = logging.getLogger(__name__)
    
    logger.info(f"Reading input dataset from: {input_dataset.path}")
    forecast_processed_data = pd.read_csv(input_dataset.path, parse_dates=[time_column])
    
    splits = [
        {
            "split_index": 1,
            "train_start": "2022-01-01",
            "train_end":   "2024-03-31",
            "test_start":  "2024-04-01",
            "test_end":    "2024-06-30"
        },
        {
            "split_index": 2,
            "train_start": "2022-01-01",
            "train_end":   "2024-06-30",
            "test_start":  "2024-07-01",
            "test_end":    "2024-09-30"
        },
        {
            "split_index": 3,
            "train_start": "2022-01-01",
            "train_end":   "2024-09-30",
            "test_start":  "2024-10-01",
            "test_end":    "2024-12-31"
        },
        {
            "split_index": 4,
            "train_start": "2022-01-01",
            "train_end":   "2024-12-31",
            "test_start":  "2025-01-01",
            "test_end":    "2025-03-31"
        },
    ]
    
    for s in splits:
        for key in ['train_start', 'train_end', 'test_start', 'test_end']:
            s[key] = pd.to_datetime(s[key])
    
    all_train_data = []
    all_test_data = []
    
    for s in splits:
        logger.info(f"Processing split {s['split_index']}")
        
        train_mask = (forecast_processed_data[time_column] >= s["train_start"]) & \
                     (forecast_processed_data[time_column] <= s["train_end"])
        
        test_mask = (forecast_processed_data[time_column] >= s["test_start"]) & \
                    (forecast_processed_data[time_column] <= s["test_end"])

        train_df = forecast_processed_data.loc[train_mask].copy()
        test_df = forecast_processed_data.loc[test_mask].copy()
        
        train_df['split_index'] = s['split_index']
        test_df['split_index'] = s['split_index']
        
        all_train_data.append(train_df)
        all_test_data.append(test_df)
        
        logger.info(f"Split {s['split_index']} - Train shape: {train_df.shape}, Test shape: {test_df.shape}")
    
    combined_train = pd.concat(all_train_data, ignore_index=True)
    combined_test = pd.concat(all_test_data, ignore_index=True)
    
    logger.info(f"Saving combined training data (shape: {combined_train.shape}) to {output_train.path}")
    combined_train.to_csv(output_train.path, index=False)
    
    logger.info(f"Saving combined test data (shape: {combined_test.shape}) to {output_test.path}")
    combined_test.to_csv(output_test.path, index=False)
    

In [None]:
@component(
    base_image="northamerica-northeast1-docker.pkg.dev/cio-workbench-image-np-0ddefe/bi-platform/bi-aaaie/images/mlops/kfp-2.0.0/kfp-load-model-slim:1.0.0"
)
def generate_dataset_statistics(
    train_dataset: Input[Dataset],
    test_dataset: Input[Dataset],
    time_column: str,
    target_column: str,
    attribute_columns: List[str],
    output_statistics: Output[Artifact]
) -> Dict[str, Dict[str, Dict[str, any]]]:
    """
    Generates statistics for the training and test datasets produced by the generate_time_series_cv component.

    This component analyzes the combined training and test datasets, which include a split_index column
    identifying different temporal splits. It generates statistics for each split as well as overall statistics.

    Args:
        train_dataset: Input[Dataset]
            Combined training dataset with split_index column
        test_dataset: Input[Dataset]
            Combined test dataset with split_index column
        output_statistics: Output[Artifact]
            Output artifact to store the generated statistics
        time_column: str
            Name of the timestamp column
        target_column: str
            Name of the target column (worked hours)
        attribute_columns: List[str]
            List of categorical columns (location, type of work, technology, product)

    Returns:
        Dict containing statistics for:
        - Each training split (train_split_1 through train_split_4)
        - Each test split (test_split_1 through test_split_4)
        - Overall statistics combining all data
    """
    import pandas as pd
    import numpy as np
    import json
    import logging

    logging.basicConfig(level=logging.INFO)
    logger = logging.getLogger(__name__)

    def calculate_statistics(df: pd.DataFrame, dataset_type: str, split_number: int) -> Dict[str, any]:
        stats = {
            "dataset_type": dataset_type,
            "split_number": split_number,
            "total_rows": len(df),
            "date_range": {
                "start": df[time_column].min().strftime("%Y-%m-%d"),
                "end": df[time_column].max().strftime("%Y-%m-%d")
            },
            "target_column": {
                "mean": float(df[target_column].mean()),
                "median": float(df[target_column].median()),
                "min": float(df[target_column].min()),
                "max": float(df[target_column].max()),
                "std": float(df[target_column].std()),
                "total": float(df[target_column].sum())
            },
            "null_counts": df.isnull().sum().to_dict(),
            "categorical_columns": {}
        }

        for col in attribute_columns:
            value_counts = df[col].value_counts()
            stats["categorical_columns"][col] = {
                "unique_values": int(df[col].nunique()),
                "top_5_values": value_counts.nlargest(5).to_dict(),
                "null_count": int(df[col].isnull().sum()),
                "total_count": int(len(df)),
                "distribution_percentage": value_counts.nlargest(5).apply(lambda x: float(x/len(df) * 100)).to_dict()
            }

        return stats

    all_statistics = {}

    # Read datasets
    logger.info("Reading input datasets")
    train_df = pd.read_csv(train_dataset.path, parse_dates=[time_column])
    test_df = pd.read_csv(test_dataset.path, parse_dates=[time_column])

    # Process each split in training data
    logger.info("Processing training splits")
    for split_index in range(1, 5):
        split_train = train_df[train_df['split_index'] == split_index]
        logger.info(f"Processing training split {split_index} (shape: {split_train.shape})")
        all_statistics[f"train_split_{split_index}"] = calculate_statistics(split_train, "train", split_index)

    # Process each split in test data
    logger.info("Processing test splits")
    for split_index in range(1, 5):
        split_test = test_df[test_df['split_index'] == split_index]
        logger.info(f"Processing test split {split_index} (shape: {split_test.shape})")
        all_statistics[f"test_split_{split_index}"] = calculate_statistics(split_test, "test", split_index)

    # Calculate overall statistics
    logger.info("Calculating overall statistics")
    all_data = pd.concat([train_df, test_df])
    all_statistics["overall"] = calculate_statistics(all_data, "overall", 0)

    # Add summary statistics
    all_statistics["summary"] = {
        "total_rows": {
            "train": len(train_df),
            "test": len(test_df),
            "total": len(all_data)
        },
        "date_range": {
            "earliest": all_data[time_column].min().strftime("%Y-%m-%d"),
            "latest": all_data[time_column].max().strftime("%Y-%m-%d")
        },
        "splits_info": {
            f"split_{i}": {
                "train_rows": len(train_df[train_df['split_index'] == i]),
                "test_rows": len(test_df[test_df['split_index'] == i])
            } for i in range(1, 5)
        }
    }

    # Save statistics to a JSON file
    logger.info(f"Saving statistics to {output_statistics.path}")
    with open(output_statistics.path, "w") as f:
        json.dump(all_statistics, f, indent=2)

    logger.info("Statistics generation completed")
    return all_statistics


In [None]:
@component(
    base_image="northamerica-northeast1-docker.pkg.dev/cio-workbench-image-np-0ddefe/bi-platform/bi-aaaie/images/kfp-plot-slim:2.0.0"
)
def generate_statistics_visualization(
    statistics_artifact: Input[Artifact],
    train_dataset: Input[Artifact],
    test_dataset: Input[Artifact],
    attribute_columns: List[str],
    time_column: str,
    target_column: str,
    output_visualization: Output[Artifact]
):
    """
    Generates an interactive HTML report visualizing the statistics from the dataset analysis.
    Focuses on train split 4 (which contains all historical data) and creates specific visualizations:
    1. Categorical variables distribution
    2. Time series visualization for the last split
    3. Target column statistics card
    
    Args:
        statistics_artifact: Input artifact containing the JSON statistics
        train_dataset: Training dataset for additional visualizations
        test_dataset: Test dataset for additional visualizations
        output_visualization: Output artifact for the HTML report
        time_column: Name of the timestamp column
        target_column: Name of the target column (worked hours)
    """
    import json
    import pandas as pd
    import numpy as np
    import plotly.express as px
    import plotly.graph_objects as go
    from plotly.subplots import make_subplots
    import logging
    from datetime import datetime

    logging.basicConfig(level=logging.INFO)
    logger = logging.getLogger(__name__)

    # Read statistics
    logger.info("Loading statistics from JSON")
    with open(statistics_artifact.path, 'r') as f:
        stats = json.load(f)

    # Read datasets
    logger.info("Loading datasets")
    
    train_df = pd.read_csv(train_dataset.path)
    train_df[time_column] = pd.to_datetime(train_df[time_column])
    
    test_df = pd.read_csv(test_dataset.path)
    test_df[time_column] = pd.to_datetime(test_df[time_column])

    # Filter for split 4
    train_split4 = train_df[train_df['split_index'] == 4]
    test_split4 = test_df[test_df['split_index'] == 4]
    split4_stats = stats['train_split_4']

    figures = []
    
    def create_categorical_distributions():
        logger.info("Creating categorical distributions chart")
        
        fig = make_subplots(
            rows=4, cols=2,
            subplot_titles=attribute_columns,
            vertical_spacing=0.12,
            horizontal_spacing=0.1
        )
        
        for idx, col in enumerate(attribute_columns, 1):
            row = ((idx-1) // 2) + 1
            col_num = ((idx-1) % 2) + 1
            
            col_stats = split4_stats['categorical_columns'][col]
            values = list(col_stats['distribution_percentage'].values())
            labels = list(col_stats['distribution_percentage'].keys())
            
            fig.add_trace(
                go.Bar(
                    x=labels,
                    y=values,
                    name=col,
                    text=[f'{v:.1f}%' for v in values],
                    textposition='auto',
                    showlegend=False
                ),
                row=row, col=col_num
            )
            
            fig.update_xaxes(tickangle=45, row=row, col=col_num)
            fig.update_yaxes(title='Percentage (%)', row=row, col=col_num)

        fig.update_layout(
            height=1200,
            title_text="Categorical Variables Distribution (Train Split 4)",
            showlegend=False
        )
        return fig

    def create_target_stats_card():
        logger.info("Creating target statistics card")
        target_stats = split4_stats['target_column']
        
        # Create subplot with 1 row and 2 columns
        fig = make_subplots(
            rows=1, cols=2,
            column_widths=[0.5, 0.5],
            specs=[[{"type": "table"}, {"type": "violin"}]],
            horizontal_spacing=0.05
        )
        
        # Add table
        fig.add_trace(
            go.Table(
                header=dict(
                    values=['Metric', 'Value'],
                    fill_color='lightgrey',
                    align='left',
                    font=dict(size=14)
                ),
                cells=dict(
                    values=[
                        ['Mean', 'Standard Deviation', 'Minimum', 'Maximum'],
                        [
                            f"{target_stats['mean']:,.2f}",
                            f"{target_stats['std']:,.2f}",
                            f"{target_stats['min']:,.2f}",
                            f"{target_stats['max']:,.2f}",
                        ]
                    ],
                    align='left',
                    font=dict(size=13)
                )
            ),
            row=1, col=1
        )
        
        # Compute log(SWT + 1)
        log_swt = np.log1p(train_split4[target_column])
        
        # Add violin plot (density plot) with log(SWT)
        fig.add_trace(
            go.Violin(
                y=log_swt,
                name=f"log({target_column} + 1)",
                box_visible=False,
                meanline_visible=True,
                points=False,
                fillcolor='lightblue',
                line_color='blue',
                showlegend=False,
                hovertemplate="log(%{y:.2f})<extra></extra>"
            ),
            row=1, col=2
        )
        
        # Update layout
        fig.update_layout(
            title=f"Target Column Statistics - log({target_column} + 1) (Train Split 4)",
            height=400,
            showlegend=False,
            yaxis=dict(
                title=f"log({target_column} + 1)",
                tickformat=".2f",
                side='right'
            )
        )
        
        return fig



    # Generate figures
    figures.extend([
        create_target_stats_card(),
        create_categorical_distributions()
    ])

    # Create HTML content
    html_parts = [
        "<!DOCTYPE html>",
        "<html>",
        "<head>",
        "<title>Workforce Time Series Analysis Report - Split 4</title>",
        "<script src='https://cdn.plot.ly/plotly-latest.min.js'></script>",
        "<style>",
        "body { margin: 20px; }",
        ".header { text-align: center; margin-bottom: 30px; }",
        ".plot { margin-bottom: 40px; }",
        ".stats-summary { background: #f5f5f5; padding: 20px; border-radius: 5px; margin-bottom: 20px; }",
        "</style>",
        "</head>",
        "<body>",
        "<div class='header'>",
        "<h1>Workforce Time Series Analysis Report - Split 4</h1>",
        f"<p>Generated on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}</p>",
        "</div>",
        "<div class='stats-summary'>",
        "<h2>Dataset Overview</h2>",
        f"<p>Total Records: {split4_stats['total_rows']:,}</p>",
        f"<p>Date Range: {split4_stats['date_range']['start']} to {split4_stats['date_range']['end']}</p>",
        "</div>"
    ]

    # Add figures
    for fig in figures:
        html_parts.append(f"<div class='plot'>{fig.to_html(full_html=False, include_plotlyjs=False)}</div>")

    # Close HTML
    html_parts.extend([
        "</body>",
        "</html>"
    ])

    # Save HTML report
    logger.info(f"Saving HTML report to {output_visualization.path}")
    with open(output_visualization.path, "w") as f:
        f.write("\n".join(html_parts))

    logger.info("Visualization generation completed")


In [None]:
@component(
    base_image="northamerica-northeast1-docker.pkg.dev/cio-workbench-image-np-0ddefe/bi-platform/bi-aaaie/images/components/kfp-2.0.0/kfp-pycaret-slim:1.1.3"
)
def sma_trainer_component(
    dataset: Input[Dataset],
    experiment_name: str,
    window_size: int,
    project_id: str,
    project_location: str,
    time_column: str,
    target_column: str,
    series_identifier: str,
    output_model: Output[Model],
):
    import pickle
    from google.cloud import aiplatform
    import pandas as pd
    from datetime import datetime
    from tqdm.auto import tqdm
    import logging
    
    # Configure logging
    logging.basicConfig(
        level=logging.INFO,
        format='%(asctime)s - %(levelname)s - %(message)s'
    )
    logger = logging.getLogger(__name__)
    
    # Custom tqdm class that logs to our logger
    class TqdmToLogger(object):
        def __init__(self, desc=None, total=None, logger=None):
            self.logger = logger or logging.getLogger(__name__)
            self.desc = desc
            self.total = total
            self.current = 0
            
        def update(self, n=1):
            self.current += n
            if self.desc:
                self.logger.info(f'{self.desc}: {self.current}/{self.total}')
            else:
                self.logger.info(f'Progress: {self.current}/{self.total}')
    
    logger.info(f"Starting SMA trainer component with window size: {window_size}")
    
    # Initialize Vertex AI SDK
    logger.info(f"Initializing Vertex AI SDK for project: {project_id}, location: {project_location}")
    aiplatform.init(project=project_id, location=project_location, experiment=experiment_name)

    logger.info(f"Reading dataset from: {dataset.path}")
    df = pd.read_csv(dataset.path)
    logger.info(f"Dataset shape: {df.shape}")

    models = {}
    timestamp = datetime.now()
    timestamp = f'{timestamp.year}-{timestamp.month}-{timestamp.day}-{timestamp.hour}{timestamp.minute}'
    run = aiplatform.start_run("sma-training-" + timestamp)

    unique_splits = sorted(df["split_index"].unique())
    total_combinations = sum(len(df[df["split_index"] == split][series_identifier].unique()) 
                           for split in unique_splits)
    
    logger.info(f"Processing {total_combinations} series across {len(unique_splits)} splits")
    
    # Main progress tracking
    overall_progress = TqdmToLogger(desc="Overall Progress", total=total_combinations, logger=logger)
    
    for split in unique_splits:
        split_df = df[df["split_index"] == split]
        unique_series = split_df[series_identifier].unique()
        
        logger.info(f"Processing Split {split} ({len(unique_series)} series)")
        
        for series_id in unique_series:
            series_df = split_df[split_df[series_identifier] == series_id].sort_values(time_column)
            sma_series = series_df[target_column].rolling(window=window_size).mean()
            models[(series_id, split)] = sma_series.dropna().tolist()
            overall_progress.update(1)
    
    logger.info(f"Total models created: {len(models)}")
    logger.info(f"Saving models to: {output_model.path}")
    
    with open(output_model.path, "wb") as f:
        pickle.dump(models, f)

    logger.info("Logging to Vertex AI")
    run.log_params({"window_size": window_size, "model_uri": output_model.path})
    run.log_metrics({"total_models": len(models)})
    

In [None]:
#TODO: 
# implement the evaluation step
# generate the model's card

In [None]:
@dsl.pipeline(
    name="b2b-wf-short-term-prediction-experiments",
    description="A Kubeflow pipeline for training forecast models using AutoML Forecast on Vertex AI Pipelines from a BigQuery view."
)
def forecast_pipeline(
    project_id: str,
    project_location: str,
    bq_dataset: str,
    bq_source_table: str,
    time_column: str,
    target_column: str,
    series_identifier: str,
    window_size: int,
    experiment_name: str,
    attribute_columns: List[str]
):
    query_and_preprocess_task = query_and_preprocess(
        project_id=project_id,
        project_location=project_location,
        bq_dataset=bq_dataset,
        bq_source_table=bq_source_table,
        time_column=time_column,
        target_column=target_column,
        series_identifier=series_identifier,
        attribute_columns=attribute_columns
    )
    
    generate_time_series_cv_task = generate_time_series_cv(
        time_column=time_column,
        input_dataset=query_and_preprocess_task.outputs['output_dataset']
    )
    
    generate_dataset_statistics_task = generate_dataset_statistics(
        train_dataset=generate_time_series_cv_task.outputs['output_train'],
        test_dataset=generate_time_series_cv_task.outputs['output_test'],
        time_column=time_column,
        target_column=target_column,
        attribute_columns=attribute_columns
    )
    
    generate_statistics_visualization_task = generate_statistics_visualization(
        statistics_artifact=generate_dataset_statistics_task.outputs['output_statistics'],
        train_dataset=generate_time_series_cv_task.outputs['output_train'],
        test_dataset=generate_time_series_cv_task.outputs['output_test'],
        attribute_columns=attribute_columns,
        time_column=time_column,
        target_column=target_column
    )
    
    sma_trainer_component_task = sma_trainer_component(
        dataset=generate_time_series_cv_task.outputs['output_train'],
        experiment_name=experiment_name,
        window_size=window_size,
        project_id=project_id,
        project_location=project_location,
        time_column=time_column,
        target_column=target_column,
        series_identifier=series_identifier
    )

In [None]:
compiler.Compiler().compile(
    pipeline_func=forecast_pipeline,
    package_path=PIPELINE_PACKAGE_PATH
)

job = pipeline_jobs.PipelineJob(
    display_name="b2b_wf_short_term_prediction",
    template_path=PIPELINE_PACKAGE_PATH,
    parameter_values={
        'project_id': PROJECT_ID,
        'project_location': PROJECT_REGION,
        'bq_dataset': BQ_DATASET_NAME,
        'bq_source_table': BQ_SOURCE_TABLE,
        'time_column': "Appointment_Day",
        'target_column': "SWT",
        'series_identifier': "Series_Identifier",
        'window_size': 183,
        'attribute_columns': EXPERIMENT_FEATURES,
        'experiment_name': EXPERIMENT_NAME
    }
)

job.run()

PipelineJob created. Resource name: projects/7796273458/locations/northamerica-northeast1/pipelineJobs/b2b-wf-short-term-prediction-experiments-20250409220140
To use this PipelineJob in another session:
pipeline_job = aiplatform.PipelineJob.get('projects/7796273458/locations/northamerica-northeast1/pipelineJobs/b2b-wf-short-term-prediction-experiments-20250409220140')
View Pipeline Job:
https://console.cloud.google.com/vertex-ai/locations/northamerica-northeast1/pipelines/runs/b2b-wf-short-term-prediction-experiments-20250409220140?project=7796273458
