In [1]:
PROJECT_ID      = ""
PROJECT_REGION  = ""

GCS_BUCKET_NAME = ""

VERTEX_DATASET_NAME    = ""
VERTEX_MODEL_NAME      = ""
VERTEX_PREDICTION_NAME = ""

BQ_DATASET_NAME  = ""
BQ_TRAIN_TABLE   = ""
BQ_PREDICT_TABLE = ""

In [2]:
PROJECT_ID = "wb-ai-acltr-tbs-3-pr-a62583"
GCS_BUCKET_NAME = "bkt_b2b_wf_prediction"
PROJECT_REGION = "northamerica-northeast1"

VERTEX_DATASET_NAME = "b2b_wf_prediction_panorama"
VERTEX_MODEL_NAME = "b2b_wf_prediction_panorama"
VERTEX_PREDICTION_NAME = "b2b_wf_prediction_batch"

BQ_DATASET_NAME = "b2b_wf_prediction"
BQ_TRAIN_TABLE = "vw_wf_experiment_historical"
BQ_PREDICT_TABLE = "bq_wf_temp_predictions"
BQ_FORECAST_TABLE= "bq_wf_forecast"

TRAIN_TEST_DATA_SPLIT = "DATE('2024-07-01')"

In [3]:
import sys
sys.path.insert(0, '/workspaces/b2b-wf-experiments/src')

from components.data_evaluation_preprocessor import DataEvaluationPreprocessor
from components.data_evaluator import Evaluation

In [4]:
import google.cloud.aiplatform as aiplatform
from google.cloud import bigquery
from dataclasses import dataclass
import datetime
import json

TRAINING_DATASET_BQ_PATH   = f"bq://{PROJECT_ID}.{BQ_DATASET_NAME}.{BQ_TRAIN_TABLE}"
PREDICTION_DATASET_BQ_PATH = f"bq://{PROJECT_ID}.{BQ_DATASET_NAME}.{BQ_PREDICT_TABLE}"
PREDICTION_OUTPUT_PREFIX   = f"bq://{PROJECT_ID}.{BQ_DATASET_NAME}"
BUCKET_URI = f"gs://{PROJECT_ID}_{GCS_BUCKET_NAME}"

In [5]:
aiplatform.init(
    project=PROJECT_ID, 
    staging_bucket=BUCKET_URI,
    location=PROJECT_REGION
)

In [6]:
client = bigquery.Client(
    project=PROJECT_ID, 
    location=PROJECT_REGION
)



In [7]:
@dataclass(frozen=True)
class Experiment:
    name: str
    model: str
    experiment_columns: list[str]
    objective: str
    forecast_horizon: int
    context_window: int
    data_granularity_unit: str
    holiday_regions: list[str]
    

In [8]:
daily_forecast_experiment = Experiment(
    name="daily_forecast",
    model="AutoML",
    experiment_columns=[
        "District",
        "Region_Type",
        "Product",
        "Product_Grp",
        "Technology",
        "Work_Order_Action",
        "Work_Order_Action_Grp",
        "Work_Force"],
    objective="minimize-rmse",
    forecast_horizon=184,
    context_window=368,
    data_granularity_unit='day',
    holiday_regions=["CA"]
)

daily_forecast_minimal_features_experiment = Experiment(
    name="minimal_features_daily_forecast",
    model="AutoML",
    experiment_columns=[
        "District",
        "Region_Type",
        "Product_Grp",
        "Work_Order_Action_Grp"],
    objective="minimize-rmse",
    forecast_horizon=184,
    context_window=368,
    data_granularity_unit='day',
    holiday_regions=["CA"]
)

## Select experiment

In [9]:
running_experiment = daily_forecast_minimal_features_experiment

## Create train data view

In [10]:
def create_series_identifier(columns):
    coalesce_parts = [f"COALESCE({column}, 'None')" for column in columns]
    separator = "' '"
    return f"CONCAT({f', {separator}, '.join(coalesce_parts)}) as Series_Identifier"

In [11]:
time_column                   = "Appointment_Day"
time_series_identifier_column = "Series_Identifier"
target_column                 = "SWT"

FORECAST_TIMESTAMP = datetime.datetime.now()
ATTRIBUTE_COLUMNS = running_experiment.experiment_columns
ATTRIBUTE_STRING = ','.join(ATTRIBUTE_COLUMNS)

COLUMN_SPECS = {
    time_column:             "timestamp",
    target_column:           "numeric"
}

for category in ATTRIBUTE_COLUMNS:
    COLUMN_SPECS[category] = "categorical"


VERTEX_MODEL_NAME += f"_{running_experiment.name}"
VERTEX_DATASET_NAME += f"_{running_experiment.name}"

In [12]:
experiment_data_cte = f"""
WITH historical_table AS (
  SELECT 
    {time_column},
    {ATTRIBUTE_STRING},
    SUM({target_column}) AS {target_column}
  FROM `{BQ_DATASET_NAME}.vw_wf_historical`
  WHERE Appointment_Day < {TRAIN_TEST_DATA_SPLIT}
  GROUP BY {time_column},{ATTRIBUTE_STRING}
)"""


experiment_train_data_query = f"""
CREATE OR REPLACE VIEW `{BQ_DATASET_NAME}.{BQ_TRAIN_TABLE}` AS 
{experiment_data_cte}
SELECT 
  {create_series_identifier(ATTRIBUTE_COLUMNS)},
  {time_column},
  {ATTRIBUTE_STRING},
  {target_column}
FROM historical_table
"""

In [13]:
client.query_and_wait(experiment_train_data_query)

<google.cloud.bigquery.table.RowIterator at 0x7fffac46a0e0>

In [14]:
dataset_list = aiplatform.TimeSeriesDataset.list(
    filter=f"display_name={VERTEX_DATASET_NAME}"
)

if len(dataset_list) == 0:
    print("... creating new dataset ... ")
    dataset = aiplatform.TimeSeriesDataset.create(
        display_name=VERTEX_DATASET_NAME,
        bq_source=[TRAINING_DATASET_BQ_PATH],
    )
else:
    print("... using existent dataset ... ")
    dataset = dataset_list[0]

... creating new dataset ... 
Creating TimeSeriesDataset
Create TimeSeriesDataset backing LRO: projects/7796273458/locations/northamerica-northeast1/datasets/1705650397936353280/operations/2547550299615133696
TimeSeriesDataset created. Resource name: projects/7796273458/locations/northamerica-northeast1/datasets/1705650397936353280
To use this TimeSeriesDataset in another session:
ds = aiplatform.TimeSeriesDataset('projects/7796273458/locations/northamerica-northeast1/datasets/1705650397936353280')


In [15]:
model_list = aiplatform.Model.list(
    filter=f"display_name={VERTEX_MODEL_NAME}"
)

if len(model_list) == 0:
    print("... training a new model ... ")
    parent_model = None
else:
    print("... using existent model ... ")
    model = model_list[0]
    print(model)
    parent_model = model.resource_name

... training a new model ... 


In [16]:
training_job = aiplatform.AutoMLForecastingTrainingJob(
    display_name=VERTEX_MODEL_NAME,
    optimization_objective=running_experiment.objective,
    column_specs=COLUMN_SPECS,
)

In [18]:
model = training_job.run(
    dataset=dataset,
    target_column=target_column,
    time_column=time_column,
    time_series_identifier_column=time_series_identifier_column,
    available_at_forecast_columns=[time_column],
    unavailable_at_forecast_columns=[target_column],
    time_series_attribute_columns=ATTRIBUTE_COLUMNS,
    forecast_horizon=running_experiment.forecast_horizon,
    context_window=running_experiment.context_window,
    data_granularity_unit=running_experiment.data_granularity_unit,
    data_granularity_count=1,
    weight_column=None,
    budget_milli_node_hours=1000,
    parent_model = parent_model,
    model_display_name=VERTEX_MODEL_NAME,
    is_default_version = True,
    model_version_description = f"{running_experiment.name} model generated on {datetime.date.today().isoformat()}",
    predefined_split_column_name=None,
    holiday_regions=running_experiment.holiday_regions
)

No dataset split provided. The service will use a default split.
View Training:
https://console.cloud.google.com/ai/platform/locations/northamerica-northeast1/training/1048846681723895808?project=7796273458
AutoMLForecastingTrainingJob projects/7796273458/locations/northamerica-northeast1/trainingPipelines/1048846681723895808 current state:
PipelineState.PIPELINE_STATE_RUNNING
AutoMLForecastingTrainingJob projects/7796273458/locations/northamerica-northeast1/trainingPipelines/1048846681723895808 current state:
PipelineState.PIPELINE_STATE_RUNNING
AutoMLForecastingTrainingJob projects/7796273458/locations/northamerica-northeast1/trainingPipelines/1048846681723895808 current state:
PipelineState.PIPELINE_STATE_RUNNING
AutoMLForecastingTrainingJob projects/7796273458/locations/northamerica-northeast1/trainingPipelines/1048846681723895808 current state:
PipelineState.PIPELINE_STATE_RUNNING
AutoMLForecastingTrainingJob projects/7796273458/locations/northamerica-northeast1/trainingPipelines/

In [19]:
date_range = f"""SELECT
    (
      DATE(DATE_TRUNC({TRAIN_TEST_DATA_SPLIT}, DAY)) + INTERVAL i DAY
    ) AS {time_column}
  FROM
    UNNEST (GENERATE_ARRAY(0, {running_experiment.forecast_horizon-1})) AS i"""

columns_dim = f"""SELECT DISTINCT
    Series_Identifier,
    {ATTRIBUTE_STRING}
  FROM `{BQ_DATASET_NAME}.{BQ_TRAIN_TABLE}`
  WHERE
      {" IS NOT NULL AND ".join(ATTRIBUTE_COLUMNS)} IS NOT NULL
"""
future_values = f"""SELECT
    h.Series_Identifier,
    CAST(d.{time_column} AS DATE) AS {time_column},
    {','.join(map(lambda x : f'h.{x}', ATTRIBUTE_COLUMNS))},
    NULL AS {target_column},
    'predicted' AS {target_column}_Type
  FROM columns_dim h,
    date_range d
  """

past_values = f"""SELECT
    Series_Identifier,
    {time_column},
    {ATTRIBUTE_STRING},
    {target_column},
    'actual' AS {target_column}_Type
  FROM `{BQ_DATASET_NAME}.{BQ_TRAIN_TABLE}`
  WHERE
    {" IS NOT NULL AND ".join(ATTRIBUTE_COLUMNS)} IS NOT NULL"""


predicton_table_query = f"""WITH date_range AS (
  {date_range}
), columns_dim AS (
  {columns_dim}
),future_values AS (
  {future_values}
), past_values AS (
  {past_values}
)
SELECT
  Series_Identifier,
  {time_column},
  {ATTRIBUTE_STRING},
  {target_column},
  {target_column}_Type
FROM future_values
UNION ALL
SELECT
  Series_Identifier,
  {time_column},
  {ATTRIBUTE_STRING},
  {target_column},
  {target_column}_Type
FROM past_values
"""


In [20]:
client.query_and_wait(f"""CREATE OR REPLACE TABLE `{BQ_DATASET_NAME}.{BQ_PREDICT_TABLE}` AS {predicton_table_query}""")

<google.cloud.bigquery.table.RowIterator at 0x7fffa7f83820>

In [22]:
batch_prediction_job = model.batch_predict(
    job_display_name=VERTEX_PREDICTION_NAME,
    bigquery_source=PREDICTION_DATASET_BQ_PATH,
    instances_format="bigquery",
    bigquery_destination_prefix=PREDICTION_OUTPUT_PREFIX,
    predictions_format="bigquery",
    generate_explanation=True,
    sync=True,
)

Creating BatchPredictionJob
BatchPredictionJob created. Resource name: projects/7796273458/locations/northamerica-northeast1/batchPredictionJobs/4091028230012665856
To use this BatchPredictionJob in another session:
bpj = aiplatform.BatchPredictionJob('projects/7796273458/locations/northamerica-northeast1/batchPredictionJobs/4091028230012665856')
View Batch Prediction Job:
https://console.cloud.google.com/ai/platform/locations/northamerica-northeast1/batch-predictions/4091028230012665856?project=7796273458
BatchPredictionJob projects/7796273458/locations/northamerica-northeast1/batchPredictionJobs/4091028230012665856 current state:
JobState.JOB_STATE_RUNNING
BatchPredictionJob projects/7796273458/locations/northamerica-northeast1/batchPredictionJobs/4091028230012665856 current state:
JobState.JOB_STATE_RUNNING
BatchPredictionJob projects/7796273458/locations/northamerica-northeast1/batchPredictionJobs/4091028230012665856 current state:
JobState.JOB_STATE_RUNNING
BatchPredictionJob proj

In [23]:
batch_table  = batch_prediction_job.output_info.bigquery_output_table

In [25]:
prediction_data = f"""
SELECT
  CAST('{FORECAST_TIMESTAMP}' AS TIMESTAMP) AS Forecast_Date,
  CAST(Appointment_Day AS DATE) AS Appointment_Day,
  Series_Identifier,
  {ATTRIBUTE_STRING},
  predicted_SWT.value AS SWT
FROM
  `{BQ_DATASET_NAME}.{batch_table}`
WHERE
  SWT_Type = 'predicted'
"""


perisist_predictions_query = f"""
INSERT INTO `{BQ_DATASET_NAME}.{BQ_FORECAST_TABLE}`
(
  Model,
  Forecast_Date,
  Series_Identifier,
  Appointment_Day,
  {ATTRIBUTE_STRING},
  SWT
)
WITH prediction_data AS (
  {prediction_data}
)
SELECT DISTINCT
  '{running_experiment.model}' AS Model,
  Forecast_Date,
  Series_Identifier,
  Appointment_Day,
  {ATTRIBUTE_STRING},
  SWT
FROM prediction_data
"""

In [27]:
client.query_and_wait(perisist_predictions_query)
client.query_and_wait(f"DROP TABLE `{BQ_DATASET_NAME}.{batch_table}`")

NotFound: 404 POST https://bigquery.googleapis.com/bigquery/v2/projects/wb-ai-acltr-tbs-3-pr-a62583/queries?prettyPrint=false: Not found: Table wb-ai-acltr-tbs-3-pr-a62583:b2b_wf_prediction.predictions_2025_02_12T02_21_46_830Z_277 was not found in location northamerica-northeast1

# Evaluation

In [15]:
forecast_query = f"""
SELECT
  DATE_TRUNC(Appointment_Day, MONTH) AS Appointment_Day,
  Product_Grp,
  Work_Order_Action_Grp,
  District,
  Region_Type,
  SUM({target_column}) as SWT
FROM `{BQ_DATASET_NAME}.{BQ_FORECAST_TABLE}`
WHERE 
  Model = '{running_experiment.model}'
  AND Forecast_Date = CAST('{FORECAST_TIMESTAMP}' AS TIMESTAMP)
GROUP BY
  DATE_TRUNC({time_column}, MONTH),
  Product_Grp,
  Work_Order_Action_Grp,
  District,
  Region_Type
ORDER BY
  {time_column},
  Product_Grp,
  Work_Order_Action_Grp,
  District,
  Region_Type
"""

In [16]:
forecast_df = client.query_and_wait(forecast_query).to_dataframe()

In [17]:
forecast_df.head()

Unnamed: 0,Appointment_Day,Product_Grp,Work_Order_Action_Grp,District,Region_Type,SWT
0,2024-07-01,Managed,Install,AFF Abitibi,Tier 2,2271.060026
1,2024-07-01,Managed,Install,AFF Amqui,Tier 3,1926.847691
2,2024-07-01,Managed,Install,AFF Baie-Comeau,Tier 3,2985.671451
3,2024-07-01,Managed,Install,AFF Basse Côte-Nord,Tier 3,1111.542418
4,2024-07-01,Managed,Install,AFF Bonaventure,Tier 3,1831.934429


In [18]:
historical_query = f"""
WITH historical_table AS (
  SELECT 
    DATE_TRUNC({time_column}, MONTH) AS Appointment_Day,
    Product_Grp,
    Work_Order_Action_Grp,
    District,
    Region_Type,
    SUM({target_column}) AS SWT
  FROM `{BQ_DATASET_NAME}.vw_wf_historical`
  WHERE {time_column} BETWEEN {TRAIN_TEST_DATA_SPLIT} AND DATE('2025-01-01')
  GROUP BY 
    DATE_TRUNC({time_column}, MONTH),
    Product_Grp,
    Work_Order_Action_Grp,
    District,
    Region_Type
)
SELECT
  Appointment_Day,
  Product_Grp,
  Work_Order_Action_Grp,
  District,
  Region_Type,
  SWT
FROM historical_table
ORDER BY
  Appointment_Day,
  Product_Grp,
  Work_Order_Action_Grp,
  District,
  Region_Type
"""

In [19]:
historical_df = client.query_and_wait(historical_query).to_dataframe()


In [20]:
historical_df.head()

Unnamed: 0,Appointment_Day,Product_Grp,Work_Order_Action_Grp,District,Region_Type,SWT
0,2024-07-01,Managed,Install,AFF Abitibi,Tier 2,12.0
1,2024-07-01,Managed,Install,AFF Amqui,Tier 3,7.0
2,2024-07-01,Managed,Install,AFF Baie-Comeau,Tier 3,5.0
3,2024-07-01,Managed,Install,AFF Basse Côte-Nord,Tier 3,35.0
4,2024-07-01,Managed,Install,AFF Bonaventure,Tier 3,4.0


In [21]:
historical_data = DataEvaluationPreprocessor(historical_df)
forecast_data = DataEvaluationPreprocessor(forecast_df)

In [22]:
evaluation = Evaluation(historical_data, forecast_data)

rmse = {
    'overall': evaluation.calculate_metric('rmse'),
    'Tier 1': evaluation.calculate_metric('rmse', filters={'Region_Type': 'Tier 1'}),
    'Tier 2': evaluation.calculate_metric('rmse', filters={'Region_Type': 'Tier 2'}),
    'Tier 3': evaluation.calculate_metric('rmse', filters={'Region_Type': 'Tier 3'})
}

wape = {
    'overall': evaluation.calculate_metric('wape'),
    'Tier 1': evaluation.calculate_metric('wape', filters={'Region_Type': 'Tier 1'}),
    'Tier 2': evaluation.calculate_metric('wape', filters={'Region_Type': 'Tier 2'}),
    'Tier 3': evaluation.calculate_metric('wape', filters={'Region_Type': 'Tier 3'})
}

In [23]:
evaluation_insert_query = f"""
INSERT INTO `{BQ_DATASET_NAME}.bq_wf_evaluation`
  (Model, Forecast_Date, Experiment_Config, WAPE, RMSE)
VALUES (
  '{running_experiment.model}',               
  '{FORECAST_TIMESTAMP}',
  STRUCT(
    '{running_experiment.name}' AS Name,
    '{running_experiment.objective}' AS Objective,
    {running_experiment.forecast_horizon} AS Forecast_Horizon,
    {running_experiment.context_window} AS Context_Window,
    '{running_experiment.data_granularity_unit}' AS Data_Granularity_Unit,
    {json.dumps(running_experiment.holiday_regions)} AS Holiday_Regions,
    {json.dumps(running_experiment.experiment_columns)} AS Experiment_Columns
  ),
  STRUCT(
    {wape['overall']} AS Overall,
    {wape['Tier 1']} AS Tier_1,
    {wape['Tier 2']} AS Tier_2,
    {wape['Tier 3']} AS Tier_3
  ),
  STRUCT(
    {rmse['overall']} AS Overall,
    {rmse['Tier 1']} AS Tier_1,
    {rmse['Tier 2']} AS Tier_2,
    {rmse['Tier 3']} AS Tier_3
  )
)
"""

In [24]:
client.query_and_wait(evaluation_insert_query)

<google.cloud.bigquery.table.RowIterator at 0x7fffa7c88a60>