In [None]:
# Copyright 2022 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Train a BigQuery ML ARIMA_PLUS Model using Vertex AI tabular workflows

<table align="left">
  <td style="text-align: center">
    <a href="https://colab.research.google.com/github/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/official/tabular_workflows/bqml_arima_plus.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/colab-logo-32px.png" alt="Colab logo"><br> Run in Colab
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/colab/import/https:%2F%2Fraw.githubusercontent.com%2FGoogleCloudPlatform%2Fvertex-ai-samples%2Fmain%2Fnotebooks%2Fofficial%2Ftabular_workflows%2Fbqml_arima_plus.ipynb">
      <img width="32px" src="https://cloud.google.com/ml-engine/images/colab-enterprise-logo-32px.png" alt="Google Cloud Colab Enterprise logo"><br> Open in Colab Enterprise
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://github.com/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/official/tabular_workflows/bqml_arima_plus.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/github-logo-32px.png" alt="GitHub logo"><br>
      View on GitHub
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/workbench/deploy-notebook?download_url=https://raw.githubusercontent.com/GoogleCloudPlatform/vertex-ai-samples/main/notebooks/official/tabular_workflows/bqml_arima_plus.ipynb">
        <img src="https://lh3.googleusercontent.com/UiNooY4LUgW_oTvpsNhPpQzsstV5W8F7rYgxgGBD85cWJoLmrOzhVs_ksK_vgx40SHs7jCqkTkCk=e14-rj-sc0xffffff-h130-w32" alt="Vertex AI logo"><br>
      Open in Vertex AI Workbench
    </a>
  </td>
</table>
<br/><br/><br/>

## Overview

In this tutorial, you take on the role of a store planner who determines how much inventory for each product needs to be ordered for each store for November 2019. You accomplish this by training a BigQuery ML (BQML) [ARIMA_PLUS](https://cloud.google.com/bigquery-ml/docs/reference/standard-sql/bigqueryml-syntax-create-time-series) forecasting model using historical sales data. A BQML ARIMA_PLUS model is useful if you need to perform many quick iterations of model training or if you need an inexpensive baseline to measure other models against.

Learn more about [BQML ARIMA+ forecasting for tabular data](https://cloud.google.com/vertex-ai/docs/tabular-data/forecasting-arima/overview).

### Objective

In this notebook, you learn how to create the BigQuery ML ARIMA_PLUS model using a training [Vertex AI Pipeline](https://cloud.google.com/vertex-ai/docs/pipelines/introduction) from [Google Cloud Pipeline Components](https://cloud.google.com/vertex-ai/docs/pipelines/components-introduction) (GCPC), and then do a batch prediction using the corresponding prediction pipeline. You then train a Vertex AI forecasting model using the same data and compare the evaluation metrics.

This tutorial uses the following Google Cloud ML services and resources:

- BigQuery
- Vertex AI

The steps performed are:

- Train the BigQuery ML ARIMA_PLUS model.
- View BigQuery ML model evaluation.
- Make a batch prediction with the BigQuery ML model.


### Dataset

To demonstrate the tradeoffs between BigQuery ML and Vertex AI forecasting, this tutorial uses a synthetic dataset where product sales are dependent on a variety of factors such as advertisements, holidays, and locations. You see how well a univariate model like ARIMA_PLUS can forecast future sales without knowing information about these factors explicitly, and how well a multivariate model like Vertex AI forecasting can perform when these factors are known.

### Costs

This tutorial uses billable components of Google Cloud:

* Vertex AI
* Cloud Storage
* BigQuery / BigQuery ML

Learn about [Vertex AI
pricing](https://cloud.google.com/vertex-ai/pricing), [Cloud Storage
pricing](https://cloud.google.com/storage/pricing), and [BigQuery pricing](https://cloud.google.com/bigquery/pricing) and use the [Pricing
Calculator](https://cloud.google.com/products/calculator/) to generate a cost estimate based on your projected usage.

### Get started

### Install Vertex AI SDK for Python and other required packages

In [None]:
# Install required packages.
! pip3 install --quiet --upgrade google-cloud-aiplatform \
                                 google-cloud-bigquery \
                                 google-cloud-pipeline-components \
                                 db-dtypes

### Restart runtime (Colab only)

To use the newly installed packages, you must restart the runtime on Google Colab.

In [None]:
import sys

if "google.colab" in sys.modules:

    import IPython

    app = IPython.Application.instance()
    app.kernel.do_shutdown(True)

<div class="alert alert-block alert-warning">
<b>⚠️ The kernel is going to restart. Wait until it's finished before continuing to the next step. ⚠️</b>
</div>

### Authenticate your notebook environment (Colab only)

Authenticate your environment on Google Colab.

In [None]:
import sys

if "google.colab" in sys.modules:

    from google.colab import auth

    auth.authenticate_user()

### Set Google Cloud project information

Learn more about [setting up a project and a development environment](https://cloud.google.com/vertex-ai/docs/start/cloud-environment).

In [None]:
PROJECT_ID = "[your-project-id]"  # @param {type:"string"}
LOCATION = "us-central1"  # @param {type:"string"}
DATA_LOCATION = "US"

### Create a Cloud Storage bucket

Create a storage bucket to store intermediate artifacts such as datasets.

In [None]:
BUCKET_URI = f"gs://your-bucket-name-{PROJECT_ID}-unique"  # @param {type:"string"}

**If your bucket doesn't already exist**: Run the following cell to create your Cloud Storage bucket.

In [None]:
! gsutil mb -l {LOCATION} -p {PROJECT_ID} {BUCKET_URI}

#### Service Account

**If you don't know your service account**, try to get your service account using `gcloud` command by executing the second cell below.

In [None]:
SERVICE_ACCOUNT = "[your-service-account]"  # @param {type:"string"}

In [None]:
import sys

IS_COLAB = "google.colab" in sys.modules
if (
    SERVICE_ACCOUNT == ""
    or SERVICE_ACCOUNT is None
    or SERVICE_ACCOUNT == "[your-service-account]"
):
    # Get your service account from gcloud
    if not IS_COLAB:
        shell_output = !gcloud auth list 2>/dev/null
        SERVICE_ACCOUNT = shell_output[2].replace("*", "").strip()

    else:  # IS_COLAB:
        shell_output = ! gcloud projects describe  $PROJECT_ID
        project_number = shell_output[-1].split(":")[1].strip().replace("'", "")
        SERVICE_ACCOUNT = f"{project_number}-compute@developer.gserviceaccount.com"

    print("Service Account:", SERVICE_ACCOUNT)

#### Set service account access for Vertex AI Pipelines

Run the following commands to grant your service account access to read and write pipeline artifacts in the bucket that you created in the previous step. You only need to run this step once per service account.

In [None]:
! gsutil iam ch serviceAccount:{SERVICE_ACCOUNT}:roles/storage.objectCreator $BUCKET_URI

! gsutil iam ch serviceAccount:{SERVICE_ACCOUNT}:roles/storage.objectViewer $BUCKET_URI

### Import libraries and define constants

In [None]:
import json
import os
import urllib
import uuid

from google.cloud import aiplatform, bigquery
from google_cloud_pipeline_components.v1.automl.forecasting import utils

## Initialize Vertex AI SDK for Python

Initialize the Vertex AI SDK for Python for your project and corresponding bucket.

In [None]:
aiplatform.init(project=PROJECT_ID, location=LOCATION, staging_bucket=BUCKET_URI)

## Define train and prediction data

### Create a BigQuery dataset

In [None]:
arima_dataset_name = "forecasting_demo_arima"
arima_dataset_path = ".".join([PROJECT_ID, arima_dataset_name])

# Must be same location as TRAINING_DATASET_BQ_PATH.
client = bigquery.Client(project=PROJECT_ID)
bq_dataset_pre = bigquery.Dataset(arima_dataset_path)
bq_dataset_pre.location = DATA_LOCATION
try:
    bq_dataset = client.create_dataset(bq_dataset_pre)
except:
    bq_dataset = client.get_dataset(bq_dataset_pre)
print(f"Created bigquery dataset {arima_dataset_path} in {DATA_LOCATION}")

### Prepare training data in BigQuery

Before training a model, you must first generate our dataset of store sales. This dataset includes multiple products and stores, and it also simulates factors such as advertisements and holiday effects. The data is then split into `TRAIN`, `VALIDATE`, `TEST`, and `PREDICT` sets, where the last three sets are all 1 month in duration.

#### Begin by defining the subqueries that creates this base sales data.

In [None]:
base_data_query = """
  WITH 

    -- Create time series for each product + store with some covariates.
    time_series AS (
      SELECT
        CONCAT("id_", store_id, "_", product_id) AS id,
        CONCAT('store_', store_id) AS store,
        CONCAT('product_', product_id) AS product,
        date,
        -- Advertise 1/100 products.
        IF(
          ABS(MOD(FARM_FINGERPRINT(CONCAT(product_id, date)), 100)) = 0,
          1,
          0
        ) AS advertisement,
        -- Mark Thanksgiving sales as holiday sales.
        IF(
          EXTRACT(DAYOFWEEK FROM date) = 6
            AND EXTRACT(MONTH FROM date) = 11
            AND EXTRACT(DAY FROM date) BETWEEN 23 AND 29,
          1,
          0
        ) AS holiday,
        -- Set when each data split ends.
        CASE
          WHEN date < '2019-09-01' THEN 'TRAIN'
          WHEN date < '2019-10-01' THEN 'VALIDATE'
          WHEN date < '2019-11-01' THEN 'TEST'
          ELSE 'PREDICT'
        END AS split,
      -- Generate the sales with one SKU per date.
      FROM
        UNNEST(GENERATE_DATE_ARRAY('2017-01-01', '2019-12-01')) AS date
      CROSS JOIN
        UNNEST(GENERATE_ARRAY(0, 10)) AS product_id
      CROSS JOIN
        UNNEST(GENERATE_ARRAY(0, 3)) AS store_id  
    ),
    
    -- Randomly determine factors that contribute to how syntheic sales are calculated. 
    time_series_sales_factors AS (
      SELECT
        *,
        ABS(MOD(FARM_FINGERPRINT(product), 10)) AS product_factor,
        ABS(MOD(FARM_FINGERPRINT(store), 10)) AS store_factor,
        [1.6, 0.6, 0.8, 1.0, 1.2, 1.8, 2.0][
          ORDINAL(EXTRACT(DAYOFWEEK FROM date))] AS day_of_week_factor,
        1 +  SIN(EXTRACT(MONTH FROM date) * 2.0 * 3.14 / 24.0) AS month_factor,    
        -- Advertised products have increased sales factors for 5 days.
        CASE
          WHEN LAG(advertisement, 0) OVER w = 1.0 THEN 1.2
          WHEN LAG(advertisement, 1) OVER w = 1.0 THEN 1.8
          WHEN LAG(advertisement, 2) OVER w = 1.0 THEN 2.4
          WHEN LAG(advertisement, 3) OVER w = 1.0 THEN 3.0
          WHEN LAG(advertisement, 4) OVER w = 1.0 THEN 1.4
          ELSE 1.0
        END AS advertisement_factor,
        IF(holiday = 1.0, 2.0, 1.0) AS holiday_factor,
        0.001 * ABS(MOD(FARM_FINGERPRINT(CONCAT(product, store, date)), 100)) AS noise_factor
      FROM
        time_series
      WINDOW w AS (PARTITION BY id ORDER BY date)
    ),
  
    -- Use factors to calculate synthetic sales for each time series. 
    base_data AS (
      SELECT
        id,
        store,
        product,
        date,
        split,
        advertisement,
        holiday,
        (
          (1 + store_factor) 
          * (1 + product_factor) 
          * (1 + month_factor + day_of_week_factor) 
          * (
            1.0 
            + 2.0 * advertisement_factor 
            + 3.0 * holiday_factor 
            + 5.0 * noise_factor
          )
        ) AS sales
      FROM
        time_series_sales_factors
      )
"""

Next, convert this base sales data into a dataset you use to train a model, and a dataset you pass to a trained model at serving time. The training dataset includes the `TRAIN`, `VALIDATE`, and `TEST` splits, while the prediction dataset includes the `PREDICT` split and also the `TEST` split to provide context information.

In [None]:
TRAINING_DATASET_BQ_PATH = f"{arima_dataset_path}.train"
PREDICTION_DATASET_BQ_PATH = f"{arima_dataset_path}.pred"

train_query = f"""
    CREATE OR REPLACE TABLE `{arima_dataset_path}.train` AS
    {base_data_query}
    SELECT *
    FROM base_data
    WHERE split != 'PREDICT'
"""
client.query(train_query).result()
print(f"Created {TRAINING_DATASET_BQ_PATH}.")

pred_query = f"""
    CREATE OR REPLACE TABLE `{arima_dataset_path}.pred` AS
    {base_data_query}
    SELECT *
    FROM base_data
    WHERE split = 'TEST'

    UNION ALL

    SELECT * EXCEPT (sales), NULL AS sales
    FROM base_data
    WHERE split = 'PREDICT'
"""
client.query(pred_query).result()
print(f"Created {PREDICTION_DATASET_BQ_PATH}.")

You can take a look at the sales data that was generated. Later in this tutorial, you can see the visualization of the time series along with the forecast.

The model is trained with data from January 2017 to October 2019 inclusive.

#### Look at the training data

In [None]:
query = f"SELECT * FROM `{arima_dataset_path}.train` LIMIT 10"
client.query(query).to_dataframe().head()

The table used for prediction contains data from November 2019. It also includes actuals from October 2019 as context information.

#### Look at the prediction data

In [None]:
query = f"SELECT * FROM `{arima_dataset_path}.pred` LIMIT 10"
client.query(query).to_dataframe().head()

# Create a BigQuery ML ARIMA_PLUS model

Now you are ready to start creating your own BigQuery ML ARIMA_PLUS model.

Like with Vertex AI forecasting, the pipeline you run trains evaluation models using the training and validation sets and use backtesting to create evaluation metrics on the test set. Finally, a serving model that uses all available data can be produced.

**How do you estimate the cost?**

Backtesting involves training a single BigQuery ML model for each period in the test set, so the cost is a function of the length of the test set after any downsampling done by the windowing strategy. The cost is also multiplied by the number of candidate models trained, which is determined by `max_order`.

According to [BQ pricing](https://cloud.google.com/bigquery-ml/pricing), BigQuery ML model creation costs $250 per TB. You can use a max order of 3, which translates to 20 candidate models when there are multiple time series. The demo dataset is 3 MB in size, and includes 31 test periods. 

In this tutorial, the model create stage of the pipeline costs `3 MB * ($250 / 1024^2) * (31 / 1) periods * 20 candidates = $0.44`.

## Create and run the training job
To train a model using the ARIMA pipeline, you perform two steps: 

1. download the training pipeline from GCPC.
1. run the job

#### Create training job

The training pipeline expects the following parameters:

- `bigquery_destination_uri`: (optional) BigQuery Dataset URI. Used to export the metrics table and model. If not given, You can create one for the user.
- `data_granularity_unit`: Enum used to specify the time granularity (hour, day, week, month, etc).
- `data_source_csv_filenames` or `data_source_bigquery_table_path`: A URI for either a CSV stored in GCR or a BigQuery table, respectively.
- `evaluated_examples_destination_uri`: (optional) BigQuery Dataset URI OR Table URI. Used to export the evaluated examples table. Uses bigquery_destination_uri, if not provided.
- `forecast_horizon`: Integer number of periods to predict.
- A data splitting strategy of either:
  - `predefined_split_key`: A column containing `TRAIN`, `VALIDATE`, or `TEST` to denote the splits for each row.
  -  `training_fraction`, `validation_fraction`, and `test_fraction` to set the fractions to split on chronologically on the time column.
  - `timestamp_split_key` plus the fractions in the previous option to perform fractional splitting on a column other than the time column.
- A windowing strategy of either:
  - `window_column`: A boolean column decides whether or now each row gets considered when calculating the evaluation metrics.
  - `window_stride_length`: Every N rows are used to compute the evaluation metrics.
  - `window_max_count`: Downsample rows such that only the given number are used to calculate the evaluation metrics.
- `target_column`: Name of target column.
- `time_column`: Name of time column.
- `time_series_identifier_column`: Name of id column.
- `max_order`: Integer between 1 and 5 representing the size of the parameter search space for ARIMA_PLUS. 5 would result in the highest accuracy model, but also the longest training runtime/cost.

For a full list of parameters, see the GCPC SDK [documentation](https://google-cloud-pipeline-components.readthedocs.io/en/google-cloud-pipeline-components-2.9.0/api/v1/automl/forecasting.html#v1.automl.forecasting.get_bqml_arima_train_pipeline_and_parameters).

The execution of the training pipeline can take up to 20 minutes or more.

In [None]:
time_column = "date"  # @param {type: "string"}
time_series_identifier_column = "id"  # @param {type: "string"}
target_column = "sales"  # @param {type: "string"}
forecast_horizon = 30  # @param {type: "integer"}
data_granularity_unit = "day"  # @param {type: "string"}
split_column = "split"  # @param {type: "string"}
window_stride_length = 1  # @param {type: "integer"}
max_order = 3  # @param {type: "integer"}
override_destination = True  # @param {type: "boolean"}

(
    train_job_spec_path,
    train_parameter_values,
) = utils.get_bqml_arima_train_pipeline_and_parameters(
    project=PROJECT_ID,
    location=LOCATION,
    root_dir=os.path.join(BUCKET_URI, "pipeline_root"),
    time_column=time_column,
    time_series_identifier_column=time_series_identifier_column,
    target_column=target_column,
    forecast_horizon=forecast_horizon,
    data_granularity_unit=data_granularity_unit,
    predefined_split_key=split_column,
    data_source_bigquery_table_path=TRAINING_DATASET_BQ_PATH,
    window_stride_length=window_stride_length,
    bigquery_destination_uri=arima_dataset_path,
    override_destination=override_destination,
    max_order=max_order,
)

### Run the training pipeline

Use the Vertex AI Python SDK to kick off a training pipeline run. Once the run has started, the following cell outputs a link that lets you monitor the run. The link should look like this: 

`https://console.cloud.google.com/vertex-ai/locations/[LOCATION]/pipelines/runs/[TRAIN_DISPLAY_NAME]`

In [None]:
# The display name should be unique even if this cell is rerun.
TRAIN_DISPLAY_NAME = f"forecasting-demo-train-{str(uuid.uuid1())}"

job = aiplatform.PipelineJob(
    job_id=TRAIN_DISPLAY_NAME,
    display_name=TRAIN_DISPLAY_NAME,
    pipeline_root=os.path.join(BUCKET_URI, TRAIN_DISPLAY_NAME),
    template_path=train_job_spec_path,
    parameter_values=train_parameter_values,
    enable_caching=False,
)
job.run(service_account=SERVICE_ACCOUNT)

## Review model evaluation scores
After your model has finished training, you can review the evaluation scores for it.

#### Metrics are always reported via the `metrics` table in the destination dataset.

In [None]:
for task_detail in job.gca_resource.job_detail.task_details:
    if task_detail.task_name == "create-metrics-artifact":
        metrics = task_detail.outputs["evaluation_metrics"].artifacts[0].metadata
        break
else:
    raise ValueError("Couldn't find the model evaluation task.")

print("Evaluation metrics:\n")
dict(metrics)

You can view the predictions used to calculate the evaluation metrics if you want to calculate your own. 

#### View predictions used to calculate the evaluation metrics

This table containing all these predictions is called `evaluated_examples`. In this table, each distinct `predicted_on_date` represents the starting period of a window of predictions. The backtesting metrics make use of all these windows.

In [None]:
query = f"SELECT * FROM `{arima_dataset_path}.evaluated_examples`"
arima_examples = client.query(query).to_dataframe()
arima_examples.head()

## Create and run prediction job

### Create prediction job
Now that your Model resource is trained, you can make a batch prediction using the prediction pipeline, with the following parameters:

- `bigquery_destination_uri`: (optional) BigQuery Dataset URI. Used to export the metrics table and model. If not given, You can create one for the user.
- `data_source_csv_filenames` or `data_source_bigquery_table_path`: A URI for either a CSV stored in GCR or a BigQuery table, respectively.
- `generate_explanation`: If True, the predictions table can have some extra explanations columns.
- `model_name`: Name of an existing BigQuery ML ARIMA_PLUS model to use for predictions.

For a full list of parameters, see the GCPC SDK [documentation](https://google-cloud-pipeline-components.readthedocs.io/en/google-cloud-pipeline-components-2.9.0/api/v1/automl/forecasting.html#v1.automl.forecasting.get_bqml_arima_predict_pipeline_and_parameters).

The execution of the prediction pipeline can take up to 5 minutes or more.

In [None]:
# Get the model name programmatically, you can find this by looking at the execution graph in Vertex AI Pipelines.
for task_detail in job.gca_resource.job_detail.task_details:
    if task_detail.task_name == "bigquery-create-model-job":
        model_name = task_detail.outputs["model"].artifacts[0].metadata["modelId"]
        break
else:
    raise ValueError("Couldn't find the model training task.")


(
    predict_job_spec_path,
    predict_parameter_values,
) = utils.get_bqml_arima_predict_pipeline_and_parameters(
    project=PROJECT_ID,
    location=LOCATION,
    model_name=f"{arima_dataset_path}.{model_name}",
    data_source_bigquery_table_path=PREDICTION_DATASET_BQ_PATH,
    bigquery_destination_uri=arima_dataset_path,
)

### Run the prediction pipeline

Use the Vertex AI Python SDK to kick off a prediction pipeline run. Once the run has started, the following cell outputs a link that lets you monitor the run. The link should look like this: 

`https://console.cloud.google.com/vertex-ai/locations/[LOCATION]/pipelines/runs/[PRED_DISPLAY_NAME]`

In [None]:
# The display name should be unique even if this cell is rerun.
PRED_DISPLAY_NAME = f"forecasting-demo-predict-{str(uuid.uuid1())}"


pred_job = aiplatform.PipelineJob(
    job_id=PRED_DISPLAY_NAME,
    display_name=PRED_DISPLAY_NAME,
    pipeline_root=os.path.join(BUCKET_URI, PRED_DISPLAY_NAME),
    template_path=predict_job_spec_path,
    parameter_values=predict_parameter_values,
    enable_caching=False,
)
pred_job.run(service_account=SERVICE_ACCOUNT)

### Get the predictions

Next, get the results from the completed batch prediction job. These are always written to a table called `predictions` under the output dataset.

In [None]:
# Get the prediction table programmatically, you can find this by looking at the execution graph in Vertex AI Pipelines.
for task_detail in pred_job.gca_resource.job_detail.task_details:
    if task_detail.task_name == "bigquery-query-job":
        pred_table = (
            task_detail.outputs["destination_table"].artifacts[0].metadata["tableId"]
        )
        break
else:
    raise ValueError("Couldn't find the prediction task.")

query = f"SELECT * FROM `{arima_dataset_path}.{pred_table}`"
arima_preds = client.query(query).to_dataframe()
arima_preds.head()

## Visualize the forecasts

Lastly, follow the given link to visualize the generated forecasts in [Data Studio](https://support.google.com/datastudio/answer/6283323?hl=en).
The code block included in this section dynamically generates a Data Studio link that specifies the template, the location of the forecasts, and the query to generate the chart. The data is populated from the forecasts generated earlier.

You can inspect the used template at https://datastudio.google.com/c/u/0/reporting/067f70d2-8cd6-4a4c-a099-292acd1053e8. This was created by Google specifically to view forecasting predictions.

In [None]:
def _sanitize_bq_uri(bq_uri: str):
    if bq_uri.startswith("bq://"):
        bq_uri = bq_uri[5:]
    return bq_uri.replace(":", ".")


def get_data_studio_link(
    batch_prediction_bq_input_uri: str,
    batch_prediction_bq_output_uri: str,
    time_column: str,
    time_series_identifier_column: str,
    target_column: str,
):
    """Creates a link that fills in the demo Data Studio template."""
    batch_prediction_bq_input_uri = _sanitize_bq_uri(batch_prediction_bq_input_uri)
    batch_prediction_bq_output_uri = _sanitize_bq_uri(batch_prediction_bq_output_uri)
    query = f"""
        SELECT
          CAST(input.{time_column} as DATETIME) timestamp_col,
          CAST(input.{time_series_identifier_column} as STRING) time_series_identifier_col,
          CAST(input.{target_column} as NUMERIC) historical_values,
          CAST(predicted_{target_column}.value as NUMERIC) predicted_values,
        FROM `{batch_prediction_bq_input_uri}` input
        LEFT JOIN `{batch_prediction_bq_output_uri}` output
          ON
            TIMESTAMP(input.{time_column}) = TIMESTAMP(output.{time_column})
            AND CAST(input.{time_series_identifier_column} as STRING) = CAST(
              output.{time_series_identifier_column} as STRING)
    """
    params = {
        "templateId": "067f70d2-8cd6-4a4c-a099-292acd1053e8",
        "ds0.connector": "BIG_QUERY",
        "ds0.projectId": PROJECT_ID,
        "ds0.billingProjectId": PROJECT_ID,
        "ds0.type": "CUSTOM_QUERY",
        "ds0.sql": query,
    }
    base_url = "https://datastudio.google.com/c/u/0/reporting"
    url_params = urllib.parse.urlencode({"params": json.dumps(params)})
    return f"{base_url}?{url_params}"

In [None]:
actuals_table = f"{arima_dataset_path}.actuals"
query = f"""
    CREATE OR REPLACE TABLE `{actuals_table}` AS
    {base_data_query}
    SELECT *
    FROM base_data
    WHERE split != 'TRAIN'
"""
client.query(query).result()
print(f"Created {actuals_table}.")

In [None]:
print("Click the link below to view ARIMA predictions:")
print(
    get_data_studio_link(
        batch_prediction_bq_input_uri=actuals_table,
        batch_prediction_bq_output_uri=f"{arima_dataset_path}.{pred_table}",
        time_column=time_column,
        time_series_identifier_column=time_series_identifier_column,
        target_column=target_column,
    )
)

## Clean up Vertex AI and BigQuery resources

To clean up all Google Cloud resources used in this project, you can [delete the Google Cloud
project](https://cloud.google.com/resource-manager/docs/creating-managing-projects#shutting_down_projects) you used for the tutorial.

Otherwise, you can delete the individual resources that you created in this tutorial

In [None]:
# Delete output datasets
client.delete_dataset(arima_dataset_path, delete_contents=True, not_found_ok=True)

job.delete()
pred_job.delete()

delete_bucket = True
if delete_bucket:
    ! gsutil -m rm -r $BUCKET_URI