In [None]:
# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# TODO

<table align="left">
  <td>
    <a href="https://colab.research.google.com/github/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/official/automl/automl_tabular_on_vertex_pipelines.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/colab-logo-32px.png" alt="Colab logo"> Run in Colab
    </a>
  </td>
  <td>
    <a href="https://github.com/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/official/automl/automl_forecasting_on_vertex_pipelines.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/github-logo-32px.png" alt="GitHub logo">
      View on GitHub
    </a>
  </td>
  <td>
    <a href="https://console.cloud.google.com/vertex-ai/workbench/deploy-notebook?download_url=https://raw.githubusercontent.com/GoogleCloudPlatform/vertex-ai-samples/main/notebooks/official/automl/automl_forecasting_on_vertex_pipelines.ipynb">
        <img src="https://lh3.googleusercontent.com/UiNooY4LUgW_oTvpsNhPpQzsstV5W8F7rYgxgGBD85cWJoLmrOzhVs_ksK_vgx40SHs7jCqkTkCk=e14-rj-sc0xffffff-h130-w32" alt="Vertex AI logo">
      Open in Vertex AI Workbench
    </a>
  </td>
</table>
<br/><br/><br/>

# Onboarding Instructions

Send a email to warehouse-trusted-testers-mailing-external+managers@google.com asking to join the group and allowlist for the project. Please mention the project id in the email.

Onboarding documentation see [here](https://docs.google.com/document/d/1sPI5WQUgq9s8raAkxl3k00kxh3PhKyxIhOn8X1GNmw8/edit?usp=sharing)

Colab CUJ see [here](https://docs.google.com/document/d/1sPI5WQUgq9s8raAkxl3k00kxh3PhKyxIhOn8X1GNmw8/edit#bookmark=id.1t2o3wjj4lss)


## Install additional packages

Install the Google Cloud Pipeline Components (GCPC) SDK not earlier than `2.3.0`.


In [None]:
# @title Download SDK
! gsutil cp gs://visionai-private-artifacts/f54638dfe40e579744dd9b7d59dfce85e813891e/visionai-0.0.5-py3-none-any.whl .

# @title Install SDK
# You can restart the runtime if it asks for.
! pip3 install visionai-0.0.5-py3-none-any.whl --force-reinstall
! pip3 install absl

### Colab only: Uncomment the following cell to restart the kernel.

In [None]:
# Automatically restart kernel after installs so that your environment can access the new packages
# import IPython

# app = IPython.Application.instance()
# app.kernel.do_shutdown(True)

## Before you begin

### Set up your Google Cloud project

**The following steps are required, regardless of your notebook environment.**

1. [Select or create a Google Cloud project](https://console.cloud.google.com/cloud-resource-manager). When you first create an account, you get a $300 free credit towards your compute/storage costs.

2. [Make sure that billing is enabled for your project](https://cloud.google.com/billing/docs/how-to/modify-project).

3. [Enable the Vertex AI API: Vertex AI APIs, Dataflow APIs, Compute Engine APIs, and Cloud Storage](https://console.cloud.google.com/flows/enableapi?apiid=ml.googleapis.com,dataflow.googleapis.com,compute_component,storage-component.googleapis.com).

4. If you are running this notebook locally, you need to install the [Cloud SDK](https://cloud.google.com/sdk).

### Set your project ID

**If you don't know your project ID**, try the following:
* Run `gcloud config list`.
* Run `gcloud projects list`.
* See the support page: [Locate the project ID](https://support.google.com/googleapi/answer/7014113)

In [None]:
PROJECT_ID = "[your-project-id]"  # @param {type:"string"}

# Set the project id
! gcloud config set project {PROJECT_ID}

### Region

You can also change the `REGION` variable used by Vertex AI. Learn more about [Vertex AI regions](https://cloud.google.com/vertex-ai/docs/general/locations).

In [None]:
REGION = "us-central1"  # @param {type: "string"}

### Authenticate your Google Cloud account

Depending on your Jupyter environment, you may have to manually authenticate. Follow the relevant instructions below.

**1. Vertex AI Workbench**
* Do nothing as you are already authenticated.

**2. Local JupyterLab instance, uncomment and run:**

In [None]:
# ! gcloud auth login

**3. Colab, uncomment and run:**

In [None]:
# from google.colab import auth
# auth.authenticate_user()

**4. Service account or other**
* See how to grant Cloud Storage permissions to your service account at https://cloud.google.com/storage/docs/gsutil/commands/iam#ch-examples.

## Set Up Variables

In [None]:
# Please run the whole section after changing any value of the variables.
PROJECT_NUMBER_STR = !gcloud projects describe $PROJECT_ID --format="value(projectNumber)"
PROJECT_NUMBER = int(PROJECT_NUMBER_STR[0])

# Only us-central1 is supported.
# Please note that this region is for VisionAi services. For speech
# transcription, we may not respect the region here.
LOCATION_ID = REGION

CORPUS_DISPLAY_NAME = "Demo corpus"  # @param {type: "string"}
CORPUS_DESCRIPTION = "Demo corpus to demo warehouse transformations and search"  # @param {type: "string"}
# If CORPUS_ID is specified, skip creating a new corpus.
CORPUS_ID = None  # @param {type: "string"}

# External users can only access PROD environment.
ENV = "PROD"  # @param {type: "string"}

# You can also create cluster via UI by creating a stream. Setting CLUSTER_ID
# None and USE_EXISTING_CLUSTER to True will use the one created from UI.
USE_EXISTING_CLUSTER = False  # @param {type: "boolean"}
CLUSTER_ID = "application-cluster-0"  # @param {type: "string"}

# If DEPLOYED_INDEX_ID is specified, use existing index instead of creating and
# deploying a new index.
DEPLOYED_INDEX_ID = None  # @param {type: "string"}
INDEX_DISPLAY_NAME = "Demo Index"  # @param {type: "string"}
INDEX_ENDPOINT_DISPLAY_NAME = "Demo Index Endpoint"  # @param {type: "string"}

CLEAN_UP_ASSETS = False  # @param {type: "boolean"}
CLEAN_UP_INDEX = False  # @param {type: "boolean"}
CLEAN_UP_CORPUS = False  # @param {type: "boolean"}
CLEAN_UP_CLUSTER = False  # @param {type: "boolean"}

In [None]:
# Files to be processed.
GCS_FILES = [
    "gs://cloud-samples-data/video/animals.mp4",
    "gs://cloud-samples-data/video/googlework_short.mp4",
    "gs://cloud-samples-data/video/chicago.mp4",
    (
        "gs://cloud-samples-data/video/Machine Learning Solving Problems"
        " Big, Small, and Prickly.mp4"
    ),
    "gs://cloud-samples-data/video/JaneGoodall.mp4",
    "gs://cloud-samples-data/video/gbikes_dinosaur.mp4",
    "gs://cloud-samples-data/video/pizza.mp4",
]

In [None]:
from absl import flags

try:
    if FLAGS_IS_DEFINED:
        print("Flags have been defined.")
except:
    _PROJECT_NUMBER = flags.DEFINE_integer("project_number", None, "Project number.")
    _LOCATION_ID = flags.DEFINE_string("location_id", "us-central1", "Location id.")
    _CORPUS_DISPLAY_NAME = flags.DEFINE_string(
        "corpus_display_name", "Demo Corpus", "Corpus display name."
    )
    _CORPUS_DISCRIPTION = flags.DEFINE_string(
        "corpus_description",
        "Demo Corpus to interact with warehouse",
        "Corpus description.",
    )
    _CORPUS_ID = flags.DEFINE_string(
        "corpus_id", None, "If specified, use existing VOD corpus."
    )
    _GCS_FILES = flags.DEFINE_list(
        "gcs_files",
        [
            "gs://cloud-samples-data/video/animals.mp4",
            "gs://cloud-samples-data/video/googlework_short.mp4",
            "gs://cloud-samples-data/video/chicago.mp4",
            (
                "gs://cloud-samples-data/video/Machine Learning Solving Problems"
                " Big, Small, and Prickly.mp4"
            ),
            "gs://cloud-samples-data/video/JaneGoodall.mp4",
        ],
        "GCS files.",
    )
    _ENV = flags.DEFINE_enum(
        "env",
        "PROD",
        ["AUTOPUSH", "STAGING", "PROD"],
        "The environment.",
    )
    _USE_EXISTING_CLUSTER = flags.DEFINE_bool(
        "use_existing_cluster",
        False,
        "Whether create a new cluster. If this is false, and cluster flag is None,"
        " it will use the cluster created from UI .",
    )
    _CLUSTER_ID = flags.DEFINE_string(
        "cluster_id",
        None,
        "The cluster. If not specified, by default using the cluster created"
        " via UI.",
    )
    _DEPLOYED_INDEX_ID = flags.DEFINE_string(
        "deployed_index_id",
        None,
        "If specified, use existing index instead of creating and deploying a new"
        " one.",
    )
    _INDEX_DISPLAY_NAME = flags.DEFINE_string(
        "index_display_name", "Demo Index", "Index display name."
    )
    _INDEX_ENDPOINT_DISPLAY_NAME = flags.DEFINE_string(
        "index_endpoint_display_name",
        "Demo index endpoint",
        "Display name for the index endpoint.",
    )
    _CLEAN_UP_ASSETS = flags.DEFINE_bool(
        "clean_up_assets", False, "Whether clean up assets."
    )
    _CLEAN_UP_CORPUS = flags.DEFINE_bool(
        "clean_up_corpus", False, "Whether clean up corpus."
    )
    _CLEAN_UP_INDEX = flags.DEFINE_bool(
        "clean_up_index", False, "Whether clean up index and index endpoint."
    )
    _CLEAN_UP_CLUSTER = flags.DEFINE_bool(
        "clean_up_cluster", False, "Whether clean up cluster."
    )
    FLAGS_IS_DEFINED = True

In [None]:
FLAGS = flags.FLAGS

FLAGS["project_number"].parse(PROJECT_NUMBER)
FLAGS["location_id"].parse(LOCATION_ID)
FLAGS["corpus_display_name"].parse(CORPUS_DISPLAY_NAME)
FLAGS["corpus_description"].parse(CORPUS_DESCRIPTION)
if CORPUS_ID:
    FLAGS["corpus_id"].parse(CORPUS_ID)
else:
    FLAGS["corpus_id"].unparse()
FLAGS["gcs_files"].parse(GCS_FILES)
FLAGS["env"].parse(ENV)
FLAGS["use_existing_cluster"].parse(USE_EXISTING_CLUSTER)
if CLUSTER_ID:
    FLAGS["cluster_id"].parse(CLUSTER_ID)
else:
    FLAGS["cluster_id"].unparse()
if DEPLOYED_INDEX_ID:
    FLAGS["deployed_index_id"].parse(DEPLOYED_INDEX_ID)
else:
    FLAGS["deployed_index_id"].unparse()
FLAGS["index_display_name"].parse(INDEX_DISPLAY_NAME)
FLAGS["index_endpoint_display_name"].parse(INDEX_ENDPOINT_DISPLAY_NAME)
FLAGS["clean_up_assets"].parse(CLEAN_UP_ASSETS)
FLAGS["clean_up_index"].parse(CLEAN_UP_INDEX)
FLAGS["clean_up_corpus"].parse(CLEAN_UP_CORPUS)
FLAGS["clean_up_cluster"].parse(CLEAN_UP_CLUSTER)

FLAGS.mark_as_parsed()

## Enable API

In [None]:
!gcloud services enable videointelligence.googleapis.com

In [None]:
def get_service_endpoint(env: str) -> str:
    if env == "STAGING":
        return "staging-visionai.sandbox.googleapis.com"
    if env == "AUTOPUSH":
        return "autopush-visionai.sandbox.googleapis.com"
    return "visionai.googleapis.com"


visionai_service_endpoint = get_service_endpoint(ENV)

In [None]:
!gcloud services enable {visionai_service_endpoint}

# Example

In [None]:
# @title Config logging
import logging

logging.basicConfig()
logging.getLogger().setLevel(logging.INFO)
_logger = logging.getLogger("colab")

In [None]:
# @title Imports
import concurrent
import logging

from absl import flags
from visionai.python.gapic.visionai import visionai_v1
from visionai.python.net import channel
from visionai.python.streams import client as streams_client
from visionai.python.warehouse.transformer import \
    asset_indexing_transformer as ait
from visionai.python.warehouse.transformer import (ocr_transformer,
                                                   speech_transformer,
                                                   transformer_factory)
from visionai.python.warehouse.utils import (vod_asset, vod_corpus,
                                             vod_index_endpoint)

In [None]:
# @title Creates a warehouse client to talk with warehouse.
warehouse_endpoint = channel.get_warehouse_service_endpoint(
    channel.Environment[_ENV.value]
)
warehouse_client = visionai_v1.WarehouseClient(
    client_options={"api_endpoint": warehouse_endpoint}
)

In [None]:
# @title Creates a cluster.
if not _USE_EXISTING_CLUSTER.value:
    if _CLUSTER_ID.value is None:
        raise ValueError("Cluster must be specified when creating new cluster.")
    streams_client.create_cluster(
        channel.ConnectionOptions(
            _PROJECT_NUMBER.value,
            _LOCATION_ID.value,
            _CLUSTER_ID.value,
            channel.Environment[_ENV.value],
        )
    )

In [None]:
# @title Creates a corpus or use existing corpus.
if _CORPUS_ID.value is None:
    corpus_name = vod_corpus.create_corpus(
        warehouse_client,
        _PROJECT_NUMBER.value,
        _LOCATION_ID.value,
        _CORPUS_DISPLAY_NAME.value,
        _CORPUS_DISCRIPTION.value,
    ).name
else:
    corpus_name = visionai_v1.WarehouseClient.corpus_path(
        _PROJECT_NUMBER.value, _LOCATION_ID.value, _CORPUS_ID.value
    )

In [None]:
# @title Creates a executor to upload and transform assets in parallel.
executor = concurrent.futures.ThreadPoolExecutor(max_workers=8)

In [None]:
# @title Creates and Uploads Assets
new_asset_futures = []
for gcs_file in _GCS_FILES.value:
    new_asset_futures.append(
        executor.submit(
            vod_asset.create_and_upload_asset,
            warehouse_client,
            gcs_file,
            corpus_name,
        )
    )
done_or_error, _ = concurrent.futures.wait(
    new_asset_futures, return_when="ALL_COMPLETED"
)
asset_names = []
for done_future in done_or_error:
    try:
        asset_names.append(done_future.result())
        _logger.info("Create and upload asset succeeded %s", done_future.result())
    except Exception as e:
        _logger.exception(e)

In [None]:
# @title Create index and index endpoint for the corpus, or use existing index
# and index endpoint if specified.
if _DEPLOYED_INDEX_ID.value is None:
    # Creates index for the corpus.
    index_name = vod_corpus.index_corpus(
        warehouse_client, corpus_name, _INDEX_DISPLAY_NAME.value
    )
    # Creates index endpoint and deploys the created index above to the index
    # endpoint.
    index_endpoint_name = vod_index_endpoint.create_index_endpoint(
        warehouse_client,
        _PROJECT_NUMBER.value,
        _LOCATION_ID.value,
        _INDEX_ENDPOINT_DISPLAY_NAME.value,
    ).name
    deploy_operation = warehouse_client.deploy_index(
        visionai_v1.DeployIndexRequest(
            index_endpoint=index_endpoint_name,
            deployed_index=visionai_v1.DeployedIndex(
                index=index_name,
            ),
        )
    )
    _logger.info("Wait for index to be deployed %s.", deploy_operation.operation.name)
    # Wait for the deploy index operation. Depends on the data size to be
    # indexed, the timeout may need to be increased.
    deploy_operation.result(timeout=7200)
    _logger.info("Index is deployed.")
else:
    index_name = "{}/indexes/{}".format(corpus_name, _DEPLOYED_INDEX_ID.value)
    index = warehouse_client.get_index(visionai_v1.GetIndexRequest(name=index_name))
    _logger.info("Use existing index %s.", index)
    if index.state != visionai_v1.Index.State.CREATED:
        _logger.critical("Invalid index. The index state must be Created.")
    if not index.deployed_indexes:
        _logger.critical("Invalid index. The index must be deployed.")
    index_endpoint_name = index.deployed_indexes[0].index_endpoint

In [None]:
# @title Run Transforms
# If you run into errors like "Resource '.../analyses/ocr-warehouse-text-lang
# was not found.", please set variable USE_EXISTING_CLUSTER to False, it will
# create the cluster.
ocr_config = ocr_transformer.OcrTransformerInitConfig(
    corpus_name=corpus_name,
    env=channel.Environment[_ENV.value],
)
if _CLUSTER_ID.value:
    ocr_config.cluster_id = _CLUSTER_ID.value
ml_config = transformer_factory.MlTransformersCreationConfig(
    run_embedding=True,
    speech_transformer_init_config=speech_transformer.SpeechTransformerInitConfig(
        corpus_name=corpus_name, language_code="en-US"
    ),
    ocr_transformer_init_config=ocr_config,
)
ml_transformers = transformer_factory.create_ml_transformers(
    warehouse_client, ml_config
)
# Creates indexing transformer to index assets.
asset_indexing_transformer = ait.AssetIndexingTransformer(warehouse_client, index_name)
# Runs the transformers for the assets.
futures = []

for asset_name in asset_names:
    futures.append(
        executor.submit(
            vod_asset.transform_single_asset,
            asset_name,
            ml_transformers,
            asset_indexing_transformer,
        )
    )
done_or_error, _ = concurrent.futures.wait(futures, return_when="ALL_COMPLETED")
for future in done_or_error:
    try:
        future.result()
    except Exception as e:
        _logger.exception(e)

all_transformers = ml_transformers + [asset_indexing_transformer]
for transformer in all_transformers:
    transformer.teardown()

In [None]:
# @title Search
search_response = warehouse_client.search_index_endpoint(
    visionai_v1.SearchIndexEndpointRequest(
        index_endpoint=index_endpoint_name,
        text_query="dinosaur",
        page_size=10,
    )
)
_logger.info("Search response: %s", search_response)

In [None]:
cr = visionai_v1.Criteria(
    field="speech", text_array=visionai_v1.StringArray(txt_values=["kid"])
)
search_response = warehouse_client.search_index_endpoint(
    visionai_v1.SearchIndexEndpointRequest(
        index_endpoint=index_endpoint_name,
        text_query="river",
        criteria=[cr],
        page_size=100,
    )
)
_logger.info("Search response: %s", search_response)

In [None]:
cr = visionai_v1.Criteria(
    field="text", text_array=visionai_v1.StringArray(txt_values=["National Park"])
)
search_response = warehouse_client.search_index_endpoint(
    visionai_v1.SearchIndexEndpointRequest(
        index_endpoint=index_endpoint_name,
        text_query="trees",
        criteria=[cr],
        page_size=100,
    )
)
_logger.info("Search response: %s", search_response)

In [None]:
# @title Clean up
if _CLEAN_UP_ASSETS.value:
    for asset_name in asset_names:
        warehouse_client.delete_asset(visionai_v1.DeleteAssetRequest(name=asset_name))
        _logger.info("Deleted asset %s", asset_name)

if _CLEAN_UP_INDEX.value:
    undeploy_operation = warehouse_client.undeploy_index(
        visionai_v1.UndeployIndexRequest(index_endpoint=index_endpoint_name)
    )
    _logger.info(
        "Wait for index to be undeployed %s.",
        undeploy_operation.operation.name,
    )
    # Wait for the undeploy index operation.
    undeploy_operation.result(timeout=1800)
    _logger.info("Index is undeployed.")
    warehouse_client.delete_index(visionai_v1.DeleteIndexRequest(name=index_name))
    _logger.info("Deleted index %s", index_name)
    warehouse_client.delete_index_endpoint(
        visionai_v1.DeleteIndexEndpointRequest(name=index_endpoint_name)
    )
    _logger.info("Deleted index endpoint %s", index_endpoint_name)

if _CLEAN_UP_CORPUS.value:
    warehouse_client.delete_corpus(visionai_v1.DeleteCorpusRequest(name=corpus_name))
    _logger.info("Deleted corpus %s", corpus_name)

if _CLEAN_UP_CLUSTER.value:
    if _CLUSTER_ID.value is None:
        _logger.warning("Can't clean up cluster since cluster_id is not specified.")
    else:
        streams_client.delete_cluster(
            channel.ConnectionOptions(
                _PROJECT_NUMBER.value,
                _LOCATION_ID.value,
                _CLUSTER_ID.value,
                channel.Environment[_ENV.value],
            ),
        )