In [None]:
# Copyright 2022 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

<table align="left">

  <td>
    <a href="https://colab.research.google.com/github/GoogleCloudPlatform/vertex-ai-samples/blob/master/notebooks/official/pipelines/google_cloud_pipeline_components_bqml_text.ipynb"">
      <img src="https://cloud.google.com/ml-engine/images/colab-logo-32px.png" alt="Colab logo"> Run in Colab
    </a>
  </td>
  <td>
    <a href="https://github.com/GoogleCloudPlatform/vertex-ai-samples/blob/master/notebooks/notebooks/official/pipelines/google_cloud_pipeline_components_bqml_text.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/github-logo-32px.png" alt="GitHub logo">
      View on GitHub
    </a>
  </td>
  <td>
    <a href="https://console.cloud.google.com/vertex-ai/notebooks/deploy-notebook?download_url=https://github.com/GoogleCloudPlatform/vertex-ai-samples/blob/master/notebooks/notebooks/official/pipelines/google_cloud_pipeline_components_bqml_text.ipynb">
      Open in Vertex AI Workbench
    </a>
  </td>
</table>

                                                                                            

## Overview

This notebooks shows the DataflowPythonJobOp and the main BQML components in a Text Categorization Vertex AI Pipeline. 

The pipeline will 

1. Read raw text (HTML) documents stored in Google Cloud Storage
2. Extract title, content and topic of (HTML) documents using Dataflow and ingest into BigQuery
3. Apply the Swivel model to generate embeddings of our document’s content
4. Train a Logistic regression model to classify if an article is about corporate acquisitions (`acq` category). 
5. Evaluate the model 
6. Apply the model to a dataset in order to generate predictions

### Dataset

The dataset is [Reuters-21578 Text Categorization Collection Data Set](https://archive.ics.uci.edu/ml/datasets/reuters-21578+text+categorization+collection).

The dataset is a collection of publicly available news articles appeared on the Reuters newswire in 1987. They were assembled and indexed with categories by personnel from Reuters Ltd. and Carnegie Group, Inc. in 1987.

### Objective

In this notebook, you will learn how to build a simple BigQuery ML pipeline on Vertex AI pipeline in order to calculate text embeddings of articles' content and classify them
depending the *corporate acquisitions* category.

### Costs 

This tutorial uses billable components of Google Cloud:

* Vertex AI
* Cloud Storage
* BigQuery
* Dataflow

### Set up your local development environment

**If you are using Colab or Google Cloud Notebooks**, your environment already meets
all the requirements to run this notebook. You can skip this step.

**Otherwise**, make sure your environment meets this notebook's requirements.
You need the following:

* The Google Cloud SDK
* Git
* Python 3
* virtualenv
* Jupyter notebook running in a virtual environment with Python 3

The Google Cloud guide to [Setting up a Python development
environment](https://cloud.google.com/python/setup) and the [Jupyter
installation guide](https://jupyter.org/install) provide detailed instructions
for meeting these requirements. The following steps provide a condensed set of
instructions:

1. [Install and initialize the Cloud SDK.](https://cloud.google.com/sdk/docs/)

1. [Install Python 3.](https://cloud.google.com/python/setup#installing_python)

1. [Install
   virtualenv](https://cloud.google.com/python/setup#installing_and_using_virtualenv)
   and create a virtual environment that uses Python 3. Activate the virtual environment.

1. To install Jupyter, run `pip3 install jupyter` on the
command-line in a terminal shell.

1. To launch Jupyter, run `jupyter notebook` on the command-line in a terminal shell.

1. Open this notebook in the Jupyter Notebook Dashboard.

### Install additional packages

Install additional package dependencies not installed in your notebook environment, such as Vertex AI SDK. Use the latest major GA version of each package.

In [None]:
import os

# The Google Cloud Notebook product has specific requirements
IS_GOOGLE_CLOUD_NOTEBOOK = os.path.exists("/opt/deeplearning/metadata/env_version")

# Google Cloud Notebook requires dependencies to be installed with '--user'
USER_FLAG = ""
if IS_GOOGLE_CLOUD_NOTEBOOK:
    USER_FLAG = "--user"

In [None]:
if os.getenv("IS_TESTING"):
    ! touch /builder/home/.local/lib/python3.9/site-packages/google_api_core-2.7.1.dist-info/METADATA

In [None]:
! pip3 install {USER_FLAG} --upgrade "apache-beam[gcp]==2.36.0"
! pip3 install {USER_FLAG} --upgrade "bs4==0.0.1"
! pip3 install {USER_FLAG} --upgrade "nltk==3.7"
! pip3 install {USER_FLAG} --upgrade "tensorflow<2.8.0"
! pip3 install {USER_FLAG} --upgrade "tensorflow-hub==0.12.0"
! pip3 install {USER_FLAG} --upgrade "kfp==1.8.2"
! pip3 install {USER_FLAG} --upgrade "google-cloud-aiplatform==1.10.0"
! pip3 install {USER_FLAG} --upgrade "google_cloud_pipeline_components==1.0.1"

### Restart the kernel

After you install the additional packages, you need to restart the notebook kernel so it can find the packages.

In [None]:
# Automatically restart kernel after installs
import os

if not os.getenv("IS_TESTING"):
    # Automatically restart kernel after installs
    import IPython

    app = IPython.Application.instance()
    app.kernel.do_shutdown(True)

## Before you begin

### Set up your Google Cloud project

**The following steps are required, regardless of your notebook environment.**

1. [Select or create a Google Cloud project](https://console.cloud.google.com/cloud-resource-manager). When you first create an account, you get a $300 free credit towards your compute/storage costs.

1. [Make sure that billing is enabled for your project](https://cloud.google.com/billing/docs/how-to/modify-project).

1. [Enable the Vertex AI API](https://console.cloud.google.com/flows/enableapi?apiid=aiplatform.googleapis.com)

1. If you are running this notebook locally, you will need to install the [Cloud SDK](https://cloud.google.com/sdk).

1. Enter your project ID in the cell below. Then run the cell to make sure the
Cloud SDK uses the right project for all the commands in this notebook.

**Note**: Jupyter runs lines prefixed with `!` as shell commands, and it interpolates Python variables prefixed with `$` into these commands.

#### Set your project ID

**If you don't know your project ID**, you may be able to get your project ID using `gcloud`.

In [None]:
import os

PROJECT_ID = ""

# Get your Google Cloud project ID from gcloud
if not os.getenv("IS_TESTING"):
    shell_output = !gcloud config list --format 'value(core.project)' 2>/dev/null
    PROJECT_ID = shell_output[0]
    print("Project ID: ", PROJECT_ID)

Otherwise, set your project ID here.

In [None]:
if PROJECT_ID == "" or PROJECT_ID is None:
    PROJECT_ID = ""  # @param {type:"string"}

In [None]:
!gcloud config set project $PROJECT_ID

#### Timestamp

If you are in a live tutorial session, you might be using a shared test account or project. To avoid name collisions between users on resources created, you create a timestamp for each instance session, and append it onto the name of resources you create in this tutorial.

In [None]:
from datetime import datetime

TIMESTAMP = datetime.now().strftime("%Y%m%d%H%M%S")

### Authenticate your Google Cloud account

**If you are using Google Cloud Notebooks**, your environment is already
authenticated. Skip this step.

**If you are using Colab**, run the cell below and follow the instructions
when prompted to authenticate your account via oAuth.

**Otherwise**, follow these steps:

1. In the Cloud Console, go to the [**Create service account key**
   page](https://console.cloud.google.com/apis/credentials/serviceaccountkey).

2. Click **Create service account**.

3. In the **Service account name** field, enter a name, and
   click **Create**.

4. In the **Grant this service account access to project** section, click the **Role** drop-down list. Type "Vertex AI"
into the filter box, and select
   **Vertex AI Administrator**. Type "Storage Object Admin" into the filter box, and select **Storage Object Admin**.

5. Click *Create*. A JSON file that contains your key downloads to your
local environment.

6. Enter the path to your service account key as the
`GOOGLE_APPLICATION_CREDENTIALS` variable in the cell below and run the cell.

In [None]:
import os
import sys

# If you are running this notebook in Colab, run this cell and follow the
# instructions to authenticate your GCP account. This provides access to your
# Cloud Storage bucket and lets you submit training jobs and prediction
# requests.

# The Google Cloud Notebook product has specific requirements
IS_GOOGLE_CLOUD_NOTEBOOK = os.path.exists("/opt/deeplearning/metadata/env_version")

# If on Google Cloud Notebooks, then don't execute this code
if not IS_GOOGLE_CLOUD_NOTEBOOK:
    if "google.colab" in sys.modules:
        from google.colab import auth as google_auth

        google_auth.authenticate_user()

    # If you are running this notebook locally, replace the string below with the
    # path to your service account key and run this cell to authenticate your GCP
    # account.
    elif not os.getenv("IS_TESTING"):
        %env GOOGLE_APPLICATION_CREDENTIALS ''

### Create a Cloud Storage bucket

**The following steps are required, regardless of your notebook environment.**

Set the name of your Cloud Storage bucket below. It must be unique across all
Cloud Storage buckets.

You may also change the `REGION` variable, which is used for operations
throughout the rest of this notebook. We suggest that you [choose a region where Vertex AI services are
available](https://cloud.google.com/vertex-ai/docs/general/locations#available_regions).

In [None]:
BUCKET_URI = "gs://[your-bucket-name]"  # @param {type:"string"}
REGION = "[your-region]"  # @param {type:"string"}

In [None]:
if BUCKET_URI == "" or BUCKET_URI is None or BUCKET_URI == "gs://[your-bucket-name]":
    BUCKET_URI = "gs://" + PROJECT_ID + "-aip-" + TIMESTAMP

if REGION == "[your-region]":
    REGION = "us-central1"

**Only if your bucket doesn't already exist**: Run the following cell to create your Cloud Storage bucket.

In [None]:
! gsutil mb -l $REGION -p $PROJECT_ID $BUCKET_URI

Finally, validate access to your Cloud Storage bucket by examining its contents:

In [None]:
! gsutil ls -al $BUCKET_URI

### Set project template

In [None]:
DATA_PATH = "data"
KFP_COMPONENTS_PATH = "components"
SRC = "src"
BUILD = "build"

In [None]:
!mkdir -m 777 -p {DATA_PATH} {KFP_COMPONENTS_PATH} {SRC} {BUILD}

### Prepare input data

In the following code, you will 

1) Get dataset from UCI archive.
2) Untar the dataset
3) Copy the dataset to a Cloud Storage location.

In [None]:
!wget --no-parent https://archive.ics.uci.edu/ml/machine-learning-databases/reuters21578-mld/reuters21578.tar.gz --directory-prefix={DATA_PATH}/raw
!mkdir -m 777 -p {DATA_PATH}/raw/temp {DATA_PATH}/raw
!tar -zxvf {DATA_PATH}/raw/reuters21578.tar.gz -C {DATA_PATH}/raw/temp/
!mv {DATA_PATH}/raw/temp/*.sgm {DATA_PATH}/raw && rm -rf {DATA_PATH}/raw/temp && rm -f {DATA_PATH}/raw/reuters21578.tar.gz

In [None]:
!gsutil -m cp -R {DATA_PATH}/raw $BUCKET_URI/{DATA_PATH}/raw

### Import libraries 

In [None]:
import random
from pathlib import Path as path
from urllib.parse import urlparse

import tensorflow_hub as hub

os.environ["TFHUB_MODEL_LOAD_FORMAT"] = "UNCOMPRESSED"

import google.cloud.aiplatform as vertex_ai
from kfp import dsl
from kfp.v2 import compiler
from kfp.v2.dsl import component

### Define constants

About the model we are going to use in preprocessing, we use the [Swivel](https://tfhub.dev/google/tf2-preview/gnews-swivel-20dim/1) embedding which was trained on English Google News 130GB corpus and has 20 dimensions.

In [None]:
JOB_NAME = f"reuters-ingest-{TIMESTAMP}"
SETUP_FILE_URI = urlparse(BUCKET_URI)._replace(path="setup.py").geturl()
RUNNER = "DataflowRunner"
STAGING_LOCATION_URI = urlparse(BUCKET_URI)._replace(path="staging").geturl()
TMP_LOCATION_URI = urlparse(BUCKET_URI)._replace(path="temp").geturl()
INPUTS_URI = urlparse(BUCKET_URI)._replace(path=f"{DATA_PATH}/raw/*.sgm").geturl()
BQ_DATASET = "mlops_bqml_text_analyisis"
BQ_TABLE = "reuters_ingested"
MODEL_NAME = "swivel_text_embedding_model"
EMBEDDINGS_TABLE = f"reuters_text_embeddings_{TIMESTAMP}"
MODEL_PATH = (
    f'{hub.resolve("https://tfhub.dev/google/tf2-preview/gnews-swivel-20dim/1")}/*'
)
PREPROCESSED_TABLE = f"reuters_text_preprocessed_{TIMESTAMP}"
CLASSIFICATION_MODEL_NAME = "logistic_reg"
PREDICT_TABLE = f"reuters_text_predict_{TIMESTAMP}"

### Initialize client

In [None]:
vertex_ai.init(project=PROJECT_ID, location=REGION, staging_bucket=BUCKET_URI)

## Pipeline formalization

### Data ingestion component

#### Create Dataflow Python module

The following module contains a Dataflow pipeline that

1) Read the files from Cloud Storage.
2) Extract the article and generate title, topics, and content from files.
3) Load the structured data to BigQuery.


In [None]:
!touch {SRC}/__init__.py 

In [None]:
%%writefile src/ingest_pipeline.py
# Copyright 2022 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# General imports
from __future__ import absolute_import
import argparse
import logging
import os
import string

# Preprocessing imports
import tensorflow as tf
import bs4
import nltk

import apache_beam as beam
from apache_beam.io.gcp.internal.clients import bigquery
from apache_beam.options.pipeline_options import PipelineOptions
from apache_beam.options.pipeline_options import SetupOptions


# Helpers -------------------------------------------------------- -------------

def get_args():
    """
    Get command line arguments.
    Returns:
      args: The parsed arguments.
    """
    parser = argparse.ArgumentParser()
    parser.add_argument('--inputs', dest='inputs', default='data/raw/reuters/*.sgm',
                        help='A directory location of input data')
    parser.add_argument('--bq-dataset', dest='bq_dataset', required=False,
                        default='reuters_dataset', help='Dataset name used in BigQuery.')
    parser.add_argument('--bq-table', dest='bq_table', required=False,
                        default='reuters_ingested_table', help='Table name used in BigQuery.')
    args, pipeline_args = parser.parse_known_args()
    return args, pipeline_args

def get_paths(data_pattern):
    """
  A function to get all the paths of the files in the data directory.
  Args:
    data_pattern: A directory location of input data.
  Returns:
    A list of file paths.
  """
    data_paths = tf.io.gfile.glob(data_pattern)
    return data_paths


def get_title(article):
    """
    A function to get the title of an article.
    Args:
        article: A BeautifulSoup object of an article.
    Returns:
        A string of the title of the article.
    """
    title = article.find('text').title
    if title is not None:
        title = ''.join(filter(lambda x: x in set(string.printable), title.text))
        title = title.encode('ascii', 'ignore')
    return title


def get_content(article):
    """
    A function to get the content of an article.
    Args:
        article: A BeautifulSoup object of an article.
    Returns:
        A string of the content of the article.
    """
    content = article.find('text').body
    if content is not None:
        content = ''.join(filter(lambda x: x in set(string.printable), content.text))
        content = ' '.join(content.split())
        try:
            content = '\n'.join(nltk.sent_tokenize(content))
        except LookupError:
            nltk.download('punkt')
            content = '\n'.join(nltk.sent_tokenize(content))
        content = content.encode('ascii', 'ignore')
    return content


def get_topics(article):
    """
    A function to get the topics of an article.
    Args:
        article: A BeautifulSoup object of an article.
    Returns:
        A list of strings of the topics of the article.
    """
    topics = []
    for topic in article.topics.children:
        topic = ''.join(filter(lambda x: x in set(string.printable), topic.text))
        topics.append(topic.encode('ascii', 'ignore'))
    return topics


def get_articles(data_paths):
    """
    Args:
        data_paths: A list of file paths.
    Returns:
        A list of articles.
    """
    data = tf.io.gfile.GFile(data_paths, 'rb').read()
    soup = bs4.BeautifulSoup(data, "html.parser")
    articles = []
    for raw_article in soup.find_all('reuters'):
        article = {
            'title': get_title(raw_article),
            'content': get_content(raw_article),
            'topics': get_topics(raw_article)
        }
        if None not in article.values():
            if [] not in article.values():
                articles.append(article)
    return articles


def get_bigquery_schema():
    """
    A function to get the BigQuery schema.
    Returns:
        A list of BigQuery schema.
    """

    table_schema = bigquery.TableSchema()
    columns = (('topics', 'string', 'repeated'),
               ('title', 'string', 'nullable'),
               ('content', 'string', 'nullable'))

    for column in columns:
        column_schema = bigquery.TableFieldSchema()
        column_schema.name = column[0]
        column_schema.type = column[1]
        column_schema.mode = column[2]
        table_schema.fields.append(column_schema)

    return table_schema


# Pipeline runner
def run(args, pipeline_args=None):
    """
    A function to run the pipeline.
    Args:
        args: The parsed arguments.
    Returns:
        None
    """

    options = PipelineOptions(pipeline_args)
    options.view_as(SetupOptions).save_main_session = True

    pipeline = beam.Pipeline(options=options)
    articles = (
            pipeline
            | 'Get Paths' >> beam.Create(get_paths(args.inputs))
            | 'Get Articles' >> beam.Map(get_articles)
            | 'Get Article' >> beam.FlatMap(lambda x: x)
    )
    if options.get_all_options()['runner'] == 'DirectRunner':
        articles | 'Dry run' >> beam.io.WriteToText('data/processed/reuters', file_name_suffix=".jsonl")
    else:
        (articles
         | 'Write to BigQuery' >> beam.io.WriteToBigQuery(
                    project=options.get_all_options()['project'],
                    dataset=args.bq_dataset,
                    table=args.bq_table,
                    schema=get_bigquery_schema(),
                    create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
                    write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE)
         )
    job = pipeline.run()

    if options.get_all_options()['runner'] == 'DirectRunner':
        job.wait_until_finish()


if __name__ == '__main__':
    args, pipeline_args = get_args()
    logging.getLogger().setLevel(logging.INFO)
    run(args, pipeline_args)

#### Create requirements

Next, create the requirements.txt file with Python modules that are needed for Apache Beam pipeline. 

In [None]:
%%writefile requirements.txt
apache-beam[gcp]==2.36.0
bs4==0.0.1
nltk==3.7
tensorflow<2.8.0

#### Create Setup file

And add the setup file with Python modules that are needed for executing the Dataflow workers. 

In [None]:
%%writefile setup.py
# !/usr/bin/python

# Copyright 2022 Google LLC

# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at

#      http://www.apache.org/licenses/LICENSE-2.0

# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import setuptools

REQUIRED_PACKAGES = [
    'bs4==0.0.1',
    'nltk==3.7',
    'tensorflow<2.8.0']

setuptools.setup(
    name='ingest',
    version='0.0.1',
    author='author',
    author_email='author@google.com',
    install_requires=REQUIRED_PACKAGES,
    packages=setuptools.find_packages())

#### Copy the setup, the python module and requirements file to Cloud Storage

Finally, copy the Python module, requirements and setup file to your Cloud Storage bucket.

In [None]:
# !gsutil cp -R {SRC}/preprocess_pipeline.py {BUCKET_URI}/preprocess_pipeline.py
!gsutil cp -R {SRC} {BUCKET_URI}/{SRC}
!gsutil cp requirements.txt {BUCKET_URI}/requirements.txt
!gsutil cp setup.py {BUCKET_URI}/setup.py

### BQML components

To build the next steps of our pipelines, we define a set of queries to:

1) Create the BigQuery dataset schema.
2) Preprocess our text data and generate the embeddings using Swevel model
2) Train the BigQuery ML Logistic Regression model.
3) Evaluate the model.
4) Run a batch prediction


In [None]:
!mkdir -m 777 -p {KFP_COMPONENTS_PATH}/bq_dataset_component
!mkdir -m 777 -p {KFP_COMPONENTS_PATH}/bq_preprocess_component
!mkdir -m 777 -p {KFP_COMPONENTS_PATH}/bq_model_component
!mkdir -m 777 -p {KFP_COMPONENTS_PATH}/bq_prediction_component

#### Create BQ Dataset query

With this query, we create the Bigquery dataset schema we are going to use to train our model.

In [None]:
create_bq_dataset_query = f"""
CREATE SCHEMA IF NOT EXISTS {BQ_DATASET}
"""

with open(
    f"{KFP_COMPONENTS_PATH}/bq_dataset_component/create_bq_dataset.sql", "w"
) as q:
    q.write(create_bq_dataset_query)
q.close()

#### Create BQ Preprocess query

The following query use the TFHub Swevel model to generate the embedding of our text data and split the dataset for training and serving purposes.

In [None]:
create_bq_preprocess_query = f"""
-- create the embedding model
CREATE OR REPLACE MODEL
  `{PROJECT_ID}.{BQ_DATASET}.{MODEL_NAME}` OPTIONS(model_type='tensorflow',
    model_path='{MODEL_PATH}');

-- create the preprocessed table
CREATE OR REPLACE TABLE `{PROJECT_ID}.{BQ_DATASET}.{PREPROCESSED_TABLE}`
AS (
  WITH
    -- Apply the model for embedding generation
    get_embeddings AS (
      SELECT
        title,
        sentences,
        output_0 as content_embeddings,
        topics
      FROM ML.PREDICT(MODEL `{PROJECT_ID}.{BQ_DATASET}.{MODEL_NAME}`,(
        SELECT topics, title, content AS sentences
        FROM `{PROJECT_ID}.{BQ_DATASET}.{BQ_TABLE}`
      ))),
    -- Get label
    get_label AS (
        SELECT
            *,
            STRUCT( CASE WHEN 'acq' in UNNEST(topics) THEN 1 ELSE 0 END as acq ) AS label,
        FROM get_embeddings
    ),
    -- Train-serve splitting
    get_split AS (
        SELECT
            *,
            CASE WHEN ABS(MOD(FARM_FINGERPRINT(title), 10)) < 8 THEN 'TRAIN' ELSE 'PREDICT' END AS split
        FROM get_label
    )
    -- create training table
    SELECT
        title,
        sentences,
        STRUCT( content_embeddings[OFFSET(0)] AS content_embed_0,
                content_embeddings[OFFSET(1)] AS content_embed_1,
                content_embeddings[OFFSET(2)] AS content_embed_2,
                content_embeddings[OFFSET(3)] AS content_embed_3,
                content_embeddings[OFFSET(4)] AS content_embed_4,
                content_embeddings[OFFSET(5)] AS content_embed_5,
                content_embeddings[OFFSET(6)] AS content_embed_6,
                content_embeddings[OFFSET(7)] AS content_embed_7,
                content_embeddings[OFFSET(8)] AS content_embed_8,
                content_embeddings[OFFSET(9)] AS content_embed_9,
                content_embeddings[OFFSET(10)] AS content_embed_10,
                content_embeddings[OFFSET(11)] AS content_embed_11,
                content_embeddings[OFFSET(12)] AS content_embed_12,
                content_embeddings[OFFSET(13)] AS content_embed_13,
                content_embeddings[OFFSET(14)] AS content_embed_14,
                content_embeddings[OFFSET(15)] AS content_embed_15,
                content_embeddings[OFFSET(16)] AS content_embed_16,
                content_embeddings[OFFSET(17)] AS content_embed_17,
                content_embeddings[OFFSET(18)] AS content_embed_18,
                content_embeddings[OFFSET(19)] AS content_embed_19) AS feature,
        label.acq as label,
        split
    FROM
      get_split)
"""

with open(
    f"{KFP_COMPONENTS_PATH}/bq_preprocess_component/bq_preprocess_query.sql", "w"
) as q:
    q.write(create_bq_preprocess_query)
q.close()

#### Create BQ Model query

Below you have a simple query to build a BigQuery ML Logistic Classifier model for topic's articles classification.

In [None]:
create_bq_model_query = f"""
CREATE OR REPLACE MODEL `{PROJECT_ID}.{BQ_DATASET}.{CLASSIFICATION_MODEL_NAME}`
  OPTIONS (
      model_type='logistic_reg',
      input_label_cols=['label']) AS
  SELECT
      label,
      feature.*
  FROM
     `{PROJECT_ID}.{BQ_DATASET}.{PREPROCESSED_TABLE}`
  WHERE split = 'TRAIN';
"""

with open(f"{KFP_COMPONENTS_PATH}/bq_model_component/create_bq_model.sql", "w") as q:
    q.write(create_bq_model_query)
q.close()

#### Create BQ Prediction query

With the following query, we run a prediction job using the table with the preprocessing query.

In [None]:
create_bq_prediction_query = f"""SELECT title, sentences, feature.* FROM `{PROJECT_ID}.{BQ_DATASET}.{PREPROCESSED_TABLE}` WHERE split = 'PREDICT' """

with open(
    f"{KFP_COMPONENTS_PATH}/bq_prediction_component/create_bq_prediction_query.sql", "w"
) as q:
    q.write(create_bq_prediction_query)
q.close()

### Build Pipeline

In [None]:
ID = random.randint(1, 10000)
JOB_NAME = f"reuters-preprocess-{TIMESTAMP}-{ID}"
JOB_CONFIG = {
    "destinationTable": {
        "projectId": PROJECT_ID,
        "datasetId": BQ_DATASET,
        "tableId": PREDICT_TABLE,
    }
}

#### Create a custom component to pass `DataflowPythonJobOp` arguments

In [None]:
@component(base_image="python:3.8-slim")
def build_dataflow_args(
    # destination_table: Input[Artifact],
    bq_dataset: str,
    bq_table: str,
    job_name: str,
    setup_file_uri: str,
    runner: str,
    inputs_uri: str,
) -> list:
    return [
        "--job_name",
        job_name,
        "--setup_file",
        setup_file_uri,
        "--runner",
        runner,
        "--inputs",
        inputs_uri,
        "--bq-dataset",
        bq_dataset,
        "--bq-table",
        bq_table,
    ]

#### Create the pipeline

In [None]:
@dsl.pipeline(
    name="mlops-bqml-text-generate-embeddings",
    description="A batch pipeline to generate embeddings",
)
def pipeline(
    create_bq_dataset_query: str,
    job_name: str,
    inputs_uri: str,
    bq_dataset: str,
    bq_table: str,
    requirements_file_path: str,
    python_file_path: str,
    setup_file_uri: str,
    temp_location: str,
    runner: str,
    create_bq_preprocess_query: str,
    create_bq_model_query: str,
    create_bq_prediction_query: str,
    job_config: dict,
    project: str = PROJECT_ID,
    region: str = REGION,
):

    from google_cloud_pipeline_components.v1.bigquery import (
        BigqueryCreateModelJobOp, BigqueryEvaluateModelJobOp,
        BigqueryPredictModelJobOp, BigqueryQueryJobOp)
    from google_cloud_pipeline_components.v1.dataflow import \
        DataflowPythonJobOp
    from google_cloud_pipeline_components.v1.wait_gcp_resources import \
        WaitGcpResourcesOp

    # create the dataset
    bq_dataset_op = BigqueryQueryJobOp(
        query=create_bq_dataset_query,
        project=project,
        location="US",
    )
    # instanciate dataflow args
    build_dataflow_args_op = build_dataflow_args(
        job_name=job_name,
        inputs_uri=inputs_uri,
        # destination_table = bq_dataset_op.outputs['destination_table'],
        bq_dataset=bq_dataset,
        bq_table=bq_table,
        setup_file_uri=setup_file_uri,
        runner=runner,
    ).after(bq_dataset_op)

    # run dataflow job
    dataflow_python_op = DataflowPythonJobOp(
        requirements_file_path=requirements_file_path,
        python_module_path=python_file_path,
        args=build_dataflow_args_op.output,
        project=project,
        location=region,
        temp_location=temp_location,
    ).after(build_dataflow_args_op)

    dataflow_wait_op = WaitGcpResourcesOp(
        gcp_resources=dataflow_python_op.outputs["gcp_resources"]
    ).after(dataflow_python_op)

    # run preprocessing job
    bq_preprocess_op = BigqueryQueryJobOp(
        query=create_bq_preprocess_query,
        project=project,
        location="US",
    ).after(dataflow_wait_op)

    # create the logistic regression
    bq_model_op = BigqueryCreateModelJobOp(
        query=create_bq_model_query,
        project=project,
        location="US",
    ).after(bq_preprocess_op)

    # evaluate the logistic regression
    bq_evaluate_op = BigqueryEvaluateModelJobOp(
        project=project, location="US", model=bq_model_op.outputs["model"]
    ).after(bq_model_op)

    # similuate prediction
    BigqueryPredictModelJobOp(
        model=bq_model_op.outputs["model"],
        query_statement=create_bq_prediction_query,
        job_configuration_query=job_config,
        project=project,
        location="US",
    ).after(bq_evaluate_op)

## Compile and Run the pipeline

In [None]:
PIPELINE_ROOT = urlparse(BUCKET_URI)._replace(path="pipeline_root").geturl()
PIPELINE_PACKAGE = str(path(BUILD) / "mlops_bqml_text_analyisis_pipeline.json")
REQUIREMENTS_URI = urlparse(BUCKET_URI)._replace(path="requirements.txt").geturl()
PYTHON_FILE_URI = urlparse(BUCKET_URI)._replace(path="src/ingest_pipeline.py").geturl()
MODEL_URI = urlparse(BUCKET_URI)._replace(path="swivel_text_embedding_model").geturl()

compiler.Compiler().compile(pipeline_func=pipeline, package_path=PIPELINE_PACKAGE)

In [None]:
pipeline = vertex_ai.PipelineJob(
    display_name=f"data_preprocess_{TIMESTAMP}",
    template_path=PIPELINE_PACKAGE,
    pipeline_root=PIPELINE_ROOT,
    parameter_values={
        "create_bq_dataset_query": create_bq_dataset_query,
        "bq_dataset": BQ_DATASET,
        "job_name": JOB_NAME,
        "inputs_uri": INPUTS_URI,
        "bq_table": BQ_TABLE,
        "requirements_file_path": REQUIREMENTS_URI,
        "python_file_path": PYTHON_FILE_URI,
        "setup_file_uri": SETUP_FILE_URI,
        "temp_location": PIPELINE_ROOT,
        "runner": RUNNER,
        "create_bq_preprocess_query": create_bq_preprocess_query,
        "create_bq_model_query": create_bq_model_query,
        "create_bq_prediction_query": create_bq_prediction_query,
        "job_config": JOB_CONFIG,
    },
    enable_caching=False,
)

pipeline.run()

## Cleaning up

To clean up all Google Cloud resources used in this project, you can [delete the Google Cloud
project](https://cloud.google.com/resource-manager/docs/creating-managing-projects#shutting_down_projects) you used for the tutorial.


In [None]:
# delete bucket
! gsutil -m rm -r $BUCKET_URI

# delete dataset
! bq rm -r -f -d $PROJECT_ID:$BQ_DATASET