In [None]:
# Copyright 2022 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# E2E ML on GCP: MLOps stage 3 : formalization: get started with Datproc Serverless pipeline components

<table align="left">
  <td>
    <a href="https://github.com/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/community/ml_ops/stage3/get_started_with_dataproc_serverless_pipeline_components.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/github-logo-32px.png" alt="GitHub logo">
      View on GitHub
    </a>
  </td>
    <td>
        <a href="https://colab.research.google.com/github/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/community/ml_ops/stage3/get_started_with_dataproc_serverless_pipeline_components.ipynb">
        <img src="https://cloud.google.com/ml-engine/images/colab-logo-32px.png\" alt="Colab logo"> Run in Colab
        </a>
  </td>
  <td>
    <a href="https://console.cloud.google.com/vertex-ai/workbench/deploy-notebook?download_url=https://github.com/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/community/ml_ops/stage3/get_started_with_dataproc_serverless_pipeline_components.ipynb">
      <img src="https://lh3.googleusercontent.com/UiNooY4LUgW_oTvpsNhPpQzsstV5W8F7rYgxgGBD85cWJoLmrOzhVs_ksK_vgx40SHs7jCqkTkCk=e14-rj-sc0xffffff-h130-w32" alt="Vertex AI logo">
      Open in Vertex AI Workbench
    </a>
  </td>
</table>
<br/><br/><br/>

## Overview

This tutorial demonstrates how to use Vertex AI for E2E MLOps on Google Cloud in production. This tutorial covers stage 3 : formalization: get started with Dataproc Serverless pipeline components.

### Objective

In this tutorial, you learn how to use prebuilt `Google Cloud Pipeline Components` for `Dataproc Serverless` service. The documentation for the components can be found [here](https://google-cloud-pipeline-components.readthedocs.io/en/google-cloud-pipeline-components-1.0.0/google_cloud_pipeline_components.experimental.dataproc.html).

This tutorial uses the following Google Cloud ML services:

- `Vertex AI Pipelines`
- `Google Cloud Pipeline Components`
- `Dataproc Serverless`

An example pipeline is provided for each Dataproc Serverless component, which includes:
- `DataprocPySparkBatchOp` for running PySpark batch workloads.
- `DataprocSparkBatchOp` for running Spark batch workloads.
- `DataprocSparkSqlBatchOp` for running Spark SQL batch workloads.
- `DataprocSparkRBatchOp` for running SparkR batch workloads.

### Costs
This tutorial uses billable components of Google Cloud:

- Vertex AI
- Dataproc Serverless
- Cloud Storage

Learn about [Vertex AI pricing](https://cloud.google.com/vertex-ai/pricing), [Dataproc Serverless pricing](https://cloud.google.com/dataproc-serverless/pricing) and [Cloud Storage pricing](https://cloud.google.com/storage/pricing) and use the [Pricing Calculator](https://cloud.google.com/products/calculator/) to generate a cost estimate based on your projected usage.

### Before you begin

**Before proceeding, you should complete the following pre-requisites:**

* [Configure your project for Vertex AI Pipelines](https://cloud.google.com/vertex-ai/docs/pipelines/configure-project).

* [Enable the Dataproc API](https://console.cloud.google.com/flows/enableapi?apiid=dataproc.googleleapis.com) in your project.

* Ensure your project meets the networking requirements detailed in [Dataproc Serverless for Spark network configuration](https://cloud.google.com/dataproc-serverless/docs/concepts/network).

## Installations

Install the following packages for executing this MLOps notebook.

In [None]:
import os

# The Google Cloud Notebook product has specific requirements
IS_GOOGLE_CLOUD_NOTEBOOK = os.path.exists("/opt/deeplearning/metadata/env_version")

# Google Cloud Notebook requires dependencies to be installed with '--user'
USER_FLAG = ""
if IS_GOOGLE_CLOUD_NOTEBOOK:
    USER_FLAG = "--user"

# Install the required packages
! pip3 install tensorflow-io $USER_FLAG -q
! pip3 install --upgrade google-cloud-pipeline-components kfp $USER_FLAG -q

### Restart the kernel

Once you've installed the additional packages, you need to restart the notebook kernel so it can find the packages.

In [None]:
import os

if not os.getenv("IS_TESTING"):
    # Automatically restart kernel after installs
    import IPython

    app = IPython.Application.instance()
    app.kernel.do_shutdown(True)

#### Set your project ID

**If you don't know your project ID**, you may be able to get your project ID using `gcloud`.

In [None]:
PROJECT_ID = "[your-project-id]"  # @param {type:"string"}

In [None]:
if PROJECT_ID == "" or PROJECT_ID is None or PROJECT_ID == "[your-project-id]":
    # Get your GCP project id from gcloud
    shell_output = ! gcloud config list --format 'value(core.project)' 2>/dev/null
    PROJECT_ID = shell_output[0]
    print("Project ID:", PROJECT_ID)

In [None]:
! gcloud config set project $PROJECT_ID

#### Region

You can also change the `REGION` variable, which is used for operations
throughout the rest of this notebook.  Below are regions supported for Vertex AI. We recommend that you choose the region closest to you. The notebook will configure the Dataproc Serverless components to run in the same region.

- Americas: `us-central1`
- Europe: `europe-west4`
- Asia Pacific: `asia-east1`

You may not use a multi-regional bucket for training with Vertex AI. Not all regions provide support for all Vertex AI services.

Learn more about [Vertex AI regions](https://cloud.google.com/vertex-ai/docs/general/locations).

In [None]:
REGION = "[your-region]"  # @param {type: "string"}

if REGION == "[your-region]":
    REGION = "us-central1"

#### Timestamp

If you are in a live tutorial session, you might be using a shared test account or project. To avoid name collisions between users on resources created, you create a timestamp for each instance session, and append the timestamp onto the name of resources you create in this tutorial.

In [None]:
from datetime import datetime

TIMESTAMP = datetime.now().strftime("%Y%m%d%H%M%S")

### Authenticate your Google Cloud account

**If you are using Google Cloud Notebooks**, your environment is already authenticated. Skip this step.

**If you are using Colab**, run the cell below and follow the instructions when prompted to authenticate your account via oAuth.

**Otherwise**, follow these steps:

In the Cloud Console, go to the [Create service account key](https://console.cloud.google.com/apis/credentials/serviceaccountkey) page.

1. **Click Create service account**.

2. In the **Service account name** field, enter a name, and click **Create**.

3. In the **Grant this service account access to project** section, click the Role drop-down list. Type "Vertex" into the filter box, and select **Vertex Administrator**. Type "Storage Object Admin" into the filter box, and select **Storage Object Admin**.

4. Click Create. A JSON file that contains your key downloads to your local environment.

5. Enter the path to your service account key as the GOOGLE_APPLICATION_CREDENTIALS variable in the cell below and run the cell.

In [None]:
# If you are running this notebook in Colab, run this cell and follow the
# instructions to authenticate your GCP account. This provides access to your
# Cloud Storage bucket and lets you submit training jobs and prediction
# requests.

import os
import sys

# If on Google Cloud Notebook, then don't execute this code
if not os.path.exists("/opt/deeplearning/metadata/env_version"):
    if "google.colab" in sys.modules:
        from google.colab import auth as google_auth

        google_auth.authenticate_user()

    # If you are running this notebook locally, replace the string below with the
    # path to your service account key and run this cell to authenticate your GCP
    # account.
    elif not os.getenv("IS_TESTING"):
        %env GOOGLE_APPLICATION_CREDENTIALS ''

### Create a Cloud Storage bucket

**The following steps are required, regardless of your notebook environment.**

When you initialize the Vertex SDK for Python, you specify a Cloud Storage staging bucket. The staging bucket is where all the data associated with your dataset and model resources are retained across sessions.

Set the name of your Cloud Storage bucket below. Bucket names must be globally unique across all Google Cloud projects, including those outside of your organization.

In [None]:
BUCKET_NAME = "[your-bucket-name]"  # @param {type:"string"}
BUCKET_URI = f"gs://{BUCKET_NAME}"

In [None]:
if BUCKET_URI == "" or BUCKET_URI is None or BUCKET_URI == "gs://[your-bucket-name]":
    BUCKET_URI = "gs://" + PROJECT_ID + "aip-" + TIMESTAMP

**Only if your bucket doesn't already exist**: Run the following cell to create your Cloud Storage bucket.

In [None]:
! gsutil mb -l $REGION $BUCKET_URI

Finally, validate access to your Cloud Storage bucket by examining its contents:

In [None]:
! gsutil ls -al $BUCKET_URI

#### Service Account

You use a service account to create the Vertex AI Pipeline job. If you do not want to use your project's Compute Engine service account, set `SERVICE_ACCOUNT` to another service account ID.

In [None]:
SERVICE_ACCOUNT = "[your-service-account]"

In [None]:
if (
    SERVICE_ACCOUNT == ""
    or SERVICE_ACCOUNT == "[your-service-account]"
    or SERVICE_ACCOUNT is None
):
    shell_output = ! gcloud projects describe $PROJECT_ID | sed -nre 's:.*projectNumber\: (.*):\1:p'
    SERVICE_ACCOUNT = (
        shell_output[0].replace("'", "") + "-compute@developer.gserviceaccount.com"
    )

print("Service Account:", SERVICE_ACCOUNT)

Run the following commands to grant your project's Compute Engine service account access to read and write pipeline artifacts in the bucket that you created in the previous step.

In [None]:
! gsutil iam ch serviceAccount:{SERVICE_ACCOUNT}:roles/storage.objectCreator $BUCKET_URI

! gsutil iam ch serviceAccount:{SERVICE_ACCOUNT}:roles/storage.objectViewer $BUCKET_URI

#### Grant Dataproc roles to the Service Account

**Note: You can skip this section if you're using your project's Compute Engine default service account, and if that account has been granted the project `Editor` role already.**

For simplicity, this notebook uses the same service account for Vertex AI Pipelines jobs and Dataproc Serverless workloads. Dataproc Serverless provides IAM roles to create and run workloads:

* The `Dataproc Editor` role grants the necessary IAM permissions to create a workload. The role should be granted to user and service accounts that create Dataproc batch workloads.
* The `Dataproc Worker` role grants the necessary IAM permissions to run a workload. The role should be granted to the service account that runs the batch workload.

You may not need to grant the `Dataproc Editor` and `Dataproc Worker` roles if your service account has already been granted with the  permissions required to create and run Dataproc batch workloads. For example, the Compute Engine default service account may already be granted the project `Editor` role. The `Editor` role provides sufficient permissions to both create and run Dataproc workloads.

**Note:** The following cells will fail if the account used by `gcloud` has not been granted permissions to modify IAM policies. In this case, you can set IAM policies for the service account by accessing the [IAM page in the Cloud Console](https://console.cloud.google.com/iam-admin/iam) with a user account that has been granted the required permissions.

Set `GRANT_DATAPROC_EDITOR_ROLE` to `True` if you wish to grant the `Dataproc Editor` role to your service account. The `Dataproc Editor` role grants the IAM permissions needed to create Dataproc Serverless workloads.

In [None]:
GRANT_DATAPROC_EDITOR_ROLE = False

if GRANT_DATAPROC_EDITOR_ROLE:
    ! gcloud projects add-iam-policy-binding $PROJECT_ID \
        --member="serviceAccount:$SERVICE_ACCOUNT" \
        --role="roles/dataproc.editor"

Set `GRANT_DATAPROC_WORKER_ROLE` to `True` if you wish to grant the `Dataproc Worker` role to your service account. The `Dataproc Worker` role grants the IAM permissions needed to run Dataproc Serverless workloads.

*Note*: The following cell will fail if the account used by `gcloud` has not been granted permissions to modify IAM policies. In this case, you can set IAM policies for the service account by accessing the [IAM page in the Cloud Console](https://console.cloud.google.com/iam-admin/iam) with a user account that has been granted the required permissions.

In [None]:
GRANT_DATAPROC_WORKER_ROLE = False

if GRANT_DATAPROC_WORKER_ROLE:
    ! gcloud projects add-iam-policy-binding $PROJECT_ID \
        --member="serviceAccount:$SERVICE_ACCOUNT" \
        --role="roles/dataproc.worker"

### Import libraries and define constants

In [None]:
import google.cloud.aiplatform as aiplatform
from kfp import dsl
from kfp.v2 import compiler

### Initialize Vertex AI SDK for Python

Initialize the Vertex AI SDK for Python for your project and corresponding bucket.

In [None]:
aiplatform.init(project=PROJECT_ID, staging_bucket=BUCKET_URI)

## Running a PySpark workload

This section shows you how to create a PySpark batch workload from Vertex AI Pipelines. The pipeline uses `DataprocPySparkBatchOp` to run a Python script that counts the frequency of words used in Shakespeare.

### Write the PySpark word count program.

First, you write a Python script that uses PySpark to perform a simple word count. The code is written to a local file called `wordcount.py`.

In [None]:
%%writefile wordcount.py
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements.  See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License.  You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

"""A PySpark program that counts the number of words in Shakespeare."""

import argparse
import sys
from pyspark.sql import SparkSession

def run(argv=None):
    parser = argparse.ArgumentParser()
    parser.add_argument('--input',
                        dest='input',
                        default=' ',
                        help='Input file to process.')
    parser.add_argument('--output',
                        dest='output',
                        default='gs://YOUR_OUTPUT_BUCKET/AND_OUTPUT_PREFIX',
                        help='Output file to write results to.')
    
    known_args, _ = parser.parse_known_args(argv)
    
    spark = SparkSession\
            .builder\
            .appName("wordcount")\
            .getOrCreate()
    
    sc = spark.sparkContext    
    words = sc.textFile(known_args.input).flatMap(lambda line: line.split(" "))
    wordCounts = words.map(lambda word: (word, 1)).reduceByKey(lambda a, b: a+b)
    wordCounts.saveAsTextFile(known_args.output)
    
    spark.stop()
    
if __name__ == '__main__':
    run(sys.argv)

### Copy python module to Cloud Storage

Next, you copy `wordcount.py` to a Cloud Storage bucket.

Additionally, you set the Cloud Storage locations for the input and output of the script.

In [None]:
GCS_WC_PY = BUCKET_URI + "/wordcount.py"
! gsutil cp wordcount.py $GCS_WC_PY

GCS_WC_OUT = BUCKET_URI + "/wc_out/"
GCS_WC_IN = "gs://dataproc-datasets-us-central1/shakespeare/all-lines.txt"

### Create and execute the pipeline job

In this example, the `DataprocPySparkBatchOp` component takes the following parameters:

- `batch_id`: The batch ID to use for the Dataproc Batch workload.
- `project_id`: The project ID.
- `location`: The region.
- `main_python_file_uri`: The URI of the main Python file.
- `service_account`: The service account that runs the workload.
- `args`: The arguments to pass to the PySpark program.

Learn more about the [Dataproc Serverless PySpark batch component](https://google-cloud-pipeline-components.readthedocs.io/en/google-cloud-pipeline-components-1.0.0/google_cloud_pipeline_components.experimental.dataproc.html#google_cloud_pipeline_components.experimental.dataproc.DataprocPySparkBatchOp).

In [None]:
PIPELINE_ROOT = "{}/pipeline_root/dataproc_pyspark".format(BUCKET_URI)
BATCH_ID = "wordcount-pyspark-" + TIMESTAMP
ARGS = [
    "--input",
    GCS_WC_IN,
    "--output",
    GCS_WC_OUT,
]


@dsl.pipeline(
    name="dataproc-pyspark",
    description="An exmaple pipeline that uses DataprocPySparkBatchOp for running a PySpark batch workload.",
)
def pipeline(
    batch_id: str = BATCH_ID,
    project_id: str = PROJECT_ID,
    location: str = REGION,
    main_python_file_uri: str = GCS_WC_PY,
    service_account: str = SERVICE_ACCOUNT,
    args: list = ARGS,
):
    from google_cloud_pipeline_components.experimental.dataproc import \
        DataprocPySparkBatchOp

    _ = DataprocPySparkBatchOp(
        project=project_id,
        location=location,
        batch_id=batch_id,
        main_python_file_uri=main_python_file_uri,
        service_account=service_account,
        args=args,
    )


compiler.Compiler().compile(pipeline_func=pipeline, package_path="pipeline.json")

pipeline = aiplatform.PipelineJob(
    display_name="pipeline",
    template_path="pipeline.json",
    pipeline_root=PIPELINE_ROOT,
    enable_caching=False,
)

pipeline.run()

! gsutil cat {GCS_WC_OUT}* | head -n10

! rm -f pipeline.json wordcount.py

### View Dataproc pipeline results

Finally, you will view the artifact outputs of each task in the pipeline.

In [None]:
import tensorflow as tf

PROJECT_NUMBER = pipeline.gca_resource.name.split("/")[1]
print(PROJECT_NUMBER)


def print_pipeline_output(job, output_task_name):
    JOB_ID = job.name
    print(JOB_ID)
    for _ in range(len(job.gca_resource.job_detail.task_details)):
        TASK_ID = job.gca_resource.job_detail.task_details[_].task_id
        EXECUTE_OUTPUT = (
            PIPELINE_ROOT
            + "/"
            + PROJECT_NUMBER
            + "/"
            + JOB_ID
            + "/"
            + output_task_name
            + "_"
            + str(TASK_ID)
            + "/executor_output.json"
        )
        GCP_RESOURCES = (
            PIPELINE_ROOT
            + "/"
            + PROJECT_NUMBER
            + "/"
            + JOB_ID
            + "/"
            + output_task_name
            + "_"
            + str(TASK_ID)
            + "/gcp_resources"
        )
        EVAL_METRICS = (
            PIPELINE_ROOT
            + "/"
            + PROJECT_NUMBER
            + "/"
            + JOB_ID
            + "/"
            + output_task_name
            + "_"
            + str(TASK_ID)
            + "/evaluation_metrics"
        )
        if tf.io.gfile.exists(EXECUTE_OUTPUT):
            print(EXECUTE_OUTPUT, "EXECUTE_OUTPUT")
            ! gsutil cat $EXECUTE_OUTPUT
            return EXECUTE_OUTPUT
        elif tf.io.gfile.exists(GCP_RESOURCES):
            ! gsutil cat $GCP_RESOURCES
            return GCP_RESOURCES
        elif tf.io.gfile.exists(EVAL_METRICS):
            ! gsutil cat $EVAL_METRICS
            return EVAL_METRICS

    return None


print("dataproc-create-pyspark-batch")
artifacts = print_pipeline_output(pipeline, "dataproc-create-pyspark-batch")
print("\n\n")

### Delete the pipeline job

After a pipeline job is completed, you can delete the pipeline job with the method `delete()`.  Prior to completion, a pipeline job can be canceled with the method `cancel()`.

In [None]:
pipeline.delete()

### Delete the batch

You can delete the created batch in Dataproc serverless using the following `gcloud` command.

In [None]:
! gcloud dataproc batches delete $BATCH_ID --region=$REGION --quiet

## Running a Spark workload
This section shows you how to create a Spark batch workload from Vertex Pipelines. The pipeline uses the `DataprocSparkBatchOp` component to run the `JavaWordCount` example that is pre-installed in the Dataproc Serverless default container image.

[View the source code](https://github.com/apache/spark/blob/master/examples/src/main/java/org/apache/spark/examples/JavaWordCount.java) for the `JavaWordCount` example.

### Create and execute the pipeline job

In this example, the `DataprocSparkBatchOp` component takes the following parameters:

- `batch_id`: The batch ID to use for the Dataproc Batch workload.
- `project_id`: The project ID.
- `location`: The region.
- `main_class`: The main class.
- `jar_file_uris`: The URIs of any required JARs to include in the executor and driver CLASSPATH.
- `service_account`: The service account that runs the workload.
- `args`: The arguments to pass to the Spark program.

Learn more about the [Dataproc Serverless Spark batch component](https://google-cloud-pipeline-components.readthedocs.io/en/google-cloud-pipeline-components-1.0.0/google_cloud_pipeline_components.experimental.dataproc.html#google_cloud_pipeline_components.experimental.dataproc.DataprocSparkBatchOp).

In [None]:
PIPELINE_ROOT = "{}/pipeline_root/dataproc_spark".format(BUCKET_URI)
BATCH_ID = "wordcount-spark-" + TIMESTAMP

MAIN_CLASS = "org.apache.spark.examples.JavaWordCount"
JAR_FILE_URIS = ["file:///usr/lib/spark/examples/jars/spark-examples.jar"]
ARGS = ["gs://dataproc-datasets-us-central1/shakespeare/all-lines.txt"]


@dsl.pipeline(
    name="dataproc-spark-wc",
    description="An example pipeline that uses DataprocSparkBatchOp to run a Spark batch workload.",
)
def pipeline(
    batch_id: str = BATCH_ID,
    project_id: str = PROJECT_ID,
    location: str = REGION,
    main_class: str = MAIN_CLASS,
    jar_file_uris: list = JAR_FILE_URIS,
    service_account: str = SERVICE_ACCOUNT,
    args: list = ARGS,
):
    from google_cloud_pipeline_components.experimental.dataproc import \
        DataprocSparkBatchOp

    _ = DataprocSparkBatchOp(
        project=project_id,
        location=location,
        batch_id=batch_id,
        main_class=main_class,
        jar_file_uris=jar_file_uris,
        service_account=service_account,
        args=args,
    )


compiler.Compiler().compile(pipeline_func=pipeline, package_path="pipeline.json")

pipeline = aiplatform.PipelineJob(
    display_name="pipeline",
    template_path="pipeline.json",
    pipeline_root=PIPELINE_ROOT,
    enable_caching=False,
)

pipeline.run()

! rm -f pipeline.json

The `JavaWordCount` example prints the results to standard output, which is captured in Cloud Logging. 

When the pipeline finishes running, you can inspect the logging output of the Dataproc batch workload using the Cloud Console. Run the following cell to generate a link to the batch workload in the Cloud Console:

In [None]:
from IPython.core.display import HTML, display

display(
    HTML(
        f"""<a href="https://console.cloud.google.com/dataproc/batches/{REGION}/{BATCH_ID}/monitoring?project={PROJECT_ID}">Link to Dataproc Batch workload.</a>"""
    )
)

### View Dataproc pipeline results

Finally, you will view the artifact outputs of each task in the pipeline.

In [None]:
PROJECT_NUMBER = pipeline.gca_resource.name.split("/")[1]
print(PROJECT_NUMBER)

print("dataproc-create-spark-batch")
artifacts = print_pipeline_output(pipeline, "dataproc-create-spark-batch")
print("\n\n")

### Delete the pipeline job

After a pipeline job is completed, you can delete the pipeline job with the method `delete()`.  Prior to completion, a pipeline job can be canceled with the method `cancel()`.

In [None]:
pipeline.delete()

### Delete the batch

You can delete the created batch in Dataproc serverless using the following `gcloud` command.

In [None]:
! gcloud dataproc batches delete $BATCH_ID --region=$REGION --quiet

## Running a Spark SQL workload
This section shows you how to create a Spark SQL batch workload from Vertex Pipelines. The pipeline uses `DataprocSparkSqlBatchOp` component to run Spark SQL queries on a public sample dataset. The sample dataset is provided by the US Social Security Adminstration and contains approximately 7 MB of data about popular baby names.

Download and extract the baby names zip file:

In [None]:
! curl -OL https://www.ssa.gov/OACT/babynames/names.zip && unzip -o names.zip -d babynames

The data is in CSV format. Each row contains values for `name`, `gender`, and `count` in that order. Run the next cell to inspect the sample data:

In [None]:
! head babynames/yob2010.txt

Copy the sample data to Cloud Storage:

In [None]:
! gsutil -m cp babynames/*.txt $BUCKET_URI/babynames

Next, write the SQL queries to execute to a file. The queries in the file perform the following:

- Creates an external table called `babynames_2010` that uses one of the sample files that you uploaded to Cloud Storage (`yob2010.txt`). The `yob2010.txt` file contains baby names from the year 2010.
- Creates an external table called `top_2010`, which is populated using a `SELECT` statement that queries popular female names. The table data is stored as CSV files in Cloud Storage. 

The values for `bucket-name`, `output-location`, and `gender` will be provided at runtime using `DataprocSparkSqlBatchOp` component parameters.

In [None]:
%%writefile top_names.sql

CREATE TABLE babynames_2010 (name STRING, gender STRING, count INT)
    USING CSV LOCATION '${bucket-name}/babynames/yob2010.txt';

CREATE TABLE top_2010 
    USING CSV LOCATION '${output-location}'
AS
    SELECT name, count 
        FROM babynames_2010 
    WHERE gender = '${gender}' ORDER BY count DESC LIMIT ${max-results};

Copy the query file to Cloud Storage.

In [None]:
! gsutil cp top_names.sql $BUCKET_URI

### Create and execute the pipeline job

In this example, the `DataprocSparkSqlBatchOp` component takes the following parameters:

- `batch_id`: The batch ID to use for the Dataproc Batch workload.
- `project_id`: The project ID.
- `location`: The region.
- `query_file_uri`: The URI of the file containing the SQL queries.
- `query_variables`: The mapping of query variable names to values (equivalent to the Spark SQL command  `SET name="value";`).
- `service_account`: The service account that runs the workload.

Learn more about the [Dataproc Serverless Spark SQL batch component](https://google-cloud-pipeline-components.readthedocs.io/en/google-cloud-pipeline-components-1.0.0/google_cloud_pipeline_components.experimental.dataproc.html#google_cloud_pipeline_components.experimental.dataproc.DataprocSparkSqlBatchOp).

In [None]:
import os

PIPELINE_ROOT = "{}/pipeline_root/dataproc_spark_sql".format(BUCKET_URI)
BATCH_ID = "top-names-spark-sql-" + TIMESTAMP

QUERY_FILE_URI = os.path.join(BUCKET_URI, "top_names.sql")
OUTPUT_LOCATION = os.path.join(BUCKET_URI, "top_2010_names_f")
QUERY_VARIABLES = {
    "bucket-name": BUCKET_URI,
    "output-location": OUTPUT_LOCATION,
    "max-results": "50",
    "gender": "F",
}


@dsl.pipeline(
    name="dataproc-spark-sql-top-names",
    description="An example pipeline that uses DataprocSparkSqlBatchOp to run Spark SQL queries.",
)
def pipeline(
    batch_id: str = BATCH_ID,
    project_id: str = PROJECT_ID,
    location: str = REGION,
    query_file_uri: str = QUERY_FILE_URI,
    query_variables: dict = QUERY_VARIABLES,
    service_account: str = SERVICE_ACCOUNT,
):
    from google_cloud_pipeline_components.experimental.dataproc import \
        DataprocSparkSqlBatchOp

    _ = DataprocSparkSqlBatchOp(
        project=project_id,
        location=location,
        batch_id=batch_id,
        query_file_uri=query_file_uri,
        query_variables=query_variables,
        service_account=service_account,
    )


compiler.Compiler().compile(pipeline_func=pipeline, package_path="pipeline.json")

pipeline = aiplatform.PipelineJob(
    display_name="pipeline",
    template_path="pipeline.json",
    pipeline_root=PIPELINE_ROOT,
    enable_caching=False,
)

pipeline.run()

! gsutil cat $OUTPUT_LOCATION/*.csv

### View Dataproc pipeline results

Finally, you will view the artifact outputs of each task in the pipeline.

In [None]:
PROJECT_NUMBER = pipeline.gca_resource.name.split("/")[1]
print(PROJECT_NUMBER)

print("dataproc-create-spark-sql-batch")
artifacts = print_pipeline_output(pipeline, "dataproc-create-spark-sql-batch")
print("\n\n")

### Delete the pipeline job

After a pipeline job is completed, you can delete the pipeline job with the method `delete()`.  Prior to completion, a pipeline job can be canceled with the method `cancel()`.

In [None]:
pipeline.delete()

### Delete the batch

You can delete the created batch in Dataproc serverless using the following `gcloud` command.

In [None]:
! gcloud dataproc batches delete $BATCH_ID --region=$REGION --quiet

## Running a SparkR workload
This section shows you how to create a SparkR batch workload from Vertex AI Pipelines. The pipeline uses `DataprocSparkRBatchOp` component to run a simple R script  that counts the frequency of words used in Shakespeare.

### Write the PySpark word count program.

First, you write a R script that performs a simple word count. The program code is written to a local file called `wordcount.R`.

In [None]:
%%writefile wordcount.R

library(SparkR)

sparkR.session(appName = "SparkR-wordcount")

args <- commandArgs(trailing = TRUE)
inputFile <- args[[1]]
outputFile <- args[[2]]

lines <- read.text(inputFile)
filtered <- selectExpr(lines, "regexp_replace(value, '[\",.?:!]', '') as filtered")
words <- selectExpr(filtered, "explode(split(filtered, ' ')) as word")
wordCounts <- count(groupBy(words, "word"))

write.df(wordCounts, outputFile, "com.databricks.spark.csv")

### Copy the R script to Cloud Storage

Next, you copy `wordcount.R` to your Cloud Storage bucket.

Additionally, you set the Cloud Storage locations for the input and output of the script.

In [None]:
GCS_WC_R = BUCKET_URI + "/wordcount.R"
! gsutil cp wordcount.R $GCS_WC_R

GCS_WC_R_OUT = BUCKET_URI + "/wc_r_out"
GCS_WC_R_IN = "gs://dataproc-datasets-us-central1/shakespeare/all-lines.txt"

### Create and execute the pipeline job

In this example, the `DataprocSparkRBatchOp` component takes the following parameters:

- `batch_id`: The batch ID to use for the Dataproc Batch workload.
- `project_id`: The project ID.
- `location`: The region.
- `main_r_file_uri`: The URI of the main R file.
- `service_account`: The service account that runs the workload.
- `args`: The arguments to pass to the Spark program.

Learn more about the [Dataproc Serverless SparkR batch component](https://google-cloud-pipeline-components.readthedocs.io/en/google-cloud-pipeline-components-1.0.0/google_cloud_pipeline_components.experimental.dataproc.html#google_cloud_pipeline_components.experimental.dataproc.DataprocSparkRBatchOp).

In [None]:
PIPELINE_ROOT = "{}/pipeline_root/dataproc_sparkr".format(BUCKET_URI)
BATCH_ID = "wordcount-sparkr-" + TIMESTAMP

# [input-file, output-file]
ARGS = [GCS_WC_R_IN, GCS_WC_R_OUT]


@dsl.pipeline(
    name="dataproc-sparkr-wc",
    description="An example pipeline that uses DataprocSparkRBatchOp to run a SparkR batch workload.",
)
def pipeline(
    batch_id: str = BATCH_ID,
    project_id: str = PROJECT_ID,
    location: str = REGION,
    main_r_file_uri: str = GCS_WC_R,
    service_account: str = SERVICE_ACCOUNT,
    args: list = ARGS,
):
    from google_cloud_pipeline_components.experimental.dataproc import \
        DataprocSparkRBatchOp

    _ = DataprocSparkRBatchOp(
        project=project_id,
        location=location,
        batch_id=batch_id,
        main_r_file_uri=main_r_file_uri,
        args=args,
    )


compiler.Compiler().compile(pipeline_func=pipeline, package_path="pipeline.json")

pipeline = aiplatform.PipelineJob(
    display_name="pipeline",
    template_path="pipeline.json",
    pipeline_root=PIPELINE_ROOT,
    enable_caching=False,
)

pipeline.run()

# Print the first 1KB of the CSV output:

! gsutil cat -r 0-1024 $GCS_WC_R_OUT/*.csv

### View Dataproc pipeline results

Finally, you will view the artifact outputs of each task in the pipeline.

In [None]:
PROJECT_NUMBER = pipeline.gca_resource.name.split("/")[1]
print(PROJECT_NUMBER)

print("dataproc-create-spark-r-batch")
artifacts = print_pipeline_output(pipeline, "dataproc-create-spark-r-batch")
print("\n\n")

### Delete the pipeline job

After a pipeline job is completed, you can delete the pipeline job with the method `delete()`.  Prior to completion, a pipeline job can be canceled with the method `cancel()`.

In [None]:
pipeline.delete()

### Delete the batch

You can delete the created batch in Dataproc serverless using the following `gcloud` command.

In [None]:
! gcloud dataproc batches delete $BATCH_ID --region=$REGION --quiet

# Cleaning up

To clean up all Google Cloud resources used in this project, you can [delete the Google Cloud
project](https://cloud.google.com/resource-manager/docs/creating-managing-projects#shutting_down_projects) you used for the tutorial.

Otherwise, you can delete the individual resources you created in this tutorial.

### Delete Cloud Storage bucket
Set `delete_bucket` to `True` to delete the Cloud Storage bucket.

In [None]:
delete_bucket = False

if delete_bucket or os.getenv("IS_TESTING"):
    # Delete the Cloud storage bucket
    ! gsutil rm -r $BUCKET_URI