In [None]:
# Copyright 2022 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Run Dataproc Templates from Vertex AI Pipelines

## Overview

This notebook shows how to build a Vertex AI Pipeline to run a Dataproc Template using the DataprocPySparkBatchOp component.

#### References

- [DataprocPySparkBatchOp reference](https://google-cloud-pipeline-components.readthedocs.io/en/google-cloud-pipeline-components-1.0.0/google_cloud_pipeline_components.experimental.dataproc.html)
- [Kubeflow SDK Overview](https://www.kubeflow.org/docs/components/pipelines/sdk/sdk-overview/)
- [Dataproc Serverless in Vertex AI Pipelines tutorial](https://github.com/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/community/ml_ops/stage3/get_started_with_dataproc_serverless_pipeline_components.ipynb)
- [Build a Vertex AI Pipeline](https://cloud.google.com/vertex-ai/docs/pipelines/build-pipeline)

This notebook is built to run a Vertex AI User-Managed Notebook using the default Compute Engine Service Account.  
Check the Dataproc Serverless in Vertex AI Pipelines tutorial linked above to learn how to setup a different Service Account.  

#### Permissions

Make sure that the service account used to run the notebook has the following roles:

- roles/aiplatform.serviceAgent
- roles/aiplatform.customCodeServiceAgent
- roles/storage.objectCreator
- roles/storage.objectViewer
- roles/dataproc.editor
- roles/dataproc.worker

#### Install the required packages

In [1]:
import os

# Google Cloud notebooks requires dependencies to be installed with '--user'
! pip3 install --upgrade google-cloud-pipeline-components kfp --user -q

Once you've installed the additional packages, you need to restart the notebook kernel so it can find the packages.

In [None]:
##Comment this step
import os

if not os.getenv("IS_TESTING"):
    import IPython
    app = IPython.Application.instance()
    app.kernel.do_shutdown(True)

#### Import dependencies

In [2]:
import google.cloud.aiplatform as aiplatform
from kfp import dsl
from kfp.v2 import compiler
from datetime import datetime

#### Change working directory to the Dataproc Templates python folder

In [3]:
WORKING_DIRECTORY = "dataproc-templates/python/"
%cd /home/jupyter/dataproc-templates/python

/home/jupyter/dataproc-templates/python


#### Set Google Cloud properties

In [4]:
# User Inputs

get_project_id = ! gcloud config list --format 'value(core.project)' 2>/dev/null
PROJECT_ID = get_project_id[0]
REGION = "us-west1"
GCS_STAGING_LOCATION = "gs://shubham_bqtest"
SUBNET = "projects/yadavaja-sandbox/regions/us-west1/subnetworks/test-subnet1"
input_hive_database="hive2bq"
input_hive_tables="empdata"
output_bigquery_dataset="hive2bq"
temp_bucket="shubham_bqtest"
hive_metastore="thrift://hive-cluster-m:9083"
max_parallelism=10

#### Build Dataproc Templates python package

In [5]:
PACKAGE_EGG_FILE = "dist/dataproc_templates_distribution.egg"
! python ./setup.py bdist_egg --output=$PACKAGE_EGG_FILE

running bdist_egg
running egg_info
writing dataproc_templates.egg-info/PKG-INFO
writing dependency_links to dataproc_templates.egg-info/dependency_links.txt
writing requirements to dataproc_templates.egg-info/requires.txt
writing top-level names to dataproc_templates.egg-info/top_level.txt
reading manifest file 'dataproc_templates.egg-info/SOURCES.txt'
writing manifest file 'dataproc_templates.egg-info/SOURCES.txt'
installing library code to build/bdist.linux-x86_64/egg
running install_lib
running build_py
creating build/bdist.linux-x86_64/egg
creating build/bdist.linux-x86_64/egg/test
creating build/bdist.linux-x86_64/egg/test/util
copying build/lib/test/util/__init__.py -> build/bdist.linux-x86_64/egg/test/util
copying build/lib/test/util/test_argument_parsing.py -> build/bdist.linux-x86_64/egg/test/util
creating build/bdist.linux-x86_64/egg/test/hive
copying build/lib/test/hive/test_hive_to_gcs.py -> build/bdist.linux-x86_64/egg/test/hive
copying build/lib/test/hive/__init__.py -> b

#### Copy package to the GCS bucket

For this, make sure that the service account used to run the notebook has the following roles:
 - roles/storage.objectCreator
 - roles/storage.objectViewer

In [6]:
! gsutil cp main.py $GCS_STAGING_LOCATION/
! gsutil cp -r $PACKAGE_EGG_FILE $GCS_STAGING_LOCATION/dist/
! gsutil cp dataproc_templates/hive/get_hive_tables.py $GCS_STAGING_LOCATION/

Copying file://main.py [Content-Type=text/x-python]...
/ [1 files][  3.6 KiB/  3.6 KiB]                                                
Operation completed over 1 objects/3.6 KiB.                                      
Copying file://dist/dataproc_templates_distribution.egg [Content-Type=application/octet-stream]...
/ [1 files][ 77.2 KiB/ 77.2 KiB]                                                
Operation completed over 1 objects/77.2 KiB.                                     
Copying file://dataproc_templates/hive/get_hive_tables.py [Content-Type=text/x-python]...
/ [1 files][  3.8 KiB/  3.8 KiB]                                                
Operation completed over 1 objects/3.8 KiB.                                      


#### Set Dataproc Templates properties

In [7]:
PIPELINE_ROOT = GCS_STAGING_LOCATION + "/pipeline_root/dataproc_pyspark"
MAIN_PYTHON_FILE = GCS_STAGING_LOCATION + "/main.py"
PYTHON_FILE_URIS = [GCS_STAGING_LOCATION + "/dist/dataproc_templates_distribution.egg"]
JARS = ["gs://spark-lib/bigquery/spark-bigquery-latest_2.12.jar"]
GET_HIVE_TABLES_PY=GCS_STAGING_LOCATION + "/get_hive_tables.py"

#### Choose template and set template arguments

GCSTOBIGQUERY is chosen in this notebook as an example.  
Check the arguments in the template's documentation.  

In [8]:
TEMPLATE_SPARK_ARGS = [
"--template=HIVETOBIGQUERY",
"--hive.bigquery.input.database={}".format(input_hive_database),
"--hive.bigquery.input.table={}".format(input_hive_tables),
"--hive.bigquery.output.dataset={}".format(output_bigquery_dataset),
"--hive.bigquery.output.mode=overwrite",
"--hive.bigquery.temp.bucket.name={}".format(temp_bucket)
]

Get List of All the Hive Tables

In [9]:
# Batch ID should be  4-63 characters
BATCH_ID = "b-"+input_hive_database+"-"+ datetime.now().strftime("%Y%m%d%H%M%S")

runtime_prop={}
runtime_prop['spark.hadoop.hive.metastore.uris']=hive_metastore

aiplatform.init(project=PROJECT_ID, staging_bucket=GCS_STAGING_LOCATION)

@dsl.pipeline(
    name="dataproc-templates-pyspark",
    description="DataprocPySparkBatchOp to get list of tables from hive metastore: "+hive_metastore,
)
def pipeline(
    batch_id: str = BATCH_ID,
    project_id: str = PROJECT_ID,
    location: str = REGION,
    main_python_file_uri: str = GET_HIVE_TABLES_PY,
    python_file_uris: list = PYTHON_FILE_URIS,
    jar_file_uris: list = JARS,
    subnetwork_uri: str = SUBNET,
    args: list = TEMPLATE_SPARK_ARGS,
):
    from google_cloud_pipeline_components.experimental.dataproc import \
        DataprocPySparkBatchOp

    _ = DataprocPySparkBatchOp(
            project=project_id,
            location=location,
            batch_id=BATCH_ID,
            main_python_file_uri=main_python_file_uri,
            python_file_uris=python_file_uris,
            jar_file_uris=jar_file_uris,
            runtime_config_properties=runtime_prop,       
            subnetwork_uri=subnetwork_uri,
            args=args
        )
        

compiler.Compiler().compile(pipeline_func=pipeline, package_path="pipeline.json")

pipeline = aiplatform.PipelineJob(
    display_name="pipeline",
    template_path="pipeline.json",
    pipeline_root=PIPELINE_ROOT,
    enable_caching=False,
)

pipeline.run()



Creating PipelineJob
PipelineJob created. Resource name: projects/617357862702/locations/us-central1/pipelineJobs/dataproc-templates-pyspark-20220731130011
To use this PipelineJob in another session:
pipeline_job = aiplatform.PipelineJob.get('projects/617357862702/locations/us-central1/pipelineJobs/dataproc-templates-pyspark-20220731130011')
View Pipeline Job:
https://console.cloud.google.com/vertex-ai/locations/us-central1/pipelines/runs/dataproc-templates-pyspark-20220731130011?project=617357862702
PipelineJob projects/617357862702/locations/us-central1/pipelineJobs/dataproc-templates-pyspark-20220731130011 current state:
PipelineState.PIPELINE_STATE_RUNNING
PipelineJob projects/617357862702/locations/us-central1/pipelineJobs/dataproc-templates-pyspark-20220731130011 current state:
PipelineState.PIPELINE_STATE_RUNNING
PipelineJob projects/617357862702/locations/us-central1/pipelineJobs/dataproc-templates-pyspark-20220731130011 current state:
PipelineState.PIPELINE_STATE_RUNNING
Pipel

# Copy Hive Tables File from GCS to Local

In [11]:
import os
table_file=WORKING_DIRECTORY+"/notebooks/HIVE/tables/{}.csv".format(BATCH_ID)
in_file='gs://'+temp_bucket+'/'+input_hive_database+'/*.csv'
os.system("gsutil cp {} {}".format(in_file,table_file))

Copying gs://shubham_bqtest/hive2bq/part-00000-359f375a-7477-4ec5-a13c-aaf55e9594d2-c000.csv...
/ [1 files][   91.0 B/   91.0 B]                                                
Operation completed over 1 objects/91.0 B.                                       


0

In [17]:
print(WORKING_DIRECTORY)
new-dataproc-templates/dataproc-templates/python/notebooks/HIVE/HIVEtoBigquery_vertex_pipeline_pyspark.ipynb

/home/jupyter/dataproc-templates/python/


# Calculate the number of jobs to run based on MAX_Parallelism value

In [13]:
from math import *
tables_f = open(table_file, 'r+')
table_list = [line for line in tables_f.readlines()]
table_count=len(table_list)
result=floor((table_count+max_parallelism-1)/max_parallelism)
result

1

# Split tables based on the number of jobs to run (calculated above)

In [9]:
import string
import random
def get_table_list(i):
    input_hive_tables=table_list[i:result+i]
    input_hive_tables_string=(','.join(input_hive_tables)).replace('\n','')
    res = ''.join(random.choices(string.ascii_lowercase + string.digits, k=10))
    TEMPLATE_SPARK_ARGS = [
"--template=HIVETOBIGQUERY",
"--hive.bigquery.input.database={}".format(input_hive_database),
"--hive.bigquery.input.table={}".format(input_hive_tables),
"--hive.bigquery.output.dataset={}".format(output_bigquery_dataset),
"--hive.bigquery.output.mode=overwrite",
"--hive.bigquery.temp.bucket.name={}".format(temp_bucket),
"--hive.database.all.tables={}".format(input_hive_tables_string),
"--migration_id={}".format(res)        
]
    return TEMPLATE_SPARK_ARGS



### Build pipeline and run Dataproc Template on Vertex AI Pipelines

For this, make sure that the service account used to run the notebook has the following roles:
 - roles/dataproc.editor
 - roles/dataproc.worker

In [10]:
runtime_prop={}
runtime_prop['spark.hadoop.hive.metastore.uris']=hive_metastore
runtime_prop['mapreduce.fileoutputcommitter.marksuccessfuljobs'] = "false"

import time
aiplatform.init(project=PROJECT_ID, staging_bucket=GCS_STAGING_LOCATION)

@dsl.pipeline(
    name="dataproc-templates-pyspark",
    description="DataprocPySparkBatchOp to run HiveToBigQuery PySpark Dataproc Template batch workload",
)

def pipeline(
    batch_id: str = BATCH_ID,
    project_id: str = PROJECT_ID,
    location: str = REGION,
    main_python_file_uri: str = MAIN_PYTHON_FILE,
    python_file_uris: list = PYTHON_FILE_URIS,
    jar_file_uris: list = JARS,
    subnetwork_uri: str = SUBNET,
    args: list = TEMPLATE_SPARK_ARGS,
):
    from google_cloud_pipeline_components.experimental.dataproc import \
        DataprocPySparkBatchOp

    i=0
    while i<table_count:
        args=get_table_list(i)
        i=i+result
        BATCH_ID = "b-"+input_hive_database+"-"+ datetime.now().strftime("%Y%m%d%H%M%S")
        _ = DataprocPySparkBatchOp(
            project=project_id,
            location=location,
            batch_id=BATCH_ID,
            main_python_file_uri=main_python_file_uri,
            python_file_uris=python_file_uris,
            jar_file_uris=jar_file_uris,
            runtime_config_properties=runtime_prop,       
            subnetwork_uri=subnetwork_uri,
            args=args
        )
        time.sleep(1)

compiler.Compiler().compile(pipeline_func=pipeline, package_path="pipeline.json")

pipeline = aiplatform.PipelineJob(
    display_name="pipeline",
    template_path="pipeline.json",
    pipeline_root=PIPELINE_ROOT,
    enable_caching=False,
)  

pipeline.run()

Creating PipelineJob
PipelineJob created. Resource name: projects/617357862702/locations/us-central1/pipelineJobs/dataproc-templates-pyspark-20220729044905
To use this PipelineJob in another session:
pipeline_job = aiplatform.PipelineJob.get('projects/617357862702/locations/us-central1/pipelineJobs/dataproc-templates-pyspark-20220729044905')
View Pipeline Job:
https://console.cloud.google.com/vertex-ai/locations/us-central1/pipelines/runs/dataproc-templates-pyspark-20220729044905?project=617357862702
PipelineJob projects/617357862702/locations/us-central1/pipelineJobs/dataproc-templates-pyspark-20220729044905 current state:
PipelineState.PIPELINE_STATE_RUNNING
PipelineJob projects/617357862702/locations/us-central1/pipelineJobs/dataproc-templates-pyspark-20220729044905 current state:
PipelineState.PIPELINE_STATE_RUNNING
PipelineJob projects/617357862702/locations/us-central1/pipelineJobs/dataproc-templates-pyspark-20220729044905 current state:
PipelineState.PIPELINE_STATE_RUNNING
Pipel