#### Step 1:
#### Set Google Cloud properties

In [None]:
# User Configuration
# User Inputs

get_project_id = ! gcloud config list --format 'value(core.project)' 2>/dev/null
PROJECT_ID = get_project_id[0]
REGION = ""  # example"us-west1"
GCS_STAGING_LOCATION = "gs://<bucket_name>" # example "gs://bucket_name"
SUBNET = "" # example "projects/<project-id>/regions/<region-id>/subnetworks/<subnet-name>" 
INPUT_HIVE_DATABASE= ""
INPUT_HIVE_TABLES= "" # example "table1,table2,table3..." or "*"
OUTPUT_BIGQUERY_DATASET= ""
TEMP_BUCKET= "<bucket_name>"
HIVE_METASTORE= "" # example "thrift://hive-cluster-m:9083"
MAX_PARALLELISM=10 # Overwrite the value if you want to increase number of parallel Dataproc Batch Jobs

In [None]:
# Copyright 2022 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Run Dataproc Templates from Vertex AI Pipelines

## Overview

This notebook shows how to build a Vertex AI Pipeline to run a Dataproc Template using the DataprocPySparkBatchOp component.

#### References

- [DataprocPySparkBatchOp reference](https://google-cloud-pipeline-components.readthedocs.io/en/google-cloud-pipeline-components-1.0.0/google_cloud_pipeline_components.experimental.dataproc.html)
- [Kubeflow SDK Overview](https://www.kubeflow.org/docs/components/pipelines/sdk/sdk-overview/)
- [Dataproc Serverless in Vertex AI Pipelines tutorial](https://github.com/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/community/ml_ops/stage3/get_started_with_dataproc_serverless_pipeline_components.ipynb)
- [Build a Vertex AI Pipeline](https://cloud.google.com/vertex-ai/docs/pipelines/build-pipeline)

This notebook is built to run a Vertex AI User-Managed Notebook using the default Compute Engine Service Account.  
Check the Dataproc Serverless in Vertex AI Pipelines tutorial linked above to learn how to setup a different Service Account.  

#### Permissions
#### TODO - Discuss the permissions required and specific resources

Make sure that the service account used to run the notebook has the following roles:

- roles/aiplatform.serviceAgent
- roles/aiplatform.customCodeServiceAgent
- roles/storage.objectCreator
- roles/storage.objectViewer
- roles/dataproc.editor
- roles/dataproc.worker

#### Step 2:
#### Install the required packages

In [None]:
import os

# Google Cloud notebooks requires dependencies to be installed with '--user'
! pip3 install --upgrade google-cloud-pipeline-components kfp --user -q

### Once you've installed the additional packages, you may need to restart the notebook kernel so it can find the packages.

Uncomment & Run this cell if you want to restart the notebook

In [None]:
# import os

# if not os.getenv("IS_TESTING"):
#    import IPython
#    app = IPython.Application.instance()
#    app.kernel.do_shutdown(True)

#### Step 3:
#### Import dependencies

In [None]:
import google.cloud.aiplatform as aiplatform
from kfp import dsl
from kfp.v2 import compiler
from datetime import datetime

#### Step 4:
#### Change working directory to the Dataproc Templates python folder

In [None]:
WORKING_DIRECTORY = "/home/jupyter/dataproc-templates/python"
%cd /home/jupyter/dataproc-templates/python

#### Step 5:
#### Build Dataproc Templates python package

In [None]:
PACKAGE_EGG_FILE = "dist/dataproc_templates_distribution.egg"
! python ./setup.py bdist_egg --output=$PACKAGE_EGG_FILE

#### Step 6:
#### Copy package to the GCS bucket

For this, make sure that the service account used to run the notebook has the following roles:
 - roles/storage.objectCreator
 - roles/storage.objectViewer

In [None]:
! gsutil cp main.py $GCS_STAGING_LOCATION/
! gsutil cp -r $PACKAGE_EGG_FILE $GCS_STAGING_LOCATION/dist/
! gsutil cp dataproc_templates/hive/get_hive_tables.py $GCS_STAGING_LOCATION/

#### Step 7:
#### Choose template and set template arguments

In [None]:
PIPELINE_ROOT = GCS_STAGING_LOCATION + "/pipeline_root/dataproc_pyspark"
MAIN_PYTHON_FILE = GCS_STAGING_LOCATION + "/main.py"
PYTHON_FILE_URIS = [GCS_STAGING_LOCATION + "/dist/dataproc_templates_distribution.egg"]
JARS = ["gs://spark-lib/bigquery/spark-bigquery-latest_2.12.jar"]
GET_HIVE_TABLES_PY=GCS_STAGING_LOCATION + "/get_hive_tables.py"

TEMPLATE_SPARK_ARGS = [
"--template=HIVETOBIGQUERY",
"--hive.bigquery.input.database={}".format(INPUT_HIVE_DATABASE),
"--hive.bigquery.input.table={}".format(INPUT_HIVE_TABLES),
"--hive.bigquery.output.dataset={}".format(OUTPUT_BIGQUERY_DATASET),
"--hive.bigquery.output.mode=overwrite",
"--hive.bigquery.temp.bucket.name={}".format(TEMP_BUCKET)
]

#### Step 8:
#### Build pipeline and run Dataproc Template on Vertex AI Pipelines to get list of all the Hive Tables

For this, make sure that the service account used to run the notebook has the following roles:
 - roles/dataproc.editor
 - roles/dataproc.worker

In [None]:
# Batch ID should be  4-63 characters
BATCH_ID = "b-"+INPUT_HIVE_DATABASE+"-"+ datetime.now().strftime("%Y%m%d%H%M%S")

runtime_prop={}
runtime_prop['spark.hadoop.hive.metastore.uris']=HIVE_METASTORE

aiplatform.init(project=PROJECT_ID, staging_bucket=GCS_STAGING_LOCATION)

@dsl.pipeline(
    name="dataproc-templates-pyspark",
    description="DataprocPySparkBatchOp to get list of tables from hive metastore: "+HIVE_METASTORE,
)
def pipeline(
    batch_id: str = BATCH_ID,
    project_id: str = PROJECT_ID,
    location: str = REGION,
    main_python_file_uri: str = GET_HIVE_TABLES_PY,
    python_file_uris: list = PYTHON_FILE_URIS,
    jar_file_uris: list = JARS,
    subnetwork_uri: str = SUBNET,
    args: list = TEMPLATE_SPARK_ARGS,
):
    from google_cloud_pipeline_components.experimental.dataproc import \
        DataprocPySparkBatchOp

    _ = DataprocPySparkBatchOp(
            project=project_id,
            location=location,
            batch_id=BATCH_ID,
            main_python_file_uri=main_python_file_uri,
            python_file_uris=python_file_uris,
            jar_file_uris=jar_file_uris,
            runtime_config_properties=runtime_prop,       
            subnetwork_uri=subnetwork_uri,
            args=args
        )
        

compiler.Compiler().compile(pipeline_func=pipeline, package_path="pipeline.json")

pipeline = aiplatform.PipelineJob(
    display_name="pipeline",
    template_path="pipeline.json",
    pipeline_root=PIPELINE_ROOT,
    enable_caching=False,
)

pipeline.run()

#### Step 9:
#### Copy Hive Tables File from GCS to Local

The above dataproc batch job will bring in all the table names from hive metastore and save in a file present in notebooks/HIVE/tables folder

In [None]:
import os
table_file=WORKING_DIRECTORY+"/notebooks/HIVE/tables/{}.csv".format(BATCH_ID)
in_file='gs://'+TEMP_BUCKET+'/'+INPUT_HIVE_DATABASE+'/*.csv'
os.system("gsutil cp {} {}".format(in_file,table_file))

#### Step 10:
#### Calculate Split Count

Calculate the number of tables to be loaded in each batch job based on MAX_PARALLELISM value.

In [None]:
from math import *
tables_f = open(table_file, 'r+')
table_list = [line for line in tables_f.readlines()]
table_count=len(table_list)
split_count=floor((table_count+MAX_PARALLELISM-1)/MAX_PARALLELISM)

#### Step 11:
#### Get Table list function

Function to split table list based on the number of jobs to run

In [None]:
import string
import random
def get_table_list(i):
    input_hive_tables=table_list[i:split_count+i]
    input_hive_tables_string=(','.join(input_hive_tables)).replace('\n','')
    res = ''.join(random.choices(string.ascii_lowercase + string.digits, k=10))
    TEMPLATE_SPARK_ARGS = [
                            "--template=HIVETOBIGQUERY",
                            "--hive.bigquery.input.database={}".format(INPUT_HIVE_DATABASE),
                            "--hive.bigquery.input.table={}".format(input_hive_tables),
                            "--hive.bigquery.output.dataset={}".format(OUTPUT_BIGQUERY_DATASET),
                            "--hive.bigquery.output.mode=overwrite",
                            "--hive.bigquery.temp.bucket.name={}".format(TEMP_BUCKET),
                            "--hive.database.all.tables={}".format(input_hive_tables_string),
                            "--migration_id={}".format(res)        
                          ]
    return TEMPLATE_SPARK_ARGS


#### Step 12:
#### Build pipeline and run Dataproc Template on Vertex AI Pipelines to migrate HIVE tables to BigQuery

For this, make sure that the service account used to run the notebook has the following roles:
 - roles/dataproc.editor
 - roles/dataproc.worker

In [None]:
runtime_prop={}
runtime_prop['spark.hadoop.hive.metastore.uris']=HIVE_METASTORE
runtime_prop['mapreduce.fileoutputcommitter.marksuccessfuljobs'] = "false"

import time
aiplatform.init(project=PROJECT_ID, staging_bucket=GCS_STAGING_LOCATION)

@dsl.pipeline(
    name="dataproc-templates-pyspark",
    description="DataprocPySparkBatchOp to run HiveToBigQuery PySpark Dataproc Template batch workload",
)

def pipeline(
    batch_id: str = BATCH_ID,
    project_id: str = PROJECT_ID,
    location: str = REGION,
    main_python_file_uri: str = MAIN_PYTHON_FILE,
    python_file_uris: list = PYTHON_FILE_URIS,
    jar_file_uris: list = JARS,
    subnetwork_uri: str = SUBNET,
    args: list = TEMPLATE_SPARK_ARGS,
):
    from google_cloud_pipeline_components.experimental.dataproc import \
        DataprocPySparkBatchOp

    i=0
    while i<table_count:
        args=get_table_list(i)
        i=i+split_count
        BATCH_ID = "b-"+INPUT_HIVE_DATABASE+"-"+ datetime.now().strftime("%Y%m%d%H%M%S")
        _ = DataprocPySparkBatchOp(
            project=project_id,
            location=location,
            batch_id=BATCH_ID,
            main_python_file_uri=main_python_file_uri,
            python_file_uris=python_file_uris,
            jar_file_uris=jar_file_uris,
            runtime_config_properties=runtime_prop,       
            subnetwork_uri=subnetwork_uri,
            args=args
        )
        time.sleep(1)

compiler.Compiler().compile(pipeline_func=pipeline, package_path="pipeline.json")

pipeline = aiplatform.PipelineJob(
    display_name="pipeline",
    template_path="pipeline.json",
    pipeline_root=PIPELINE_ROOT,
    enable_caching=False,
)  

pipeline.run()