In [35]:
# Copyright 2022 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

#### References

- [DataprocPySparkBatchOp reference](https://google-cloud-pipeline-components.readthedocs.io/en/google-cloud-pipeline-components-1.0.0/google_cloud_pipeline_components.experimental.dataproc.html)
- [Kubeflow SDK Overview](https://www.kubeflow.org/docs/components/pipelines/sdk/sdk-overview/)
- [Dataproc Serverless in Vertex AI Pipelines tutorial](https://github.com/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/community/ml_ops/stage3/get_started_with_dataproc_serverless_pipeline_components.ipynb)
- [Build a Vertex AI Pipeline](https://cloud.google.com/vertex-ai/docs/pipelines/build-pipeline)

This notebook is built to run a Vertex AI User-Managed Notebook using the default Compute Engine Service Account.  
Check the Dataproc Serverless in Vertex AI Pipelines tutorial linked above to learn how to setup a different Service Account.  

#### Permissions

Make sure that the service account used to run the notebook has the following roles:

- roles/aiplatform.serviceAgent
- roles/aiplatform.customCodeServiceAgent
- roles/storage.objectCreator
- roles/storage.objectViewer
- roles/dataproc.editor
- roles/dataproc.worker

#### Step 1:
#### Install the required packages

In [36]:
# Google Cloud notebooks requires dependencies to be installed with '--user'
! pip3 install pyspark
! pip3 install --upgrade google-cloud-pipeline-components kfp --user -q
! pip3 install pip install google-auth==2.13.0
! pip3 install google-cloud-bigquery-migration
# Install latest JDK
! sudo apt-get update
! sudo apt-get install default-jdk -y

Hit:1 http://security.debian.org/debian-security buster/updates InRelease
Hit:2 http://packages.cloud.google.com/apt cloud-sdk-buster InRelease
Hit:3 http://packages.cloud.google.com/apt google-cloud-packages-archive-keyring-buster InRelease
Hit:4 http://deb.debian.org/debian buster InRelease                            
Hit:5 http://deb.debian.org/debian buster-updates InRelease                    
Hit:6 http://deb.debian.org/debian buster-backports InRelease                  
Hit:7 http://packages.cloud.google.com/apt gcsfuse-buster InRelease            
Get:8 https://download.docker.com/linux/debian buster InRelease [54.0 kB]      
Hit:10 http://packages.cloud.google.com/apt google-compute-engine-buster-stable InRelease
Get:9 https://packages.cloud.google.com/apt kubernetes-xenial InRelease [8993 B]
Fetched 63.0 kB in 1s (46.1 kB/s)  
Reading package lists... Done
Reading package lists... Done
Building dependency tree       
Reading state information... Done
default-jdk is already th

#### Once you've installed the additional packages, you may need to restart the notebook kernel so it can find the packages.

Uncomment & Run this cell if you have installed anything from above commands

In [37]:
# import os

# if not os.getenv("IS_TESTING"):
#    import IPython
#    app = IPython.Application.instance()
#    app.kernel.do_shutdown(True)

#### Step 2:
#### Set Google Cloud properties

**Overview**  
This notebook shows how to build a Vertex AI Pipeline to run a Dataproc Template   
using the DataprocPySparkBatchOp component.

In [38]:
# User Configuration
# User Inputs

get_project_id = ! gcloud config list --format 'value(core.project)' 2>/dev/null
PROJECT_ID = get_project_id[0]
REGION = ""  # example "us-west1"
GCS_STAGING_LOCATION = "gs://<bucket_name>" # example "gs://my_bucket_name"
SUBNET = "" # example "projects/<project-id>/regions/<region-id>/subnetworks/<subnet-name>" 
INPUT_HIVE_DATABASE= ""
INPUT_HIVE_TABLES= "*" # example "table1,table2,table3..." or "*"
OUTPUT_BIGQUERY_DATASET= ""
TEMP_BUCKET= ""
HIVE_METASTORE= "" # example "thrift://hive-cluster-m:9083"

## Change if needed
HIVE_OUTPUT_MODE="overwrite"
MAX_PARALLELISM=10 # Controlls number of parallel Dataproc Serverless Jobs

#### Step 3:
#### Import dependencies

In [40]:
import google.cloud.aiplatform as aiplatform
from kfp import dsl
from kfp.v2 import compiler
from datetime import datetime
from google_cloud_pipeline_components.experimental.dataproc import DataprocPySparkBatchOp
import time
import os
from pyspark.sql import SparkSession
import pandas as pd
from utils.hive_notebook_utils import *

#### Step 4:
#### Change working directory to the Dataproc Templates python folder

In [41]:
WORKING_DIRECTORY = "/home/jupyter/dataproc-templates/python"
%cd /home/jupyter/dataproc-templates/python

/home/jupyter/dataproc-templates/python


#### Step 5:
#### Build Dataproc Templates python package

In [42]:
PACKAGE_EGG_FILE = "dist/dataproc_templates_distribution.egg"
! python ./setup.py bdist_egg --output=$PACKAGE_EGG_FILE

running bdist_egg
running egg_info
writing google_dataproc_templates.egg-info/PKG-INFO
writing dependency_links to google_dataproc_templates.egg-info/dependency_links.txt
writing requirements to google_dataproc_templates.egg-info/requires.txt
writing top-level names to google_dataproc_templates.egg-info/top_level.txt
reading manifest file 'google_dataproc_templates.egg-info/SOURCES.txt'
reading manifest template 'MANIFEST.in'
writing manifest file 'google_dataproc_templates.egg-info/SOURCES.txt'
installing library code to build/bdist.linux-x86_64/egg
running install_lib
running build_py
creating build/bdist.linux-x86_64/egg
creating build/bdist.linux-x86_64/egg/test
creating build/bdist.linux-x86_64/egg/test/gcs
copying build/lib/test/gcs/test_gcs_to_mongo.py -> build/bdist.linux-x86_64/egg/test/gcs
copying build/lib/test/gcs/test_gcs_to_gcs.py -> build/bdist.linux-x86_64/egg/test/gcs
copying build/lib/test/gcs/test_gcs_to_bigtable.py -> build/bdist.linux-x86_64/egg/test/gcs
copying bu

#### Step 6:
#### Copy package to the GCS bucket

For this, make sure that the service account used to run the notebook has the following roles:
 - roles/storage.objectCreator
 - roles/storage.objectViewer

In [43]:
!gsutil cp main.py $GCS_STAGING_LOCATION/
!gsutil cp $PACKAGE_EGG_FILE $GCS_STAGING_LOCATION/dist/

Copying file://main.py [Content-Type=text/x-python]...
/ [1 files][  4.9 KiB/  4.9 KiB]                                                
Operation completed over 1 objects/4.9 KiB.                                      
Copying file://dist/dataproc_templates_distribution.egg [Content-Type=application/octet-stream]...
/ [1 files][155.4 KiB/155.4 KiB]                                                
Operation completed over 1 objects/155.4 KiB.                                    


#### Step 7:
#### Get Hive Tables 
In case user wants to load all the Hive tables from the database, we need to get the table list using the metastore.

Below cell will fetch all tables from the Hive database by running a Spark SQL query using the provided Hive Metastore.

In [44]:
spark=get_spark_session(HIVE_METASTORE)

if INPUT_HIVE_TABLES=="*":
    #os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
    #os.environ["PATH"] = os.environ["JAVA_HOME"] + "/bin:" + os.environ["PATH"]
    TABLE_LIST_DF=spark.sql("show tables in "+INPUT_HIVE_DATABASE)
    TABLE_LIST=TABLE_LIST_DF.select("tableName").rdd.flatMap(lambda x: x).collect()
    print("Table Sets to Migrate: ")
    print(TABLE_LIST)
else:
    TABLE_LIST=INPUT_HIVE_TABLES.split(",")
    print("Table Sets to Migrate: ")
    print(TABLE_LIST)

Table Sets to Migrate: 
['student', 'student2', 'student3', 'student4', 'studentm']


#### Step 8:
#### Extract Hive DDls 
Below cell will fetch DDls of all the tables in the given database from HIVE metastore and store in the below GCP location

gs://{TEMP_BUCKET}/hive_ddls/input/{INPUT_HIVE_DATABASE}

In [45]:
# Extract HIVE DDLs from Hive Metastore
get_hive_ddls(INPUT_HIVE_DATABASE,TABLE_LIST,TEMP_BUCKET,spark)

partitiondb
student
CREATE TABLE partitiondb.student (
  student_name STRING,
  class_name STRING,
  percentage FLOAT)
PARTITIONED BY (section STRING)
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe'
WITH SERDEPROPERTIES (
  'serialization.format' = ',',
  'field.delim' = ',')
STORED AS
  INPUTFORMAT 'org.apache.hadoop.mapred.TextInputFormat'
  OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
TBLPROPERTIES (
  'bucketing_version' = '2',
  'transient_lastDdlTime' = '1658332738')

partitiondb
student2
CREATE TABLE partitiondb.student2 (
  student_name STRING,
  class_name STRING,
  percentage FLOAT)
PARTITIONED BY (datenow DATE)
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe'
WITH SERDEPROPERTIES (
  'serialization.format' = ',',
  'field.delim' = ',')
STORED AS
  INPUTFORMAT 'org.apache.hadoop.mapred.TextInputFormat'
  OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
TBLPROPERTIES (
  'bucketing

#### Step 9:
#### Create BQ translation Migration Workfloe 
Below cell will use BQ translation API to convert all the DDLs from HIVE to BQ syntax. The translation API is intelligent enough to identify the required columns to be used as partitioned or clustered columns in BQ tables.

All the converted queries can be found at:

gs://{TEMP_BUCKET}/hive_ddls/output/{INPUT_HIVE_DATABASE}

In [46]:
# Run BQ translation workflow to convert HIVE DDls to BQ
GCS_INPUT_PATH=f"gs://{TEMP_BUCKET}/hive_ddls/input/{INPUT_HIVE_DATABASE}"
GCS_OUTPUT_PATH=f"gs://{TEMP_BUCKET}/hive_ddls/output/{INPUT_HIVE_DATABASE}"
create_migration_workflow(GCS_INPUT_PATH,GCS_OUTPUT_PATH,PROJECT_ID,OUTPUT_BIGQUERY_DATASET)


Created workflow: demo-workflow-python-example-Hive2BQ
Current state: RUNNING
Current state: RUNNING
Current state: RUNNING
Current state: COMPLETED


#### Step 10:
#### Create BQ tables: 
Below cell will create empty BQ tables in Bigquery with partitioned and clustered keys

In [47]:
# Create BQ tables
create_bq_tables(TEMP_BUCKET,INPUT_HIVE_DATABASE)

QueryJob<project=yadavaja-sandbox, location=US, id=8dbdb2ec-3668-43dd-880a-26aac94151ef>


#### Step 11:

Split Hive Tables list based on MAX_PARALLELISM value provided by the user.

In [48]:
import copy
COMPLETE_LIST = copy.deepcopy(TABLE_LIST)
PARALLEL_JOBS = len(TABLE_LIST)//MAX_PARALLELISM
JOB_LIST = []
while len(COMPLETE_LIST) > 0:
    SUB_LIST = []
    for i in range(MAX_PARALLELISM):
        if len(COMPLETE_LIST)>0 :
            SUB_LIST.append(COMPLETE_LIST[0])
            COMPLETE_LIST.pop(0)
        else:
            break
    JOB_LIST.append(SUB_LIST)
print("List of tables for execution : ")
print(JOB_LIST)

List of tables for execution : 
[['student', 'student2', 'student3', 'student4', 'studentm']]


#### Step 12:

Set Dataproc Template Properties

In [49]:
PIPELINE_ROOT = GCS_STAGING_LOCATION + "/pipeline_root/dataproc_pyspark"
MAIN_PYTHON_FILE = GCS_STAGING_LOCATION + "/main.py"
JARS = ["gs://spark-lib/bigquery/spark-bigquery-latest_2.12.jar"]
PYTHON_FILE_URIS = [GCS_STAGING_LOCATION + "/dist/dataproc_templates_distribution.egg"]

#### Step 13:
#### Build pipeline and run Dataproc Template on Vertex AI Pipelines to migrate Hive tables to BigQuery

For this, make sure that the service account used to run the notebook has the following roles:
 - roles/dataproc.editor
 - roles/dataproc.worker

In [50]:
runtime_prop={}
runtime_prop['spark.hadoop.hive.metastore.uris']=HIVE_METASTORE
runtime_prop['mapreduce.fileoutputcommitter.marksuccessfuljobs'] = "false"

def migrate_hive(EXECUTION_LIST):
    EXECUTION_LIST = EXECUTION_LIST
    aiplatform.init(project=PROJECT_ID, staging_bucket=GCS_STAGING_LOCATION)

    @dsl.pipeline(
        name="hive-to-bq-pyspark",
        description="Pipeline to migrate tables from hive to bq",
    )
    def pipeline(
        project_id: str = PROJECT_ID,
        location: str = REGION,
        main_python_file_uri: str = MAIN_PYTHON_FILE,
        python_file_uris: list = PYTHON_FILE_URIS,
        jar_file_uris: list = JARS,
        subnetwork_uri: str = SUBNET
    ):
        for table in EXECUTION_LIST:
            BATCH_ID = "hive2bq-{}-{}".format(table,datetime.now().strftime("%s")).replace('_','-')
            TEMPLATE_SPARK_ARGS = [
                                    "--template=HIVETOBIGQUERY",
                                    "--hive.bigquery.input.database={}".format(INPUT_HIVE_DATABASE),
                                    "--hive.bigquery.input.table={}".format(table),
                                    "--hive.bigquery.output.table={}".format(table),
                                    "--hive.bigquery.output.dataset={}".format(OUTPUT_BIGQUERY_DATASET),
                                    "--hive.bigquery.output.mode={}".format(HIVE_OUTPUT_MODE),
                                    "--hive.bigquery.temp.bucket.name={}".format(TEMP_BUCKET)                                    ]
            _ = DataprocPySparkBatchOp(
                project=project_id,
                location=location,
                batch_id=BATCH_ID,
                main_python_file_uri=main_python_file_uri,
                python_file_uris=python_file_uris,
                jar_file_uris=jar_file_uris,
                subnetwork_uri=subnetwork_uri,
                runtime_config_properties=runtime_prop,
                args=TEMPLATE_SPARK_ARGS,
            )
            time.sleep(5)

    compiler.Compiler().compile(pipeline_func=pipeline, package_path="pipeline.json")

    pipeline = aiplatform.PipelineJob(
            display_name="pipeline",
            template_path="pipeline.json",
            pipeline_root=PIPELINE_ROOT,
            enable_caching=False,
            )
    pipeline.run()

#### Step 14:

#### If the user wants to verify DDLs first, uncomment the below line to stop execution of the final cell which loads the data to BQ.

In [51]:
# # don't go beyond here with Run All
# assert False

#### Step 15:

Run Dataproc Batch Template based on Hive Tables list calculated in Step 8.

The below cell will call function migrate_hive to migrate tables using dataproc serverless batch job and also add an entry in Audit Table for each Table Set.

In [52]:
AUDIT_DICT={}
AUDIT_DF = pd.DataFrame(columns=["Source_DB_Name","Source_Table_Set","Target_DB_Name","Target_Table_Set","Job_Start_Time","Job_End_Time","Job_Status"])
 
for execution_list in JOB_LIST:
    print("\n\nLoading Table Set: "+str(execution_list))
    AUDIT_DICT["Source_DB_Name"]=INPUT_HIVE_DATABASE
    AUDIT_DICT["Source_Table_Set"]='|'.join(execution_list)
    AUDIT_DICT["Target_DB_Name"]=OUTPUT_BIGQUERY_DATASET
    AUDIT_DICT["Target_Table_Set"]='|'.join(execution_list)
    AUDIT_DICT["Job_Start_Time"]=str(datetime.now())
    try:
        migrate_hive(execution_list)
    except Exception:
        AUDIT_DICT["Job_Status"]="FAIL"
        print("\n\nSome Error Occured while loading Table Set: "+str(execution_list))
    else:
        AUDIT_DICT["Job_Status"]="PASS"
        print("\n\nLoaded Table Set: "+str(execution_list))

    AUDIT_DICT["Job_End_Time"]=str(datetime.now())
    AUDIT_DF=AUDIT_DF.append(AUDIT_DICT, ignore_index = True)
    
if AUDIT_DF.empty:
    print("Audit Dataframe is Empty")
else:
    print(AUDIT_DF)
    AUDIT_DF.to_csv("gs://"+TEMP_BUCKET+"/audit/audit_file_{}.csv".format(str(datetime.now())),index=False,header = False)



Loading Table Set: ['student', 'student2', 'student3', 'student4', 'studentm']




Creating PipelineJob


INFO:google.cloud.aiplatform.pipeline_jobs:Creating PipelineJob


PipelineJob created. Resource name: projects/617357862702/locations/us-central1/pipelineJobs/hive-to-bq-pyspark-20230129120952


INFO:google.cloud.aiplatform.pipeline_jobs:PipelineJob created. Resource name: projects/617357862702/locations/us-central1/pipelineJobs/hive-to-bq-pyspark-20230129120952


To use this PipelineJob in another session:


INFO:google.cloud.aiplatform.pipeline_jobs:To use this PipelineJob in another session:


pipeline_job = aiplatform.PipelineJob.get('projects/617357862702/locations/us-central1/pipelineJobs/hive-to-bq-pyspark-20230129120952')


INFO:google.cloud.aiplatform.pipeline_jobs:pipeline_job = aiplatform.PipelineJob.get('projects/617357862702/locations/us-central1/pipelineJobs/hive-to-bq-pyspark-20230129120952')


View Pipeline Job:
https://console.cloud.google.com/vertex-ai/locations/us-central1/pipelines/runs/hive-to-bq-pyspark-20230129120952?project=617357862702


INFO:google.cloud.aiplatform.pipeline_jobs:View Pipeline Job:
https://console.cloud.google.com/vertex-ai/locations/us-central1/pipelines/runs/hive-to-bq-pyspark-20230129120952?project=617357862702


PipelineJob projects/617357862702/locations/us-central1/pipelineJobs/hive-to-bq-pyspark-20230129120952 current state:
PipelineState.PIPELINE_STATE_RUNNING


INFO:google.cloud.aiplatform.pipeline_jobs:PipelineJob projects/617357862702/locations/us-central1/pipelineJobs/hive-to-bq-pyspark-20230129120952 current state:
PipelineState.PIPELINE_STATE_RUNNING


PipelineJob projects/617357862702/locations/us-central1/pipelineJobs/hive-to-bq-pyspark-20230129120952 current state:
PipelineState.PIPELINE_STATE_RUNNING


INFO:google.cloud.aiplatform.pipeline_jobs:PipelineJob projects/617357862702/locations/us-central1/pipelineJobs/hive-to-bq-pyspark-20230129120952 current state:
PipelineState.PIPELINE_STATE_RUNNING


PipelineJob projects/617357862702/locations/us-central1/pipelineJobs/hive-to-bq-pyspark-20230129120952 current state:
PipelineState.PIPELINE_STATE_RUNNING


INFO:google.cloud.aiplatform.pipeline_jobs:PipelineJob projects/617357862702/locations/us-central1/pipelineJobs/hive-to-bq-pyspark-20230129120952 current state:
PipelineState.PIPELINE_STATE_RUNNING


PipelineJob projects/617357862702/locations/us-central1/pipelineJobs/hive-to-bq-pyspark-20230129120952 current state:
PipelineState.PIPELINE_STATE_RUNNING


INFO:google.cloud.aiplatform.pipeline_jobs:PipelineJob projects/617357862702/locations/us-central1/pipelineJobs/hive-to-bq-pyspark-20230129120952 current state:
PipelineState.PIPELINE_STATE_RUNNING


PipelineJob projects/617357862702/locations/us-central1/pipelineJobs/hive-to-bq-pyspark-20230129120952 current state:
PipelineState.PIPELINE_STATE_RUNNING


INFO:google.cloud.aiplatform.pipeline_jobs:PipelineJob projects/617357862702/locations/us-central1/pipelineJobs/hive-to-bq-pyspark-20230129120952 current state:
PipelineState.PIPELINE_STATE_RUNNING


PipelineJob run completed. Resource name: projects/617357862702/locations/us-central1/pipelineJobs/hive-to-bq-pyspark-20230129120952


INFO:google.cloud.aiplatform.pipeline_jobs:PipelineJob run completed. Resource name: projects/617357862702/locations/us-central1/pipelineJobs/hive-to-bq-pyspark-20230129120952




Loaded Table Set: ['student', 'student2', 'student3', 'student4', 'studentm']
  Source_DB_Name                             Source_Table_Set Target_DB_Name  \
0    partitiondb  student|student2|student3|student4|studentm       hivedemo   

                              Target_Table_Set              Job_Start_Time  \
0  student|student2|student3|student4|studentm  2023-01-29 12:09:26.955648   

                 Job_End_Time Job_Status  
0  2023-01-29 12:14:55.097716       PASS  
