In [None]:
# Copyright 2022 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

#### References

- [DataprocPySparkBatchOp reference](https://google-cloud-pipeline-components.readthedocs.io/en/google-cloud-pipeline-components-1.0.0/google_cloud_pipeline_components.experimental.dataproc.html)
- [Kubeflow SDK Overview](https://www.kubeflow.org/docs/components/pipelines/sdk/sdk-overview/)
- [Dataproc Serverless in Vertex AI Pipelines tutorial](https://github.com/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/community/ml_ops/stage3/get_started_with_dataproc_serverless_pipeline_components.ipynb)
- [Build a Vertex AI Pipeline](https://cloud.google.com/vertex-ai/docs/pipelines/build-pipeline)

This notebook is built to run a Vertex AI User-Managed Notebook using the default Compute Engine Service Account.  
Check the Dataproc Serverless in Vertex AI Pipelines tutorial linked above to learn how to setup a different Service Account.  

#### Permissions

Make sure that the service account used to run the notebook has the following roles:

- roles/aiplatform.serviceAgent
- roles/aiplatform.customCodeServiceAgent
- roles/storage.objectCreator
- roles/storage.objectViewer
- roles/dataproc.editor
- roles/dataproc.worker

#### Step 1:
#### Install the required packages

In [1]:
# Google Cloud notebooks requires dependencies to be installed with '--user'
! pip3 install pyspark
! pip3 install --upgrade google-cloud-pipeline-components kfp --user -q
! pip3 install pip install google-auth==2.13.0
! pip3 install --upgrade google-cloud-bigquery-migration

# Install latest JDK
! sudo apt-get update
! sudo apt-get install default-jdk -y

Collecting pyspark
  Downloading pyspark-3.4.0.tar.gz (310.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m310.8/310.8 MB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting py4j==0.10.9.7
  Downloading py4j-0.10.9.7-py2.py3-none-any.whl (200 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m200.5/200.5 kB[0m [31m23.5 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25ldone
[?25h  Created wheel for pyspark: filename=pyspark-3.4.0-py2.py3-none-any.whl size=311317130 sha256=44d8023b2e263a156398338ca1ed44e00a2cc8f3de9b8198a8e3cb2327b4497b
  Stored in directory: /home/jupyter/.cache/pip/wheels/06/51/98/f7a41aad64c08302d6c26c90650e713c3dfeb5cdec4946db00
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9.7 pyspark-3.4.0
Collecting install


#### Once you've installed the additional packages, you may need to restart the notebook kernel so it can find the packages.

Uncomment & Run this cell if you have installed anything from above commands

In [None]:
# import os

# if not os.getenv("IS_TESTING"):
#    import IPython
#    app = IPython.Application.instance()
#    app.kernel.do_shutdown(True)

#### Step 2:
#### Set Google Cloud properties

**Overview**  
This notebook shows how to build a Vertex AI Pipeline to run a Dataproc Template   
using the DataprocPySparkBatchOp component.

In [1]:
# User Configuration
# User Inputs

get_project_id = ! gcloud config list --format 'value(core.project)' 2>/dev/null
PROJECT_ID = get_project_id[0]
REGION = ""  # example "us-west1"
GCS_STAGING_LOCATION = "gs://<bucket_name>" # example "gs://my_bucket_name"
SUBNET = "" # example "projects/<project-id>/regions/<region-id>/subnetworks/<subnet-name>" 
INPUT_HIVE_DATABASE= ""
INPUT_HIVE_TABLES= "*" # example "table1,table2,table3..." or "*"
OUTPUT_BIGQUERY_DATASET= ""
TEMP_BUCKET= "<bucket_name>"
HIVE_METASTORE= "" # example "thrift://shubu-hive2bqnb-m:9083"
BQ_DATASET_REGION="us"

## Change if needed
HIVE_OUTPUT_MODE="overwrite"
MAX_PARALLELISM=10 # Controlls number of parallel Dataproc Serverless Jobs

#### Step 3:
#### Import dependencies

In [3]:
import google.cloud.aiplatform as aiplatform
from kfp import dsl
from kfp.v2 import compiler
from datetime import datetime
from google_cloud_pipeline_components.experimental.dataproc import DataprocPySparkBatchOp
import time
import os
from pyspark.sql import SparkSession
import pandas as pd
from pathlib import Path
import subprocess


#### Step 4:
#### Change working directory to the Dataproc Templates python folder

In [4]:
cur_path = Path(os.getcwd())
WORKING_DIRECTORY = os.path.join(cur_path.parent.parent ,'python')

# If the above code doesn't fetches the correct path please
# provide complete path to python folder in your dataproc 
# template repo which you cloned 

# WORKING_DIRECTORY = "/home/jupyter/dataproc-templates/python/"
print(WORKING_DIRECTORY)

/home/jupyter/dataproc-templates/python


In [5]:
%cd $WORKING_DIRECTORY

/home/jupyter/dataproc-templates/python


#### Step 5:
#### Build Dataproc Templates python package

In [6]:
PACKAGE_EGG_FILE = "dist/dataproc_templates_distribution.egg"
! python ./setup.py bdist_egg --output=$PACKAGE_EGG_FILE

running bdist_egg
running egg_info
writing google_dataproc_templates.egg-info/PKG-INFO
writing dependency_links to google_dataproc_templates.egg-info/dependency_links.txt
writing requirements to google_dataproc_templates.egg-info/requires.txt
writing top-level names to google_dataproc_templates.egg-info/top_level.txt
reading manifest file 'google_dataproc_templates.egg-info/SOURCES.txt'
reading manifest template 'MANIFEST.in'
writing manifest file 'google_dataproc_templates.egg-info/SOURCES.txt'
installing library code to build/bdist.linux-x86_64/egg
running install_lib
running build_py
creating build/bdist.linux-x86_64/egg
creating build/bdist.linux-x86_64/egg/dataproc_templates
creating build/bdist.linux-x86_64/egg/dataproc_templates/jdbc
copying build/lib/dataproc_templates/jdbc/__init__.py -> build/bdist.linux-x86_64/egg/dataproc_templates/jdbc
copying build/lib/dataproc_templates/jdbc/jdbc_to_gcs.py -> build/bdist.linux-x86_64/egg/dataproc_templates/jdbc
copying build/lib/dataproc

#### Step 6:
#### Copy package to the GCS bucket

For this, make sure that the service account used to run the notebook has the following roles:
 - roles/storage.objectCreator
 - roles/storage.objectViewer

In [7]:
!gsutil cp main.py $GCS_STAGING_LOCATION/
!gsutil cp $PACKAGE_EGG_FILE $GCS_STAGING_LOCATION/dist/

Copying file://main.py [Content-Type=text/x-python]...
/ [1 files][  6.0 KiB/  6.0 KiB]                                                
Operation completed over 1 objects/6.0 KiB.                                      
Copying file://dist/dataproc_templates_distribution.egg [Content-Type=application/octet-stream]...
/ [1 files][236.1 KiB/236.1 KiB]                                                
Operation completed over 1 objects/236.1 KiB.                                    


#### Step 7:
#### Get Hive Tables 
In case user wants to load all the Hive tables from the database, we need to get the table list using the metastore.

Below cell will fetch all tables from the Hive database by running a Spark SQL query using the provided Hive Metastore.

In [8]:
if INPUT_HIVE_TABLES=="*":
    #os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
    #os.environ["PATH"] = os.environ["JAVA_HOME"] + "/bin:" + os.environ["PATH"]
    spark=SparkSession.builder \
          .master("local")\
          .appName("Spark Job to get HIVE table list") \
          .config("hive.metastore.uris",HIVE_METASTORE) \
          .enableHiveSupport() \
          .getOrCreate()  
    TABLE_LIST_DF=spark.sql("show tables in "+INPUT_HIVE_DATABASE)
    TABLE_LIST=TABLE_LIST_DF.select("tableName").rdd.flatMap(lambda x: x).collect()
    print("Table Sets to Migrate: ")
    print(TABLE_LIST)
    spark.stop()
else:
    TABLE_LIST=INPUT_HIVE_TABLES.split(",")
    print("Table Sets to Migrate: ")
    print(TABLE_LIST)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/05/07 13:35:24 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
                                                                                

Table Sets to Migrate: 
['avro_nonpartitioned', 'avro_partitioned', 'orc_nonpartitioned', 'orc_partitioned', 'parquet_nonpartitioned', 'parquet_partitioned']


#### -----  Skip steps 8-16 to create non partitioned tables in Bigquery -----


#### Step 8:
#### Get Required Variables for HIVEDDLEXTRACTOR

In [56]:
DDL_INPUT_PATH=GCS_STAGING_LOCATION+"/hiveddl/input"
DDL_OUTPUT_PATH=GCS_STAGING_LOCATION+"/hiveddl/output"
os.environ["GCP_PROJECT"]=get_project_id[0]
os.environ["REGION"]=REGION
os.environ["GCS_STAGING_LOCATION"]=GCS_STAGING_LOCATION
os.environ["SUBNET"]=SUBNET
os.environ["HIVE_METASTORE"]=HIVE_METASTORE
os.environ["INPUT_HIVE_DATABASE"]=INPUT_HIVE_DATABASE
os.environ["GCS_STAGING_PATH"]=DDL_INPUT_PATH


#### Step 9:
#### Run HIVEDDLEXTRACTOR to extract HIVE DDLs

We will be making use of HIVEDDLEXTRACTOR utility to connect to thrift server and extracting all the DDLs. 

In [10]:
!./bin/start.sh \
    --properties=spark.hadoop.hive.metastore.uris=$HIVE_METASTORE \
    -- --template=HIVEDDLEXTRACTOR \
    --hive.ddl.extractor.input.database=$INPUT_HIVE_DATABASE \
    --hive.ddl.extractor.output.path=$GCS_STAGING_PATH


GCP_PROJECT=yadavaja-sandbox
REGION=us-west1
GCS_STAGING_LOCATION=gs://test-shubu
running bdist_egg
running egg_info
writing google_dataproc_templates.egg-info/PKG-INFO
writing dependency_links to google_dataproc_templates.egg-info/dependency_links.txt
writing requirements to google_dataproc_templates.egg-info/requires.txt
writing top-level names to google_dataproc_templates.egg-info/top_level.txt
reading manifest file 'google_dataproc_templates.egg-info/SOURCES.txt'
reading manifest template 'MANIFEST.in'
writing manifest file 'google_dataproc_templates.egg-info/SOURCES.txt'
installing library code to build/bdist.linux-x86_64/egg
running install_lib
running build_py
creating build/bdist.linux-x86_64/egg
creating build/bdist.linux-x86_64/egg/dataproc_templates
creating build/bdist.linux-x86_64/egg/dataproc_templates/jdbc
copying build/lib/dataproc_templates/jdbc/__init__.py -> build/bdist.linux-x86_64/egg/dataproc_templates/jdbc
copying build/lib/dataproc_templates/jdbc/jdbc_to_gcs.py 

#### Step 10:
#### Get the latest DDL GCS path

HIVEDDLEXTRACTOR utility will create seperate directory for each run based on the hive database name and run time.

In [11]:
result=subprocess.run(f"gsutil ls  {DDL_INPUT_PATH}/{INPUT_HIVE_DATABASE} | sed '$!d'", capture_output=True, shell=True, encoding="utf-8")
latest_ddl_path=str(result.stdout)[:-1]
print(latest_ddl_path)

gs://test-shubu/hiveddl/input/default/05-07-2023 13.37.34/


#### Step 12:
#### Copy Global Typeconvert file into the input folder

Refer Reamde file in the current folder for more information.


In [12]:
!gsutil cp ../notebooks/util/global_typeconvert.config.yaml "$latest_ddl_path"

Copying file://../notebooks/util/global_typeconvert.config.yaml [Content-Type=application/octet-stream]...
/ [1 files][   81.0 B/   81.0 B]                                                
Operation completed over 1 objects/81.0 B.                                       


#### Step 13:
#### Create object name mapping

Object name mapping will help us to replace HIVE database name with Bigquery Dataset in the final DDL

In [15]:
obj_name_mapping={
                    "name_map": [{
                      "source": {
                        "schema": INPUT_HIVE_DATABASE,
                                 },
                      "target": {
                        "schema": OUTPUT_BIGQUERY_DATASET,
                                 }
                                }, 
                                ]
                  }

#### Step 14:
#### Call create_migration_workflow to convert HIVE DDls to Bigquery syntax

Below cell will call create_migration_workflow function to call BQ Translation API and then wait for the status to be completed.

It will create the translated DDLs in "gs://bucket_name/hiveddl/output/bq_dataset"

In [16]:
import sys
#Import required util functions
sys.path.insert(0, '../notebooks/util')
from utils import *

#Set required vraiables to be passed to the create_migration_workflow
gcs_input_path=latest_ddl_path
gcs_output_path=DDL_OUTPUT_PATH+"/"+OUTPUT_BIGQUERY_DATASET
project_id=get_project_id[0]
bq_dataset=OUTPUT_BIGQUERY_DATASET
default_database=get_project_id[0]
source_dilect="hive"
bq_region=BQ_DATASET_REGION

# Call create_migration_workflow with the require parameters
workflow_name,workflow_state=create_migration_workflow(
    gcs_input_path, gcs_output_path, project_id, bq_dataset,
    default_database, source_dilect, bq_region,obj_name_mapping)

#Get Worklow status
while (str(workflow_state) == "State.RUNNING"):
    print("Running Migration Workflow")
    time.sleep(5)
    workflow_state=get_migration_workflow_status(workflow_name).state
print(str(workflow_state))

Created workflow:
workflow-python-hive2bq
Current state:
State.RUNNING
Running Migration Workflow
Running Migration Workflow
State.COMPLETED


#### Step 14:
#### Extract translated DDLs

Below cell will read all the translated Bigquery DDLs

In [17]:

result=subprocess.run(f"gsutil ls  {gcs_output_path} ", capture_output=True, shell=True, encoding="utf-8")
translated_files=result.stdout.split("\n")
all_ddls=""
for file in translated_files:
    if "/_SUCCESS" not in file and "batch_translation_report.csv" not in file and "consumed_name_map.json" not in file and len(file)>0:
        bucket_name=file.replace("gs://","").split("/",1)[0]
        file_path=file.replace("gs://","").split("/",1)[1]
        all_ddls=all_ddls+(get_gcs_file_as_string(bucket_name,file_path))

#### Step 15:
#### Run translated ddls and create BQ partitioned and clustered tables

Below cell will try to run each bigquery DDL one by one and save the status in DDL_FAIL_AUDIT_DF dataframe

In [47]:
DDL_FAIL_AUDIT_DICT={}
DDL_FAIL_AUDIT_DF = pd.DataFrame(columns=["Source_DB_Name","Source_Table_Set","Target_DB_Name","Target_Table_Set","Job_Start_Time","Job_End_Time","Job_Status"])
for ddl in all_ddls[:-1].split(";\n"):
    tblnm=ddl.split("CREATE TABLE")[1].split("\n")[0].split(".")[2]
    try:
        run_bq_query(ddl)
        print(f"Table Created in bigquery: {OUTPUT_BIGQUERY_DATASET}.{tblnm}")
    except Exception as e:
        print(f"Failed to create table: {OUTPUT_BIGQUERY_DATASET}.{tblnm}")
        print(e)
        DDL_FAIL_AUDIT_DICT["Source_DB_Name"]=INPUT_HIVE_DATABASE
        DDL_FAIL_AUDIT_DICT["Source_Table_Set"]=tblnm
        DDL_FAIL_AUDIT_DICT["Target_DB_Name"]=OUTPUT_BIGQUERY_DATASET
        DDL_FAIL_AUDIT_DICT["Target_Table_Set"]=tblnm
        DDL_FAIL_AUDIT_DICT["Job_Start_Time"]=str(datetime.now())
        DDL_FAIL_AUDIT_DICT["Job_Status"]=f"FAIL REASON: {e}"
        DDL_FAIL_AUDIT_DICT["Job_End_Time"]=str(datetime.now())
        DDL_FAIL_AUDIT_DF=DDL_FAIL_AUDIT_DF.append(DDL_FAIL_AUDIT_DICT, ignore_index = True)


Table Created in bigquery: hive2bq.avro_nonpartitioned
Failed to create table: hive2bq.avro_partitioned
409 Already Exists: Table yadavaja-sandbox:hive2bq.avro_partitioned

Location: US
Job ID: fe29a624-d581-4fc7-8287-cfc52872a956

Table Created in bigquery: hive2bq.orc_nonpartitioned
Failed to create table: hive2bq.orc_partitioned
409 Already Exists: Table yadavaja-sandbox:hive2bq.orc_partitioned

Location: US
Job ID: 409def9b-ff08-4c4e-b2a6-a38ab7e4408a

Table Created in bigquery: hive2bq.parquet_nonpartitioned
Failed to create table: hive2bq.parquet_partitioned
409 Already Exists: Table yadavaja-sandbox:hive2bq.parquet_partitioned

Location: US
Job ID: 2a35b828-6fdb-4662-9f70-d9ad62793c3b



#### Step 16:
#### Remove failed tables from the final TABLE_LIST


In [50]:
import copy
FAILED_DDL_TBLS=DDL_FAIL_AUDIT_DF[['Target_Table_Set']].values.ravel().tolist()
TABLE_LIST_COPY=copy.deepcopy(TABLE_LIST)
for element in TABLE_LIST_COPY:
    if element in FAILED_DDL_TBLS:
        TABLE_LIST.remove(element)



#### Step 17:

Split Hive Tables list based on MAX_PARALLELISM value provided by the user.

In [52]:
COMPLETE_LIST = copy.deepcopy(TABLE_LIST)
PARALLEL_JOBS = len(TABLE_LIST)//MAX_PARALLELISM
JOB_LIST = []
while len(COMPLETE_LIST) > 0:
    SUB_LIST = []
    for i in range(MAX_PARALLELISM):
        if len(COMPLETE_LIST)>0 :
            SUB_LIST.append(COMPLETE_LIST[0])
            COMPLETE_LIST.pop(0)
        else:
            break
    JOB_LIST.append(SUB_LIST)
print("List of tables for execution : ")
print(JOB_LIST)

List of tables for execution : 
[['avro_nonpartitioned', 'orc_nonpartitioned', 'parquet_nonpartitioned']]


#### Step 18:

Set Dataproc Template Properties

In [53]:
PIPELINE_ROOT = GCS_STAGING_LOCATION + "/pipeline_root/dataproc_pyspark"
MAIN_PYTHON_FILE = GCS_STAGING_LOCATION + "/main.py"
JARS = ["gs://spark-lib/bigquery/spark-bigquery-latest_2.12.jar"]
PYTHON_FILE_URIS = [GCS_STAGING_LOCATION + "/dist/dataproc_templates_distribution.egg"]

#### Step 19:
#### Build pipeline and run Dataproc Template on Vertex AI Pipelines to migrate Hive tables to BigQuery

For this, make sure that the service account used to run the notebook has the following roles:
 - roles/dataproc.editor
 - roles/dataproc.worker

In [54]:
runtime_prop={}
runtime_prop['spark.hadoop.hive.metastore.uris']=HIVE_METASTORE
runtime_prop['mapreduce.fileoutputcommitter.marksuccessfuljobs'] = "false"

def migrate_hive(EXECUTION_LIST):
    EXECUTION_LIST = EXECUTION_LIST
    aiplatform.init(project=PROJECT_ID, staging_bucket=GCS_STAGING_LOCATION)

    @dsl.pipeline(
        name="hive-to-bq-pyspark",
        description="Pipeline to migrate tables from hive to bq",
    )
    def pipeline(
        project_id: str = PROJECT_ID,
        location: str = REGION,
        main_python_file_uri: str = MAIN_PYTHON_FILE,
        python_file_uris: list = PYTHON_FILE_URIS,
        jar_file_uris: list = JARS,
        subnetwork_uri: str = SUBNET
    ):
        for table in EXECUTION_LIST:
            BATCH_ID = "hive2bq-{}-{}".format(table,datetime.now().strftime("%s")).replace('_','-')
            TEMPLATE_SPARK_ARGS = [
                                    "--template=HIVETOBIGQUERY",
                                    "--hive.bigquery.input.database={}".format(INPUT_HIVE_DATABASE),
                                    "--hive.bigquery.input.table={}".format(table),
                                    "--hive.bigquery.output.table={}".format(table),
                                    "--hive.bigquery.output.dataset={}".format(OUTPUT_BIGQUERY_DATASET),
                                    "--hive.bigquery.output.mode={}".format(HIVE_OUTPUT_MODE),
                                    "--hive.bigquery.temp.bucket.name={}".format(TEMP_BUCKET)                                    ]
            _ = DataprocPySparkBatchOp(
                project=project_id,
                location=location,
                batch_id=BATCH_ID,
                main_python_file_uri=main_python_file_uri,
                python_file_uris=python_file_uris,
                jar_file_uris=jar_file_uris,
                subnetwork_uri=subnetwork_uri,
                runtime_config_properties=runtime_prop,
                runtime_config_version="1.1", # issue 665
                args=TEMPLATE_SPARK_ARGS,
            )
            time.sleep(5)

    compiler.Compiler().compile(pipeline_func=pipeline, package_path="pipeline.json")

    pipeline = aiplatform.PipelineJob(
            display_name="pipeline",
            template_path="pipeline.json",
            pipeline_root=PIPELINE_ROOT,
            enable_caching=False,
            )
    pipeline.run()

#### Step 20:

Run Dataproc Batch Template based on Hive Tables list calculated in Step 8.

The below cell will call function migrate_hive to migrate tables using dataproc serverless batch job and also add an entry in Audit Table for each Table Set.

In [55]:
AUDIT_DICT={}
AUDIT_DF = pd.DataFrame(columns=["Source_DB_Name","Source_Table_Set","Target_DB_Name","Target_Table_Set","Job_Start_Time","Job_End_Time","Job_Status"])
 
for execution_list in JOB_LIST:
    print("\n\nLoading Table Set: "+str(execution_list))
    AUDIT_DICT["Source_DB_Name"]=INPUT_HIVE_DATABASE
    AUDIT_DICT["Source_Table_Set"]='|'.join(execution_list)
    AUDIT_DICT["Target_DB_Name"]=OUTPUT_BIGQUERY_DATASET
    AUDIT_DICT["Target_Table_Set"]='|'.join(execution_list)
    AUDIT_DICT["Job_Start_Time"]=str(datetime.now())
    try:
        migrate_hive(execution_list)
    except Exception:
        AUDIT_DICT["Job_Status"]="FAIL"
        print("\n\nSome Error Occured while loading Table Set: "+str(execution_list))
    else:
        AUDIT_DICT["Job_Status"]="PASS"
        print("\n\nLoaded Table Set: "+str(execution_list))

    AUDIT_DICT["Job_End_Time"]=str(datetime.now())
    AUDIT_DF=AUDIT_DF.append(AUDIT_DICT, ignore_index = True)

AUDIT_DF_COMBINED = pd.concat([AUDIT_DF, DDL_FAIL_AUDIT_DF], axis=0)

if AUDIT_DF_COMBINED.empty:
    print("Audit Dataframe is Empty")
else:
    print(AUDIT_DF_COMBINED)
    AUDIT_DF_COMBINED.to_csv("gs://"+TEMP_BUCKET+"/audit/audit_file_{}.csv".format(str(datetime.now())),index=False,header = False)



Loading Table Set: ['avro_nonpartitioned', 'orc_nonpartitioned', 'parquet_nonpartitioned']




Creating PipelineJob


INFO:google.cloud.aiplatform.pipeline_jobs:Creating PipelineJob


PipelineJob created. Resource name: projects/617357862702/locations/us-central1/pipelineJobs/hive-to-bq-pyspark-20230507135738


INFO:google.cloud.aiplatform.pipeline_jobs:PipelineJob created. Resource name: projects/617357862702/locations/us-central1/pipelineJobs/hive-to-bq-pyspark-20230507135738


To use this PipelineJob in another session:


INFO:google.cloud.aiplatform.pipeline_jobs:To use this PipelineJob in another session:


pipeline_job = aiplatform.PipelineJob.get('projects/617357862702/locations/us-central1/pipelineJobs/hive-to-bq-pyspark-20230507135738')


INFO:google.cloud.aiplatform.pipeline_jobs:pipeline_job = aiplatform.PipelineJob.get('projects/617357862702/locations/us-central1/pipelineJobs/hive-to-bq-pyspark-20230507135738')


View Pipeline Job:
https://console.cloud.google.com/vertex-ai/locations/us-central1/pipelines/runs/hive-to-bq-pyspark-20230507135738?project=617357862702


INFO:google.cloud.aiplatform.pipeline_jobs:View Pipeline Job:
https://console.cloud.google.com/vertex-ai/locations/us-central1/pipelines/runs/hive-to-bq-pyspark-20230507135738?project=617357862702


PipelineJob projects/617357862702/locations/us-central1/pipelineJobs/hive-to-bq-pyspark-20230507135738 current state:
PipelineState.PIPELINE_STATE_RUNNING


INFO:google.cloud.aiplatform.pipeline_jobs:PipelineJob projects/617357862702/locations/us-central1/pipelineJobs/hive-to-bq-pyspark-20230507135738 current state:
PipelineState.PIPELINE_STATE_RUNNING


PipelineJob projects/617357862702/locations/us-central1/pipelineJobs/hive-to-bq-pyspark-20230507135738 current state:
PipelineState.PIPELINE_STATE_RUNNING


INFO:google.cloud.aiplatform.pipeline_jobs:PipelineJob projects/617357862702/locations/us-central1/pipelineJobs/hive-to-bq-pyspark-20230507135738 current state:
PipelineState.PIPELINE_STATE_RUNNING


PipelineJob projects/617357862702/locations/us-central1/pipelineJobs/hive-to-bq-pyspark-20230507135738 current state:
PipelineState.PIPELINE_STATE_RUNNING


INFO:google.cloud.aiplatform.pipeline_jobs:PipelineJob projects/617357862702/locations/us-central1/pipelineJobs/hive-to-bq-pyspark-20230507135738 current state:
PipelineState.PIPELINE_STATE_RUNNING


PipelineJob projects/617357862702/locations/us-central1/pipelineJobs/hive-to-bq-pyspark-20230507135738 current state:
PipelineState.PIPELINE_STATE_RUNNING


INFO:google.cloud.aiplatform.pipeline_jobs:PipelineJob projects/617357862702/locations/us-central1/pipelineJobs/hive-to-bq-pyspark-20230507135738 current state:
PipelineState.PIPELINE_STATE_RUNNING


PipelineJob projects/617357862702/locations/us-central1/pipelineJobs/hive-to-bq-pyspark-20230507135738 current state:
PipelineState.PIPELINE_STATE_RUNNING


INFO:google.cloud.aiplatform.pipeline_jobs:PipelineJob projects/617357862702/locations/us-central1/pipelineJobs/hive-to-bq-pyspark-20230507135738 current state:
PipelineState.PIPELINE_STATE_RUNNING


PipelineJob projects/617357862702/locations/us-central1/pipelineJobs/hive-to-bq-pyspark-20230507135738 current state:
PipelineState.PIPELINE_STATE_RUNNING


INFO:google.cloud.aiplatform.pipeline_jobs:PipelineJob projects/617357862702/locations/us-central1/pipelineJobs/hive-to-bq-pyspark-20230507135738 current state:
PipelineState.PIPELINE_STATE_RUNNING


PipelineJob run completed. Resource name: projects/617357862702/locations/us-central1/pipelineJobs/hive-to-bq-pyspark-20230507135738


INFO:google.cloud.aiplatform.pipeline_jobs:PipelineJob run completed. Resource name: projects/617357862702/locations/us-central1/pipelineJobs/hive-to-bq-pyspark-20230507135738




Loaded Table Set: ['avro_nonpartitioned', 'orc_nonpartitioned', 'parquet_nonpartitioned']
  Source_DB_Name                                   Source_Table_Set  \
0        default  avro_nonpartitioned|orc_nonpartitioned|parquet...   
0        default                                   avro_partitioned   
1        default                                    orc_partitioned   
2        default                                parquet_partitioned   

  Target_DB_Name                                   Target_Table_Set  \
0        hive2bq  avro_nonpartitioned|orc_nonpartitioned|parquet...   
0        hive2bq                                   avro_partitioned   
1        hive2bq                                    orc_partitioned   
2        hive2bq                                parquet_partitioned   

               Job_Start_Time                Job_End_Time  \
0  2023-05-07 13:57:23.388685  2023-05-07 14:04:19.979226   
0  2023-05-07 13:56:18.495603  2023-05-07 13:56:18.495638   
1  2023-05-07