# MSSQL to POSTGRES Migration

In [31]:
# Copyright 2022 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

#### References
- [DataprocPySparkBatchOp reference](https://google-cloud-pipeline-components.readthedocs.io/en/google-cloud-pipeline-components-1.0.0/google_cloud_pipeline_components.experimental.dataproc.html)
- [Kubeflow SDK Overview](https://www.kubeflow.org/docs/components/pipelines/sdk/sdk-overview/)
- [Dataproc Serverless in Vertex AI Pipelines tutorial](https://github.com/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/community/ml_ops/stage3/get_started_with_dataproc_serverless_pipeline_components.ipynb)
- [Build a Vertex AI Pipeline](https://cloud.google.com/vertex-ai/docs/pipelines/build-pipeline)

This notebook is built to run a Vertex AI User-Managed Notebook using the default Compute Engine Service Account.
Check the Dataproc Serverless in Vertex AI Pipelines tutorial linked above to learn how to setup a different Service Account.
#### Permissions
Make sure that the service account used to run the notebook has the following roles:
- roles/aiplatform.serviceAgent
- roles/aiplatform.customCodeServiceAgent
- roles/storage.objectCreator
- roles/storage.objectViewer
- roles/dataproc.editor
- roles/dataproc.worker

## Step 1: Install Libraries
#### Run Step 1 one time for each new notebook instance"

In [33]:
!pip3 install ipywidgets===8.0.0
!pip3 install pymssql SQLAlchemy
!pip3 install --upgrade google-cloud-pipeline-components kfp --user -q



In [None]:
# waititng some time for kernel to restart
import os
import IPython
if not os.getenv("IS_TESTING"):
    app = IPython.Application.instance()
    app.kernel.do_shutdown(True)

## Step 2: Import Libraries

In [3]:
import google.cloud.aiplatform as aiplatform
from kfp import dsl
from kfp.v2 import compiler
from datetime import datetime
import time
import copy
import json
import pandas as pd
from google_cloud_pipeline_components.experimental.dataproc import DataprocPySparkBatchOp
import ipywidgets as widgets
import sqlalchemy
import pymssql

In [4]:
# define widget style for the notebook
style = {'description_width': 'initial','width':'400px'}

## Step 3: Assign Parameters

### Step 3.1 Common Parameters
##### PROJECT : GCP project-id
##### REGION : GCP region
##### GCS_STAGING_LOCATION : GCS staging location to be used for this notebook to store artifacts
##### SUBNET : VPC subnet
##### JARS : list of jars. For this notebook mssql connectora and postgres connectorjar is required in addition with the dataproc template 
##### MAX_PARALLELISM : Parameter for number of jobs to run in parallel default value is 2

In [5]:
# Get GCP Project

PROJECT=widgets.Text(value="yadavaja-sandbox",description="PROJECT",placeholder="<project_id>",style=style)
display(PROJECT)

REGION=widgets.Text(value="us-west1",description="REGION",placeholder="eg. us-central1",style=style, width="auto")
display(REGION)

GCS_STAGING_LOCATION=widgets.Text(value="gs://test-styagi",placeholder="gs://<bucket_name>",description="GCS STAGING LOCATION",style=style)
display(GCS_STAGING_LOCATION)

SUBNET=widgets.Text(value="projects/yadavaja-sandbox/regions/us-west1/subnetworks/test-subnet1",placeholder="projects/<project-id>/regions/<region-id>/subnetworks/<subnet-name>",description="SUBNET",style=style)
display(SUBNET)

MAX_PARALLELISM=widgets.IntText(value="2",description="MAX PARALLELISM",style=style)
display(MAX_PARALLELISM)


Text(value='yadavaja-sandbox', description='PROJECT', placeholder='<project_id>', style=TextStyle(description_…

Text(value='us-west1', description='REGION', placeholder='eg. us-central1', style=TextStyle(description_width=…

Text(value='gs://test-styagi', description='GCS STAGING LOCATION', placeholder='gs://<bucket_name>', style=Tex…

Text(value='projects/yadavaja-sandbox/regions/us-west1/subnetworks/test-subnet1', description='SUBNET', placeh…

IntText(value=2, description='MAX PARALLELISM', style=DescriptionStyle(description_width='initial'))

### Step 3.2 MSSQL Parameters
#### MSSQL_HOST : MSSQL instance ip address
#### MSSQL_PORT : MSSQL instance port
#### MSSQL_USERNAME : MSSQL username
#### MSSQL_PASSWORD : MSSQL password
#### MSSQL_DATABASE : name of database that you want to migrate
#### MSSQLTABLE_LIST : list of tables you want to migrate eg: ['table1','table2'] else provide an empty list for migration whole database eg : []

In [32]:

display(widgets.Label("SQL Server Host"))
MSSQL_HOST=widgets.Text(value="10.203.210.5",style=style)
display(MSSQL_HOST)

display(widgets.Label("SQL Server Port"))
MSSQL_PORT=widgets.Text(value="1433",style=style)
display(MSSQL_PORT)

display(widgets.Label("SQL Server Username"))
MSSQL_USERNAME=widgets.Text(value="sqlserver",style=style)
display(MSSQL_USERNAME)

display(widgets.Label("SQL Server Password"))
MSSQL_PASSWORD=widgets.Text(value="password123",style=style)
display(MSSQL_PASSWORD)

display(widgets.Label("SQL Server Database"))
MSSQL_DATABASE=widgets.Text(value="AdvDB",style=style)
display(MSSQL_DATABASE)

display(widgets.Label("SQL Server Input Table List"))
MSSQLTABLE_LIST=widgets.Text(placeholder="<table1>,<table2>,...",style=style)
display(MSSQLTABLE_LIST)

display(widgets.Label("SQL Server Input Partition Column"))
JDBCTOJDBC_INTPUT_PARTITIONCOLUMN=widgets.Text(value="id",style=style)
display(JDBCTOJDBC_INTPUT_PARTITIONCOLUMN)

display(widgets.Label("SQL Server Input Lowerbound"))
JDBCTOJDBC_INTPUT_LOWERBOUND=widgets.Text(value="11",style=style)
display(JDBCTOJDBC_INTPUT_LOWERBOUND)

display(widgets.Label("SQL Server Input Upperbound"))
JDBCTOJDBC_INTPUT_UPPERBOUND=widgets.Text(value="20",style=style)
display(JDBCTOJDBC_INTPUT_UPPERBOUND)

display(widgets.Label("No. Of Partitions"))
JDBCTOJDBC_NUMOFPARTITIONS=widgets.Text(value="4",style=style)
display(JDBCTOJDBC_NUMOFPARTITIONS)




Label(value='SQL Server Host')

Text(value='10.203.210.5', style=TextStyle(description_width='initial'))

Label(value='SQL Server Port')

Text(value='1433', style=TextStyle(description_width='initial'))

Label(value='SQL Server Username')

Text(value='sqlserver', style=TextStyle(description_width='initial'))

Label(value='SQL Server Password')

Text(value='password123', style=TextStyle(description_width='initial'))

Label(value='SQL Server Database')

Text(value='AdvDB', style=TextStyle(description_width='initial'))

Label(value='SQL Server Input Table List')

Text(value='', placeholder='<table1>,<table2>,...', style=TextStyle(description_width='initial'))

Label(value='SQL Server Input Partition Column')

Text(value='id', style=TextStyle(description_width='initial'))

Label(value='SQL Server Input Lowerbound')

Text(value='11', style=TextStyle(description_width='initial'))

Label(value='SQL Server Input Upperbound')

Text(value='20', style=TextStyle(description_width='initial'))

Label(value='No. Of Partitions')

Text(value='4', style=TextStyle(description_width='initial'))

### Step 3.3 POSTGRES Parameters
#### POSTGRES_HOST : MSSQL instance ip address
#### POSTGRES_PORT : MSSQL instance port
#### POSTGRES_USERNAME : MSSQL username
#### POSTGRES_PASSWORD : MSSQL password
#### POSTGRES_DATABASE : name of database that you want to migrate


In [9]:
display(widgets.Label("PPOSTGRES Server Host"))
POSTGRES_HOST=widgets.Text(value="10.203.211.3",style=style)
display(POSTGRES_HOST)

display(widgets.Label("POSTGRES Server Port"))
POSTGRES_PORT=widgets.Text(value="5432",style=style)
display(POSTGRES_PORT)

display(widgets.Label("POSTGRES Server Username"))
POSTGRES_USERNAME=widgets.Text(value="postgres",style=style)
display(POSTGRES_USERNAME)

display(widgets.Label("POSTGRES Server Password"))
POSTGRES_PASSWORD=widgets.Text(value="password123",style=style)
display(POSTGRES_PASSWORD)

display(widgets.Label("POSTGRES Server Database"))
POSTGRES_DATABASE=widgets.Text(value="AdvDB",style=style)
display(POSTGRES_DATABASE)

display(widgets.Label("POSTGRES Output Table"))
JDBCTOJDBC_OUTPUT_TABLE=widgets.Text(value="employees_out_st",style=style)
display(JDBCTOJDBC_OUTPUT_TABLE)

display(widgets.Label("POSTGRES Output Mode"))
JDBCTOJDBC_OUTPUT_MODE=widgets.Text(value="overwrite",style=style)
display(JDBCTOJDBC_OUTPUT_MODE)

display(widgets.Label("POSTGRES Output Batch Size"))
JDBCTOJDBC_OUTPUT_BATCH_SIZE=widgets.Text(value="1000",style=style)
display(JDBCTOJDBC_OUTPUT_BATCH_SIZE)

Label(value='PPOSTGRES Server Host')

Text(value='10.203.211.3', style=TextStyle(description_width='initial'))

Label(value='POSTGRES Server Port')

Text(value='5432', style=TextStyle(description_width='initial'))

Label(value='POSTGRES Server Username')

Text(value='postgres', style=TextStyle(description_width='initial'))

Label(value='POSTGRES Server Password')

Text(value='password123', style=TextStyle(description_width='initial'))

Label(value='POSTGRES Server Database')

Text(value='AdvDB', style=TextStyle(description_width='initial'))

Label(value='POSTGRES Output Table')

Text(value='employees_out_st', style=TextStyle(description_width='initial'))

Label(value='POSTGRES Output Mode')

Text(value='overwrite', style=TextStyle(description_width='initial'))

Label(value='POSTGRES Output Batch Size')

Text(value='1000', style=TextStyle(description_width='initial'))

### Step 3.4 Notebook Configuration Parameters
#### Below variables shoulld not be changed unless required
#### In case required:

* Change disabled=False in the respective widget.Text arguments 
    * eg. PYMSSQL_DRIVER = widgets.Text(value="mssql+pymssql",description="Python MSSQL Driver",style=style,disabled=False)
* Fill in the value in the textbox

In [None]:
display(widgets.Label("Python MSSQL Driver"))
PYMSSQL_DRIVER = widgets.Text(value="mssql+pymssql",style=style,disabled=True)
display(PYMSSQL_DRIVER)

display(widgets.Label("JDBC MSSQL Driver"))
JDBC_INPUT_DRIVER = widgets.Text(value="com.microsoft.sqlserver.jdbc.SQLServerDriver",style=style,disabled=True)
display(JDBC_INPUT_DRIVER)

display(widgets.Label("JDBC MSSQL Url"))
JDBC_INPUT_URL = widgets.Text(value="jdbc:sqlserver://{0}:{1};databaseName={2};user={3};password={4}".format(MSSQL_HOST.value,MSSQL_PORT.value,MSSQL_DATABASE.value,MSSQL_USERNAME.value,MSSQL_PASSWORD.value),style=style,disabled=True)
display(JDBC_INPUT_URL)

display(widgets.Label("Dataproc Main Class"))
MAIN_CLASS = widgets.Text(value="com.google.cloud.dataproc.templates.main.DataProcTemplate",style=style,disabled=True)
display(MAIN_CLASS)

display(widgets.Label("Working Directory"))
WORKING_DIRECTORY = widgets.Text(value="/home/jupyter/dataproc-templates/python/",style=style,disabled=True)
display(WORKING_DIRECTORY)

display(widgets.Label("JDBC POSTGRES Driver"))
JDBC_OUTPUT_DRIVER = widgets.Text(value="org.postgresql.Driver",style=style,disabled=True)
display(JDBC_OUTPUT_DRIVER)

display(widgets.Label("JDBC POSTGRES Url"))
JDBC_OUTPUT_URL = widgets.Text(value="jdbc:postgresql://{0}:{1}/{2}?user={3}&password={4}".format(POSTGRES_HOST.value,POSTGRES_PORT.value,POSTGRES_DATABASE.value,POSTGRES_USERNAME.value,POSTGRES_PASSWORD.value),style=style,disabled=True)
display(JDBC_OUTPUT_URL)

display(widgets.Label("Dataproc Jar Name"))
JAR_FILE = widgets.Text(value="dataproc-templates-1.0-SNAPSHOT.jar",style=style,disabled=True)
display(JAR_FILE)

display(widgets.Label("Dataproc Package Egg File"))
PACKAGE_EGG_FILE = widgets.Text(value = "dist/dataproc_templates_distribution.egg",style=style,disabled=True)
display(PACKAGE_EGG_FILE)

PIPELINE_ROOT = GCS_STAGING_LOCATION.value + "/pipeline_root/dataproc_pyspark"
MAIN_PYTHON_FILE = GCS_STAGING_LOCATION.value + "/main.py"
PYTHON_FILE_URIS = [GCS_STAGING_LOCATION.value + "/dist/dataproc_templates_distribution.egg"]

# Do not change this parameter unless you want to refer below JARS from new location
JARS = [GCS_STAGING_LOCATION.value + "/jars/mssql-jdbc-6.4.0.jre8.jar", GCS_STAGING_LOCATION.value + "/jars/postgresql-42.2.6.jar", GCS_STAGING_LOCATION.value + "/" + JAR_FILE.value]

Label(value='Python MSSQL Driver')

Text(value='mssql+pymssql', disabled=True, style=TextStyle(description_width='initial'))

Label(value='JDBC MSSQL Driver')

Text(value='com.microsoft.sqlserver.jdbc.SQLServerDriver', disabled=True, style=TextStyle(description_width='i…

Label(value='JDBC MSSQL Url')

Text(value='jdbc:sqlserver://10.203.210.5:1433;databaseName=AdvDB;user=sqlserver;password=password123', disabl…

Label(value='Dataproc Main Class')

Text(value='com.google.cloud.dataproc.templates.main.DataProcTemplate', disabled=True, style=TextStyle(descrip…

Label(value='Working Directory')

Text(value='/home/jupyter/dataproc-templates/python/', disabled=True, style=TextStyle(description_width='initi…

Label(value='JDBC POSTGRES Driver')

Text(value='org.postgresql.Driver', disabled=True, style=TextStyle(description_width='initial'))

Label(value='JDBC POSTGRES Url')

Text(value='jdbc:postgresql://10.203.211.3:5432/postadvdb?user=postgres&password=password123', disabled=True, …

Label(value='Dataproc Jar Name')

Text(value='dataproc-templates-1.0-SNAPSHOT.jar', disabled=True, style=TextStyle(description_width='initial'))

Label(value='Dataproc Package Egg File')

Text(value='dist/dataproc_templates_distribution.egg', disabled=True, style=TextStyle(description_width='initi…

## Step 4: Generate MSSQL Table List
This step creates list of tables for migration. If MSSQLTABLE_LIST is kept empty all the tables in the MSSQL_DATABASE are listed for migration otherwise the provided list is used

In [168]:
SQLTABLE_LIST=[]
SQLTABLE_LIST=MSSQLTABLE_LIST.value.strip().split(",")
if len(SQLTABLE_LIST) == 0 or SQLTABLE_LIST[0]=='':
    SQLTABLE_LIST.pop()
    DB = sqlalchemy.create_engine(
            sqlalchemy.engine.url.URL.create(
                drivername=PYMSSQL_DRIVER.value,
                username=MSSQL_USERNAME.value,
                password=MSSQL_PASSWORD.value,
                database=MSSQL_DATABASE.value,
                host=MSSQL_HOST.value,
                port=MSSQL_PORT.value
              )
            )
    with DB.connect() as conn:
        print("connected to database")
        results = DB.execute('select TABLE_SCHEMA,TABLE_NAME from INFORMATION_SCHEMA.Tables').fetchall()
        print("Total Tables = ", len(results))
        for row in results:
            SQLTABLE_LIST.append(row[0]+"."+row[1])

print("list of tables for migration :")
print(SQLTABLE_LIST)

connected to database
Total Tables =  91
list of tables for migration :
['HumanResources.EmployeePayHistory', 'Sales.SalesOrderHeaderSalesReason', 'Sales.SalesPerson', 'Production.Illustration', 'HumanResources.JobCandidate', 'Production.Location', 'Person.Password', 'Sales.SalesPersonQuotaHistory', 'Person.Person', 'Sales.SalesReason', 'Sales.SalesTaxRate', 'Sales.PersonCreditCard', 'Person.vAdditionalContactInfo', 'Person.PersonPhone', 'HumanResources.vEmployee', 'Sales.SalesTerritory', 'HumanResources.vEmployeeDepartment', 'Person.PhoneNumberType', 'HumanResources.vEmployeeDepartmentHistory', 'Sales.vIndividualCustomer', 'Production.Product', 'Sales.vPersonDemographics', 'HumanResources.vJobCandidate', 'HumanResources.vJobCandidateEmployment', 'HumanResources.vJobCandidateEducation', 'Production.vProductAndDescription', 'Production.vProductModelCatalogDescription', 'Production.vProductModelInstructions', 'Sales.vSalesPerson', 'Sales.SalesTerritoryHistory', 'Sales.vSalesPersonSalesBy

## Step 5: Get Primary Keys for partition the tables
This step fetches primary key from MSSQL_DATABASE for the tables listed for migration

In [169]:
SQL_TABLE_PRIMARY_KEYS = {}
DB = sqlalchemy.create_engine(
            sqlalchemy.engine.url.URL.create(
                drivername=PYMSSQL_DRIVER.value,
                username=MSSQL_USERNAME.value,
                password=MSSQL_PASSWORD.value,
                database=MSSQL_DATABASE.value,
                host=MSSQL_HOST.value,
                port=MSSQL_PORT.value
              )
            )
with DB.connect() as conn:
    for table in SQLTABLE_LIST:
        primary_keys = []
        results = DB.execute("SELECT COLUMN_NAME FROM INFORMATION_SCHEMA.TABLE_CONSTRAINTS T JOIN INFORMATION_SCHEMA.KEY_COLUMN_USAGE K ON K.CONSTRAINT_NAME=T.CONSTRAINT_NAME  WHERE  K.TABLE_NAME='{0}'  AND K.TABLE_SCHEMA='{1}' AND T.CONSTRAINT_TYPE='PRIMARY KEY';".format(table.split(".")[1],table.split(".")[0])).fetchall()
        for row in results:
            primary_keys.append(row[0])
        if primary_keys:
            SQL_TABLE_PRIMARY_KEYS[table] = ",".join(primary_keys)
        else:
            SQL_TABLE_PRIMARY_KEYS[table] = ""
            

In [170]:
pkDF = pd.DataFrame({"table" : SQLTABLE_LIST, "primary_keys": list(SQL_TABLE_PRIMARY_KEYS.values())})
print("Below are identified primary keys for migrating mssql table to postgres:")
pkDF

Below are identified primary keys for migrating mssql table to postgres:


Unnamed: 0,table,primary_keys
0,HumanResources.EmployeePayHistory,"BusinessEntityID,RateChangeDate"
1,Sales.SalesOrderHeaderSalesReason,"SalesOrderID,SalesReasonID"
2,Sales.SalesPerson,BusinessEntityID
3,Production.Illustration,IllustrationID
4,HumanResources.JobCandidate,JobCandidateID
...,...,...
86,Sales.SalesOrderDetail,"SalesOrderDetailID,SalesOrderID"
87,Person.EmailAddress,"BusinessEntityID,EmailAddressID"
88,HumanResources.Employee,BusinessEntityID
89,Sales.SalesOrderHeader,SalesOrderID


## Step 6: Create JAR files and Upload to GCS
#### Run Step 6 one time for each new notebook instance

In [171]:
%cd $WORKING_DIRECTORY.value

/home/jupyter/dataproc-templates/python


#### Get JDBC Connector jars

In [172]:
%%bash
wget https://jdbc.postgresql.org/download/postgresql-42.2.6.jar
wget https://repo1.maven.org/maven2/com/microsoft/sqlserver/mssql-jdbc/6.4.0.jre8/mssql-jdbc-6.4.0.jre8.jar

--2022-09-22 13:16:01--  https://jdbc.postgresql.org/download/postgresql-42.2.6.jar
Resolving jdbc.postgresql.org (jdbc.postgresql.org)... 72.32.157.228, 2001:4800:3e1:1::228
Connecting to jdbc.postgresql.org (jdbc.postgresql.org)|72.32.157.228|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 842825 (823K) [application/java-archive]
Saving to: ‘postgresql-42.2.6.jar.5’

     0K .......... .......... .......... .......... ..........  6% 1.28M 1s
    50K .......... .......... .......... .......... .......... 12% 1.30M 1s
   100K .......... .......... .......... .......... .......... 18%  118M 0s
   150K .......... .......... .......... .......... .......... 24% 1.29M 0s
   200K .......... .......... .......... .......... .......... 30%  187M 0s
   250K .......... .......... .......... .......... .......... 36% 56.6M 0s
   300K .......... .......... .......... .......... .......... 42% 1.35M 0s
   350K .......... .......... .......... .......... .......... 48% 94.

#### Build Dataproc Templates python package

In [173]:
! python ./setup.py bdist_egg --output=$PACKAGE_EGG_FILE.value

running bdist_egg
running egg_info
writing dataproc_templates.egg-info/PKG-INFO
writing dependency_links to dataproc_templates.egg-info/dependency_links.txt
writing requirements to dataproc_templates.egg-info/requires.txt
writing top-level names to dataproc_templates.egg-info/top_level.txt
reading manifest file 'dataproc_templates.egg-info/SOURCES.txt'
writing manifest file 'dataproc_templates.egg-info/SOURCES.txt'
installing library code to build/bdist.linux-x86_64/egg
running install_lib
running build_py
creating build/bdist.linux-x86_64/egg
creating build/bdist.linux-x86_64/egg/dataproc_templates
copying build/lib/dataproc_templates/__init__.py -> build/bdist.linux-x86_64/egg/dataproc_templates
copying build/lib/dataproc_templates/template_name.py -> build/bdist.linux-x86_64/egg/dataproc_templates
creating build/bdist.linux-x86_64/egg/dataproc_templates/mongo
copying build/lib/dataproc_templates/mongo/__init__.py -> build/bdist.linux-x86_64/egg/dataproc_templates/mongo
copying build

#### Copying JAR files to GCS_STAGING_LOCATION

In [174]:
! gsutil cp main.py $GCS_STAGING_LOCATION.value/
! gsutil cp -r $PACKAGE_EGG_FILE.value $GCS_STAGING_LOCATION.value/
! gsutil cp mssql-jdbc-6.4.0.jre8.jar $GCS_STAGING_LOCATION.value/jars/mssql-jdbc-6.4.0.jre8.jar
! gsutil cp postgresql-42.2.6.jar $GCS_STAGING_LOCATION.value/jars/postgresql-42.2.6.jar


Copying file://main.py [Content-Type=text/x-python]...
/ [1 files][  4.2 KiB/  4.2 KiB]                                                
Operation completed over 1 objects/4.2 KiB.                                      
Copying file://dist/dataproc_templates_distribution.egg [Content-Type=application/octet-stream]...
/ [1 files][114.0 KiB/114.0 KiB]                                                
Operation completed over 1 objects/114.0 KiB.                                    
Copying file://mssql-jdbc-6.4.0.jre8.jar [Content-Type=application/java-archive]...
/ [1 files][884.7 KiB/884.7 KiB]                                                
Operation completed over 1 objects/884.7 KiB.                                    
Copying file://postgresql-42.2.6.jar [Content-Type=application/java-archive]...
/ [1 files][823.1 KiB/823.1 KiB]                                                
Operation completed over 1 objects/823.1 KiB.                                    


## Step 7: Calculate Parallel Jobs for MSSQL to POSTGRES
This step uses MAX_PARALLELISM parameter to calculate number of parallel jobs to run

In [175]:
COMPLETE_LIST = copy.deepcopy(SQLTABLE_LIST)
PARALLEL_JOBS = len(SQLTABLE_LIST)//MAX_PARALLELISM.value
JOB_LIST = []
while len(COMPLETE_LIST) > 0:
    SUB_LIST = []
    for i in range(MAX_PARALLELISM.value):
        if len(COMPLETE_LIST)>0 :
            SUB_LIST.append(COMPLETE_LIST[0])
            COMPLETE_LIST.pop(0)
        else:
            break
    JOB_LIST.append(SUB_LIST)
print("list of tables for execution : ")
print(JOB_LIST)


list of tables for execution : 
[['HumanResources.EmployeePayHistory', 'Sales.SalesOrderHeaderSalesReason', 'Sales.SalesPerson', 'Production.Illustration', 'HumanResources.JobCandidate', 'Production.Location', 'Person.Password', 'Sales.SalesPersonQuotaHistory', 'Person.Person', 'Sales.SalesReason'], ['Sales.SalesTaxRate', 'Sales.PersonCreditCard', 'Person.vAdditionalContactInfo', 'Person.PersonPhone', 'HumanResources.vEmployee', 'Sales.SalesTerritory', 'HumanResources.vEmployeeDepartment', 'Person.PhoneNumberType', 'HumanResources.vEmployeeDepartmentHistory', 'Sales.vIndividualCustomer'], ['Production.Product', 'Sales.vPersonDemographics', 'HumanResources.vJobCandidate', 'HumanResources.vJobCandidateEmployment', 'HumanResources.vJobCandidateEducation', 'Production.vProductAndDescription', 'Production.vProductModelCatalogDescription', 'Production.vProductModelInstructions', 'Sales.vSalesPerson', 'Sales.SalesTerritoryHistory'], ['Sales.vSalesPersonSalesByFiscalYears', 'Person.vStateProvi

## Step 8: Get Row Count of Tables and identify Partition Columns 
#### This step uses PARTITION_THRESHOLD parameter and any table having rows greater than PARTITION_THRESHOLD will be partitioned based on Primary Keys
#### Get Primary keys for all tables to be migrated and find an integer column to partition on

In [176]:
display(widgets.Label("Maximum Row Count Threshold for a Table"))
PARTITION_THRESHOLD = widgets.Text(value="10000",style=style)
display(PARTITION_THRESHOLD)

CHECK_PARTITION_COLUMN_LIST={}
mssql_to_postgres_jobs = []

Label(value='Maximum Row Count Threshold for a Table')

Text(value='10000', style=TextStyle(description_width='initial'))

In [177]:
with DB.connect() as conn:
    for table in SQLTABLE_LIST:
        results = DB.execute("SELECT count(1) FROM {}".format(table)).fetchall()
        if results[0][0]>int(PARTITION_THRESHOLD.value) and len(SQL_TABLE_PRIMARY_KEYS.get(table).split(",")[0])>0:
            column_list=SQL_TABLE_PRIMARY_KEYS.get(table).split(",")
            for column in column_list:
                results_datatype = DB.execute("SELECT DATA_TYPE FROM INFORMATION_SCHEMA.COLUMNS WHERE TABLE_SCHEMA = '{0}' AND TABLE_NAME   = '{1}' AND COLUMN_NAME  = '{2}'".format(table.split(".")[0],table.split(".")[1],column)).fetchall()      
                if results_datatype[0][0]=="int":
                    CHECK_PARTITION_COLUMN_LIST[table]=column
                
                
print(CHECK_PARTITION_COLUMN_LIST)


{'Sales.SalesOrderHeaderSalesReason': 'SalesReasonID', 'Person.Password': 'BusinessEntityID', 'Person.Person': 'BusinessEntityID', 'Sales.PersonCreditCard': 'CreditCardID', 'Person.PersonPhone': 'PhoneNumberTypeID', 'Person.Address': 'AddressID', 'Production.TransactionHistory': 'TransactionID', 'Person.BusinessEntity': 'BusinessEntityID', 'Production.TransactionHistoryArchive': 'TransactionID', 'Person.BusinessEntityAddress': 'BusinessEntityID', 'Production.WorkOrder': 'WorkOrderID', 'Sales.CreditCard': 'CreditCardID', 'Production.WorkOrderRouting': 'WorkOrderID', 'Sales.CurrencyRate': 'CurrencyRateID', 'Sales.Customer': 'CustomerID', 'Sales.SalesOrderDetail': 'SalesOrderID', 'Person.EmailAddress': 'EmailAddressID', 'Sales.SalesOrderHeader': 'SalesOrderID'}


## Step 9:Create Source Schemas in POSTGRES

In [206]:
import psycopg2
postgresDB = psycopg2.connect(
                user=POSTGRES_USERNAME.value,
                password=POSTGRES_PASSWORD.value,
                dbname=POSTGRES_DATABASE.value,
                host=POSTGRES_HOST.value,
                port=POSTGRES_PORT.value
            )
postgresDB.autocommit = True
conn=postgresDB.cursor()

for table in SQLTABLE_LIST:
    conn.execute('''CREATE SCHEMA IF NOT EXISTS {};'''.format(table.split(".")[0]))

conn.close()

## Step 10: Execute Pipeline to Migrate tables from MSSQL to POSTGRES

In [207]:
def migrate_mssql_to_postgres(EXECUTION_LIST):
    EXECUTION_LIST = EXECUTION_LIST
    aiplatform.init(project=PROJECT.value,staging_bucket=GCS_STAGING_LOCATION.value)
    
    @dsl.pipeline(
        name="python-mssql-to-postgres-pyspark",
        description="Pipeline to get data from mssql to postgres",
    )
    
    def pipeline(
        PROJECT_ID: str = PROJECT.value,
        LOCATION: str = REGION.value,
        MAIN_PYTHON_CLASS: str = MAIN_PYTHON_FILE,
        PYTHON_FILE_URIS: list = PYTHON_FILE_URIS,
        JAR_FILE_URIS: list = JARS,
        SUBNETWORK_URI: str = SUBNET.value
        ):
        for table in EXECUTION_LIST:
            BATCH_ID = "mssqltopostgres-{}".format(datetime.now().strftime("%s"))
            mssql_to_postgres_jobs.append(BATCH_ID)
            
            
            if table in CHECK_PARTITION_COLUMN_LIST.keys():
                TEMPLATE_SPARK_ARGS = [
                "--template=JDBCTOJDBC",
                "--jdbctojdbc.input.url={}".format(JDBC_INPUT_URL.value),
                "--jdbctojdbc.input.driver={}".format(JDBC_INPUT_DRIVER.value),
                "--jdbctojdbc.input.table={}".format(table),
                "--jdbctojdbc.output.url={}".format(JDBC_OUTPUT_URL.value),
                "--jdbctojdbc.output.driver={}".format(JDBC_OUTPUT_DRIVER.value),
                "--jdbctojdbc.output.table={}".format(table),
                "--jdbctojdbc.input.partitioncolumn={}".format(CHECK_PARTITION_COLUMN_LIST[table]),
                "--jdbctojdbc.input.lowerbound={}".format(JDBCTOJDBC_INTPUT_LOWERBOUND.value),
                "--jdbctojdbc.input.upperbound={}".format(PARTITION_THRESHOLD.value),
                "--jdbctojdbc.numpartitions={}".format(JDBCTOJDBC_NUMOFPARTITIONS.value),
                "--jdbctojdbc.output.mode={}".format(JDBCTOJDBC_OUTPUT_MODE.value),
                "--jdbctojdbc.output.batch.size={}".format(JDBCTOJDBC_OUTPUT_BATCH_SIZE.value)
                ]
            else:
                TEMPLATE_SPARK_ARGS = [
                "--template=JDBCTOJDBC",
                "--jdbctojdbc.input.url={}".format(JDBC_INPUT_URL.value),
                "--jdbctojdbc.input.driver={}".format(JDBC_INPUT_DRIVER.value),
                "--jdbctojdbc.input.table={}".format(table),
                "--jdbctojdbc.output.url={}".format(JDBC_OUTPUT_URL.value),
                "--jdbctojdbc.output.driver={}".format(JDBC_OUTPUT_DRIVER.value),
                "--jdbctojdbc.output.table={}".format(table),
                "--jdbctojdbc.numpartitions={}".format(JDBCTOJDBC_NUMOFPARTITIONS.value),
                "--jdbctojdbc.output.mode={}".format(JDBCTOJDBC_OUTPUT_MODE.value),
                "--jdbctojdbc.output.batch.size={}".format(JDBCTOJDBC_OUTPUT_BATCH_SIZE.value)
                ]
            

            _ = DataprocPySparkBatchOp(
                project=PROJECT_ID,
                location=LOCATION,
                batch_id=BATCH_ID,
                main_python_file_uri=MAIN_PYTHON_CLASS,
                jar_file_uris=JAR_FILE_URIS,
                python_file_uris=PYTHON_FILE_URIS,
                subnetwork_uri=SUBNETWORK_URI,
                args=TEMPLATE_SPARK_ARGS
                )
            time.sleep(3)

    compiler.Compiler().compile(pipeline_func=pipeline, package_path="pipeline.json")

    pipeline = aiplatform.PipelineJob(
            display_name="pipeline",
        template_path="pipeline.json",
        pipeline_root=PIPELINE_ROOT,
        enable_caching=False,
        )
    pipeline.run()

In [208]:
for execution_list in JOB_LIST:
    print(execution_list)
    migrate_mssql_to_postgres(execution_list)

['HumanResources.EmployeePayHistory', 'Sales.SalesOrderHeaderSalesReason', 'Sales.SalesPerson', 'Production.Illustration', 'HumanResources.JobCandidate', 'Production.Location', 'Person.Password', 'Sales.SalesPersonQuotaHistory', 'Person.Person', 'Sales.SalesReason']
Creating PipelineJob
PipelineJob created. Resource name: projects/617357862702/locations/us-central1/pipelineJobs/python-mssql-to-postgres-pyspark-20220922134814
To use this PipelineJob in another session:
pipeline_job = aiplatform.PipelineJob.get('projects/617357862702/locations/us-central1/pipelineJobs/python-mssql-to-postgres-pyspark-20220922134814')
View Pipeline Job:
https://console.cloud.google.com/vertex-ai/locations/us-central1/pipelines/runs/python-mssql-to-postgres-pyspark-20220922134814?project=617357862702
PipelineJob projects/617357862702/locations/us-central1/pipelineJobs/python-mssql-to-postgres-pyspark-20220922134814 current state:
PipelineState.PIPELINE_STATE_RUNNING
PipelineJob projects/617357862702/locati

## Step 11: Get status for tables migrated from MSSQL to POSTGRES

In [209]:
def get_bearer_token():
    
    try:
        #Defining Scope
        CREDENTIAL_SCOPES = ["https://www.googleapis.com/auth/cloud-platform"]

        #Assigning credentials and project value
        credentials, project_id = google.auth.default(scopes=CREDENTIAL_SCOPES)

        #Refreshing credentials data
        credentials.refresh(requests.Request())

        #Get refreshed token
        token = credentials.token
        if token:
            return (token,200)
        else:
            return "Bearer token not generated"
    except Exception as error:
        return ("Bearer token not generated. Error : {}".format(error),500)

In [210]:
from google.auth.transport import requests
import google

token = get_bearer_token()
if token[1] == 200:
    print("Bearer token generated")
else:
    print(token)

Bearer token generated


In [211]:
import requests

mssql_to_postgres_status = []
job_status_url = "https://dataproc.googleapis.com/v1/projects/{}/locations/{}/batches/{}"
for job in mssql_to_postgres_jobs:
    auth = "Bearer " + token[0]
    url = job_status_url.format(PROJECT.value,REGION.value,job)
    headers = {
      'Content-Type': 'application/json; charset=UTF-8',
      'Authorization': auth 
    }
    response = requests.get(url, headers=headers)
    mssql_to_postgres_status.append(response.json()['state'])

In [248]:
statusDF = pd.DataFrame({"table" : SQLTABLE_LIST,"mssql_to_postgres_job" : mssql_to_postgres_jobs, "mssql_to_postgres_status" : mssql_to_postgres_status})
statusDF

Unnamed: 0,table,mssql_to_postgres_job,mssql_to_postgres_status
0,HumanResources.EmployeePayHistory,mssqltopostgres-1663854464,SUCCEEDED
1,Sales.SalesOrderHeaderSalesReason,mssqltopostgres-1663854467,SUCCEEDED
2,Sales.SalesPerson,mssqltopostgres-1663854470,SUCCEEDED
3,Production.Illustration,mssqltopostgres-1663854473,SUCCEEDED
4,HumanResources.JobCandidate,mssqltopostgres-1663854476,SUCCEEDED
...,...,...,...
86,Sales.SalesOrderDetail,mssqltopostgres-1663857120,SUCCEEDED
87,Person.EmailAddress,mssqltopostgres-1663857123,SUCCEEDED
88,HumanResources.Employee,mssqltopostgres-1663857126,SUCCEEDED
89,Sales.SalesOrderHeader,mssqltopostgres-1663857129,SUCCEEDED


## Step 12: Validate row counts of migrated tables from MSSQL to POSTGRES

In [249]:
mssql_row_count = []
postgres_row_count = []

In [250]:
# get mssql table counts
DB = sqlalchemy.create_engine(
            sqlalchemy.engine.url.URL.create(
                drivername=PYMSSQL_DRIVER.value,
                username=MSSQL_USERNAME.value,
                password=MSSQL_PASSWORD.value,
                database=MSSQL_DATABASE.value,
                host=MSSQL_HOST.value,
                port=MSSQL_PORT.value
              )
            )
with DB.connect() as conn:
    for table in SQLTABLE_LIST:
        results = DB.execute("select count(*) from {}".format(table)).fetchall()
        for row in results:
            mssql_row_count.append(row[0])

In [252]:
import psycopg2
postgresDB = psycopg2.connect(
                user=POSTGRES_USERNAME.value,
                password=POSTGRES_PASSWORD.value,
                dbname=POSTGRES_DATABASE.value,
                host=POSTGRES_HOST.value,
                port=POSTGRES_PORT.value
            )

conn=postgresDB.cursor()
for table in SQLTABLE_LIST:
    conn.execute('''select count(*) from {}'''.format(table))
    results = conn.fetchall()
    for row in results:
            postgres_row_count.append(row[0])

conn.close()

In [253]:
statusDF['mssql_row_count'] = mssql_row_count 
statusDF['postgres_row_count'] = postgres_row_count 
statusDF

Unnamed: 0,table,mssql_to_postgres_job,mssql_to_postgres_status,mssql_row_count,postgres_row_count
0,HumanResources.EmployeePayHistory,mssqltopostgres-1663854464,SUCCEEDED,316,316
1,Sales.SalesOrderHeaderSalesReason,mssqltopostgres-1663854467,SUCCEEDED,27647,27647
2,Sales.SalesPerson,mssqltopostgres-1663854470,SUCCEEDED,17,17
3,Production.Illustration,mssqltopostgres-1663854473,SUCCEEDED,5,5
4,HumanResources.JobCandidate,mssqltopostgres-1663854476,SUCCEEDED,13,13
...,...,...,...,...,...
86,Sales.SalesOrderDetail,mssqltopostgres-1663857120,SUCCEEDED,121317,121317
87,Person.EmailAddress,mssqltopostgres-1663857123,SUCCEEDED,19972,19972
88,HumanResources.Employee,mssqltopostgres-1663857126,SUCCEEDED,290,290
89,Sales.SalesOrderHeader,mssqltopostgres-1663857129,SUCCEEDED,31465,31465
