# <center>MySQL to Spanner Migration

## Step 1: Install Libraries
#### Run Step 1 one time for each new notebook instance

In [319]:
%%bash
pip3 install pymysql SQLAlchemy
pip3 install --upgrade google-cloud-pipeline-components kfp --user -q



In [322]:
%%bash
wget https://mirrors.estointernet.in/apache/maven/maven-3/3.6.3/binaries/apache-maven-3.6.3-bin.tar.gz
tar -xf apache-maven-3.6.3-bin.tar.gz
sudo rm -rf /usr/bin/apache-maven-3.6.3
sudo mv apache-maven-3.6.3 /usr/bin/

--2022-09-01 00:40:24--  https://mirrors.estointernet.in/apache/maven/maven-3/3.6.3/binaries/apache-maven-3.6.3-bin.tar.gz
Resolving mirrors.estointernet.in (mirrors.estointernet.in)... 43.255.166.254, 2403:8940:3:1::f
Connecting to mirrors.estointernet.in (mirrors.estointernet.in)|43.255.166.254|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 9506321 (9.1M) [application/octet-stream]
Saving to: ‘apache-maven-3.6.3-bin.tar.gz.2’

     0K .......... .......... .......... .......... ..........  0%  108K 86s
    50K .......... .......... .......... .......... ..........  1%  238K 62s
   100K .......... .......... .......... .......... ..........  1%  440K 48s
   150K .......... .......... .......... .......... ..........  2%  514K 40s
   200K .......... .......... .......... .......... ..........  2%  550K 35s
   250K .......... .......... .......... .......... ..........  3%  773K 31s
   300K .......... .......... .......... .......... ..........  3% 1003K 28s
 

## Step 2: Import Libraries

In [4]:
import sqlalchemy
import pymysql
import google.cloud.aiplatform as aiplatform
from kfp import dsl
from kfp.v2 import compiler
from datetime import datetime
import time
import copy
import json
import pandas as pd
from google_cloud_pipeline_components.experimental.dataproc import DataprocSparkBatchOp

## Step 3: Assign Parameters

### Step 3.1 Common Parameters

###### PROJECT : GCP project-id
###### REGION : GCP region
###### GCS_STAGING_LOCATION : GCS staging locatio to be used for this notebook
###### SUBNET : subnet
###### JARS : list of jars. For this notebook mysql connector and avro jar is required in addition with the dataproc template jars
###### MAX_PARALLELISM : Parameter for number of jobs to run in parallel default value is 2

In [5]:
PROJECT = "yadavaja-sandbox"
REGION = "us-west1"
GCS_STAGING_LOCATION = "gs://python-dataproc-templates-temp/mysql-to-spanner-staging"
SUBNET = "projects/yadavaja-sandbox/regions/us-west1/subnetworks/test-subnet1"
JARS = ["gs://datproc_template_nk/jars/mysql-connector-java-8.0.29.jar","file:///usr/lib/spark/external/spark-avro.jar"]
MAX_PARALLELISM = 2

### Step 3.2 MYSQL to GCS Parameters

In [6]:
MYSQL_HOST = "10.203.209.12"
MYSQL_PORT = "3306"
MYSQL_USERNAME = "root"
MYSQL_PASSWORD = "naveen"
MYSQL_DATABASE = "nk"
MYSQLTABLE_LIST = ['employees','employees_nop','employees_mup'] # leave list empty for migrating complete database
MYSQL_OUTPUT_GCS_LOCATION = "gs://python-dataproc-templates/mysql-gcs-output"
MYSQL_OUTPUT_GCS_MODE = "overwrite"
MYSQL_OUTPUT_GCS_FORMAT = "avro"

### Step 3.3 GCS to SPANNER Parameters

In [7]:
SPANNER_INSTANCE = "dataproc-spark-test"
SPANNER_DATABASE = "spark-ci-db"
SPANNER_TABLE_PRIMARY_KEYS = {"employees_nop" : "id"}

### Step 3.4 Notebook Configuration Parameters
Below variables shoulld not be changed unless required

In [8]:
PYMYSQL_DRIVER = "mysql+pymysql"
JDBC_DRIVER = "com.mysql.cj.jdbc.Driver"
JDBC_URL = "jdbc:mysql://{}:{}/{}?user={}&password={}".format(MYSQL_HOST,MYSQL_PORT,MYSQL_DATABASE,MYSQL_USERNAME,MYSQL_PASSWORD)
MAIN_CLASS = "com.google.cloud.dataproc.templates.main.DataProcTemplate"
WORKING_DIRECTORY = "/home/jupyter/dataproc-templates/java/"
JAR_FILE = "dataproc-templates-1.0-SNAPSHOT.jar"
GRPC_JAR_PATH = "./grpc_lb/io/grpc/grpc-grpclb/1.40.1"
GRPC_JAR = "grpc-grpclb-1.40.1.jar"
LOG4J_PROPERTIES_PATH = "./src/test/resources"
LOG4J_PROPERTIES = "log4j-spark-driver-template.properties"
PIPELINE_ROOT = GCS_STAGING_LOCATION + "/pipeline_root/dataproc_pyspark"

JARS.append(GCS_STAGING_LOCATION + "/" + GRPC_JAR)
JARS.append(GCS_STAGING_LOCATION + "/" + JAR_FILE)

## Step 4: Generate MySQL Table List
This step creates list of tables for migration. If MYSQLTABLE_LIST is kept empty all the tables in the MYSQL_DATABASE are listed for migration otherwise the provided list is used

In [9]:
if len(MYSQLTABLE_LIST) == 0:
    DB = sqlalchemy.create_engine(
            sqlalchemy.engine.url.URL.create(
                drivername=PYMYSQL_DRIVER,
                username=MYSQL_USERNAME,
                password=MYSQL_PASSWORD,
                database=MYSQL_DATABASE,
                host=MYSQL_HOST,
                port=MYSQL_PORT
              )
            )
    with DB.connect() as conn:
        print("connected to database")
        results = DB.execute('show tables;').fetchall()
        print("Total Tables = ", len(results))
        for row in results:
            TABLE_LIST.append(row[0])

print("list of tables for migration :")
print(MYSQLTABLE_LIST)

list of tables for migration :
['employees', 'employees_nop', 'employees_mup']


## Step 5: Get Primary Keys for tables not present in SPANNER_TABLE_PRIMARY_KEYS
For tables which do not have primary key provided in dictonary SPANNER_TABLE_PRIMARY_KEYS this step fetches primary key from MYSQL_DATABASE

In [10]:
DB = sqlalchemy.create_engine(
            sqlalchemy.engine.url.URL.create(
                drivername=PYMYSQL_DRIVER,
                username=MYSQL_USERNAME,
                password=MYSQL_PASSWORD,
                database=MYSQL_DATABASE,
                host=MYSQL_HOST,
                port=MYSQL_PORT
              )
            )
with DB.connect() as conn:
    for table in MYSQLTABLE_LIST:
        primary_keys = []
        if table not in SPANNER_TABLE_PRIMARY_KEYS:
            results = DB.execute("SHOW KEYS FROM {} WHERE Key_name = 'PRIMARY'".format(table)).fetchall()
            for row in results:
                primary_keys.append(row[4])
            if primary_keys:
                SPANNER_TABLE_PRIMARY_KEYS[table] = ",".join(primary_keys)
            else:
                SPANNER_TABLE_PRIMARY_KEYS[table] = ""

In [11]:
pkDF = pd.DataFrame({"table" : MYSQLTABLE_LIST, "primary_keys": list(SPANNER_TABLE_PRIMARY_KEYS.values())})
print("Below are identified primary keys for migrating mysql table to spanner:")
pkDF

Below are identified primary keys for migrating mysql table to spanner:


Unnamed: 0,table,primary_keys
0,employees,id
1,employees_nop,id
2,employees_mup,"id,fname"


## Step 6: Create JAR files and Upload to GCS
#### Run Step 6 one time for each new notebook instance

In [12]:
%cd $WORKING_DIRECTORY

/home/jupyter/dataproc-templates/java


In [14]:
%%bash
export MAVEN_HOME="/usr/bin/apache-maven-3.6.3"
export PATH="$MAVEN_HOME/bin:$PATH"
export PATH
mvn clean spotless:apply install -DskipTests 
mvn dependency:get -Dartifact=io.grpc:grpc-grpclb:1.40.1 -Dmaven.repo.local=./grpc_lb 

IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)



In [15]:
!gsutil cp target/$JAR_FILE $GCS_STAGING_LOCATION/$JAR_FILE
!gsutil cp $GRPC_JAR_PATH/$GRPC_JAR $GCS_STAGING_LOCATION/$GRPC_JAR
!gsutil cp $LOG4J_PROPERTIES_PATH/$LOG4J_PROPERTIES $GCS_STAGING_LOCATION/$LOG4J_PROPERTIES

Copying file://target/dataproc-templates-1.0-SNAPSHOT.jar [Content-Type=application/java-archive]...
==> NOTE: You are uploading one or more large file(s), which would run          
significantly faster if you enable parallel composite uploads. This
feature can be enabled by editing the
"parallel_composite_upload_threshold" value in your .boto
configuration file. However, note that if you do this large files will
be uploaded as `composite objects
<https://cloud.google.com/storage/docs/composite-objects>`_,which
means that any user who downloads such objects will need to have a
compiled crcmod installed (see "gsutil help crcmod"). This is because
without a compiled crcmod, computing checksums on composite objects is
so slow that gsutil disables downloads of composite objects.

\ [1 files][158.5 MiB/158.5 MiB]                                                
Operation completed over 1 objects/158.5 MiB.                                    
Copying file://./grpc_lb/io/grpc/grpc-grpclb/1.40.

## Step 7: Calculate Parallel Jobs for MySQL to GCS
This step uses MAX_PARALLELISM parameter to calculate number of parallel jobs to run

In [16]:
# calculate parallel jobs:
COMPLETE_LIST = copy.deepcopy(MYSQLTABLE_LIST)
PARALLEL_JOBS = len(MYSQLTABLE_LIST)//MAX_PARALLELISM
JOB_LIST = []
while len(COMPLETE_LIST) > 0:
    SUB_LIST = []
    for i in range(MAX_PARALLELISM):
        if len(COMPLETE_LIST)>0 :
            SUB_LIST.append(COMPLETE_LIST[0].lower())
            COMPLETE_LIST.pop(0)
        else:
            break
    JOB_LIST.append(SUB_LIST)
print("list of tables for execution : ")
print(JOB_LIST)

list of tables for execution : 
[['employees', 'employees_nop'], ['employees_mup']]


## Step 8: Execute Pipeline to Migrate tables from MySQL to GCS

In [17]:
mysql_to_gcs_jobs = []

In [18]:
def migrate_mysql_to_gcs(EXECUTION_LIST):
    EXECUTION_LIST = EXECUTION_LIST
    aiplatform.init(project=PROJECT,staging_bucket=GCS_STAGING_LOCATION)
    
    @dsl.pipeline(
        name="java-mysql-to-gcs-pyspark",
        description="Pipeline to get data from mysql to gcs",
    )
    def pipeline(
        PROJECT_ID: str = PROJECT,
        LOCATION: str = REGION,
        MAIN_CLASS: str = MAIN_CLASS,
        JAR_FILE_URIS: list = JARS,
        SUBNETWORK_URI: str = SUBNET,
        FILE_URIS: list = [GCS_STAGING_LOCATION + "/" + LOG4J_PROPERTIES]
    ):
        for table in EXECUTION_LIST:
            BATCH_ID = "mysql2gcs-{}".format(datetime.now().strftime("%s"))
            mysql_to_gcs_jobs.append(BATCH_ID)
            TEMPLATE_SPARK_ARGS = [
            "--template=JDBCTOGCS",
            "--templateProperty", "project.id={}".format(PROJECT),
            "-templateProperty", "jdbctogcs.jdbc.url={}".format(JDBC_URL),
            "--templateProperty", "jdbctogcs.jdbc.driver.class.name={}".format(JDBC_DRIVER),
            "--templateProperty","jdbctogcs.output.location={}/{}".format(MYSQL_OUTPUT_GCS_LOCATION,table),
            "--templateProperty", "jdbctogcs.output.format={}".format(MYSQL_OUTPUT_GCS_FORMAT),
            "--templateProperty", "jdbctogcs.write.mode={}".format(MYSQL_OUTPUT_GCS_MODE),
            "--templateProperty", "jdbctogcs.sql=select * from {}".format(table),
            ]

            _ = DataprocSparkBatchOp(
                project=PROJECT_ID,
                location=LOCATION,
                batch_id=BATCH_ID,
                main_class=MAIN_CLASS,
                jar_file_uris=JAR_FILE_URIS,
                file_uris=FILE_URIS,
                subnetwork_uri=SUBNETWORK_URI,
                args=TEMPLATE_SPARK_ARGS
            )
            time.sleep(3)

    compiler.Compiler().compile(pipeline_func=pipeline, package_path="pipeline.json")

    pipeline = aiplatform.PipelineJob(
            display_name="pipeline",
        template_path="pipeline.json",
        pipeline_root=PIPELINE_ROOT,
        enable_caching=False,
        )
    pipeline.run()

In [19]:
for execution_list in JOB_LIST:
    print(execution_list)
    migrate_mysql_to_gcs(execution_list)

['employees', 'employees_nop']




Creating PipelineJob
PipelineJob created. Resource name: projects/617357862702/locations/us-central1/pipelineJobs/java-mysql-to-gcs-pyspark-20220901005246
To use this PipelineJob in another session:
pipeline_job = aiplatform.PipelineJob.get('projects/617357862702/locations/us-central1/pipelineJobs/java-mysql-to-gcs-pyspark-20220901005246')
View Pipeline Job:
https://console.cloud.google.com/vertex-ai/locations/us-central1/pipelines/runs/java-mysql-to-gcs-pyspark-20220901005246?project=617357862702
PipelineJob projects/617357862702/locations/us-central1/pipelineJobs/java-mysql-to-gcs-pyspark-20220901005246 current state:
PipelineState.PIPELINE_STATE_RUNNING
PipelineJob projects/617357862702/locations/us-central1/pipelineJobs/java-mysql-to-gcs-pyspark-20220901005246 current state:
PipelineState.PIPELINE_STATE_RUNNING
PipelineJob projects/617357862702/locations/us-central1/pipelineJobs/java-mysql-to-gcs-pyspark-20220901005246 current state:
PipelineState.PIPELINE_STATE_RUNNING
PipelineJob

## Step 9: Get status for tables migrated from MySql to GCS

In [20]:
def get_bearer_token():
    
    try:
        #Defining Scope
        CREDENTIAL_SCOPES = ["https://www.googleapis.com/auth/cloud-platform"]

        #Assining credentials and project value
        credentials, project_id = google.auth.default(scopes=CREDENTIAL_SCOPES)

        #Refreshing credentials data
        credentials.refresh(requests.Request())

        #Get refreshed token
        token = credentials.token
        if token:
            return (token,200)
        else:
            return "Bearer token not generated"
    except Exception as error:
        return ("Bearer token not generated. Error : {}".format(error),500)

In [21]:
from google.auth.transport import requests
import google
token = get_bearer_token()
if token[1] == 200:
    print("Bearer token generated")
else:
    print(token)

Bearer token generated


In [22]:
import requests

mysql_to_gcs_status = []
job_status_url = "https://dataproc.googleapis.com/v1/projects/{}/locations/{}/batches/{}"
for job in mysql_to_gcs_jobs:
    auth = "Bearer " + token[0]
    url = job_status_url.format(PROJECT,REGION,job)
    headers = {
      'Content-Type': 'application/json; charset=UTF-8',
      'Authorization': auth 
    }
    response = requests.get(url, headers=headers)
    mysql_to_gcs_status.append(response.json()['state'])

In [23]:
statusDF = pd.DataFrame({"table" : MYSQLTABLE_LIST,"mysql_to_gcs_job" : mysql_to_gcs_jobs, "mysql_to_gcs_status" : mysql_to_gcs_status})
statusDF

Unnamed: 0,table,mysql_to_gcs_job,mysql_to_gcs_status
0,employees,mysql2gcs-1661993560,SUCCEEDED
1,employees_nop,mysql2gcs-1661993563,SUCCEEDED
2,employees_mup,mysql2gcs-1661993911,SUCCEEDED


## Step 10: Execute Pipeline to Migrate tables from GCS to SPANNER

In [24]:
gcs_to_spanner_jobs = []

In [27]:
def migrate_gcs_to_spanner(EXECUTION_LIST):
    EXECUTION_LIST = EXECUTION_LIST
    aiplatform.init(project=PROJECT, staging_bucket=GCS_STAGING_LOCATION)


    @dsl.pipeline(
        name="java-gcs-to-spanner-pyspark",
        description="Pipeline to get data from gcs to spanner",
    )
    def pipeline(
        PROJECT_ID: str = PROJECT,
        LOCATION: str = REGION,
        MAIN_CLASS: str = MAIN_CLASS,
        JAR_FILE_URIS: list = JARS,
        SUBNETWORK_URIS: str = SUBNET,
        FILE_URIS: list = [GCS_STAGING_LOCATION + "/" + LOG4J_PROPERTIES]
    ):
        for table in EXECUTION_LIST:
            BATCH_ID = "gcs2spanner-{}".format(datetime.now().strftime("%s"))
            gcs_to_spanner_jobs.append(BATCH_ID)
            TEMPLATE_SPARK_ARGS = [
            "--template=GCSTOSPANNER",
            "--templateProperty", "project.id={}".format(PROJECT),
            "-templateProperty",  "gcs.spanner.input.format={}".format(MYSQL_OUTPUT_GCS_FORMAT),
            "--templateProperty", "gcs.spanner.input.location={}/{}/".format(MYSQL_OUTPUT_GCS_LOCATION,table),
            "--templateProperty", "gcs.spanner.output.instance={}".format(SPANNER_INSTANCE),
            "--templateProperty", "gcs.spanner.output.database={}".format(SPANNER_DATABASE),
            "--templateProperty", "gcs.spanner.output.table={}".format(table),
            "--templateProperty", "gcs.spanner.output.saveMode={}".format(MYSQL_OUTPUT_GCS_MODE.capitalize()),
            "--templateProperty", "gcs.spanner.output.primaryKey={}".format(SPANNER_TABLE_PRIMARY_KEYS[table])
            ]
            _ = DataprocSparkBatchOp(
                project=PROJECT_ID,
                location=LOCATION,
                batch_id=BATCH_ID,
                main_class=MAIN_CLASS,
                jar_file_uris=JAR_FILE_URIS,
                file_uris=FILE_URIS,
                subnetwork_uri=SUBNETWORK_URIS,
                args=TEMPLATE_SPARK_ARGS
            )
            time.sleep(3)
                                                    

    compiler.Compiler().compile(pipeline_func=pipeline, package_path="pipeline.json")

    pipeline = aiplatform.PipelineJob(
            display_name="pipeline",
            template_path="pipeline.json",
            pipeline_root=PIPELINE_ROOT,
            enable_caching=False,
            )
    pipeline.run()

In [28]:
for execution_list in JOB_LIST:
    print(execution_list)
    migrate_gcs_to_spanner(execution_list)

['employees', 'employees_nop']




Creating PipelineJob
PipelineJob created. Resource name: projects/617357862702/locations/us-central1/pipelineJobs/java-gcs-to-spanner-pyspark-20220901010416
To use this PipelineJob in another session:
pipeline_job = aiplatform.PipelineJob.get('projects/617357862702/locations/us-central1/pipelineJobs/java-gcs-to-spanner-pyspark-20220901010416')
View Pipeline Job:
https://console.cloud.google.com/vertex-ai/locations/us-central1/pipelines/runs/java-gcs-to-spanner-pyspark-20220901010416?project=617357862702
PipelineJob projects/617357862702/locations/us-central1/pipelineJobs/java-gcs-to-spanner-pyspark-20220901010416 current state:
PipelineState.PIPELINE_STATE_RUNNING
PipelineJob projects/617357862702/locations/us-central1/pipelineJobs/java-gcs-to-spanner-pyspark-20220901010416 current state:
PipelineState.PIPELINE_STATE_RUNNING
PipelineJob projects/617357862702/locations/us-central1/pipelineJobs/java-gcs-to-spanner-pyspark-20220901010416 current state:
PipelineState.PIPELINE_STATE_RUNNING

## Step 11: Get status for tables migrated from GCS to SPANNER

In [29]:
from google.auth.transport import requests
import google

token = get_bearer_token()
if token[1] == 200:
    print("Bearer token generated")
else:
    print(token)

Bearer token generated


In [30]:
import requests

gcs_to_spanner_status = []
job_status_url = "https://dataproc.googleapis.com/v1/projects/{}/locations/{}/batches/{}"
for job in gcs_to_spanner_jobs:
    auth = "Bearer " + token[0]
    url = job_status_url.format(PROJECT,REGION,job)
    headers = {
      'Content-Type': 'application/json; charset=UTF-8',
      'Authorization': auth 
    }
    response = requests.get(url, headers=headers)
    gcs_to_spanner_status.append(response.json()['state'])

In [31]:
statusDF['gcs_to_spanner_job'] = gcs_to_spanner_jobs
statusDF['gcs_to_spanner_status'] = gcs_to_spanner_status
statusDF

Unnamed: 0,table,mysql_to_gcs_job,mysql_to_gcs_status,gcs_to_spanner_job,gcs_to_spanner_status
0,employees,mysql2gcs-1661993560,SUCCEEDED,gcs2spanner-1661994250,SUCCEEDED
1,employees_nop,mysql2gcs-1661993563,SUCCEEDED,gcs2spanner-1661994253,SUCCEEDED
2,employees_mup,mysql2gcs-1661993911,SUCCEEDED,gcs2spanner-1661994546,SUCCEEDED


## Step 12: Validate row counts of migrated tables from MySQL to SPANNER

In [32]:
mysql_row_count = []
spanner_row_count = []

In [33]:
# get mysql table counts
DB = sqlalchemy.create_engine(
            sqlalchemy.engine.url.URL.create(
                drivername=PYMYSQL_DRIVER,
                username=MYSQL_USERNAME,
                password=MYSQL_PASSWORD,
                database=MYSQL_DATABASE,
                host=MYSQL_HOST,
                port=MYSQL_PORT
              )
            )
with DB.connect() as conn:
    for table in MYSQLTABLE_LIST:
        results = DB.execute("select count(*) from {}".format(table)).fetchall()
        for row in results:
            mysql_row_count.append(row[0])

In [34]:
# get spanner table counts
from google.cloud import spanner

spanner_client = spanner.Client()
instance = spanner_client.instance(SPANNER_INSTANCE)
database = instance.database(SPANNER_DATABASE)

for table in MYSQLTABLE_LIST:
    with database.snapshot() as snapshot:
        results = snapshot.execute_sql("select count(*) from {}".format(table))
        for row in results:
            spanner_row_count.append(row[0])

[18]
[18]
[18]


In [35]:
statusDF['mysql_row_count'] = mysql_row_count 
statusDF['spanner_row_count'] = spanner_row_count 
statusDF

Unnamed: 0,table,mysql_to_gcs_job,mysql_to_gcs_status,gcs_to_spanner_job,gcs_to_spanner_status,mysql_row_count,spanner_row_count
0,employees,mysql2gcs-1661993560,SUCCEEDED,gcs2spanner-1661994250,SUCCEEDED,18,18
1,employees_nop,mysql2gcs-1661993563,SUCCEEDED,gcs2spanner-1661994253,SUCCEEDED,18,18
2,employees_mup,mysql2gcs-1661993911,SUCCEEDED,gcs2spanner-1661994546,SUCCEEDED,18,18
