# <center>MySQL to Spanner Migration

## Step 1: Install Libraries
#### Run Step 1 one time for each new notebook instance

In [1]:
%%bash
pip3 install pymysql SQLAlchemy
pip3 install --upgrade google-cloud-pipeline-components kfp --user -q

Collecting pymysql
  Downloading PyMySQL-1.0.2-py3-none-any.whl (43 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 43.8/43.8 kB 4.6 MB/s eta 0:00:00
Installing collected packages: pymysql
Successfully installed pymysql-1.0.2




In [2]:
%%bash
wget https://mirrors.estointernet.in/apache/maven/maven-3/3.6.3/binaries/apache-maven-3.6.3-bin.tar.gz
tar -xf apache-maven-3.6.3-bin.tar.gz
sudo rm -rf /usr/bin/apache-maven-3.6.3
sudo mv apache-maven-3.6.3 /usr/bin/

--2022-09-01 01:41:48--  https://mirrors.estointernet.in/apache/maven/maven-3/3.6.3/binaries/apache-maven-3.6.3-bin.tar.gz
Resolving mirrors.estointernet.in (mirrors.estointernet.in)... 43.255.166.254, 2403:8940:3:1::f
Connecting to mirrors.estointernet.in (mirrors.estointernet.in)|43.255.166.254|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 9506321 (9.1M) [application/octet-stream]
Saving to: ‘apache-maven-3.6.3-bin.tar.gz’

     0K .......... .......... .......... .......... ..........  0%  116K 80s
    50K .......... .......... .......... .......... ..........  1%  257K 58s
   100K .......... .......... .......... .......... ..........  1%  474K 45s
   150K .......... .......... .......... .......... ..........  2%  550K 37s
   200K .......... .......... .......... .......... ..........  2%  591K 33s
   250K .......... .......... .......... .......... ..........  3%  836K 29s
   300K .......... .......... .......... .......... ..........  3% 1.05M 26s
   

In [4]:
import os
import IPython
if not os.getenv("IS_TESTING"):
    app = IPython.Application.instance()
    app.kernel.do_shutdown(True)

## Step 2: Import Libraries

In [1]:
import sqlalchemy
import pymysql
import google.cloud.aiplatform as aiplatform
from kfp import dsl
from kfp.v2 import compiler
from datetime import datetime
import time
import copy
import json
import pandas as pd
from google_cloud_pipeline_components.experimental.dataproc import DataprocSparkBatchOp

## Step 3: Assign Parameters

### Step 3.1 Common Parameters

##### PROJECT : GCP project-id
##### REGION : GCP region
##### GCS_STAGING_LOCATION : GCS staging location to be used for this notebook to store artifacts
##### SUBNET : VPC subnet
##### JARS : list of jars. For this notebook mysql connector and avro jar is required in addition with the dataproc template jars
##### MAX_PARALLELISM : Parameter for number of jobs to run in parallel default value is 2

In [89]:
PROJECT = <"project-id">
REGION = <"region""
GCS_STAGING_LOCATION = <"gs://bucket/[folder]">
SUBNET = <"projects/{project}/regions/{region}/subnetworks/{subnet}">
MAX_PARALLELISM = 2 # default value is set to 2
JARS = [GCS_STAGING_LOCATION + "/jars/mysql-connector-java-8.0.29.jar","file:///usr/lib/spark/external/spark-avro.jar"]

### Step 3.2 MYSQL to GCS Parameters
#### MYSQL_HOST : MYSQL instance ip address
#### MYSQL_PORT : MySQL instance port
#### MYSQL_USERNAME : MYSQL username
#### MYSQL_PASSWORD : MYSQL password
#### MYSQL_DATABASE : name of database that you want to migrate
#### MYSQLTABLE_LIST : list of tables you want to migrate eg: ['table1','table2'] else provide an empty list for migration whole database eg : [] 
#### MYSQL_OUTPUT_GCS_LOCATION : gcs location where mysql output will be writtes eg :"gs://bucket/[folder]"
#### MYSQL_OUTPUT_GCS_MODE : output mode for MYSQL data one of (overwrite|append)
#### MYSQL_OUTPUT_GCS_FORMAT : output file formate for MYSQL data one of (avro|parquet|orc)

In [90]:
MYSQL_HOST = <"host">
MYSQL_PORT = <"port">
MYSQL_USERNAME = <"username">
MYSQL_PASSWORD = <"password">
MYSQL_DATABASE = <"database">
MYSQLTABLE_LIST = [] # leave list empty for migrating complete database else provide tables as ['table1','table2']
MYSQL_OUTPUT_GCS_LOCATION = <"gs://bucket/[folder]">
MYSQL_OUTPUT_GCS_MODE = <"mode"> # one of overwrite|append
MYSQL_OUTPUT_GCS_FORMAT = <"format"> # one of avro|parquet|orc

### Step 3.3 GCS to SPANNER Parameters
#### SPANNER_INSTANCE : cloud spanner instance name
#### SPANNER_DATABASE : cloud spanner database name
#### SPANNER_TABLE_PRIMARY_KEYS : provide dictionary of format {"table_name":"primary_key"} for tables which do not have primary key in MYSQL

In [91]:
SPANNER_INSTANCE = <"instance">
SPANNER_DATABASE = <"database">
SPANNER_TABLE_PRIMARY_KEYS = <{}> # provide table which do not have PK in MYSQL {"table_name":"primary_key"}

### Step 3.4 Notebook Configuration Parameters
Below variables shoulld not be changed unless required

In [92]:
PYMYSQL_DRIVER = "mysql+pymysql"
JDBC_DRIVER = "com.mysql.cj.jdbc.Driver"
JDBC_URL = "jdbc:mysql://{}:{}/{}?user={}&password={}".format(MYSQL_HOST,MYSQL_PORT,MYSQL_DATABASE,MYSQL_USERNAME,MYSQL_PASSWORD)
MAIN_CLASS = "com.google.cloud.dataproc.templates.main.DataProcTemplate"
WORKING_DIRECTORY = "/home/jupyter/dataproc-templates/java/"
JAR_FILE = "dataproc-templates-1.0-SNAPSHOT.jar"
GRPC_JAR_PATH = "./grpc_lb/io/grpc/grpc-grpclb/1.40.1"
GRPC_JAR = "grpc-grpclb-1.40.1.jar"
LOG4J_PROPERTIES_PATH = "./src/test/resources"
LOG4J_PROPERTIES = "log4j-spark-driver-template.properties"
PIPELINE_ROOT = GCS_STAGING_LOCATION + "/pipeline_root/dataproc_pyspark"

# adding dataproc template JAR and grpc jar
JARS.append(GCS_STAGING_LOCATION + "/" + GRPC_JAR)
JARS.append(GCS_STAGING_LOCATION + "/" + JAR_FILE)

## Step 4: Generate MySQL Table List
This step creates list of tables for migration. If MYSQLTABLE_LIST is kept empty all the tables in the MYSQL_DATABASE are listed for migration otherwise the provided list is used

In [93]:
if len(MYSQLTABLE_LIST) == 0:
    DB = sqlalchemy.create_engine(
            sqlalchemy.engine.url.URL.create(
                drivername=PYMYSQL_DRIVER,
                username=MYSQL_USERNAME,
                password=MYSQL_PASSWORD,
                database=MYSQL_DATABASE,
                host=MYSQL_HOST,
                port=MYSQL_PORT
              )
            )
    with DB.connect() as conn:
        print("connected to database")
        results = DB.execute('show tables;').fetchall()
        print("Total Tables = ", len(results))
        for row in results:
            TABLE_LIST.append(row[0])

print("list of tables for migration :")
print(MYSQLTABLE_LIST)

list of tables for migration :
['employees', 'employees_nop', 'employees_mup']


## Step 5: Get Primary Keys for tables not present in SPANNER_TABLE_PRIMARY_KEYS
For tables which do not have primary key provided in dictonary SPANNER_TABLE_PRIMARY_KEYS this step fetches primary key from MYSQL_DATABASE

In [94]:
DB = sqlalchemy.create_engine(
            sqlalchemy.engine.url.URL.create(
                drivername=PYMYSQL_DRIVER,
                username=MYSQL_USERNAME,
                password=MYSQL_PASSWORD,
                database=MYSQL_DATABASE,
                host=MYSQL_HOST,
                port=MYSQL_PORT
              )
            )
with DB.connect() as conn:
    for table in MYSQLTABLE_LIST:
        primary_keys = []
        if table not in SPANNER_TABLE_PRIMARY_KEYS:
            results = DB.execute("SHOW KEYS FROM {} WHERE Key_name = 'PRIMARY'".format(table)).fetchall()
            for row in results:
                primary_keys.append(row[4])
            if primary_keys:
                SPANNER_TABLE_PRIMARY_KEYS[table] = ",".join(primary_keys)
            else:
                SPANNER_TABLE_PRIMARY_KEYS[table] = ""

In [95]:
pkDF = pd.DataFrame({"table" : MYSQLTABLE_LIST, "primary_keys": list(SPANNER_TABLE_PRIMARY_KEYS.values())})
print("Below are identified primary keys for migrating mysql table to spanner:")
pkDF

Below are identified primary keys for migrating mysql table to spanner:


Unnamed: 0,table,primary_keys
0,employees,id
1,employees_nop,id
2,employees_mup,"id,fname"


## Step 6: Create JAR files and Upload to GCS
#### Run Step 6 one time for each new notebook instance

In [96]:
%cd $WORKING_DIRECTORY

/home/jupyter/dataproc-templates/java


In [97]:
%%bash
wget https://downloads.mysql.com/archives/get/p/3/file/mysql-connector-java-8.0.29.tar.gz
tar -xf mysql-connector-java-8.0.29.tar.gz
sudo apt-get update -y
sudo apt-get install default-jdk -y
export JAVA_HOME="/usr/lib/jvm/java-11-openjdk-amd64"
export PATH="$JAVA_HOME/bin:$PATH"
export MAVEN_HOME="/usr/bin/apache-maven-3.6.3"
export PATH="$MAVEN_HOME/bin:$PATH"
export PATH
mvn clean spotless:apply install -DskipTests 
mvn dependency:get -Dartifact=io.grpc:grpc-grpclb:1.40.1 -Dmaven.repo.local=./grpc_lb 

Hit:1 http://packages.cloud.google.com/apt cloud-sdk-buster InRelease
Hit:2 http://deb.debian.org/debian buster InRelease
Hit:3 http://security.debian.org/debian-security buster/updates InRelease
Get:4 http://deb.debian.org/debian buster-updates InRelease [56.6 kB]
Get:5 http://deb.debian.org/debian buster-backports InRelease [51.4 kB]
Hit:7 https://download.docker.com/linux/debian buster InRelease
Hit:6 https://packages.cloud.google.com/apt kubernetes-xenial InRelease
Hit:8 http://packages.cloud.google.com/apt google-cloud-packages-archive-keyring-buster InRelease
Hit:9 http://packages.cloud.google.com/apt gcsfuse-buster InRelease
Hit:10 http://packages.cloud.google.com/apt google-compute-engine-buster-stable InRelease
Fetched 108 kB in 1s (95.0 kB/s)
Reading package lists...
Reading package lists...
Building dependency tree...
Reading state information...
default-jdk is already the newest version (2:1.11-71).
0 upgraded, 0 newly installed, 0 to remove and 1 not upgraded.
[INFO] Scann

--2022-09-01 02:58:43--  https://downloads.mysql.com/archives/get/p/3/file/mysql-connector-java-8.0.29.tar.gz
Resolving downloads.mysql.com (downloads.mysql.com)... 104.86.241.75, 2600:1409:9800:993::2e31, 2600:1409:9800:98b::2e31
Connecting to downloads.mysql.com (downloads.mysql.com)|104.86.241.75|:443... connected.
HTTP request sent, awaiting response... 302 Moved Temporarily
Location: https://cdn.mysql.com/archives/mysql-connector-java-8.0/mysql-connector-java-8.0.29.tar.gz [following]
--2022-09-01 02:58:43--  https://cdn.mysql.com/archives/mysql-connector-java-8.0/mysql-connector-java-8.0.29.tar.gz
Resolving cdn.mysql.com (cdn.mysql.com)... 104.97.44.231
Connecting to cdn.mysql.com (cdn.mysql.com)|104.97.44.231|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4273713 (4.1M) [application/x-tar-gz]
Saving to: ‘mysql-connector-java-8.0.29.tar.gz’

     0K .......... .......... .......... .......... ..........  1% 3.08M 1s
    50K .......... .......... .......

In [98]:
!gsutil cp target/$JAR_FILE $GCS_STAGING_LOCATION/$JAR_FILE
!gsutil cp $GRPC_JAR_PATH/$GRPC_JAR $GCS_STAGING_LOCATION/$GRPC_JAR
!gsutil cp $LOG4J_PROPERTIES_PATH/$LOG4J_PROPERTIES $GCS_STAGING_LOCATION/$LOG4J_PROPERTIES
!gsutil cp mysql-connector-java-8.0.29/mysql-connector-java-8.0.29.jar $GCS_STAGING_LOCATION/jars/mysql-connector-java-8.0.29.jar

Copying file://target/dataproc-templates-1.0-SNAPSHOT.jar [Content-Type=application/java-archive]...
==> NOTE: You are uploading one or more large file(s), which would run          
significantly faster if you enable parallel composite uploads. This
feature can be enabled by editing the
"parallel_composite_upload_threshold" value in your .boto
configuration file. However, note that if you do this large files will
be uploaded as `composite objects
<https://cloud.google.com/storage/docs/composite-objects>`_,which
means that any user who downloads such objects will need to have a
compiled crcmod installed (see "gsutil help crcmod"). This is because
without a compiled crcmod, computing checksums on composite objects is
so slow that gsutil disables downloads of composite objects.

\ [1 files][158.5 MiB/158.5 MiB]                                                
Operation completed over 1 objects/158.5 MiB.                                    
Copying file://./grpc_lb/io/grpc/grpc-grpclb/1.40.

## Step 7: Calculate Parallel Jobs for MySQL to GCS
This step uses MAX_PARALLELISM parameter to calculate number of parallel jobs to run

In [99]:
# calculate parallel jobs:
COMPLETE_LIST = copy.deepcopy(MYSQLTABLE_LIST)
PARALLEL_JOBS = len(MYSQLTABLE_LIST)//MAX_PARALLELISM
JOB_LIST = []
while len(COMPLETE_LIST) > 0:
    SUB_LIST = []
    for i in range(MAX_PARALLELISM):
        if len(COMPLETE_LIST)>0 :
            SUB_LIST.append(COMPLETE_LIST[0].lower())
            COMPLETE_LIST.pop(0)
        else:
            break
    JOB_LIST.append(SUB_LIST)
print("list of tables for execution : ")
print(JOB_LIST)

list of tables for execution : 
[['employees', 'employees_nop'], ['employees_mup']]


## Step 8: Execute Pipeline to Migrate tables from MySQL to GCS

In [100]:
mysql_to_gcs_jobs = []

In [101]:
def migrate_mysql_to_gcs(EXECUTION_LIST):
    EXECUTION_LIST = EXECUTION_LIST
    aiplatform.init(project=PROJECT,staging_bucket=GCS_STAGING_LOCATION)
    
    @dsl.pipeline(
        name="java-mysql-to-gcs-pyspark",
        description="Pipeline to get data from mysql to gcs",
    )
    def pipeline(
        PROJECT_ID: str = PROJECT,
        LOCATION: str = REGION,
        MAIN_CLASS: str = MAIN_CLASS,
        JAR_FILE_URIS: list = JARS,
        SUBNETWORK_URI: str = SUBNET,
        FILE_URIS: list = [GCS_STAGING_LOCATION + "/" + LOG4J_PROPERTIES]
    ):
        for table in EXECUTION_LIST:
            BATCH_ID = "mysql2gcs-{}".format(datetime.now().strftime("%s"))
            mysql_to_gcs_jobs.append(BATCH_ID)
            TEMPLATE_SPARK_ARGS = [
            "--template=JDBCTOGCS",
            "--templateProperty", "project.id={}".format(PROJECT),
            "-templateProperty", "jdbctogcs.jdbc.url={}".format(JDBC_URL),
            "--templateProperty", "jdbctogcs.jdbc.driver.class.name={}".format(JDBC_DRIVER),
            "--templateProperty","jdbctogcs.output.location={}/{}".format(MYSQL_OUTPUT_GCS_LOCATION,table),
            "--templateProperty", "jdbctogcs.output.format={}".format(MYSQL_OUTPUT_GCS_FORMAT),
            "--templateProperty", "jdbctogcs.write.mode={}".format(MYSQL_OUTPUT_GCS_MODE),
            "--templateProperty", "jdbctogcs.sql=select * from {}".format(table),
            ]

            _ = DataprocSparkBatchOp(
                project=PROJECT_ID,
                location=LOCATION,
                batch_id=BATCH_ID,
                main_class=MAIN_CLASS,
                jar_file_uris=JAR_FILE_URIS,
                file_uris=FILE_URIS,
                subnetwork_uri=SUBNETWORK_URI,
                args=TEMPLATE_SPARK_ARGS
            )
            time.sleep(3)

    compiler.Compiler().compile(pipeline_func=pipeline, package_path="pipeline.json")

    pipeline = aiplatform.PipelineJob(
            display_name="pipeline",
        template_path="pipeline.json",
        pipeline_root=PIPELINE_ROOT,
        enable_caching=False,
        )
    pipeline.run()

In [102]:
for execution_list in JOB_LIST:
    print(execution_list)
    migrate_mysql_to_gcs(execution_list)

['employees', 'employees_nop']




Creating PipelineJob
PipelineJob created. Resource name: projects/617357862702/locations/us-central1/pipelineJobs/java-mysql-to-gcs-pyspark-20220901030258
To use this PipelineJob in another session:
pipeline_job = aiplatform.PipelineJob.get('projects/617357862702/locations/us-central1/pipelineJobs/java-mysql-to-gcs-pyspark-20220901030258')
View Pipeline Job:
https://console.cloud.google.com/vertex-ai/locations/us-central1/pipelines/runs/java-mysql-to-gcs-pyspark-20220901030258?project=617357862702
PipelineJob projects/617357862702/locations/us-central1/pipelineJobs/java-mysql-to-gcs-pyspark-20220901030258 current state:
PipelineState.PIPELINE_STATE_RUNNING
PipelineJob projects/617357862702/locations/us-central1/pipelineJobs/java-mysql-to-gcs-pyspark-20220901030258 current state:
PipelineState.PIPELINE_STATE_RUNNING
PipelineJob projects/617357862702/locations/us-central1/pipelineJobs/java-mysql-to-gcs-pyspark-20220901030258 current state:
PipelineState.PIPELINE_STATE_RUNNING
PipelineJob

## Step 9: Get status for tables migrated from MySql to GCS

In [103]:
def get_bearer_token():
    
    try:
        #Defining Scope
        CREDENTIAL_SCOPES = ["https://www.googleapis.com/auth/cloud-platform"]

        #Assining credentials and project value
        credentials, project_id = google.auth.default(scopes=CREDENTIAL_SCOPES)

        #Refreshing credentials data
        credentials.refresh(requests.Request())

        #Get refreshed token
        token = credentials.token
        if token:
            return (token,200)
        else:
            return "Bearer token not generated"
    except Exception as error:
        return ("Bearer token not generated. Error : {}".format(error),500)

In [104]:
from google.auth.transport import requests
import google
token = get_bearer_token()
if token[1] == 200:
    print("Bearer token generated")
else:
    print(token)

Bearer token generated


In [105]:
import requests

mysql_to_gcs_status = []
job_status_url = "https://dataproc.googleapis.com/v1/projects/{}/locations/{}/batches/{}"
for job in mysql_to_gcs_jobs:
    auth = "Bearer " + token[0]
    url = job_status_url.format(PROJECT,REGION,job)
    headers = {
      'Content-Type': 'application/json; charset=UTF-8',
      'Authorization': auth 
    }
    response = requests.get(url, headers=headers)
    mysql_to_gcs_status.append(response.json()['state'])

In [106]:
statusDF = pd.DataFrame({"table" : MYSQLTABLE_LIST,"mysql_to_gcs_job" : mysql_to_gcs_jobs, "mysql_to_gcs_status" : mysql_to_gcs_status})
statusDF

Unnamed: 0,table,mysql_to_gcs_job,mysql_to_gcs_status
0,employees,mysql2gcs-1662001372,SUCCEEDED
1,employees_nop,mysql2gcs-1662001375,SUCCEEDED
2,employees_mup,mysql2gcs-1662001712,SUCCEEDED


## Step 10: Execute Pipeline to Migrate tables from GCS to SPANNER

In [107]:
gcs_to_spanner_jobs = []

In [108]:
def migrate_gcs_to_spanner(EXECUTION_LIST):
    EXECUTION_LIST = EXECUTION_LIST
    aiplatform.init(project=PROJECT, staging_bucket=GCS_STAGING_LOCATION)


    @dsl.pipeline(
        name="java-gcs-to-spanner-pyspark",
        description="Pipeline to get data from gcs to spanner",
    )
    def pipeline(
        PROJECT_ID: str = PROJECT,
        LOCATION: str = REGION,
        MAIN_CLASS: str = MAIN_CLASS,
        JAR_FILE_URIS: list = JARS,
        SUBNETWORK_URIS: str = SUBNET,
        FILE_URIS: list = [GCS_STAGING_LOCATION + "/" + LOG4J_PROPERTIES]
    ):
        for table in EXECUTION_LIST:
            BATCH_ID = "gcs2spanner-{}".format(datetime.now().strftime("%s"))
            gcs_to_spanner_jobs.append(BATCH_ID)
            TEMPLATE_SPARK_ARGS = [
            "--template=GCSTOSPANNER",
            "--templateProperty", "project.id={}".format(PROJECT),
            "-templateProperty",  "gcs.spanner.input.format={}".format(MYSQL_OUTPUT_GCS_FORMAT),
            "--templateProperty", "gcs.spanner.input.location={}/{}/".format(MYSQL_OUTPUT_GCS_LOCATION,table),
            "--templateProperty", "gcs.spanner.output.instance={}".format(SPANNER_INSTANCE),
            "--templateProperty", "gcs.spanner.output.database={}".format(SPANNER_DATABASE),
            "--templateProperty", "gcs.spanner.output.table={}".format(table),
            "--templateProperty", "gcs.spanner.output.saveMode={}".format(MYSQL_OUTPUT_GCS_MODE.capitalize()),
            "--templateProperty", "gcs.spanner.output.primaryKey={}".format(SPANNER_TABLE_PRIMARY_KEYS[table])
            ]
            _ = DataprocSparkBatchOp(
                project=PROJECT_ID,
                location=LOCATION,
                batch_id=BATCH_ID,
                main_class=MAIN_CLASS,
                jar_file_uris=JAR_FILE_URIS,
                file_uris=FILE_URIS,
                subnetwork_uri=SUBNETWORK_URIS,
                args=TEMPLATE_SPARK_ARGS
            )
            time.sleep(3)
                                                    

    compiler.Compiler().compile(pipeline_func=pipeline, package_path="pipeline.json")

    pipeline = aiplatform.PipelineJob(
            display_name="pipeline",
            template_path="pipeline.json",
            pipeline_root=PIPELINE_ROOT,
            enable_caching=False,
            )
    pipeline.run()

In [112]:
for execution_list in JOB_LIST:
    print(execution_list)
    migrate_gcs_to_spanner(execution_list)

Creating PipelineJob
PipelineJob created. Resource name: projects/617357862702/locations/us-central1/pipelineJobs/java-gcs-to-spanner-pyspark-20220901031448
To use this PipelineJob in another session:
pipeline_job = aiplatform.PipelineJob.get('projects/617357862702/locations/us-central1/pipelineJobs/java-gcs-to-spanner-pyspark-20220901031448')
View Pipeline Job:
https://console.cloud.google.com/vertex-ai/locations/us-central1/pipelines/runs/java-gcs-to-spanner-pyspark-20220901031448?project=617357862702
PipelineJob projects/617357862702/locations/us-central1/pipelineJobs/java-gcs-to-spanner-pyspark-20220901031448 current state:
PipelineState.PIPELINE_STATE_RUNNING
PipelineJob projects/617357862702/locations/us-central1/pipelineJobs/java-gcs-to-spanner-pyspark-20220901031448 current state:
PipelineState.PIPELINE_STATE_RUNNING
PipelineJob projects/617357862702/locations/us-central1/pipelineJobs/java-gcs-to-spanner-pyspark-20220901031448 current state:
PipelineState.PIPELINE_STATE_RUNNING

## Step 11: Get status for tables migrated from GCS to SPANNER

In [113]:
from google.auth.transport import requests
import google

token = get_bearer_token()
if token[1] == 200:
    print("Bearer token generated")
else:
    print(token)

Bearer token generated


In [118]:
import requests

gcs_to_spanner_status = []
job_status_url = "https://dataproc.googleapis.com/v1/projects/{}/locations/{}/batches/{}"
for job in gcs_to_spanner_jobs:
    auth = "Bearer " + token[0]
    url = job_status_url.format(PROJECT,REGION,job)
    headers = {
      'Content-Type': 'application/json; charset=UTF-8',
      'Authorization': auth 
    }
    response = requests.get(url, headers=headers)
    gcs_to_spanner_status.append(response.json()['state'])

In [119]:
statusDF['gcs_to_spanner_job'] = gcs_to_spanner_jobs
statusDF['gcs_to_spanner_status'] = gcs_to_spanner_status
statusDF

Unnamed: 0,table,mysql_to_gcs_job,mysql_to_gcs_status,gcs_to_spanner_job,gcs_to_spanner_status
0,employees,mysql2gcs-1662001372,SUCCEEDED,gcs2spanner-1662002079,SUCCEEDED
1,employees_nop,mysql2gcs-1662001375,SUCCEEDED,gcs2spanner-1662002082,SUCCEEDED
2,employees_mup,mysql2gcs-1662001712,SUCCEEDED,gcs2spanner-1662002085,SUCCEEDED


## Step 12: Validate row counts of migrated tables from MySQL to SPANNER

In [120]:
mysql_row_count = []
spanner_row_count = []

In [121]:
# get mysql table counts
DB = sqlalchemy.create_engine(
            sqlalchemy.engine.url.URL.create(
                drivername=PYMYSQL_DRIVER,
                username=MYSQL_USERNAME,
                password=MYSQL_PASSWORD,
                database=MYSQL_DATABASE,
                host=MYSQL_HOST,
                port=MYSQL_PORT
              )
            )
with DB.connect() as conn:
    for table in MYSQLTABLE_LIST:
        results = DB.execute("select count(*) from {}".format(table)).fetchall()
        for row in results:
            mysql_row_count.append(row[0])

In [122]:
# get spanner table counts
from google.cloud import spanner

spanner_client = spanner.Client()
instance = spanner_client.instance(SPANNER_INSTANCE)
database = instance.database(SPANNER_DATABASE)

for table in MYSQLTABLE_LIST:
    with database.snapshot() as snapshot:
        results = snapshot.execute_sql("select count(*) from {}".format(table))
        for row in results:
            spanner_row_count.append(row[0])

In [123]:
statusDF['mysql_row_count'] = mysql_row_count 
statusDF['spanner_row_count'] = spanner_row_count 
statusDF

Unnamed: 0,table,mysql_to_gcs_job,mysql_to_gcs_status,gcs_to_spanner_job,gcs_to_spanner_status,mysql_row_count,spanner_row_count
0,employees,mysql2gcs-1662001372,SUCCEEDED,gcs2spanner-1662002079,SUCCEEDED,18,18
1,employees_nop,mysql2gcs-1662001375,SUCCEEDED,gcs2spanner-1662002082,SUCCEEDED,18,18
2,employees_mup,mysql2gcs-1662001712,SUCCEEDED,gcs2spanner-1662002085,SUCCEEDED,18,18
