1.User to specify mysql connection.
2.Generate list of tables from metadata. Alternatively, user should be able to supply list of tables.
3.Identify current primary key column name, and partitioned read properties.
4.Should generate logic for partitioned read, such that each partition read is <2 GB in size.
5.Run JDBCToGCS (Python or Java template) for MySQL to GCS for export.
6.Run GCSToSpanner (Java) for import into Spanner.
7.Notebook should allow for both types of save modes i.e. appending data or overwrite
8.Notebook should allow table schema generation if table does not exists.

### Step 1: Install Libraries

In [None]:
!pip install pymysql SQLAlchemy
# Google Cloud notebooks requires dependencies to be installed with '--user'
! pip3 install --upgrade google-cloud-pipeline-components kfp --user -q

### Step 2: Import Libraries

In [None]:
import os
import IPython
if not os.getenv("IS_TESTING"):
    app = IPython.Application.instance()
    app.kernel.do_shutdown(True)

In [None]:
import sqlalchemy
import pymysql
import google.cloud.aiplatform as aiplatform
from kfp import dsl
from kfp.v2 import compiler
from datetime import datetime
import time
import copy
from google_cloud_pipeline_components.experimental.dataproc import DataprocPySparkBatchOp

### Step 3: Assign Variables

In [None]:
IP_ADDRESS = "10.203.209.12"
PORT = "3306"
USERNAME = "root"
PASSWORD = "####"
DATABASE = "INFORMATION_SCHEMA"
TABLE_LIST = ['CHARACTER_SETS', 'COLLATIONS', 'COLLATION_CHARACTER_SET_APPLICABILITY', 'COLUMNS','TABLES'] # leave list empty for migrating complete database
MYSQL_OUTPUT_GCS_LOCATION = "gs://python-dataproc-templates/mysql-gcs-output"
MYSQL_OUTPUT_GCS_MODE = "overwrite"
MYSQL_OUTPUT_GCS_FORMAT = "csv"
MAX_PARALLELISM = 2
PROJECT_ID = "yadavaja-sandbox"
REGION = "us-west1"
GCS_STAGING_LOCATION = "gs://python-dataproc-templates-temp/mysql-to-spanner-staging"
SUBNET = "projects/yadavaja-sandbox/regions/us-west1/subnetworks/test-subnet1"
JARS = ["gs://datproc_template_nk/jars/mysql-connector-java-8.0.29.jar"]

# Please update below variables only whenrequired
PYMYSQL_DRIVER = "mysql+pymysql"
JDBC_DRIVER = "com.mysql.cj.jdbc.Driver"
JDBC_URL = "jdbc:mysql://{}:{}/{}?user={}&password={}".format(IP_ADDRESS,PORT,DATABASE,USERNAME,PASSWORD)
WORKING_DIRECTORY = "/home/jupyter/dataproc-templates/python/"
PACKAGE_EGG_FILE = "dist/dataproc_templates_distribution.egg"


### Step 4: Generate MySQL Table List

In [None]:
if len(TABLE_LIST) == 0:
    DB = sqlalchemy.create_engine(
            sqlalchemy.engine.url.URL.create(
                drivername=PYMYSQL_DRIVER,
                username=USERNAME,
                password=PASSWORD,
                database=DATABASE,
                host=IP_ADDRESS,
                port=PORT
              )
            )
    with DB.connect() as conn:
        print("connected to database")
        results = DB.execute('show tables;').fetchall()
        print("Total Tables = ", len(results))
        for row in results:
            TABLE_LIST.append(row[0])

print("list of tables for migration :")
print(TABLE_LIST)

### Step 5: Create Package Egg file and Upload to GCS

In [None]:
%cd $WORKING_DIRECTORY
! python ./setup.py bdist_egg --output=$PACKAGE_EGG_FILE

In [None]:
! gsutil cp main.py $GCS_STAGING_LOCATION/
! gsutil cp -r $PACKAGE_EGG_FILE $GCS_STAGING_LOCATION/dist/

### Step 6: Calculate Parallel Job for MySQL to GCS

In [None]:
# calculate parallel jobs:
COMPLETE_LIST = copy.deepcopy(TABLE_LIST)
PARALLEL_JOBS = len(TABLE_LIST)//MAX_PARALLELISM
JOB_LIST = []
while len(COMPLETE_LIST) > 0:
    SUB_LIST = []
    for i in range(MAX_PARALLELISM):
        if len(COMPLETE_LIST)>0 :
            SUB_LIST.append(COMPLETE_LIST[0])
            COMPLETE_LIST.pop(0)
        else:
            break
    JOB_LIST.append(SUB_LIST)
print("list of tables for execution : ")
print(JOB_LIST)

### Step 7: Execute Pipeline to Migrate tables from MySQL to GCS

In [None]:
PIPELINE_ROOT = GCS_STAGING_LOCATION + "/pipeline_root/dataproc_pyspark"
MAIN_PYTHON_FILE = GCS_STAGING_LOCATION + "/main.py"
PYTHON_FILE_URIS = [GCS_STAGING_LOCATION + "/dist/dataproc_templates_distribution.egg"]

In [None]:
def migrate_mysql(EXECUTION_LIST):
    EXECUTION_LIST = EXECUTION_LIST
    aiplatform.init(project=PROJECT_ID, staging_bucket=GCS_STAGING_LOCATION)


    @dsl.pipeline(
        name="mysql-to-gcs-pyspark",
        description="Pipeline to get data from mysql to gcs",
    )
    def pipeline(
        project_id: str = PROJECT_ID,
        location: str = REGION,
        main_python_file_uri: str = MAIN_PYTHON_FILE,
        python_file_uris: list = PYTHON_FILE_URIS,
        jar_file_uris: list = JARS,
        subnetwork_uri: str = SUBNET
    ):
        for table in EXECUTION_LIST:
            BATCH_ID = "mysql2gcs-{}".format(datetime.now().strftime("%s"))
            TEMPLATE_SPARK_ARGS = [
            "--template=JDBCTOGCS",
            "--jdbctogcs.input.url={}".format(JDBC_URL),
            "--jdbctogcs.input.driver={}".format(JDBC_DRIVER),
            "--jdbctogcs.input.table={}".format(table),
            "--jdbctogcs.output.location={}/{}".format(MYSQL_OUTPUT_GCS_LOCATION,table.lower()),
            "--jdbctogcs.output.mode={}".format(MYSQL_OUTPUT_GCS_MODE),
            "--jdbctogcs.output.format={}".format(MYSQL_OUTPUT_GCS_FORMAT)
            ]
            print(TEMPLATE_SPARK_ARGS)
            _ = DataprocPySparkBatchOp(
                project=project_id,
                location=location,
                batch_id=BATCH_ID,
                main_python_file_uri=main_python_file_uri,
                python_file_uris=python_file_uris,
                jar_file_uris=jar_file_uris,
                subnetwork_uri=subnetwork_uri,
                args=TEMPLATE_SPARK_ARGS,
            )
            time.sleep(3)

    compiler.Compiler().compile(pipeline_func=pipeline, package_path="pipeline.json")

    pipeline = aiplatform.PipelineJob(
            display_name="pipeline",
            template_path="pipeline.json",
            pipeline_root=PIPELINE_ROOT,
            enable_caching=False,
            )
    pipeline.run()


In [None]:
for execution_list in JOB_LIST:
    print(execution_list)
    migrate_mysql(execution_list)

4. Identify current primary key column name, and partitioned read properties.

5. Should generate logic for partitioned read, such that each partition read is <2 GB in size.

7. create spanner schema if it does not exists

8. get gcs to spanner

In [None]:
TEMPLATE_SPARK_ARGS = [
"--template=JDBCTOGCS",
"--jdbctogcs.input.url={}".format(JDBC_URL),
"--jdbctogcs.input.driver={}".format(JDBC_DRIVER),
"--jdbctogcs.input.table={}".format(),
"--jdbctogcs.output.location={}/{}".format(MYSQL_OUTPUT_GCS_LOCATION),
"--jdbctogcs.output.mode={}".format(MYSQL_OUTPUT_GCS_MODE),
"--jdbctogcs.output.format={}".format(MYSQL_OUTPUT_GCS_FORMAT)
]
# --jdbctogcs.input.partitioncolumn="id" \
# --jdbctogcs.input.lowerbound="11" \
# --jdbctogcs.input.upperbound="20" \
# --jdbctogcs.numpartitions="4" \
# --jdbctogcs.output.partitioncolumn="department_id"

In [None]:
# my sql to gcs pending
#3.Identify current primary key column name, and partitioned read properties. 
#4.Should generate logic for partitioned read, such that each partition read is <2 GB in size
# lower bound, upper bound partition
# parallel execution