In [0]:
from datetime import datetime, timedelta
from delta.tables import DeltaTable
from pyspark.sql.functions import col
from pyspark.sql.types import IntegerType

In [0]:
%sql
create or replace table demo_catalog.demo_schema.members(
    id long,
    member_id string,
    name string,
    email string,
    phone string,
    favorite_store_id int,
    last_purchase_date date,
    member_type_rfm string,
    member_category string,
    status int,
    verified int,
    photo_id int,
    created_at timestamp,
    updated_at timestamp,
    deleted_at timestamp
)

In [0]:
# function to read data from MySQL, return DataFrame
# don't add ; at the end of query
# set useLegacyDatetimeCode to false if you want the utc timestamp from timestamp columns
# use databricks secrets for username and passowrd

def read_from_mysql(database_name, query):
    try:
        df = (
            spark.read.format("jdbc")
            .option("driver", "org.mariadb.jdbc.Driver")
            .option(
                "url",
                f"host_address/{database_name}?useLegacyDatetimeCode=false",
            )
            .option("dbtable", f"({query}) as result")
            .option("user", dbutils.secrets.get("jdbc", "demo_username"))
            .option("password", dbutils.secrets.get("jdbc", "demo_password"))
            .load()
        )
    except Exception as error:
        return error

    return df


Example 1 <br> 
When the rows in table getting update, we can add updated_at column at the source table and read the changes rows with that column.

In [0]:
current_timestamp = datetime.now()

#get maximum updated timestamp from the existing table
df_current = spark.sql("select max(updated_at) as max_timestamp from demo_catalog.demo_schema.members")

df_current = df_current.select("max_timestamp").collect()[0]
max_timestamp = df_current["max_timestamp"]

print("current timestamp:", current_timestamp)
print("max timestamp:", max_timestamp)

In [0]:
#read only updated rows from posdb 
query = f"select * from demo_database.members where updated_at <= '{current_timestamp}' and updated_at >= '{max_timestamp}'"

new_data = read_from_mysql("demo_database", query)
new_data.display()

In [0]:
if spark.conf.get("spark.databricks.delta.schema.autoMerge.enabled") == "false":
    spark.conf.set("spark.databricks.delta.schema.autoMerge.enabled", "true")
    print("set to true")
elif spark.conf.get("spark.databricks.delta.schema.autoMerge.enabled") == "true":
    print("true")
else:
    print("unknown value")

In [0]:
deltaTable = DeltaTable.forName(spark, "demo_catalog.demo_schema.members")

(
    deltaTable.alias("current")
    .merge(new_data.alias("new"), "new.id = current.id")
    .whenMatchedUpdateAll()
    .whenNotMatchedInsertAll()
    .execute()
)

In [0]:
df = spark.table("demo_catalog.demo_schema.members")
df.limit(10).display()

Join with sales tables and get more info.

In [0]:
#truncate and write data only

driver = "org.mariadb.jdbc.Driver"
table = ""
user = dbutils.secrets.get("jdbc", "username")
password = dbutils.secrets.get("jdbc", "password")
url = ""

df = spark.table("")

df.write.format("jdbc")\
    .option("driver", driver)\
    .option("url", url)\
    .option("dbtable", table)\
    .option("truncate", "true")\
    .option("user", user)\
    .option("password", password)\
    .mode("overwrite")\
    .save()

In [0]:
#

Example 2 <br>
When the source table is insert only, then we can use watermark table or any way to get the current max id. That way, we can read the changes rows only.

In [0]:
%sql
create or replace table demo_catalog.demo_schema.members_watermark(
  table_name string,
  max_id long
)

In [0]:
#get maximum id from the existing table
df_current = spark.sql("select max_id from demo_catalog.demo_schema.members_watermark where table_name = 'members' ")


print("current id:", current_id)
print("max id:", max_id)

In [0]:
#