In [0]:
from pyspark.sql.functions import col
from pyspark.sql.types import IntegerType

In [0]:
# function to read data from MySQL, return DataFrame
# don't add ; at the end of query
# set useLegacyDatetimeCode to false if you want the utc timestamp from timestamp columns
# use databricks secrets for username and passowrd

def read_from_mysql(database_name, query):
    try:
        df = (
            spark.read.format("jdbc")
            .option("driver", "org.mariadb.jdbc.Driver")
            .option(
                "url",
                f"host_address/{database_name}?useLegacyDatetimeCode=false",
            )
            .option("dbtable", f"({query}) as result")
            .option("user", dbutils.secrets.get("jdbc", "demo_username"))
            .option("password", dbutils.secrets.get("jdbc", "demo_password"))
            .load()
        )
    except Exception as error:
        return error

    return df


Example 2 <br>
When the source table is insert only, then we can use watermark table or any way to get the current max id. That way, we can read the changes rows only.

In [0]:
%sql
create or replace table demo_catalog.demo_schema.user_events (
  id bigint,
  user_id,
  event_id,
  value,
  timestamp
)

In [0]:
%sql
create or replace table demo_catalog.demo_schema.watermark_table(
  table_name string,
  max_id long
);

In [0]:
%sql
-- start with 0
insert into demo_catalog.demo_schema.watermark_table (table_name, max_id) values ("user_events", 0);

In [0]:
#get maximum id from the watermark table
df_current = spark.sql("select max_id from demo_catalog.demo_schema.watermark_table where table_name = 'user_events' ")
df_current = df_current.select("max_id").collect()[0]
current_max_id = df_current["max_id"]

#get maximum id from the source table
df_source = read_from_mysql("demo_database", "select max(id) as max_id from demo_database.user_events ")
df_source = df_source.select("max_id").collect()[0]
source_max_id = df_source["max_id"]

print("current max id:", current_max_id)
print("source max id:", source_max_id)

In [0]:
# read only new rows from source table by comparing two ids
query = f"select * from demo_database.user_events where id > '{current_max_id}' and id <= '{source_max_id}' "

new_data = read_from_mysql("demo_database", query)
#new_data.display()

In [0]:
# spark and delta configs for automerge 
# so we don't need to worry for new schema or deleted schema 

if spark.conf.get("spark.databricks.delta.schema.autoMerge.enabled") == "false":
    spark.conf.set("spark.databricks.delta.schema.autoMerge.enabled", "true")
    print("set to true")
elif spark.conf.get("spark.databricks.delta.schema.autoMerge.enabled") == "true":
    print("true")
else:
    print("unknown value")

In [0]:
deltaTable = DeltaTable.forName(spark, "demo_catalog.demo_schema.user_events")

(
    deltaTable.alias("current")
    .merge(new_data.alias("new"), "new.id = current.id")
    .whenMatchedUpdateAll()
    .whenNotMatchedInsertAll()
    .execute()
)

In [0]:
#update the watermark table with max id after pipeline is succeed

query = f"update demo_catalog.demo_schema.watermark_table set max_id = '{source_max_id}' where table_name = 'user_events' "

spark.sql(query)

DataFrame[num_affected_rows: bigint]

In [0]:
#