In [None]:
from pyspark.sql import SparkSession

In [None]:
# create a spark session with mssql, delta, and hive support enabled
spark = SparkSession.builder \
    .appName("sql-server-cdc-with-pyspark") \
    .config("spark.jars.packages", "com.microsoft.sqlserver:mssql-jdbc:9.4.1.jre8,io.delta:delta-core_2.12:1.1.0") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .enableHiveSupport() \
    .getOrCreate()

In [None]:
# secrets included for readability; normally they would be in KeyVault, etc.
SRC_USER = "XXXXXX"
SRC_PWD  = "XXXXXX"
SRC_HOST = "XXXXXX"
SRC_DB   = "XXXXXX"

src_table     = "customers"
src_table_key = "customer_id"

delta_table_path = f"/tmp/{src_table}"

In [None]:
# get the data and schema of the src table from sql server
# NOTE: be sure your IP is allowed in the db firewall
df = spark.read \
        .format("jdbc") \
        .option("url", f"jdbc:sqlserver://{SRC_HOST}:{SRC_PORT}; database={SRC_DB}; fetchsize=20000") \
        .option("dbtable", f"dbo.{src_table}") \
        .option("user", SRC_USER) \
        .option("password", SRC_PWD) \
        .option("encrypt", "true") \
        .option("driver", "com.microsoft.sqlserver.jdbc.SQLServerDriver") \
        .option("hostNameInCertificate", "*.database.windows.net") \
        .load()

# persist it in delta format
df.write.format("delta").mode("overwrite").save(delta_table_path)

spark.sql(f"""
  DROP TABLE IF EXISTS {src_table}
""")

# create a hive table using the data at the delta location
spark.sql(f"""
  CREATE TABLE {src_table}
  USING DELTA
  LOCATION '{delta_table_path}'
""")

In [None]:
spark.sql("select * from customers").show()