In [20]:
# IMPORT REQUIRED MODULES
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when, coalesce, lit


# CREATE SPARK SESSION
# Creating Spark session and attaching MySQL JDBC driver (so Spark can connect to MySQL)
spark = SparkSession.builder \
    .appName("CDC_project_DB_to_DB") \
    .config(
        "spark.jars","/Users/manidharrao16/docs/manidocs/pyspark/jdbc/mysql-connector-j-8.4.0/mysql-connector-j-8.4.0.jar"
    ) \
    .getOrCreate()

# READ FULL LOAD FROM SOURCE DB

# Reading the existing data (full load) from source MySQL database
jdbc_url = "jdbc:mysql://127.0.0.1:3306/pyspark_sql_mani_cdc_schema"

# MySQL connection properties (username, password, driver)
properties = {
    "user": "root",
    "password": "Manidharrao@777",
    "driver": "com.mysql.cj.jdbc.Driver"
}

# Reading source table data into Spark DataFrame
source_df = spark.read.jdbc(
    url=jdbc_url,
    table="Persons",
    properties=properties
)

# NOTE: Here we are keeping a consistent column naming format
# (id, fullname, city) so later CDC operations become easy
full_load = source_df \
    .withColumnRenamed("PersonID", "id") \
    .withColumnRenamed("FullName", "fullname") \
    .withColumnRenamed("City", "city")


# READ UPDATED / CDC DATA

# Reading CDC file which contains incremental changes (Insert, Update, Delete)
# status -> I = Insert, U = Update, D = Delete
updated_load = spark.read.csv("/Users/manidharrao16/docs/manidocs/pyspark/project_files/officeData_m.csv",header=False)

# Renaming default CSV columns into meaningful column names
updated_load = updated_load \
    .withColumnRenamed("_c0", "status") \
    .withColumnRenamed("_c1", "id") \
    .withColumnRenamed("_c2", "fullname") \
    .withColumnRenamed("_c3", "city")


# APPLY CDC LOGIC

# Applying CDC logic record by record
# For each row:
# U -> Update existing record
# I -> Insert new record
# D -> Delete record


# Split CDC into separate DataFrames
u_df = updated_load.filter(col("status") == "U").select("id", "fullname", "city")
i_df = updated_load.filter(col("status") == "I").select("id", "fullname", "city")
d_df = updated_load.filter(col("status") == "D").select("id")

# 1) DELETE: remove ids present in D
full_after_delete = full_load.join(d_df, on="id", how="left_anti")

# 2) UPDATE: left join and replace values where update exists
full_after_update = (
    full_after_delete.alias("f")
    .join(u_df.alias("u"), on="id", how="left")
    .select(
        col("id"),
        coalesce(col("u.fullname"), col("f.fullname")).alias("fullname"),
        coalesce(col("u.city"), col("f.city")).alias("city")
    )
)

# 3) INSERT: add only new ids (avoid duplicates)
new_inserts = i_df.join(full_after_update.select("id"), on="id", how="left_anti")

# Final result after CDC
final_df = full_after_update.unionByName(new_inserts)


# WRITE FINAL DATA TO TARGET DB

# Writing the final processed data into target MySQL database table
# mode("overwrite") -> it will overwrite existing table data


jdbc_url = "jdbc:mysql://127.0.0.1:3306/pyspark_sql_mani_cdc_schema"

properties = {
    "user": "root",
    "password": "Manidharrao@777",
    "driver": "com.mysql.cj.jdbc.Driver"
}

source_df = final_df.write.jdbc(
    url=jdbc_url,
    table="Update_Persons",
    mode= ("overwrite"),
    properties=properties
)


In [None]:
# -------------------------
# IMPORT REQUIRED MODULES
# -------------------------
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, coalesce

# -------------------------
# CREATE SPARK SESSION
# -------------------------
spark = SparkSession.builder \
    .appName("PySpark CDC Project") \
    .config(
        "spark.jars",
        r"C:\Users\chint\OneDrive\Desktop\connector\mysql-connector-j-8.0.33\mysql-connector-j-8.0.33.jar"
    ) \
    .getOrCreate()

# -------------------------
# READ FULL LOAD FROM SOURCE DB
# -------------------------
full_load = spark.read \
    .format("jdbc") \
    .option("url", "jdbc:mysql://localhost:3306/source_db") \
    .option("dbtable", "source_db_Persons") \
    .option("user", "root") \
    .option("password", "manikanta396@") \
    .option("driver", "com.mysql.cj.jdbc.Driver") \
    .load()

# Rename columns to maintain consistency
full_load = full_load \
    .withColumnRenamed("PersonID", "id") \
    .withColumnRenamed("FullName", "fullname") \
    .withColumnRenamed("City", "city")

# -------------------------
# READ UPDATED / CDC DATA (CSV FILE)
# -------------------------
updated_load = spark.read.csv(
    r"C:\Users\chint\PycharmProjects\PythonProject\officeD",
    header=False,
    inferSchema=True
)

# Rename CDC columns
# status -> I (Insert), U (Update), D (Delete)
updated_load = updated_load \
    .withColumnRenamed("_c0", "status") \
    .withColumnRenamed("_c1", "id") \
    .withColumnRenamed("_c2", "fullname") \
    .withColumnRenamed("_c3", "city")

# -------------------------
# APPLY CDC LOGIC (WITHOUT FOR LOOP)
# Using JOIN + ANTI JOIN + COALESCE
# -------------------------

# Split CDC into separate DataFrames
u_df = updated_load.filter(col("status") == "U").select("id", "fullname", "city")
i_df = updated_load.filter(col("status") == "I").select("id", "fullname", "city")
d_df = updated_load.filter(col("status") == "D").select("id")

# 1) DELETE: remove ids present in D
full_after_delete = full_load.join(d_df, on="id", how="left_anti")

# 2) UPDATE: left join and replace values where update exists
full_after_update = (
    full_after_delete.alias("f")
    .join(u_df.alias("u"), on="id", how="left")
    .select(
        col("id"),
        coalesce(col("u.fullname"), col("f.fullname")).alias("fullname"),
        coalesce(col("u.city"), col("f.city")).alias("city")
    )
)

# 3) INSERT: add only new ids (avoid duplicates)
new_inserts = i_df.join(full_after_update.select("id"), on="id", how="left_anti")

# Final result after CDC
final_df = full_after_update.unionByName(new_inserts)

# -------------------------
# WRITE FINAL DATA TO TARGET DB
# -------------------------
final_df.write \
    .format("jdbc") \
    .option("url", "jdbc:mysql://localhost:3306/target_db") \
    .option("dbtable", "Update_Persons") \
    .option("user", "root") \
    .option("password", "manikanta396@") \
    .option("driver", "com.mysql.cj.jdbc.Driver") \
    .mode("overwrite") \
    .save()

In [12]:
# IMPORT REQUIRED MODULES
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, coalesce

# CREATE SPARK SESSION
spark = SparkSession.builder \
    .appName("CDC_project_DB_to_DB") \
    .config(
        "spark.jars",
        "/Users/manidharrao16/docs/manidocs/pyspark/jdbc/mysql-connector-j-8.4.0/mysql-connector-j-8.4.0.jar"
    ) \
    .getOrCreate()

# READ FULL LOAD FROM SOURCE DB
jdbc_url = "jdbc:mysql://127.0.0.1:3306/cdc_db"

properties = {
    "user": "root",
    "password": "Manidharrao@777",
    "driver": "com.mysql.cj.jdbc.Driver"
}

source_df = spark.read.jdbc(
    url=jdbc_url,
    table="employee_source",
    properties=properties
)

# Keep consistent columns
full_load = source_df.select(
    col("emp_id").alias("id"),
    col("emp_name").alias("fullname"),
    col("department").alias("city")   # (optional mapping, change if needed)
)

# READ UPDATED / CDC DATA
updated_load = spark.read.csv(
    "/Users/manidharrao16/docs/manidocs/pyspark/project_files/officeData_m.csv",
    header=False, inferSchema= True
)

# Rename CDC columns properly
updated_load = updated_load \
    .withColumnRenamed("_c0", "status") \
    .withColumnRenamed("_c1", "id") \
    .withColumnRenamed("_c2", "fullname") \
    .withColumnRenamed("_c3", "city")

# Split CDC into separate DataFrames
u_df = updated_load.filter(col("status") == "U").select("id", "fullname", "city")
i_df = updated_load.filter(col("status") == "I").select("id", "fullname", "city")
d_df = updated_load.filter(col("status") == "D").select("id").distinct()

# 1) DELETE
full_after_delete = full_load.join(d_df, on="id", how="left_anti")

# 2) UPDATE
full_after_update = (
    full_after_delete.alias("f")
    .join(u_df.alias("u"), on="id", how="left")
    .select(
        col("id"),
        coalesce(col("u.fullname"), col("f.fullname")).alias("fullname"),
        coalesce(col("u.city"), col("f.city")).alias("city")
    )
)

# 3) INSERT (avoid duplicates)
new_inserts = i_df.join(full_after_update.select("id"), on="id", how="left_anti")

# Final CDC result
final_df = full_after_update.unionByName(new_inserts)

# WRITE FINAL DATA TO TARGET DB
final_df.write.jdbc(
    url=jdbc_url,
    table="Update_Persons",
    mode="overwrite",
    properties=properties
)
final_df.show()
print(" CDC Applied Successfully and Data Written to Update_Persons")


+---+--------------+-------------+
| id|      fullname|         city|
+---+--------------+-------------+
|  1|    Ravi Kumar|           IT|
|  2|       ABC XYZ| Jacksonville|
|  4|       ABC XYZ|      Detroit|
|  5|   Priya Mehta|           HR|
|  6|   Rahul Verma|      Finance|
|  7|     Vijay Rao|           IT|
|  8|    Neha Singh|           HR|
|  9|   Arjun Patel|      Finance|
| 10|    Pooja Nair|           IT|
| 11|   Manoj Kumar|           HR|
| 12|    Divya Iyer|      Finance|
| 13|  Sanjay Gupta|           IT|
| 14|     Kavya Rao|           HR|
| 15|     Amit Shah|      Finance|
| 16|   Ramesh Babu|           IT|
| 17|  Swathi Reddy|           HR|
| 18|   Nikhil Jain|      Finance|
| 19|   Prakash Das|           IT|
| 20|Sneha Kulkarni|           HR|
| 21| Deepak Mishra|      Finance|
+---+--------------+-------------+
only showing top 20 rows
 CDC Applied Successfully and Data Written to Update_Persons
