In [1]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from datetime import datetime

from utils.mysql_spark_connection import table_df, spark

In [2]:
spark = SparkSession.builder.appName("master db creation").getOrCreate()

In [3]:
rw_account_master = table_df(schema_name='client_rw', table_name='rw_account_master')
md_account_master = table_df(schema_name='client_rw', table_name='md_account_master')
md_account_master_archive = table_df(schema_name='client_rw', table_name='md_account_master_archive')

In [4]:
rw_account_master.show()

+--------------+-------------+-------------+---------------+-------+
|account_number|mobile_number|acc_open_date|acc_closed_date|address|
+--------------+-------------+-------------+---------------+-------+
|        123ABC|         9840|   2024-01-01|     2024-12-31|      X|
|        123PQR|         9851|   2024-01-01|     2024-12-31|      Y|
|        123XYZ|         9842|   2024-01-01|     2024-12-31|      Z|
+--------------+-------------+-------------+---------------+-------+



In [5]:
md_account_master.show()

+--------------+-------------+-------------+---------------+-------+-------------+
|account_number|mobile_number|acc_open_date|acc_closed_date|address|modified_date|
+--------------+-------------+-------------+---------------+-------+-------------+
|        123ABC|         9840|   2024-01-01|     2024-12-31|      X|         NULL|
|        123PQR|         9841|   2024-01-01|     2024-12-31|      Y|         NULL|
+--------------+-------------+-------------+---------------+-------+-------------+



In [6]:
md_account_master_archive.show()

+--------------+-------------+-------------+---------------+-------+------------+
|account_number|mobile_number|acc_open_date|acc_closed_date|address|created_date|
+--------------+-------------+-------------+---------------+-------+------------+
+--------------+-------------+-------------+---------------+-------+------------+



### Step 1: Create md_account_master_archive 

In [7]:
part_md_account_master_archive = md_account_master.drop('modified_date').withColumn('created_date', F.lit(datetime.now()))

md_account_master_archive = md_account_master_archive.union(part_md_account_master_archive). \
dropDuplicates(subset= [col for col in md_account_master_archive.columns if col != 'created_date'])
md_account_master_archive.show()

+--------------+-------------+-------------+---------------+-------+--------------------+
|account_number|mobile_number|acc_open_date|acc_closed_date|address|        created_date|
+--------------+-------------+-------------+---------------+-------+--------------------+
|        123PQR|         9841|   2024-01-01|     2024-12-31|      Y|2024-07-14 13:23:...|
|        123ABC|         9840|   2024-01-01|     2024-12-31|      X|2024-07-14 13:23:...|
+--------------+-------------+-------------+---------------+-------+--------------------+



### Step 2: Find out New and Changed Records in rw_account_master compared to md_account_master

In [8]:
common_columns = [col for col in rw_account_master.columns if col in md_account_master.columns]

changed_new = rw_account_master.join(md_account_master, on=common_columns, how='left_anti')
changed_new = changed_new.withColumn('modified_date', F.lit(datetime.now()))

In [9]:
changed_new.show()

+--------------+-------------+-------------+---------------+-------+--------------------+
|account_number|mobile_number|acc_open_date|acc_closed_date|address|       modified_date|
+--------------+-------------+-------------+---------------+-------+--------------------+
|        123XYZ|         9842|   2024-01-01|     2024-12-31|      Z|2024-07-14 13:23:...|
|        123PQR|         9851|   2024-01-01|     2024-12-31|      Y|2024-07-14 13:23:...|
+--------------+-------------+-------------+---------------+-------+--------------------+



### Step 3: Find out Unchanged Records in md_account_master

In [10]:
unchanged = rw_account_master.join(md_account_master, on=common_columns, how='inner')

# def unique_values(df, col):
#     return [row[col] for row in df.select(col).distinct().collect()]

# distinct_values = unique_values(changed_new, 'account_number')
# unchanged = md_account_master.filter(~F.col("account_number").isin(distinct_values))
unchanged.show()

+--------------+-------------+-------------+---------------+-------+-------------+
|account_number|mobile_number|acc_open_date|acc_closed_date|address|modified_date|
+--------------+-------------+-------------+---------------+-------+-------------+
|        123ABC|         9840|   2024-01-01|     2024-12-31|      X|         NULL|
+--------------+-------------+-------------+---------------+-------+-------------+



### Step 4: Create/update md_account_master by combining unchanged + (changed + new) records

In [11]:
md_account_master = unchanged.union(changed_new)

In [12]:
md_account_master.show()

+--------------+-------------+-------------+---------------+-------+--------------------+
|account_number|mobile_number|acc_open_date|acc_closed_date|address|       modified_date|
+--------------+-------------+-------------+---------------+-------+--------------------+
|        123ABC|         9840|   2024-01-01|     2024-12-31|      X|                NULL|
|        123XYZ|         9842|   2024-01-01|     2024-12-31|      Z|2024-07-14 13:23:...|
|        123PQR|         9851|   2024-01-01|     2024-12-31|      Y|2024-07-14 13:23:...|
+--------------+-------------+-------------+---------------+-------+--------------------+



### Step 5: Update md_account_master_archive 

In [13]:
part_md_account_master_archive = md_account_master.drop('modified_date').withColumn('created_date', F.lit(datetime.now()))

md_account_master_archive = md_account_master_archive.union(part_md_account_master_archive). \
dropDuplicates(subset= [col for col in md_account_master_archive.columns if col != 'created_date'])
md_account_master_archive.show()

+--------------+-------------+-------------+---------------+-------+--------------------+
|account_number|mobile_number|acc_open_date|acc_closed_date|address|        created_date|
+--------------+-------------+-------------+---------------+-------+--------------------+
|        123PQR|         9841|   2024-01-01|     2024-12-31|      Y|2024-07-14 13:23:...|
|        123ABC|         9840|   2024-01-01|     2024-12-31|      X|2024-07-14 13:23:...|
|        123XYZ|         9842|   2024-01-01|     2024-12-31|      Z|2024-07-14 13:23:...|
|        123PQR|         9851|   2024-01-01|     2024-12-31|      Y|2024-07-14 13:23:...|
+--------------+-------------+-------------+---------------+-------+--------------------+



### Playground