In [11]:
# Step 1: Import the original DIFProcessedTableHelper
from pyspark.sql import SparkSession
import sys, datetime
import pandas as pd
sys.path.append("/Users/kulyashdahiya/STUDY/DataEngineering/PysparkLearning/Basics/ELC_Try/LoadTypes/")
sys.path.append("/Users/kulyashdahiya/STUDY/DataEngineering/PysparkLearning/Basics/ELC_Try/LoadTypes/Libs/")
import DIFProcessedHelper3 as DIFProcessedTableHelper3
from DatabaseConstants import AssetColumns

In [12]:
from pyspark.sql import functions as F
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DateType,  FloatType
import os

# Mock Asset Configuration (now with Raw_Table_Name path and data types)
mockAsset = {
    "Processed_Table_Name": "T1SubsetAppend",
    "Load_Type_Code": "T1-Subset-Append",  # Example load type
    "BusinessDate": "2025-04-17",  # Example business date
    "IngestFile": "file1.csv",  # Example source file name
    "Raw_Table_Name": "/Users/kulyashdahiya/STUDY/DataEngineering/PysparkLearning/Basics/ELC_Try/LoadTypes/Raw/",  # Folder for raw Parquet files
        "Fields": [
        {"Source_Field_Name": "SalesDate", "Field_Sequence_Number": 1, "Target_Field_Name": "SalesDate", "IsKey_Indicator": "Y", "Source_Data_Type_Code": "STRING", "Target_Data_Type_Code": "STRING"},
        {"Source_Field_Name": "SKU", "Field_Sequence_Number": 2, "Target_Field_Name": "SKU", "IsKey_Indicator": "Y", "Source_Data_Type_Code": "STRING", "Target_Data_Type_Code": "STRING"},
        {"Source_Field_Name": "Units", "Field_Sequence_Number": 3, "Target_Field_Name": "Units", "IsKey_Indicator": "N", "Source_Data_Type_Code": "INTEGER", "Target_Data_Type_Code": "INTEGER"},
        {"Source_Field_Name": "DIFSourceFile", "Field_Sequence_Number": 4, "Target_Field_Name": "DIFSourceFile", "IsKey_Indicator": "N", "Source_Data_Type_Code": "STRING", "Target_Data_Type_Code": "STRING"},
        {"Source_Field_Name": "BusinessDate", "Field_Sequence_Number": 5, "Target_Field_Name": "BusinessDate", "IsKey_Indicator": "N", "Source_Data_Type_Code": "STRING", "Target_Data_Type_Code": "STRING"}
    ]
}

# Initialize the Spark session for testing
spark = SparkSession.builder.master("local").appName("SimulateRawTable").getOrCreate()

# Simulating raw data (mocked for testing)
data = [
    ("2024-01-12", "A", 100, "file1.csv", "2024-01-12"),
    ("2024-01-12", "B", 200, "file1.csv", "2024-01-12"),
    ("2024-01-13", "A", 150, "file2.csv", "2024-01-13"),
    ("2024-01-13", "B", 250, "file2.csv", "2024-01-13"),
]

# Create DataFrame with raw data
df = spark.createDataFrame(data, ["SalesDate", "SKU", "Units", "DIFSourceFile", "BusinessDate"])

# Step 1: Cast the columns according to the data types specified in the mockAsset
def cast_columns_based_on_data_type(df, mockAsset):
    for field in mockAsset["Fields"]:
        target_field = field["Target_Field_Name"]
        data_type = field["Target_Data_Type_Code"]

        if data_type == "STRING":
            df = df.withColumn(target_field, df[target_field].cast(StringType()))
        elif data_type == "INTEGER":
            df = df.withColumn(target_field, df[target_field].cast(IntegerType()))
        elif data_type == "DATE":
            df = df.withColumn(target_field, df[target_field].cast(DateType()))
        elif data_type == "FLOAT":
            df = df.withColumn(target_field, df[target_field].cast(FloatType()))
        else:
            raise ValueError(f"Unsupported data type: {data_type} for column {target_field}")

    return df

# Cast fields in df according to mockAsset fields' data types
df_casted = cast_columns_based_on_data_type(df, mockAsset)

# Step 2: Save the DataFrame as Parquet under Raw folder
raw_table_path = mockAsset["Raw_Table_Name"]
# Ensure the directory exists
os.makedirs(raw_table_path, exist_ok=True)

# Define the Parquet path where the data will be saved
parquet_path = os.path.join(raw_table_path, "raw_data.parquet")

# Save the DataFrame as Parquet file
df_casted.write.mode("overwrite").parquet(parquet_path)

25/04/17 15:49:00 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [15]:
# Step 2: Mock the necessary configurations
from pyspark.sql.functions import *

# Mock the DIFLogger class to simply print logs
class DIFLogger:
    def __init__(self):
        pass

    def info(self, message):
        print(f"INFO: {message}")

    def debug(self, message):
        print(f"DEBUG: {message}")

    def error(self, message):
        print(f"ERROR: {message}")

    def warn(self, message):
        print(f"WARNING: {message}")

# Initialize the SparkSession for testing
spark = SparkSession.builder.master("local").appName("DIFProcessedHelperTest").getOrCreate()

# Step 3: Use the real DIFProcessedTableHelper class for testing
class DIFProcessedTableHelperTest(DIFProcessedTableHelper3.DIFProcessedTableHelper):
    def __init__(self, pEnvConfig, pAssetGroupConfig, pLogger, pSpark, pAsset):
        # Call the parent constructor
        super().__init__(pEnvConfig, pAssetGroupConfig, pLogger, pSpark, pAsset)

    # Function to manually define the schema based on mockAsset fields
    def define_schema(self, mockAsset):
        fields = []
        for field in mockAsset["Fields"]:
            field_name = field["Target_Field_Name"]
            data_type = field["Target_Data_Type_Code"]

            # Map the data type to the appropriate Spark SQL type
            if data_type == "STRING":
                fields.append(StructField(field_name, StringType(), True))
            elif data_type == "INTEGER":
                fields.append(StructField(field_name, IntegerType(), True))
            elif data_type == "DATE":
                fields.append(StructField(field_name, DateType(), True))
            else:
                raise ValueError(f"Unsupported data type: {data_type} for field {field_name}")

        return StructType(fields)

    # Manually create a Raw DataFrame inside the method
    def getRawDataFrame(self, pTimeStamp):
        try:
            self.aLogger.info("DIFTableHelper.getRawDataFrame")

            # Define the schema for the raw data based on mockAsset fields
            schema = self.define_schema(self.aAsset)

            # Simulating the raw data (this will be done manually instead of reading a Parquet file)
            data = [
                ("2024-01-12", "A", 100, "file1.csv", "2024-01-12"),
                ("2024-01-12", "B", 200, "file1.csv", "2024-01-12"),
                ("2024-01-13", "A", 150, "file2.csv", "2024-01-13"),
                ("2024-01-13", "B", 250, "file2.csv", "2024-01-13"),
            ]

            # Create the DataFrame with the raw data and defined schema
            dfRaw = self.aSpark.createDataFrame(data, schema)

            # Add CDC_LOAD_CODE and LOAD_TS columns
            dfRaw = dfRaw.withColumn("CDC_LOAD_CODE", lit("I")) \
                         .withColumn("LOAD_TS", lit(pTimeStamp))

            self.aLogger.debug("This is the data from the Raw table to be processed")
            dfRaw.show()

            self.aLogger.info("getRawDataFrame.End.25.02.25")
            return dfRaw
        except Exception as ex:
            self.aLogger.error("getRawDataFrame.Error:" + str(ex))
            raise Exception(str(ex))

    def applyTempRecordsToTTable(self, pTimeStamp, dfTarget):
        try:
            self.aLogger.info("applyTempRecordsToTTable.Start")

            # Get the incoming adjusted data
            dfRaw = self.getRawDataFrame(pTimeStamp)
            dfIncoming = self.getAdjustedRawDataFrame(dfRaw)

            # Get the main processed table (T Table)
            mainTable = self.aAsset[f'{AssetColumns.Processed_Table_Name}']
            self.aLogger.info("Processing table: " + mainTable)
            # dfTarget = self.aSpark.table(mainTable)

            # Handle From/To and Updates (if applicable)
            if self.aInstructions["FromToColumns"]:
                # Mark records for update where END_TS = '2999-12-31'
                dfUpdates = dfIncoming.join(dfTarget, on="KEY_CHECKSUM_TXT", how="inner")\
                    .filter((dfTarget["END_TS"] == "2999-12-31") & (dfIncoming["CDC_LOAD_CODE"] == "I"))

                dfUpdates = dfUpdates.withColumn("CDC_LOAD_CODE", lit("U"))\
                    .withColumn("END_TS", lit(self.aAsset["BusinessDate"]))\
                    .withColumn("LOAD_TS", to_timestamp(lit(pTimeStamp), 'yyyy-MM-dd HH:mm:ss'))

                # Update the T Table with CDC_LOAD_CODE='U'
                dfTarget = dfTarget.subtract(dfUpdates)  # Remove old data that's being updated
                dfTarget = dfTarget.union(dfUpdates)  # Add updated data

            # Handle From/To + Deletes (if applicable)
            if self.aInstructions["FromToColumns"]:
                # Mark records for deletion where CDC_LOAD_CODE = 'I' and END_TS = '2999-12-31'
                dfDeletes = dfIncoming.join(dfTarget, on="KEY_CHECKSUM_TXT", how="left_anti")\
                    .filter(dfTarget["CDC_LOAD_CODE"] == "I")

                dfDeletes = dfDeletes.withColumn("CDC_LOAD_CODE", lit("D"))\
                    .withColumn("END_TS", lit(self.aAsset["BusinessDate"]))\
                    .withColumn("LOAD_TS", to_timestamp(lit(pTimeStamp), 'yyyy-MM-dd HH:mm:ss'))

                # Remove deleted records from T Table
                dfTarget = dfTarget.subtract(dfDeletes)  # Remove records to be deleted
                dfTarget = dfTarget.union(dfDeletes)  # Add deleted records

            # Handle HashColumns: Updates and Deletes must be physically deleted (if applicable)
            if not self.aInstructions["FromToColumns"] and self.aInstructions["HashColumns"]:
                dfToDelete = dfIncoming.join(dfTarget, on="KEY_CHECKSUM_TXT", how="inner")\
                    .filter(dfIncoming["CDC_LOAD_CODE"].isin("U", "D"))

                dfTarget = dfTarget.subtract(dfToDelete)  # Remove deleted or updated records

            # Handle Missing Keys: Mark as Deleted (if applicable)
            if self.aInstructions["Delete"] == "MissingKeys":
                missingKeys = dfTarget.join(dfIncoming, on="KEY_CHECKSUM_TXT", how="left_anti")

                missingKeys = missingKeys.withColumn("CDC_LOAD_CODE", lit("D"))\
                    .withColumn("LOAD_TS", to_timestamp(lit(pTimeStamp), 'yyyy-MM-dd HH:mm:ss'))\
                    .withColumn("END_TS", lit(self.aAsset["BusinessDate"]))

                # Update T Table for missing keys (soft delete)
                dfTarget = dfTarget.subtract(missingKeys)  # Remove missing keys
                dfTarget = dfTarget.union(missingKeys)  # Add missing keys as deleted

            # Finally, add the new data to the T Table (Insert)
            dfInserts = dfIncoming.withColumn("CDC_LOAD_CODE", lit("I"))\
                .withColumn("LOAD_TS", to_timestamp(lit(pTimeStamp), 'yyyy-MM-dd HH:mm:ss'))

            # Add the new data to T Table
            dfTarget.printSchema()
            dfInserts.printSchema()
            dfTarget = dfTarget.union(dfInserts)

            # Write the final DataFrame back to the Processed Table (T Table)
            # dfTarget.write.mode("append").saveAsTable(mainTable)

            dfTarget.collect()  # Collect DataFrame to prevent OutOfMemoryError
            return dfTarget

            self.aLogger.info("applyTempRecordsToTTable.End")
        except Exception as ex:
            self.aLogger.error("applyTempRecordsToTTable.Error: " + str(ex))
            raise Exception(str(ex))

In [14]:
mockEnvConfig = {
    "config": {"some_key": "some_value"}  # You can add more keys here as needed
}

# Mock asset group configuration (pAssetGroupConfig) as a dictionary
mockAssetGroupConfig = {
    "AssetGroupName": "MockAssetGroup",
    "some_config": "MockConfig"  # Add other config variables as needed
}

# Create an instance of DIFLogger
mockLogger = DIFLogger()

# Create an instance of the real DIFProcessedTableHelper (subclassed for testing)
helper = DIFProcessedTableHelperTest(mockEnvConfig, mockAssetGroupConfig, mockLogger, spark, mockAsset)

# df = helper.getRawDataFrame(pTimeStamp="2024-01-15 00:00:00")
# df = helper.getAdjustedRawDataFrame(df)
# df.show()

# Step 4: Run a test example using the real class
processedFinal = helper.applyTempRecordsToTTable("2024-01-15 00:00:00", dfTarget)
processedFinal.show()

+----------+---+-----+-------------+------------+
| SalesDate|SKU|Units|DIFSourceFile|BusinessDate|
+----------+---+-----+-------------+------------+
|2024-01-12|  A|  110|    file1.csv|  2024-01-12|
|2024-01-13|  B|  220|    file1.csv|  2024-01-13|
|2024-01-14|  C|  300|    file2.csv|  2024-01-14|
+----------+---+-----+-------------+------------+

INFO: applyTempRecordsToTTable.Start
ERROR: applyTempRecordsToTTable.Error: local variable 'dfIncoming' referenced before assignment


Exception: local variable 'dfIncoming' referenced before assignment