In [0]:
import pyspark.sql.functions as F
from pyspark.sql.types import StringType, DateType
from pyspark.sql.functions import col, trim, length

In [0]:
df_silver = spark.table("workspace.bronze.crm_sales_details")

In [0]:
df_silver.summary("count").show()

In [0]:
for field in df_silver.schema.fields:
    if isinstance(field.dataType, StringType):
        df_silver = df_silver.withColumn(field.name, trim(col(field.name)))

In [0]:
df_silver = (
    df_silver
    .withColumn(
        "sls_order_dt",
        F.when(
            (col("sls_order_dt") == 0) | (length(col("sls_order_dt")) != 8),
            None
        ).otherwise(F.to_date(col("sls_order_dt").cast("string"), "yyyyMMdd"))
    )
    .withColumn(
        "sls_ship_dt",
        F.when(
            (col("sls_ship_dt") == 0) | (length(col("sls_ship_dt")) != 8),
            None
        ).otherwise(F.to_date(col("sls_ship_dt").cast("string"), "yyyyMMdd"))
    )
    .withColumn(
        "sls_due_dt",
        F.when(
            (col("sls_due_dt") == 0) | (length(col("sls_due_dt")) != 8),
            None
        ).otherwise(F.to_date(col("sls_due_dt").cast("string"), "yyyyMMdd"))
    )
)

In [0]:
df_silver.limit(10).display()

In [0]:
df_silver = (
    df_silver
    .withColumn(
        "sls_price",
        F.when(
            (col("sls_price").isNull()) | (col("sls_price") <= 0),
            F.when(
                col("sls_quantity") != 0,
                col("sls_sales") / col("sls_quantity")
            ).otherwise(None)
        ).otherwise(col("sls_price"))
    )
)
df_silver.limit(10).display()

In [0]:


df_silver = (
    df_silver
    .withColumn(
        "sls_sales",
        F.when(
            # The exact CASE WHEN conditions separated by | (OR)
            F.col("sls_sales").isNull() | 
            (F.col("sls_sales") <= 0) | 
            (F.col("sls_sales") != F.col("sls_quantity") * F.abs(F.col("sls_price"))),
            
            # The THEN part: What to do if the condition is true
            F.col("sls_quantity") * F.abs(F.col("sls_price"))
            
        # The ELSE part: Keep the original sales value
        ).otherwise(F.col("sls_sales"))
    )
)

display(df_silver)
df_silver.summary().show()

In [0]:

RENAME_MAP = {
    "sls_ord_num": "order_number",
    "sls_prd_key": "product_number",
    "sls_cust_id": "customer_id",
    "sls_order_dt": "order_date",
    "sls_ship_dt": "ship_date",
    "sls_due_dt": "due_date",
    "sls_sales": "sales_amount",
    "sls_quantity": "quantity",
    "sls_price": "price"
}
for old_name, new_name in RENAME_MAP.items():
    df_silver = df_silver.withColumnRenamed(old_name, new_name)

In [0]:
df_silver.write.mode("overwrite").format("delta").saveAsTable("workspace.silver.crm_sales")