In [0]:
# Imports
from pyspark.sql import functions as F

In [0]:
# Read Bronze customers
df_loyalty_segments_bronze = spark.table("retail_project.bronze.loyalty_segments")

In [0]:
display(df_loyalty_segments_bronze.limit(10))
df_loyalty_segments_bronze.printSchema()

loyalty_segment_id,loyalty_segment_description,unit_threshold,valid_from,valid_to,_read_timestamp,_source_path,_file_size
0,level_0,0,2017-01-01,,2025-12-13T16:15:02.069Z,dbfs:/databricks-datasets/retail-org/loyalty_segments/loyalty_segment.csv,181
1,level_1,10,2017-01-01,,2025-12-13T16:15:02.069Z,dbfs:/databricks-datasets/retail-org/loyalty_segments/loyalty_segment.csv,181
2,level_2,30,2017-01-01,,2025-12-13T16:15:02.069Z,dbfs:/databricks-datasets/retail-org/loyalty_segments/loyalty_segment.csv,181
3,level_3,70,2017-01-01,,2025-12-13T16:15:02.069Z,dbfs:/databricks-datasets/retail-org/loyalty_segments/loyalty_segment.csv,181


root
 |-- loyalty_segment_id: integer (nullable = true)
 |-- loyalty_segment_description: string (nullable = true)
 |-- unit_threshold: integer (nullable = true)
 |-- valid_from: date (nullable = true)
 |-- valid_to: string (nullable = true)
 |-- _read_timestamp: timestamp (nullable = true)
 |-- _source_path: string (nullable = true)
 |-- _file_size: long (nullable = true)



In [0]:
# validate loyalty_segment_id
(
    df_loyalty_segments_bronze
    .groupBy("loyalty_segment_id")
    .count()
    .filter("count > 1")
    .display()
)

loyalty_segment_id,count


In [0]:
# Discovery check: nulls & data quality
df_loyalty_segments_bronze.select([
    F.count(F.when(F.col(c).isNull(), c)).alias(c)
    for c in df_loyalty_segments_bronze.columns
]).display()

loyalty_segment_id,loyalty_segment_description,unit_threshold,valid_from,valid_to,_read_timestamp,_source_path,_file_size
0,0,0,0,4,0,0,0


In [0]:
df_loyalty_segments_deduped = df_loyalty_segments_bronze.dropDuplicates(["loyalty_segment_id"])


In [0]:
# Silver cleaning & standardization
df_loyalty_segments_silver = (
    df_loyalty_segments_deduped
    .select(
        F.col("loyalty_segment_id").cast("int"),
        F.col("loyalty_segment_description").cast("string"),
        F.col("unit_threshold").cast("int"),
        F.col("valid_from").cast("date"),
        F.col("valid_to").cast("date"),
        F.col("_read_timestamp").alias("bronze_read_timestamp")
    )
)


In [0]:
df_loyalty_segments_silver.display()

loyalty_segment_id,loyalty_segment_description,unit_threshold,valid_from,valid_to,bronze_read_timestamp
0,level_0,0,2017-01-01,,2025-12-13T16:15:02.069Z
1,level_1,10,2017-01-01,,2025-12-13T16:15:02.069Z
2,level_2,30,2017-01-01,,2025-12-13T16:15:02.069Z
3,level_3,70,2017-01-01,,2025-12-13T16:15:02.069Z


In [0]:
(
    df_loyalty_segments_silver.write
        .format("delta")
        .mode("overwrite")
        .option("overwriteSchema", "true")
        .saveAsTable("retail_project.silver.loyalty_segments")
)

In [0]:
spark.table("retail_project.silver.loyalty_segments").display()

loyalty_segment_id,loyalty_segment_description,unit_threshold,valid_from,valid_to,bronze_read_timestamp
1,level_1,10,2017-01-01,,2025-12-13T16:15:02.069Z
3,level_3,70,2017-01-01,,2025-12-13T16:15:02.069Z
0,level_0,0,2017-01-01,,2025-12-13T16:15:02.069Z
2,level_2,30,2017-01-01,,2025-12-13T16:15:02.069Z
