
## Silver Layer Products

The Silver layer in a data lakehouse architecture typically contains cleaned, conformed, and enriched data, ready for analytics and reporting. Silver layer products may include:

- **Curated Data Tables:** Standardized datasets with business logic applied.
- **Aggregated Views:** Summaries and rollups for faster querying.
- **Data Marts:** Subject-oriented collections for specific business domains.
- **Enriched Datasets:** Data with added features or joined from multiple sources.

These products serve as the foundation for advanced analytics, dashboards, and machine learning models.

In [0]:
#Setup and read
from pyspark.sql.functions import (
    col, upper, trim, current_timestamp, lit, when,
    regexp_replace, length, coalesce, concat_ws, 
    year, datediff, to_date
)
from pyspark.sql.types import IntegerType, StringType
products_bronze = spark.table("bronze.products")
total_products_bronze = products_bronze.count()
unique_products_bronze_count = products_bronze.select("product_id").distinct().count()
unique_products = products_bronze.select("product_id").distinct()
print(f"Total rows: {products_bronze.count()}")
print(f"Null product_id : {products_bronze.filter(col("product_id").isNull()).count()}")
print(f"Duplicates product_id : {products_bronze.groupBy("product_id").count().filter(col("count") > 1).count()}")
if total_products_bronze == unique_products_bronze_count:
    print("All product_id are unique")
else:
    print("product_id are not unique")

display(products_bronze.limit(5))
products_bronze.printSchema()

In [0]:
products_clean = products_bronze \
    .withColumn("product_category_name", trim(upper(col("product_category_name")))) 

display(products_clean.limit(5))


In [0]:
products_validated = products_clean \
    .filter(col("product_name_lenght") > 0) \
    .filter(col("product_description_lenght") > 0) \
    .filter(col("product_photos_qty") > 0
    )

filtered_products = products_clean.subtract(products_validated)
filtered_products_perc = filtered_products.count() / products_clean.count() * 100
display(filtered_products)
print(f"Total rows: {products_clean.count()}")
print(f"Filtered rows: {filtered_products.count()}")
print(f"Filtered rows percentage: {filtered_products_perc:.2f}%")

display(products_validated.limit(5))


In [0]:
products_final = products_validated \
    .withColumn("processed_at", current_timestamp()) \
    .withColumn("data_source", lit("olist")) \
    .withColumn("data_layer", lit("silver")) \
    .withColumn("data_status", lit("cleaned")) \


products_final.write \
    .format("delta") \
    .mode("overwrite") \
    .option("overwriteSchema", "true") \
    .saveAsTable("silver.products")


print("Silver products table is now created")
print(f"Total rows: {spark.table('silver.products').count()}")
