
## Silver Layer: Order Items

The Silver layer contains validated, cleaned, and enriched order item data.  
This layer is used for analytics, reporting, and downstream processing.

**Source Table:** `workspace.default.olist_order_items_dataset`  
**Key Columns:**  
- `order_id`
- `order_item_id`
- `product_id`
- `seller_id`
- `shipping_limit_date`
- `price`
- `freight_value`

Data in this layer is deduplicated, normalized, and ready for business use.

In [0]:
#1. Setup

from pyspark.sql.functions import (
    col, upper, trim, current_timestamp, lit, when,
    regexp_replace, length, coalesce, concat_ws, 
    year, datediff, to_date
)
from pyspark.sql.types import IntegerType, StringType

spark.sql("CREATE DATABASE IF NOT EXISTS silver")

order_items_bronze = spark.table("bronze.order_items")
total_order_items_bronze = order_items_bronze.count()
unique_order_items_bronze = order_items_bronze.select("order_id", "order_item_id").distinct().count()
print(f"Total rows: {order_items_bronze.count()}")
print(f"Null order_id : {order_items_bronze.filter(col("order_id").isNull()).count()}")
print(f"Duplicates order_id : {order_items_bronze.groupBy("order_id", "order_item_id").count().filter(col("count") > 1).count()}")
if total_order_items_bronze == unique_order_items_bronze:
    print("All order_id are unique")
else:
    print("order_id are not unique")


print("Creating silver database")
spark.sql("CREATE DATABASE IF NOT EXISTS silver")
print(" Silver database ready\n")

order_items_bronze = spark.table("bronze.order_items")

print(f"Total order items: {order_items_bronze.count()}")
display(order_items_bronze.limit(5))

In [0]:
order_item_validate = order_items_bronze \
    .filter(col("product_id").isNotNull()) \
    .filter(col("seller_id").isNotNull()) \
    .filter(col("shipping_limit_date").isNotNull()) \
    .filter(col("price").isNotNull()) \
    .filter(col("freight_value").isNotNull()) \


print(f"Before validation: {order_items_bronze.count()}")
print(f"After validation: {order_item_validate.count()}")
display(order_item_validate.limit(5))
deleted_rows = order_items_bronze.filter((col("price") == 0) | (col("freight_value") == 0))
print(f"Deleted rows: {deleted_rows.count()}")
display(deleted_rows.limit(20))

price_zero = order_items_bronze.filter(col("price") == 0)
print(f"Price zero: {price_zero.count()}")
display(price_zero.limit(20))

freight_zero = order_items_bronze.filter(col("freight_value") == 0)
print(f"Freight zero: {freight_zero.count()}")
display(freight_zero.limit(20))

In [0]:
order_items_final = order_item_validate \
    .withColumn("processed_at", current_timestamp()) \
    .withColumn("data_source", lit("olist")) \
    .withColumn("data_layer", lit("silver")) \
    .withColumn("data_status", lit("cleaned")) 



order_items_final.write \
    .format("delta") \
    .mode("overwrite") \
    .option("overwriteSchema", "true") \
    .saveAsTable("silver.order_items")


print("Silver order items table is now created")
print(f"Total rows: {spark.table('silver.order_items').count()}")
