In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *

# Initilize the sesson
spark = SparkSession.builder \
    .appName("Olist Data Processing") \
    .config("spark.sql.legacy.timeParserPolicy", "LEGACY") \
    .getOrCreate()

# 1. Defnie the function
def read_olist_dataset(file_name):
    return spark.read \
        .option("header", True) \
        .option("inferSchema", True) \
        .option("multiLine", True) \
        .option("quote", "\"") \
        .option("escape", "\"") \
        .csv(f"Files/{file_name}.csv")

# 2. Load the dataset
datasets = {
    "customers": read_olist_dataset("olist_customers_dataset"),
    "geolocation": read_olist_dataset("olist_geolocation_dataset"),
    "order_items": read_olist_dataset("olist_order_items_dataset"),
    "order_payments": read_olist_dataset("olist_order_payments_dataset"),
    "orders": read_olist_dataset("olist_orders_dataset"),
    "products": read_olist_dataset("olist_products_dataset")
}

# 3. Clean and tramsfore
# 3.1 Clean client data
customers_clean = datasets["customers"].transform(lambda df: df
    .withColumn("customer_zip_code_prefix", col("customer_zip_code_prefix").cast("int"))
    .dropDuplicates(["customer_id"])
)

# 3.2 Clean order data
orders_clean = datasets["orders"].transform(lambda df: df
    .withColumn("order_purchase_timestamp", to_timestamp(col("order_purchase_timestamp"), "yyyy-MM-dd HH:mm:ss"))
    .withColumn("order_delivered_customer_date", to_timestamp(col("order_delivered_customer_date"), "yyyy-MM-dd HH:mm:ss"))
    .filter(col("order_status") == "delivered")
    .drop("order_approved_at")  
)


# 3.3 Clean payment data
payments_clean = datasets["order_payments"].transform(lambda df: df
    .groupBy("order_id")
    .agg(
        sum("payment_value").alias("total_payment"),
        count("payment_sequential").alias("payment_count")
    )
)

# 3.4 Clean produdt data
products_enhanced = datasets["products"].transform(lambda df: df
    .withColumnRenamed("product_name_lenght", "product_name_length")
    .withColumnRenamed("product_description_lenght", "product_description_length") 
    
    .withColumn("volume_cm3", 
               col("product_length_cm") * col("product_height_cm") * col("product_width_cm"))
    
    .fillna({"product_category_name": "unknown_category"}, subset=["product_category_name"])
)

# 4. Merge the princip data
master_table = (
    orders_clean
    .join(datasets["order_items"], "order_id", "left")
    .join(products_enhanced, "product_id", "left")
    .join(customers_clean, "customer_id", "left")
    .join(payments_clean, "order_id", "left")
    .join(datasets["geolocation"], 
          [col("customer_zip_code_prefix") == col("geolocation_zip_code_prefix")], 
          "left")
)

# 5. Force the data merged
final_table = master_table.transform(lambda df: df
    .withColumn("delivery_days", 
               datediff(col("order_delivered_customer_date"), 
                       col("order_purchase_timestamp")))
    .withColumn("price_category",
               when(col("price") < 50, "Low")
               .when(col("price").between(50, 200), "Medium")
               .otherwise("High"))
)

# 6. Save as delta 
(final_table
 .write
 .format("delta")
 .mode("overwrite")
 .option("overwriteSchema", "true")
 .saveAsTable("olist_master_table")
)

# Verify the data
spark.sql("DESCRIBE DETAIL olist_master_table").show(truncate=False)

StatementMeta(, 7ddfc75a-e4fb-4526-a203-1ef55ff67c8c, 4, Finished, Available, Finished)

+------+------------------------------------+--------------------------------------+-----------+--------------------------------------------------------------------------------------------------------------------------------------------+-----------------------+-----------------------+----------------+-----------------+--------+-----------+---------------------------------------------------------------------------+----------------+----------------+------------------------+
|format|id                                  |name                                  |description|location                                                                                                                                    |createdAt              |lastModified           |partitionColumns|clusteringColumns|numFiles|sizeInBytes|properties                                                                 |minReaderVersion|minWriterVersion|tableFeatures           |
+------+------------------------------------+-