In [0]:
# 01_Ingest_Bronze

# Define paths
source_volume_path = "/Volumes/main/ecommerce/lakehouse_vol/"
target_bronze_path = "/Volumes/main/ecommerce/lakehouse_vol/bronze/"

# List of the 5 specific files we need
datasets = {
    "orders": "olist_orders_dataset.csv",
    "order_items": "olist_order_items_dataset.csv",
    "customers": "olist_customers_dataset.csv",
    "products": "olist_products_dataset.csv",
    "payments": "olist_order_payments_dataset.csv"
}

# Loop through each dataset
for dataset_name, file_name in datasets.items():
    print(f"Processing: {dataset_name}...")
    
    # 1. Read CSV (Standardize raw storage)
    # We infer schema to get correct types, but header is true
    df = spark.read.format("csv") \
        .option("header", "true") \
        .option("inferSchema", "true") \
        .load(f"{source_volume_path}/{file_name}")
        
    # 2. Define Output Path (e.g., .../bronze/orders/)
    output_path = f"{target_bronze_path}/{dataset_name}/"
    
    # 3. Write to Parquet (NO transformations, just format change)
    df.write.mode("overwrite").parquet(output_path)
    
    print(f"--> Saved to {output_path}")

print("All Bronze datasets created successfully!")