In [0]:
# 03_gold_ml_dataset

**Purpose:**  
Prepare a clean, machine-learning-ready Gold dataset by handling nulls,  
casting labels, and selecting final feature columns.

**Layer:** Gold  
**Input Table:** logistics_silver.shipment_features  
**Output Table:** logistics_gold.ml_delay_dataset

In [0]:
%sql
CREATE SCHEMA IF NOT EXISTS logistics_gold;

In [0]:
%python
from pyspark.sql.functions import col

In [0]:
silver_df = spark.table("logistics_silver.shipment_features")

In [0]:
ml_df = silver_df.select(
    "shipment_id",
    "distance_km",
    "shipment_weight_kg",
    "avg_traffic_index",
    "weather_severity_score",
    "historical_delay_rate",
    "avg_delay_days",
    "is_weekend",
    "is_peak_season",
    "is_late"
)

In [0]:
ml_df_clean = ml_df.fillna({
    "avg_traffic_index": 0.5,
    "weather_severity_score": 0.3,
    "historical_delay_rate": 0.2,
    "avg_delay_days": 1.0
})

In [0]:
ml_df_clean = ml_df_clean.withColumn(
    "is_late",
    col("is_late").cast("int")
)

In [0]:
ml_df_clean.write \
    .format("delta") \
    .mode("overwrite") \
    .option("overwriteSchema", "true") \
    .saveAsTable("logistics_gold.ml_delay_dataset")

In [0]:
%sql
SELECT COUNT(*) FROM logistics_gold.ml_delay_dataset;

COUNT(*)
10000


In [0]:
%sql
SELECT is_late, COUNT(*) 
FROM logistics_gold.ml_delay_dataset
GROUP BY is_late;

is_late,COUNT(*)
1,3543
0,6457


In [0]:
%sql
SELECT
  SUM(CASE WHEN avg_traffic_index IS NULL THEN 1 ELSE 0 END) AS traffic_nulls,
  SUM(CASE WHEN weather_severity_score IS NULL THEN 1 ELSE 0 END) AS weather_nulls
FROM logistics_gold.ml_delay_dataset;

traffic_nulls,weather_nulls
0,0
