In [0]:
"""# 06 ML Feature Preparation

This notebook prepares ML-ready features from Silver and Gold tables.

Objectives:
- Feature engineering
- Label preparation
- Null handling
- Train/test readiness

Target use cases:
- Sales prediction
- Stock risk prediction
"""

In [0]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window


In [0]:
silver_df = spark.table("silver_pharmacy_events")
gold_category_df = spark.table("gold_category_metrics")

print("Silver rows:", silver_df.count())
print("Gold category rows:", gold_category_df.count())


In [0]:
silver_features = (
    silver_df
    .withColumn("mfg_year", F.year("mfg_date_dt"))
    .withColumn("expiry_year", F.year("expiry_date_dt"))
    .withColumn("current_stock_age_months",
        F.months_between(F.current_date(), F.col("mfg_date_dt"))
    )
)


In [0]:
silver_features = (
    silver_features
    .withColumn("is_high_value",
        F.when(F.col("price_including_gst") >= 300, 1).otherwise(0)
    )
    .withColumn("is_long_shelf_life",
        F.when(F.col("shelf_life_months") >= 24, 1).otherwise(0)
    )
)


In [0]:
ml_base = (
    silver_features
    .groupBy("medicine_brand_name", "category")
    .agg(
        F.count("*").alias("stock_count"),
        F.avg("price_including_gst").alias("avg_price"),
        F.avg("shelf_life_months").alias("avg_shelf_life"),
        F.sum("near_expiry_flag").alias("near_expiry_items")
    )
)


In [0]:
ml_labeled = (
    ml_base
    .withColumn(
        "stock_risk_score",
        F.col("near_expiry_items") / F.col("stock_count")
    )
)


In [0]:
ml_final = (
    ml_labeled
    .fillna(0)
    .filter(F.col("stock_count") > 0)
)

ml_final.show(5)


In [0]:
ml_final.write \
    .format("delta") \
    .mode("overwrite") \
    .saveAsTable("ml_pharmacy_features")


In [0]:
%sql
SELECT
  category,
  AVG(stock_risk_score) AS avg_stock_risk
FROM ml_pharmacy_features
GROUP BY category
ORDER BY avg_stock_risk DESC;


In [0]:
"""## ML Feature Contract

✔ Aggregated product-level features  
✔ Business-aligned labels  
✔ No data leakage  
✔ Null-safe dataset  
✔ Ready for regression & classification  

This dataset is the foundation for ML models.
"""