In [0]:
"""# 07 ML Model Training & Evaluation

This notebook trains ML models on pharmacy analytics data.

Models:
- Regression → stock risk score prediction
- Classification → high-risk stock identification

Framework: Spark ML
"""

In [0]:
from pyspark.sql import functions as F
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import RegressionEvaluator, BinaryClassificationEvaluator


In [0]:
ml_df = spark.table("ml_pharmacy_features")

print("ML dataset size:", ml_df.count())
ml_df.show(5)


In [0]:
feature_cols = [
    "stock_count",
    "avg_price",
    "avg_shelf_life",
    "near_expiry_items"
]

assembler = VectorAssembler(
    inputCols=feature_cols,
    outputCol="features"
)


In [0]:
regression_df = assembler.transform(ml_df) \
    .select("features", "stock_risk_score") \
    .withColumnRenamed("stock_risk_score", "label")


In [0]:
train_reg, test_reg = regression_df.randomSplit([0.8, 0.2], seed=42)


In [0]:
lr = LinearRegression(
    featuresCol="features",
    labelCol="label"
)

lr_model = lr.fit(train_reg)


In [0]:
predictions = lr_model.transform(test_reg)

rmse = RegressionEvaluator(
    labelCol="label",
    predictionCol="prediction",
    metricName="rmse"
).evaluate(predictions)

r2 = RegressionEvaluator(
    labelCol="label",
    predictionCol="prediction",
    metricName="r2"
).evaluate(predictions)

print(f"Regression RMSE: {rmse}")
print(f"Regression R2: {r2}")


In [0]:
clf_df = (
    ml_df
    .withColumn(
        "risk_label",
        F.when(F.col("stock_risk_score") >= 0.3, 1).otherwise(0)
    )
)

clf_features = assembler.transform(clf_df) \
    .select("features", "risk_label") \
    .withColumnRenamed("risk_label", "label")


In [0]:
train_clf, test_clf = clf_features.randomSplit([0.8, 0.2], seed=42)

log_reg = LogisticRegression(
    featuresCol="features",
    labelCol="label"
)

log_model = log_reg.fit(train_clf)


In [0]:
clf_predictions = log_model.transform(test_clf)

auc = BinaryClassificationEvaluator(
    labelCol="label",
    rawPredictionCol="rawPrediction",
    metricName="areaUnderROC"
).evaluate(clf_predictions)

print(f"Classification AUC: {auc}")


In [0]:
coeffs = list(zip(feature_cols, log_model.coefficients.toArray()))
coeffs


In [0]:
"""## ML Training Contract

✔ Regression model for stock risk prediction  
✔ Classification model for high-risk detection  
✔ Explainable features only  
✔ Spark ML (scalable & production-ready)  
✔ Metrics logged and interpretable  

Models are suitable for analytics-driven decision support.
"""