In [0]:
# 04_ml_training_mlflow

**Purpose:**  
Train baseline and final ML models for late delivery risk prediction  
and track experiments using MLflow with Unity Catalog compliance.

**Input Table:** logistics_gold.ml_delay_dataset  
**Models:** Logistic Regression (baseline), Random Forest (final)

In [0]:
%python
import os
import mlflow
import mlflow.spark

from pyspark.sql.functions import col
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import LogisticRegression, RandomForestClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [0]:
os.environ["MLFLOW_DFS_TMP"] = "/Volumes/workspace/default/mlflow_tmp"

In [0]:
ml_df = spark.table("logistics_gold.ml_delay_dataset")

In [0]:
feature_cols = [
    "distance_km",
    "shipment_weight_kg",
    "avg_traffic_index",
    "weather_severity_score",
    "historical_delay_rate",
    "avg_delay_days",
    "is_weekend",
    "is_peak_season"
]

assembler = VectorAssembler(
    inputCols=feature_cols,
    outputCol="features",
    handleInvalid="skip"
)

final_df = assembler.transform(ml_df).select(
    "shipment_id",
    "features",
    "is_late"
)

In [0]:
train_df, test_df = final_df.randomSplit([0.8, 0.2], seed=42)

In [0]:
mlflow.set_experiment("/Shared/Late_Delivery_Risk_Prediction")

<Experiment: artifact_location='dbfs:/databricks/mlflow-tracking/2440674745671958', creation_time=1769870860132, experiment_id='2440674745671958', last_update_time=1769881972041, lifecycle_stage='active', name='/Shared/Late_Delivery_Risk_Prediction', tags={'mlflow.experiment.sourceName': '/Shared/Late_Delivery_Risk_Prediction',
 'mlflow.experimentKind': 'custom_model_development',
 'mlflow.experimentType': 'MLFLOW_EXPERIMENT',
 'mlflow.ownerEmail': 'kushangashukla1@gmail.com',
 'mlflow.ownerId': '75809046495647'}>

In [0]:
mlflow.end_run()  # safety in case a run is already active

In [0]:
with mlflow.start_run(run_name="Logistic_Regression_Baseline"):
    
    lr = LogisticRegression(
        featuresCol="features",
        labelCol="is_late",
        family="binomial"
    )
    
    lr_model = lr.fit(train_df)
    lr_preds = lr_model.transform(test_df)
    
    evaluator = BinaryClassificationEvaluator(
        labelCol="is_late",
        metricName="areaUnderROC"
    )
    
    auc_lr = evaluator.evaluate(lr_preds)
    
    mlflow.log_metric("ROC_AUC", auc_lr)
    mlflow.spark.log_model(lr_model, "logistic_regression_model")
    
    print("Logistic Regression ROC-AUC:", auc_lr)



Logistic Regression ROC-AUC: 0.49320421664918923


In [0]:
mlflow.end_run()

In [0]:
with mlflow.start_run(run_name="Random_Forest_Final_Model"):
    
    rf = RandomForestClassifier(
        featuresCol="features",
        labelCol="is_late",
        numTrees=100,
        maxDepth=8,
        featureSubsetStrategy="sqrt",
        seed=42
    )
    
    rf_model = rf.fit(train_df)
    rf_preds = rf_model.transform(test_df)
    
    evaluator = BinaryClassificationEvaluator(
        labelCol="is_late",
        metricName="areaUnderROC"
    )
    
    auc_rf = evaluator.evaluate(rf_preds)
    
    mlflow.log_metric("ROC_AUC", auc_rf)
    mlflow.log_params({
        "numTrees": 100,
        "maxDepth": 8,
        "featureSubsetStrategy": "sqrt"
    })
    
    mlflow.spark.log_model(rf_model, "random_forest_model")
    
    print("Random Forest ROC-AUC:", auc_rf)



Random Forest ROC-AUC: 0.5042461647737482


In [0]:
import pandas as pd

importances = rf_model.featureImportances.toArray()

feature_importance_df = pd.DataFrame({
    "feature": feature_cols,
    "importance": importances
}).sort_values(by="importance", ascending=False)

feature_importance_df

Unnamed: 0,feature,importance
0,distance_km,0.246355
1,shipment_weight_kg,0.227875
2,avg_traffic_index,0.206785
3,weather_severity_score,0.113712
4,historical_delay_rate,0.063438
5,avg_delay_days,0.057929
6,is_weekend,0.045042
7,is_peak_season,0.038865


In [0]:
print(f"Baseline LR AUC: {auc_lr}")
print(f"Final RF AUC: {auc_rf}")

Baseline LR AUC: 0.49320421664918923
Final RF AUC: 0.5042461647737482


In [0]:
%sql
CREATE VOLUME IF NOT EXISTS workspace.default.ml_models;

In [0]:
%sql
SHOW VOLUMES IN workspace.default;

database,volume_name
default,ml_models
default,mlflow_tmp


In [0]:
rf_model.write().overwrite().save(
    "/Volumes/workspace/default/ml_models/random_forest_delay_model"
)