In [0]:
%sql
CREATE VOLUME IF NOT EXISTS capstone_project.logistics.mlflow_tmp;

### Loading ML Feature Data

The Gold ML feature table is loaded from Delta Lake and converted to Pandas for scikit-learn model training. Data type casting is applied to ensure compatibility with the ML library.


In [0]:
import mlflow
import mlflow.sklearn
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

# Load Data
df = spark.table(
    "capstone_project.logistics.gold_ml_last_mile_features"
).toPandas()


df["missing_pickup_gps"] = df["missing_pickup_gps"].astype(float)
df["high_risk_delivery"] = df["high_risk_delivery"].astype(float)


### Feature Selection and Label Definition

The model uses operational delay metrics and GPS availability as input features.
The target variable represents whether a delivery is classified as high risk.

Features were intentionally kept minimal to maintain model interpretability and
reduce overfitting.


In [0]:
X = df[
    [
        "accept_to_pickup_minutes",
        "pickup_delay_minutes",
        "missing_pickup_gps"
    ]
]

y = df["high_risk_delivery"]

### Train-Test Split

The dataset is split into training and testing subsets to evaluate model performance
on unseen data and avoid overfitting.


In [0]:

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42
)


input_example = X_train.head(5)

### Model Training and Experiment Tracking

A Logistic Regression model is trained as a baseline, prioritizing explainability
and operational trust. MLflow is used to log parameters, metrics, and artifacts,
ensuring experiment reproducibility and traceability.


In [0]:
with mlflow.start_run(run_name="last_mile_risk_logistic_regression"):

    mlflow.log_param("model_type", "LogisticRegression")
    mlflow.log_param("test_size", 0.2)

    model = LogisticRegression(max_iter=1000)
    model.fit(X_train, y_train)

    y_pred_prob = model.predict_proba(X_test)[:, 1]
    auc = roc_auc_score(y_test, y_pred_prob)

    mlflow.log_metric("AUC", auc)

    mlflow.sklearn.log_model(
        model,
        artifact_path="model",
        input_example=input_example
    )

print(f"AUC: {auc:.4f}")


### Feature Importance Interpretation

Model coefficients are examined to understand which operational factors contribute
most strongly to delivery risk. This improves explainability and stakeholder trust
in model outputs.


In [0]:
# See model coefficients (what impacts risk the most)
coefficients = model.coef_[0]

feature_importance = list(zip(X.columns, coefficients))
feature_importance