- Train 3 Different Models

In [0]:
import os
import mlflow
import mlflow.spark
import pandas as pd

from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import (
    LinearRegression,
    DecisionTreeRegressor,
    RandomForestRegressor
)
from pyspark.ml.evaluation import RegressionEvaluator

# Unity Catalog requirement
os.environ["MLFLOW_DFS_TMP"] = "/Volumes/workspace/ecommerce/mlflow_tmp"

# Set MLflow Experiment
mlflow.set_experiment("/day13-mlflow-model-comparison")

# Load data
data = (
    spark.table("ecommerce.silver.daily_sales")
    .select("total_events", "total_revenue")
    .dropna()
)

train_df, test_df = data.randomSplit([0.8, 0.2], seed=42)

# Feature Engineering
assembler = VectorAssembler(
    inputCols=["total_events"],
    outputCol="features"
)

train_vec = assembler.transform(train_df)
test_vec = assembler.transform(test_df)

# Models
models = {
    "LinearRegression": LinearRegression(
        featuresCol="features",
        labelCol="total_revenue"
    ),
    "DecisionTree": DecisionTreeRegressor(
        featuresCol="features",
        labelCol="total_revenue",
        maxDepth=5
    ),
    "RandomForest": RandomForestRegressor(
        featuresCol="features",
        labelCol="total_revenue",
        numTrees=50
    )
}

# Evaluator
evaluator = RegressionEvaluator(
    labelCol="total_revenue",
    predictionCol="prediction",
    metricName="rmse"
)

# Train & log
for name, model in models.items():

    with mlflow.start_run(run_name=name):

        mlflow.log_param("model_type", name)
        mlflow.log_param("features", "total_events")

        fitted_model = model.fit(train_vec)
        predictions = fitted_model.transform(test_vec)

        rmse = evaluator.evaluate(predictions)
        mlflow.log_metric("rmse", rmse)

        #  FIX: Convert DenseVector â†’ list
        sample_rows = (
            train_vec
            .select("features")
            .limit(5)
            .collect()
        )

        input_example = pd.DataFrame({
            "features": [row["features"].toArray().tolist() for row in sample_rows]
        })

        mlflow.spark.log_model(
            spark_model=fitted_model,
            artifact_path="model",
            input_example=input_example
        )

        print(f"{name} | RMSE = {rmse:.2f}")


{"ts": "2026-01-21 08:41:19.277", "level": "ERROR", "logger": "pyspark.sql.connect.logging", "msg": "GRPC Error received", "context": {}, "exception": {"class": "_InactiveRpcError", "msg": "<_InactiveRpcError of RPC that terminated with:\n\tstatus = StatusCode.INTERNAL\n\tdetails = \"requirement failed: Column features must be of type class org.apache.spark.ml.linalg.VectorUDT:struct<type:tinyint,size:int,indices:array<int>,values:array<double>> but was actually class org.apache.spark.sql.types.ArrayType:array<double>.\"\n\tdebug_error_string = \"UNKNOWN:Error received from peer  {grpc_message:\"requirement failed: Column features must be of type class org.apache.spark.ml.linalg.VectorUDT:struct<type:tinyint,size:int,indices:array<int>,values:array<double>> but was actually class org.apache.spark.sql.types.ArrayType:array<double>.\", grpc_status:13, created_time:\"2026-01-21T08:41:19.277004163+00:00\"}\"\n>", "stacktrace": [{"class": null, "method": "_analyze", "file": "/databricks/pyt

LinearRegression | RMSE = 19570667.46


{"ts": "2026-01-21 08:41:55.704", "level": "ERROR", "logger": "pyspark.sql.connect.logging", "msg": "GRPC Error received", "context": {}, "exception": {"class": "_InactiveRpcError", "msg": "<_InactiveRpcError of RPC that terminated with:\n\tstatus = StatusCode.INTERNAL\n\tdetails = \"requirement failed: Column features must be of type class org.apache.spark.ml.linalg.VectorUDT:struct<type:tinyint,size:int,indices:array<int>,values:array<double>> but was actually class org.apache.spark.sql.types.ArrayType:array<double>.\"\n\tdebug_error_string = \"UNKNOWN:Error received from peer  {created_time:\"2026-01-21T08:41:55.703602162+00:00\", grpc_status:13, grpc_message:\"requirement failed: Column features must be of type class org.apache.spark.ml.linalg.VectorUDT:struct<type:tinyint,size:int,indices:array<int>,values:array<double>> but was actually class org.apache.spark.sql.types.ArrayType:array<double>.\"}\"\n>", "stacktrace": [{"class": null, "method": "_analyze", "file": "/databricks/pyt

DecisionTree | RMSE = 79878445.21


{"ts": "2026-01-21 08:42:16.749", "level": "ERROR", "logger": "pyspark.sql.connect.logging", "msg": "GRPC Error received", "context": {}, "exception": {"class": "_InactiveRpcError", "msg": "<_InactiveRpcError of RPC that terminated with:\n\tstatus = StatusCode.INTERNAL\n\tdetails = \"requirement failed: Column features must be of type class org.apache.spark.ml.linalg.VectorUDT:struct<type:tinyint,size:int,indices:array<int>,values:array<double>> but was actually class org.apache.spark.sql.types.ArrayType:array<double>.\"\n\tdebug_error_string = \"UNKNOWN:Error received from peer  {grpc_message:\"requirement failed: Column features must be of type class org.apache.spark.ml.linalg.VectorUDT:struct<type:tinyint,size:int,indices:array<int>,values:array<double>> but was actually class org.apache.spark.sql.types.ArrayType:array<double>.\", grpc_status:13, created_time:\"2026-01-21T08:42:16.748863706+00:00\"}\"\n>", "stacktrace": [{"class": null, "method": "_analyze", "file": "/databricks/pyt

RandomForest | RMSE = 68778849.78


In [0]:
import mlflow

runs = mlflow.search_runs()

runs[[
    "tags.mlflow.runName",
    "metrics.rmse",
    "params.model_type"
]]


Unnamed: 0,tags.mlflow.runName,metrics.rmse,params.model_type
0,RandomForest,68778850.0,RandomForest
1,DecisionTree,79878450.0,DecisionTree
2,LinearRegression,19570670.0,LinearRegression
3,LinearRegression,19570670.0,LinearRegression
4,LinearRegression,19570670.0,LinearRegression


In [0]:
clean_runs = runs[[
    "tags.mlflow.runName",
    "metrics.rmse",
    "params.model_type"
]].rename(columns={
    "tags.mlflow.runName": "run_name",
    "metrics.rmse": "rmse",
    "params.model_type": "model"
})

clean_runs

Unnamed: 0,run_name,rmse,model
0,RandomForest,68778850.0,RandomForest
1,DecisionTree,79878450.0,DecisionTree
2,LinearRegression,19570670.0,LinearRegression
3,LinearRegression,19570670.0,LinearRegression
4,LinearRegression,19570670.0,LinearRegression


- Build Spark ML Pipeline

In [0]:
import os
import mlflow
import mlflow.spark

from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.evaluation import RegressionEvaluator

# Unity Catalog requirement
os.environ["MLFLOW_DFS_TMP"] = "/Volumes/workspace/ecommerce/mlflow_tmp"

# Set MLflow Experiment
mlflow.set_experiment("/day13-mlflow-pipeline")

# Load data from Unity Catalog
data = (
    spark.table("ecommerce.silver.daily_sales")
    .select("total_events", "total_revenue")
    .dropna()
)

# Train-test split
train, test = data.randomSplit([0.8, 0.2], seed=42)

# Feature engineering
assembler = VectorAssembler(
    inputCols=["total_events"],
    outputCol="features"
)

# Model
rf = RandomForestRegressor(
    featuresCol="features",
    labelCol="total_revenue",
    numTrees=50
)

# Pipeline
pipeline = Pipeline(stages=[assembler, rf])

with mlflow.start_run(run_name="RandomForest_Pipeline"):

    pipeline_model = pipeline.fit(train)
    predictions = pipeline_model.transform(test)

    evaluator = RegressionEvaluator(
        labelCol="total_revenue",
        predictionCol="prediction",
        metricName="rmse"
    )

    rmse = evaluator.evaluate(predictions)

    # Log metadata
    mlflow.log_param("model_type", "RandomForestPipeline")
    mlflow.log_param("features", "total_events")
    mlflow.log_metric("rmse", rmse)

    # Log pipeline model (UC-safe)
    mlflow.spark.log_model(
        spark_model=pipeline_model,
        artifact_path="pipeline_model"
    )

    print("Pipeline RMSE:", rmse)




Pipeline RMSE: 68778849.78263079


- Select Best Model

In [0]:
best_run = clean_runs.sort_values("rmse").iloc[0]

print("Best Model:", best_run["model"])
print("Best RMSE:", best_run["rmse"])

Best Model: LinearRegression
Best RMSE: 19570667.462254103
