In [0]:
from pyspark.sql.functions import col
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.regression import LinearRegression, GeneralizedLinearRegression
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import ParamGridBuilder, TrainValidationSplit
from mlflow.models.signature import infer_signature

import os
import mlflow
import mlflow.spark
import logging

logging.getLogger("mlflow").setLevel(logging.ERROR)  # Silence MLflow noise (optional but clean)
spark.sql("""CREATE VOLUME IF NOT EXISTS workspace.ecommerce.mlflow_tmp""")  # Create UC volume for MLflow artifacts (required on serverless)
os.environ["MLFLOW_DFS_TMP"] = "/Volumes/workspace/ecommerce/mlflow_tmp"  # Set MLflow temp dir
mlflow.set_experiment("/Shared/ecommerce_price_experiment")  # Set MLflow experiment

<Experiment: artifact_location='dbfs:/databricks/mlflow-tracking/972906041565883', creation_time=1769888538803, experiment_id='972906041565883', last_update_time=1769891661645, lifecycle_stage='active', name='/Shared/ecommerce_price_experiment', tags={'mlflow.experiment.sourceName': '/Shared/ecommerce_price_experiment',
 'mlflow.experimentType': 'MLFLOW_EXPERIMENT',
 'mlflow.ownerEmail': 'ganapathisking@gmail.com',
 'mlflow.ownerId': '78221769891557'}>

In [0]:
df = spark.read.format("delta").load("dbfs:/Volumes/workspace/ecommerce/silver/events_delta")
df = df.select(col("price").cast("int"), col("category_id").cast("string")).dropna()
df.show(3)

+-----+-------------------+
|price|        category_id|
+-----+-------------------+
|  842|2053013555631882655|
|   90|2053013556202308035|
|  270|2053013561579406073|
+-----+-------------------+
only showing top 3 rows


In [0]:
df_ml = df.select("price", "category_id")
train_df, test_df = df_ml.randomSplit([0.8, 0.2], seed=42)

In [0]:
category_indexer = StringIndexer(inputCol="category_id", outputCol="category_index", handleInvalid="keep")
assembler = VectorAssembler(inputCols=["category_index"], outputCol="features")

In [0]:
# import os
# display(spark.sql("CREATE VOLUME IF NOT EXISTS workspace.ecommerce.mlflow_tmp"))
# os.environ["MLFLOW_DFS_TMP"] = "/Volumes/workspace/ecommerce/mlflow_tmp"

In [0]:
models = {
    "LinearRegression": LinearRegression(labelCol="price", featuresCol="features"),
    "RidgeRegression": LinearRegression(labelCol="price", featuresCol="features", regParam=0.1, elasticNetParam=0.0),
    "LassoRegression": LinearRegression(labelCol="price", featuresCol="features", regParam=0.1, elasticNetParam=1.0)
}

In [0]:
evaluator = RegressionEvaluator(labelCol="price", predictionCol="prediction", metricName="rmse")

In [0]:
mlflow.set_experiment("/Shared/ecommerce_price_experiment")
results = []
for model_name, model in models.items():
    with mlflow.start_run(run_name=model_name):
        pipeline = Pipeline(stages=[category_indexer, assembler, model])  # 3. Build Spark ML pipeline 
        fitted_model = pipeline.fit(train_df)  # 1. Train 3 different models (linear regression, ridge regression, lasso regression) through looping 
        predictions = fitted_model.transform(test_df)
        rmse = evaluator.evaluate(predictions)
        mlflow.log_param("model_type", model_name)
        mlflow.log_metric("rmse", rmse)  # 2. Compare metrics in MLflow 
        input_example = train_df.select("category_id").limit(5).toPandas()  # Input example (fixes signature warning)
        output_example = predictions.select("prediction").limit(5).toPandas()  # Output example (fixes signature warning)
        signature = infer_signature(input_example, output_example)  # Signature (fixes MLflow model warnings)
        mlflow.spark.log_model(fitted_model, artifact_path="model")
        results.append((model_name, rmse))

In [0]:
best_model_name, best_rmse = sorted(results, key=lambda x: x[1])[0]  # 4. Select best model 
display(best_model_name, best_rmse)

'LassoRegression'

348.1669572711918

In [0]:
spark.createDataFrame(results, ["model", "rmse"]).orderBy("rmse").show()

+----------------+------------------+
|           model|              rmse|
+----------------+------------------+
| LassoRegression| 348.1669572711918|
| RidgeRegression|348.16696203873227|
|LinearRegression|348.16696620410744|
+----------------+------------------+



In [0]:
best_estimator = models[best_model_name]
with mlflow.start_run(run_name=f"{best_model_name}_final"):
    final_pipeline = Pipeline(stages=[category_indexer, assembler, best_estimator])
    final_model = final_pipeline.fit(train_df)
    final_predictions = final_model.transform(test_df)
    final_rmse = evaluator.evaluate(final_predictions)
    mlflow.log_param("best_model", best_model_name)
    mlflow.log_metric("rmse", final_rmse)
    input_example = (train_df.select("category_id").limit(10).toPandas().astype({"category_id": "string"}))  # Build input example (only model inputs, cast safe types)
    output_example = (final_predictions.select("prediction").limit(10).toPandas().astype({"prediction": "float64"}))  # Build output example (only prediction column, cast float)
    signature = infer_signature(input_example, output_example)
    mlflow.spark.log_model(final_model, artifact_path="model", registered_model_name="ecommerce_price_model", input_example=input_example, signature=signature)

Registered model 'ecommerce_price_model' already exists. Creating a new version of this model...
Created version '2' of model 'workspace.default.ecommerce_price_model'.


In [0]:
lr = LinearRegression(featuresCol="features", labelCol="price")
pipeline_lr = Pipeline(stages=[category_indexer, assembler, lr])
paramGrid = (ParamGridBuilder().addGrid(lr.regParam, [0.0, 0.01, 0.1]).addGrid(lr.elasticNetParam, [0.0, 0.5, 1.0]).build())
tvs = TrainValidationSplit(estimator=pipeline_lr, estimatorParamMaps=paramGrid, evaluator=evaluator, trainRatio=0.8)

In [0]:
import os
spark.sql("CREATE VOLUME IF NOT EXISTS workspace.ecommerce.sparkml_tmp")  # Create UC volume if not already created
# Set required temp paths for serverless + MLflow + SparkML
os.environ["MLFLOW_DFS_TMP"] = "/Volumes/workspace/ecommerce/mlflow_tmp"
os.environ["SPARKML_TEMP_DFS_PATH"] = "/Volumes/workspace/ecommerce/sparkml_tmp"
print("Temp paths configured")

Temp paths configured


In [0]:
with mlflow.start_run(run_name="LinearRegression_Tuned"): 
    tvs_model = tvs.fit(train_df)
    tuned_model = tvs_model.bestModel
    tuned_preds = tuned_model.transform(test_df)
    tuned_rmse = evaluator.evaluate(tuned_preds)
    mlflow.log_metric("rmse", tuned_rmse)
    input_example = train_df.limit(5).toPandas()
    output_example = tuned_model.transform(train_df.limit(5)).toPandas()
    signature = infer_signature(input_example, output_example)
    mlflow.spark.log_model(tuned_model, artifact_path="model", registered_model_name="ecommerce_price_model_tuned", input_example=input_example, signature=signature)

Registered model 'ecommerce_price_model_tuned' already exists. Creating a new version of this model...
Created version '1' of model 'workspace.default.ecommerce_price_model_tuned'.
