In [0]:
from pyspark.sql import functions as F

from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import (
    LogisticRegression,
    DecisionTreeClassifier,
    RandomForestClassifier
)
from pyspark.ml.evaluation import BinaryClassificationEvaluator,MulticlassClassificationEvaluator


import mlflow
import mlflow.spark

In [0]:
df = spark.table("workspace.ecommerce.ecommerce_events_delta")

In [0]:
df = df.withColumn(
    "label",
    F.when(F.col("event_type") == "purchase", 1).otherwise(0)
)

In [0]:

feature_cols = ["price", "product_id", "category_id"]

train_df, test_df = (
    df
    .select(feature_cols + ["label"])
    .randomSplit([0.8, 0.2], seed=42)
)


assembler = VectorAssembler(
    inputCols=feature_cols,
    outputCol="features"
)
     

In [0]:
models = {
    "logistic_regression": LogisticRegression(),
    "decision_tree": DecisionTreeClassifier(maxDepth=5),
    "random_forest": RandomForestClassifier(numTrees=50)
}
   

In [0]:
evaluator = BinaryClassificationEvaluator(
    labelCol="label",
    metricName="areaUnderROC"
)
   

In [0]:
accuracy_evaluator = MulticlassClassificationEvaluator(
    labelCol="label",
    metricName="accuracy"
)

In [0]:
import os

os.environ["MLFLOW_DFS_TMP"] = "/Volumes/workspace/ecommerce/ecommerce_data/mlflow_tmp"


In [0]:
results = []

for name, model in models.items():

    pipeline = Pipeline(stages=[assembler, model])

    with mlflow.start_run(run_name=name, nested=True):

        mlflow.log_param("model_name", name)

        trained_model = pipeline.fit(train_df)
        predictions = trained_model.transform(test_df)

        auc = evaluator.evaluate(predictions)
        accuracy = accuracy_evaluator.evaluate(predictions)

        mlflow.log_metric("AUC", auc)
        mlflow.log_metric("Accuracy", accuracy)

        mlflow.spark.log_model(
            trained_model,
            artifact_path="model",
            dfs_tmpdir="/Volumes/workspace/ecommerce/ecommerce_data/mlflow_tmp"
        )

        results.append((name, auc, accuracy))
        print(f"{name} â†’ AUC = {auc:.4f}, Accuracy = {accuracy:.4f}")

[0;31m---------------------------------------------------------------------------[0m
[0;31mAnalysisException[0m                         Traceback (most recent call last)
File [0;32m<command-6426400283418349>, line 11[0m
[1;32m      7[0m [38;5;28;01mwith[39;00m mlflow[38;5;241m.[39mstart_run(run_name[38;5;241m=[39mname, nested[38;5;241m=[39m[38;5;28;01mTrue[39;00m):
[1;32m      9[0m     mlflow[38;5;241m.[39mlog_param([38;5;124m"[39m[38;5;124mmodel_name[39m[38;5;124m"[39m, name)
[0;32m---> 11[0m     trained_model [38;5;241m=[39m pipeline[38;5;241m.[39mfit(train_df)
[1;32m     12[0m     predictions [38;5;241m=[39m trained_model[38;5;241m.[39mtransform(test_df)
[1;32m     14[0m     auc [38;5;241m=[39m evaluator[38;5;241m.[39mevaluate(predictions)

File [0;32m/databricks/python_shell/lib/dbruntime/MLWorkloadsInstrumentation/_pyspark.py:30[0m, in [0;36m_create_patch_function.<locals>.patched_method[0;34m(self, *args, **kwargs)[0m
[1;32m    

In [0]:
best_model = max(results, key=lambda x: x[1])  # based on AUC

print("\n🏆 Best Model Selected:")
print(f"Model Name : {best_model[0]}")
print(f"AUC        : {best_model[1]:.4f}")
print(f"Accuracy   : {best_model[2]:.4f}")