In [0]:
from pyspark.sql.functions import col, when
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import LogisticRegression, RandomForestClassifier
from pyspark.ml import Pipeline

silver_table_name = "workspace.ecommerce.user_features_silver"
df_silver = spark.table(silver_table_name)

df_features = df_silver.select("user_id", "total_views", "total_cart_adds")
df_labels = (
    df_silver.select("user_id", "total_purchases")
    .withColumn("label", when(col("total_purchases") > 0, 1).otherwise(0))
    .drop("total_purchases")
)

df_model_ready = df_features.join(df_labels, on="user_id", how="inner")
train_df, test_df = df_model_ready.randomSplit([0.8, 0.2], seed=42)

assembler = VectorAssembler(inputCols=["total_views", "total_cart_adds"], outputCol="features")

lr = LogisticRegression(featuresCol="features", labelCol="label")
rf = RandomForestClassifier(featuresCol="features", labelCol="label", seed=42)

pipeline_lr = Pipeline(stages=[assembler, lr])
pipeline_rf = Pipeline(stages=[assembler, rf])

In [0]:
import os
import mlflow
import mlflow.spark
from pyspark.ml.evaluation import BinaryClassificationEvaluator

mlflow_temp_path = "/Volumes/workspace/ecommerce/ecommerce_data/mlflow_tmp"
dbutils.fs.mkdirs(mlflow_temp_path) 
os.environ["MLFLOW_DFS_TMP"] = mlflow_temp_path

evaluator = BinaryClassificationEvaluator(labelCol="label", rawPredictionCol="rawPrediction", metricName="areaUnderROC")

# Logistic Regression
with mlflow.start_run(run_name="Day7_Logistic_Regression"):
    model_lr = pipeline_lr.fit(train_df)
    preds_lr = model_lr.transform(test_df)
    auc_lr = evaluator.evaluate(preds_lr)
    
    # Logging Parameters & Metrics
    mlflow.log_param("Model_Type", "Logistic Regression")
    mlflow.log_metric("AUC_Score", auc_lr)
    
    mlflow.spark.log_model(model_lr, "logistic_regression_model")
    print(f"Logistic Regression Logged. AUC: {auc_lr:.4f}")

# Random Forest
with mlflow.start_run(run_name="Day7_Random_Forest"):
    model_rf = pipeline_rf.fit(train_df) 
    preds_rf = model_rf.transform(test_df)
    auc_rf = evaluator.evaluate(preds_rf)
    
    # Logging Parameters & Metrics
    mlflow.log_param("Model_Type", "Random Forest")
    mlflow.log_param("maxDepth", 5) 
    mlflow.log_metric("AUC_Score", auc_rf)
    
    mlflow.spark.log_model(model_rf, "random_forest_model")
    print(f"Random Forest Logged. AUC: {auc_rf:.4f}")