In [0]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import LogisticRegression, RandomForestClassifier
from pyspark.ml import Pipeline

# Feature Assembler 
assembler = VectorAssembler(inputCols=["total_interactions", "total_views", "total_cart_adds"], outputCol="features")

lr = LogisticRegression(featuresCol="features", labelCol="label")
rf = RandomForestClassifier(featuresCol="features", labelCol="label", seed=42)

# Create Pipelines
pipeline_lr = Pipeline(stages=[assembler, lr])
pipeline_rf = Pipeline(stages=[assembler, rf])

In [0]:
from pyspark.sql.functions import col, when

silver_table_name = "workspace.ecommerce.user_features_silver"
df_silver = spark.table(silver_table_name)

df_features = df_silver.select("user_id", "total_interactions", "total_views", "total_cart_adds")
df_labels = (
    df_silver.select("user_id", "total_purchases")
    .withColumn("label", when(col("total_purchases") > 0, 1).otherwise(0))
    .drop("total_purchases")
)

df_model_ready = df_features.join(df_labels, on="user_id", how="inner")
train_df, test_df = df_model_ready.randomSplit([0.8, 0.2], seed=42)

In [0]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import LogisticRegression, RandomForestClassifier
from pyspark.ml import Pipeline
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import BinaryClassificationEvaluator
import os

assembler = VectorAssembler(inputCols=["total_views", "total_cart_adds"], outputCol="features")

lr = LogisticRegression(featuresCol="features", labelCol="label")
rf = RandomForestClassifier(featuresCol="features", labelCol="label", seed=42)

pipeline_lr = Pipeline(stages=[assembler, lr])
pipeline_rf = Pipeline(stages=[assembler, rf])

# Evaluator
evaluator = BinaryClassificationEvaluator(labelCol="label", rawPredictionCol="rawPrediction", metricName="areaUnderROC")

model_lr_fixed = pipeline_lr.fit(train_df)
print("Logistic Regression Trained")

paramGrid = (ParamGridBuilder()
             .addGrid(rf.maxDepth, [5, 7]) 
             .addGrid(rf.numTrees, [20])   
             .build())

cv = CrossValidator(estimator=pipeline_rf,
                    estimatorParamMaps=paramGrid,
                    evaluator=evaluator,
                    numFolds=2, 
                    seed=42)

cv_model_rf_fixed = cv.fit(train_df)
print("Random Forest Trained")

In [0]:
preds_lr_fixed = model_lr_fixed.transform(test_df)
preds_rf_fixed = cv_model_rf_fixed.transform(test_df)

auc_lr_fixed = evaluator.evaluate(preds_lr_fixed)
auc_rf_fixed = evaluator.evaluate(preds_rf_fixed)

print(f"Logistic Regression AUC: {auc_lr_fixed:.4f}")
print(f"Random Forest (Tuned) AUC: {auc_rf_fixed:.4f}")

if auc_rf_fixed > auc_lr_fixed:
    print("\nWinner: Random Forest")
else:
    print("\nWinner: Logistic Regression")