In [0]:
from pyspark.sql.functions import col, when
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline

df_silver = spark.table("workspace.ecommerce.user_features_silver")

df_features = df_silver.select("user_id", "total_views", "total_cart_adds")
df_labels = df_silver.select("user_id", "total_purchases").withColumn("label", when(col("total_purchases") > 0, 1).otherwise(0)).drop("total_purchases")
df_model_ready = df_features.join(df_labels, on="user_id", how="inner")

assembler = VectorAssembler(inputCols=["total_views", "total_cart_adds"], outputCol="features")
lr = LogisticRegression(featuresCol="features", labelCol="label")
pipeline_lr = Pipeline(stages=[assembler, lr])

model_lr = pipeline_lr.fit(df_model_ready) 

In [0]:
from pyspark.ml.functions import vector_to_array
from pyspark.sql.functions import col, desc


# Score all users
predictions_df = model_lr.transform(df_silver)

final_predictions = predictions_df.select(
    "user_id",
    "total_views",
    "total_cart_adds",
    "total_purchases",
    col("prediction").cast("int").alias("predicted_to_buy"),
    vector_to_array(col("probability"))[1].alias("buy_probability")
)

In [0]:
# Save predictions to Gold Delta table
gold_table_name = "workspace.ecommerce.streaming_predictions_gold" 
final_predictions.write.format("delta").mode("overwrite").saveAsTable(gold_table_name)

# Identify top predicted buyers
top_buyers = final_predictions.filter(
    (col("predicted_to_buy") == 1) & (col("total_purchases") == 0)
).orderBy(desc("buy_probability"))

print("\nTop Predicted Future Buyers :")
display(top_buyers.limit(10))