This notebook builds reusable features for the ranking model by joining candidate products with user, product, and interaction attributes.

In [0]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window

gold = "kusha_solutions.products_recommendation_online_ml_temp"

candidates = spark.table(f"{gold}.gold_candidate_products1")

display(candidates.limit(10))


In [0]:
candidate_agg = (
    candidates
      .groupBy("CustomerID", "ProductID")
      .agg(
          F.countDistinct("candidate_source").alias("num_sources"),
          F.collect_set("candidate_source").alias("candidate_sources")
      )
)

display(candidate_agg.limit(100))


In [0]:
from pyspark.sql import functions as F

ui = spark.table(f"{gold}.gold_user_product_interactions")

candidate_with_ui = (
    candidate_agg
      .join(
          ui.select(
              "CustomerID",
              "ProductID",
              "interaction_events",
              "last_interaction_ts"
          ),
          ["CustomerID", "ProductID"],
          "left"
      )
      # ðŸ”¹ Replace NULLs with 0
      .withColumn(
          "interaction_events",
          F.coalesce(F.col("interaction_events"), F.lit(0))
      )
      .withColumn(
          "last_interaction_ts",
          F.coalesce(F.col("last_interaction_ts"), F.lit(None).cast('timestamp'))
      )
      # ðŸ”¹ Repeat interaction flag
      .withColumn(
          "is_repeat_interaction",
          F.when(F.col("interaction_events") > 0, 1).otherwise(0)
      )
)

display(candidate_with_ui.limit(100))


In [0]:
zero_count = candidate_with_ui.filter(F.col("interaction_events").isNull()).count()
one_count = candidate_with_ui.filter(F.col("interaction_events") == 1).count()

print(f"Zero count: {null_count}")
print(f"One count: {one_count}")

In [0]:
candidate_with_recency = (
    candidate_with_ui
      .withColumn(
          "recency_days",
          F.when(
              F.col("last_interaction_ts").isNotNull(),
              F.datediff(F.current_date(), F.col("last_interaction_ts"))
          ).otherwise(999)
      )
)

display(candidate_with_recency.limit(10))


In [0]:
display(candidate_with_recency.select("recency_days").distinct())

In [0]:
pf = spark.table(f"{gold}.gold_product_features")

candidate_with_product = (
    candidate_with_recency
      .join(
          pf.select(
              "ProductID",
              "ProductRating",
              "ReviewsCount",
              "DiscountPercent"
          ),
          "ProductID",
          "left"
      )
)

display(candidate_with_product.limit(10))


In [0]:
cu = spark.table(f"{gold}.gold_customers_with_age_group")

final_features = (
    candidate_with_product
      .join(
          cu.select(
              "CustomerID",
              "AgeGroup",
              "AvgReviewRating",
              "PreviousPurchases"
          ),
          "CustomerID",
          "left"
      )
)

display(final_features.limit(10))


**How ranking uses these features**

num_sources	More reasons â†’ higher rank

is_repeat_interaction	Seen before â†’ strong signal

recency_days	More recent â†’ stronger

ProductRating	Quality bias

DiscountPercent	Price sensitivity

AgeGroup	Personalization