In [0]:
# Candidate table
candidates = spark.table(
    "kusha_solutions.products_recommendation_online_ml_temp.__gold_candidate_products"
).toPandas()

# Interaction table
user_interactions = spark.table(
    "kusha_solutions.products_recommendation_online_ml_temp.gold_user_product_interactions"
).toPandas()

# -----------------------------
# USER FEATURES
# -----------------------------
user_stats = (
    
    user_interactions
    .groupby("CustomerID")
    .agg(
        user_total_interactions=("ProductID", "count")
    )
    .reset_index()
)

# -----------------------------
# USER-PRODUCT FEATURES
# -----------------------------
interaction_features = (
    user_interactions
    .groupby(["CustomerID", "ProductID"])
    .agg(
        interaction_count=("ProductID", "count")
    )
    .reset_index()
)

interaction_features["has_interacted"] = 1

# -----------------------------
# MERGE FEATURES
# -----------------------------
df = candidates.merge(user_stats, on="CustomerID", how="left")
df = df.merge(
    interaction_features,
    on=["CustomerID", "ProductID"],
    how="left"
)

df.fillna(0, inplace=True)

# -----------------------------
# EXTRA FEATURES (SAFE & USEFUL)
# -----------------------------
df["user_product_affinity"] = (
    df["interaction_count"] / (df["user_total_interactions"] + 1)
)

df["is_repeat_interaction"] = (df["interaction_count"] > 1).astype(int)

# -----------------------------
# LABEL CREATION
# -----------------------------
df["label"] = df["has_interacted"]

print(df.head())

# -----------------------------
# SAVE AS GOLD FEATURE TABLE
# -----------------------------
features_spark = spark.createDataFrame(df)

features_spark.write \
    .mode("overwrite") \
    .format("delta") \
    .saveAsTable(
        
        "kusha_solutions.products_recommendation_online_ml_temp._gold_candidate_features_labeled"
    )


In [0]:
df = spark.read.table("kusha_solutions.products_recommendation_online_ml_temp._gold_candidate_features_labeled")
display(df)

In [0]:
df = spark.read.table("kusha_solutions.products_recommendation_online_ml_temp._gold_candidate_features_labeled")
filtered_df = df.filter(df["label"] == 1)
display(filtered_df.select("ProductID", "CustomerID", "label"))

In [0]:
df = spark.read.table("kusha_solutions.products_recommendation_online_ml_temp._gold_candidate_features_labeled")

columns = [
    "user_total_interactions",
    "interactiyukon_count",
    "has_interacted",
    "user_product_affinity",
    "is_repeat_interaction"
    "label"
]

for col in columns:
    display(df.select(col).distinct())