### config and import

In [0]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window

GOLD_DATABASE = "`bigdata-and-bi`.gold" 

GOLD_INTERACTIONS_TABLE = f"{GOLD_DATABASE}.star_interactions"
COLLAB_MATRIX_TABLE     = f"{GOLD_DATABASE}.star_collab_matrix"

print("=" * 60)
print("Configuration:")
print("=" * 60)
print(f"GOLD_INTERACTIONS_TABLE: {GOLD_INTERACTIONS_TABLE}")
print(f"COLLAB_MATRIX_TABLE    : {COLLAB_MATRIX_TABLE}")

# Limit interactions per user to avoid power users dominating
MAX_INTERACTIONS_PER_USER = 200

# Top-K neighbors to keep per item (consistent with R_S)
TOP_K_NEIGHBORS = 200

print(f"\nMAX_INTERACTIONS_PER_USER: {MAX_INTERACTIONS_PER_USER}")
print(f"TOP_K_NEIGHBORS          : {TOP_K_NEIGHBORS}")
print("=" * 60)

Configuration:
GOLD_INTERACTIONS_TABLE: `bigdata-and-bi`.gold.star_interactions
COLLAB_MATRIX_TABLE    : `bigdata-and-bi`.gold.star_collab_matrix

MAX_INTERACTIONS_PER_USER: 200
TOP_K_NEIGHBORS          : 200


### Load & normalize interactions (implicit = 1)

In [0]:
print("\nLoading interactions from", GOLD_INTERACTIONS_TABLE)
interactions_df = spark.table(GOLD_INTERACTIONS_TABLE)

print("\nSchema:")
interactions_df.printSchema()

print("\nSample:")
interactions_df.show(5, truncate=False)

# Clean and prepare interactions
interactions_df = (
    interactions_df
    .select(
        F.col("user_id").cast("string"),
        F.col("item_id").cast("string"),
        F.col("unixReviewTime").cast("bigint"),
        F.col("rating").cast("double")
    )
    .dropna(subset=["user_id", "item_id", "unixReviewTime"])
)

# Limit to most recent MAX_INTERACTIONS_PER_USER per user
print(f"\nLimiting to {MAX_INTERACTIONS_PER_USER} most recent interactions per user...")
w_time = Window.partitionBy("user_id").orderBy(F.col("unixReviewTime").desc())

interactions_limited = (
    interactions_df
    .withColumn("rn", F.row_number().over(w_time))
    .filter(F.col("rn") <= MAX_INTERACTIONS_PER_USER)
    .drop("rn")
)

raw_count = interactions_df.count()
limited_count = interactions_limited.count()

print("\nInteraction Counts:")
print(f"  Total interactions (raw)     : {raw_count:,}")
print(f"  Total interactions (limited) : {limited_count:,}")
print(f"  Filtered out                 : {raw_count - limited_count:,} ({(raw_count-limited_count)/raw_count*100:.1f}%)")


📥 Loading interactions from `bigdata-and-bi`.gold.star_interactions

📋 Schema:
root
 |-- item_id: string (nullable = true)
 |-- user_id: string (nullable = true)
 |-- unixReviewTime: long (nullable = true)
 |-- rating: double (nullable = true)


📊 Sample:
+----------+----------------------------+--------------+------+
|item_id   |user_id                     |unixReviewTime|rating|
+----------+----------------------------+--------------+------+
|B00I80NDLC|AHAT63DTBDKHFAJE2X5H6ODTM4FQ|1391881096    |5.0   |
|0310321913|AHATU2DHZYLLEOYMJBQWAEW7TZ4A|1394968052    |5.0   |
|0486408027|AHAUDYY467756ZP4VSEBSKTEJGVQ|1523705319    |3.0   |
|0590930028|AHAVNTRRREZNW6BBYDYKY7NBX6EA|1042685157    |4.0   |
|1421558874|AHAWIH6FH27UZ4MSBVJKYYXI477Q|1627968875    |5.0   |
+----------+----------------------------+--------------+------+
only showing top 5 rows

🔧 Limiting to 200 most recent interactions per user...

📊 Interaction Counts:
  Total interactions (raw)     : 8,719,087
  Total interactions 

In [0]:
print("\nComputing item normalization: 1/sqrt(#users_per_item)")

# Count how many users interacted with each item
item_activity = (
    interactions_limited
    .groupBy("item_id")
    .agg(
        F.countDistinct("user_id").alias("user_count")
    )
    .withColumn("item_norm_factor", 1.0 / F.sqrt(F.col("user_count")))
)

# Join back to get normalized interaction value
# For implicit feedback: interaction = 1 * (1/sqrt(#users for this item))
normalized_df = (
    interactions_limited
    .join(item_activity, "item_id")
    .withColumn(
        "normalized_interaction", 
        F.col("item_norm_factor")  # implicit rating = 1, multiplied by norm factor
    )
    .select("user_id", "item_id", "normalized_interaction")
)

print("Normalized interactions ready (by ITEM).")
print("\nSample normalized interactions:")
normalized_df.show(10, truncate=False)

# Verify normalization
print("\nVerification: Sample item norms (most popular items)")
item_activity.orderBy(F.desc("user_count")).show(10, truncate=False)


📐 Computing item normalization: 1/sqrt(#users_per_item)
✅ Normalized interactions ready (by ITEM).

📊 Sample normalized interactions:
+----------------------------+----------+----------------------+
|user_id                     |item_id   |normalized_interaction|
+----------------------------+----------+----------------------+
|AE2MNSXQDAPAHIPCRSRGZ6KSI5LQ|1572840897|0.4082482904638631    |
|AE2NTQFK4LFFDGVNIBV4GKL6UA2A|B00HU5KLR0|0.1796053020267749    |
|AE2X37LDNKEX77Y4QY6KRSP7PWGA|0998863831|0.4472135954999579    |
|AE2X37LDNKEX77Y4QY6KRSP7PWGA|0307957276|0.10425720702853739   |
|AE37JRSQFSYOAR34XASJCV2JSHXQ|0307460681|0.22360679774997896   |
|AE3FM3HGYR5644KV4CQ3MF2F75PA|0500516952|0.4082482904638631    |
|AE3KHEJ76PDZ3JX2Y56OS2PAU7RA|0062200577|0.07432941462471664   |
|AE3O5IH6MT74GCDIMHZOBHZT3VBQ|B00AQMH0FA|0.25                  |
|AE4MDI5AGHBBBVMIUIPQUHF2O53A|B095L3S271|0.14744195615489714   |
|AE4YFIO53YVMGRBCA6FAK5M6OQWA|1335081283|0.11785113019775793   |
+-------------------

### Build R_C and save to Delta

In [0]:
print("\nComputing collaborative similarity (cosine similarity based on co-occurrence)...")

# Self-join to find items purchased by same users
df_a = interactions_limited.alias("a")
df_b = interactions_limited.alias("b")

print("  Step 1: Finding co-occurrences...")
co_occurrence = (
    df_a.join(df_b, on="user_id")
    .filter(F.col("a.item_id") < F.col("b.item_id"))  # Only upper triangle to avoid duplicates
    .groupBy(
        F.col("a.item_id").alias("item_i"),
        F.col("b.item_id").alias("item_j")
    )
    .agg(
        F.countDistinct("user_id").alias("common_users")
    )
)

co_count = co_occurrence.count()
print(f"Found {co_count:,} co-occurrence pairs")

# Count total users per item
print("  Step 2: Counting users per item...")
item_user_counts = (
    interactions_limited
    .groupBy("item_id")
    .agg(F.countDistinct("user_id").alias("total_users"))
)

item_count = item_user_counts.count()
print(f"{item_count:,} unique items")

# Join to compute cosine similarity: common_users / sqrt(users_i * users_j)
print("  Step 3: Computing similarity scores...")
collab_upper = (
    co_occurrence
    .join(item_user_counts.alias("i"), F.col("item_i") == F.col("i.item_id"))
    .join(item_user_counts.alias("j"), F.col("item_j") == F.col("j.item_id"))
    .withColumn(
        "collab_score",
        F.col("common_users") / F.sqrt(F.col("i.total_users") * F.col("j.total_users"))
    )
    .select("item_i", "item_j", "collab_score")
)

print("\nSample R_C (upper triangle only):")
collab_upper.show(10, truncate=False)

# Verify that scores are in valid range [0, 1]
print("\nVerification: Score range")
score_stats = collab_upper.select(
    F.min("collab_score").alias("min_score"),
    F.max("collab_score").alias("max_score"),
    F.mean("collab_score").alias("avg_score")
).collect()[0]

print(f"  Min score: {score_stats['min_score']:.6f}")
print(f"  Max score: {score_stats['max_score']:.6f}")
print(f"  Avg score: {score_stats['avg_score']:.6f}")

if score_stats['max_score'] > 1.0:
    print("WARNING: collab_score > 1 detected! Formula may be incorrect.")
else:
    print("All collab_scores in valid range [0, 1]")


🔧 Computing collaborative similarity (cosine similarity based on co-occurrence)...
  Step 1: Finding co-occurrences...
  ✅ Found 131,048,405 co-occurrence pairs
  Step 2: Counting users per item...
  ✅ 494,860 unique items
  Step 3: Computing similarity scores...

📊 Sample R_C (upper triangle only):
+----------+----------+---------------------+
|item_i    |item_j    |collab_score         |
+----------+----------+---------------------+
|0142414972|1400207983|0.038235955645093626 |
|0670026182|0811867846|0.03592106040535498  |
|1590200551|1594633665|0.0063511588826065165|
|061899923X|0763668966|0.015278308283803521 |
|0802121969|B01N5HHWS4|0.10259783520851541  |
|0394868773|0736423478|0.020145574100634507 |
|0316038636|0836217179|0.01620722127754384  |
|076365549X|1579654584|0.014740991400388204 |
|0062687735|1559717866|0.08333333333333333  |
|0976631008|B006LSZECO|0.002518803791793962 |
+----------+----------+---------------------+
only showing top 10 rows

🔍 Verification: Score range


In [0]:
print("\nSymmetrizing R_C matrix...")

# Add reversed pairs (j,i) to make symmetric
R_C_symmetric = collab_upper.unionByName(
    collab_upper.select(
        F.col("item_j").alias("item_i"),
        F.col("item_i").alias("item_j"),
        "collab_score"
    )
)

symmetric_count = R_C_symmetric.count()
print(f"Symmetric R_C: {symmetric_count:,} pairs")
print(f"   (Should be ~2x upper triangle: {co_count:,} × 2 = {co_count*2:,})")

print("\nSample symmetric R_C:")
R_C_symmetric.show(10, truncate=False)




🔄 Symmetrizing R_C matrix...
✅ Symmetric R_C: 262,096,810 pairs
   (Should be ~2x upper triangle: 131,048,405 × 2 = 262,096,810)

📊 Sample symmetric R_C:
+----------+----------+---------------------+
|item_i    |item_j    |collab_score         |
+----------+----------+---------------------+
|0142414972|1400207983|0.038235955645093626 |
|0670026182|0811867846|0.03592106040535498  |
|1590200551|1594633665|0.0063511588826065165|
|061899923X|0763668966|0.015278308283803521 |
|0802121969|B01N5HHWS4|0.10259783520851541  |
|0394868773|0736423478|0.020145574100634507 |
|0316038636|0836217179|0.01620722127754384  |
|076365549X|1579654584|0.014740991400388204 |
|0062687735|1559717866|0.08333333333333333  |
|0976631008|B006LSZECO|0.002518803791793962 |
+----------+----------+---------------------+
only showing top 10 rows


In [0]:
print(f"\nFiltering to keep only top-{TOP_K_NEIGHBORS} neighbors per item...")

# Define window: partition by item_i, order by score descending
window_spec = Window.partitionBy("item_i").orderBy(F.col("collab_score").desc())

# Apply top-K filter
R_C_top200 = (
    R_C_symmetric
    .withColumn("rank", F.row_number().over(window_spec))
    .filter(F.col("rank") <= TOP_K_NEIGHBORS)
    .drop("rank")
    .select("item_i", "item_j", "collab_score")
)

before_count = symmetric_count
after_count = R_C_top200.count()

print("\nFiltering Results:")
print(f"  Before filtering: {before_count:,} pairs")
print(f"  After filtering : {after_count:,} pairs")
print(f"  Reduction       : {before_count - after_count:,} pairs ({(1 - after_count/before_count)*100:.1f}%)")

# Verify top-K constraint
print(f"\nVerifying top-{TOP_K_NEIGHBORS} constraint...")
neighbors_per_item = R_C_top200.groupBy("item_i").count()

neighbor_stats = neighbors_per_item.select(
    F.min("count").alias("min_neighbors"),
    F.max("count").alias("max_neighbors"),
    F.mean("count").alias("avg_neighbors"),
    F.expr("percentile(count, 0.5)").alias("median_neighbors")
).collect()[0]

print(f"  Min neighbors  : {neighbor_stats['min_neighbors']}")
print(f"  Max neighbors  : {neighbor_stats['max_neighbors']}")
print(f"  Avg neighbors  : {neighbor_stats['avg_neighbors']:.2f}")
print(f"  Median neighbors: {neighbor_stats['median_neighbors']:.0f}")

if neighbor_stats['max_neighbors'] > TOP_K_NEIGHBORS:
    print(f"WARNING: Some items have more than {TOP_K_NEIGHBORS} neighbors!")
else:
    print(f"All items have ≤ {TOP_K_NEIGHBORS} neighbors")


🎯 Filtering to keep only top-200 neighbors per item...

📊 Filtering Results:
  Before filtering: 262,096,810 pairs
  After filtering : 80,585,954 pairs
  Reduction       : 181,510,856 pairs (69.3%)

🔍 Verifying top-200 constraint...
  Min neighbors  : 3
  Max neighbors  : 200
  Avg neighbors  : 162.85
  Median neighbors: 200
  ✅ All items have ≤ 200 neighbors


In [0]:
print(f"\nSaving R_C matrix to {COLLAB_MATRIX_TABLE}...")

# Save to Delta
R_C_top200.write \
    .format("delta") \
    .mode("overwrite") \
    .option("overwriteSchema", "true") \
    .saveAsTable(COLLAB_MATRIX_TABLE)

print(f"Saved collaborative matrix R_C to table: {COLLAB_MATRIX_TABLE}")

# Verify save
final_count = spark.table(COLLAB_MATRIX_TABLE).count()
print(f"\nFinal verification:")
print(f"  Pairs in Delta table: {final_count:,}")

if final_count == after_count:
    print(" Row count matches!")
else:
    print(f"WARNING: Row count mismatch! Expected {after_count:,}, got {final_count:,}")


💾 Saving R_C matrix to `bigdata-and-bi`.gold.star_collab_matrix...
✅ Saved collaborative matrix R_C to table: `bigdata-and-bi`.gold.star_collab_matrix

📊 Final verification:
  Pairs in Delta table: 80,585,954
  ✅ Row count matches!


In [0]:
print("\nOptimizing Delta table...")

# Optimize and Z-order by item_i for better join performance
spark.sql(f"""
    OPTIMIZE {COLLAB_MATRIX_TABLE}
    ZORDER BY (item_i)
""")

print("Optimization complete!")

# Collect statistics
spark.sql(f"ANALYZE TABLE {COLLAB_MATRIX_TABLE} COMPUTE STATISTICS FOR ALL COLUMNS")
print("Statistics collected")


🔧 Optimizing Delta table...
✅ Optimization complete!
✅ Statistics collected
