## STAR Retrieval & Scoring Pipeline
### Calculate Semantic Score (R_S), Collaborative Score (R_C) and Final Ranking

score(x) = (1/n) * Σ[j=1 to n] r_j * λ^t_j * [a*R_S_xj + (1-a)*R_C_xj]
Where:
- R_S_xj: Semantic similarity (cosine similarity of embeddings)
- R_C_xj: Collaborative score (from pre-computed collab matrix)
- r_j: User rating (set to 1 for implicit feedback)
- λ^t_j: Temporal decay factor
- a: Weight factor between semantic and collaborative

### Configuration and Imports


In [0]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window
from pyspark.sql.types import FloatType
import time

In [0]:
# Database configuration
GOLD_DATABASE = "`bigdata-and-bi`.gold"

# Input tables
INTERACTIONS_TABLE = f"{GOLD_DATABASE}.star_interactions"
SEMANTIC_MATRIX_TABLE = f"{GOLD_DATABASE}.star_semantic_matrix"  # Pre-computed R_S
COLLAB_MATRIX_TABLE = f"{GOLD_DATABASE}.star_collab_matrix"      # Pre-computed R_C

# Output table
USER_RECOMMENDATIONS_TABLE = f"{GOLD_DATABASE}.star_user_recommendations"

# STAR parameters (from paper experiments)
HISTORY_LENGTH = 3
LAMBDA_DECAY = 0.7
ALPHA_WEIGHT = 0.5  # 0.5 means equal weight for semantic + collaborative
TOP_K_RECOMMENDATIONS = 30

In [0]:
print("=" * 60)
print("STAR Configuration:")
print("=" * 60)
print(f"  History length (l): {HISTORY_LENGTH}")
print(f"  Decay factor (λ): {LAMBDA_DECAY}")
print(f"  Semantic weight (α): {ALPHA_WEIGHT}")
print(f"  Collaborative weight (1-α): {1 - ALPHA_WEIGHT}")
print(f"  Top-K output: {TOP_K_RECOMMENDATIONS}")
print("=" * 60)

STAR Configuration:
  History length (l): 3
  Decay factor (λ): 0.7
  Semantic weight (α): 0.5
  Collaborative weight (1-α): 0.5
  Top-K output: 30


### Load Data


In [0]:
print("\nLoading pre-computed matrices...")
start_time = time.time()

# Load R_S (semantic similarity matrix)
semantic_matrix_df = spark.table(SEMANTIC_MATRIX_TABLE).select(
    F.col("item_i").alias("history_item"),
    F.col("item_j").alias("candidate_item"),
    F.col("semantic_score")
)
rs_count = semantic_matrix_df.count()
print(f"R_S matrix: {rs_count:,} pairs")


📥 Loading pre-computed matrices...
  ✅ R_S matrix: 88,510,600 pairs


In [0]:
# Load R_C (collaborative similarity matrix)
collab_matrix_df = spark.table(COLLAB_MATRIX_TABLE).select(
    F.col("item_i").alias("history_item"),
    F.col("item_j").alias("candidate_item"),
    F.col("collab_score")
)
rc_count = collab_matrix_df.count()
print(f"R_C matrix: {rc_count:,} pairs")

# Combine R_S and R_C into single matrix with proper null handling
print("\nCombining R_S and R_C matrices...")

  ✅ R_C matrix: 80,585,954 pairs

🔗 Combining R_S and R_C matrices...


In [0]:

# Full outer join to keep all pairs from both matrices
combined_matrix_df = semantic_matrix_df.join(
    collab_matrix_df,
    on=["history_item", "candidate_item"],
    how="outer"  # Keep all pairs from both matrices
)

# Check coverage before filling nulls
print("\nMatrix Coverage Analysis:")
coverage_stats = combined_matrix_df.select(
    F.count("*").alias("total_pairs"),
    F.sum(F.when(F.col("semantic_score").isNotNull(), 1).otherwise(0)).alias("has_semantic"),
    F.sum(F.when(F.col("collab_score").isNotNull(), 1).otherwise(0)).alias("has_collab"),
    F.sum(F.when(F.col("semantic_score").isNotNull() & F.col("collab_score").isNotNull(), 1).otherwise(0)).alias("has_both"),
    F.sum(F.when(F.col("semantic_score").isNull() & F.col("collab_score").isNotNull(), 1).otherwise(0)).alias("only_collab"),
    F.sum(F.when(F.col("semantic_score").isNotNull() & F.col("collab_score").isNull(), 1).otherwise(0)).alias("only_semantic")
).collect()[0]

print(f"  Total pairs: {coverage_stats['total_pairs']:,}")
print(f"  Has R_S: {coverage_stats['has_semantic']:,} ({coverage_stats['has_semantic']/coverage_stats['total_pairs']*100:.1f}%)")
print(f"  Has R_C: {coverage_stats['has_collab']:,} ({coverage_stats['has_collab']/coverage_stats['total_pairs']*100:.1f}%)")
print(f"  Has both: {coverage_stats['has_both']:,} ({coverage_stats['has_both']/coverage_stats['total_pairs']*100:.1f}%)")
print(f"  Only R_S: {coverage_stats['only_semantic']:,}")
print(f"  Only R_C: {coverage_stats['only_collab']:,}")

# Fill nulls with 0.0 (missing score means no similarity)
combined_matrix_df = combined_matrix_df \
    .fillna(0.0, subset=["semantic_score", "collab_score"])

# Add flags to track data source
combined_matrix_df = combined_matrix_df \
    .withColumn("has_semantic", F.col("semantic_score") > 0) \
    .withColumn("has_collab", F.col("collab_score") > 0)

# Pre-compute combined similarity score
# score = α * R_S + (1-α) * R_C
combined_matrix_df = combined_matrix_df.withColumn(
    "combined_score",
    ALPHA_WEIGHT * F.col("semantic_score") + (1 - ALPHA_WEIGHT) * F.col("collab_score")
).select(
    "history_item", 
    "candidate_item", 
    "combined_score",
    "semantic_score",
    "collab_score",
    "has_semantic",
    "has_collab"
)

combined_count = combined_matrix_df.count()

print(f"\nCombined matrix: {combined_count:,} pairs")

# Show distribution of combined scores
print("\nCombined Score Distribution:")
combined_matrix_df.select(
    F.min("combined_score").alias("min"),
    F.max("combined_score").alias("max"),
    F.mean("combined_score").alias("mean"),
    F.stddev("combined_score").alias("stddev")
).show()

print(f"Loading time: {time.time() - start_time:.2f}s")


📊 Matrix Coverage Analysis:
  Total pairs: 167,022,875
  Has R_S: 88,510,600 (53.0%)
  Has R_C: 80,585,954 (48.2%)
  Has both: 2,073,679 (1.2%)
  Only R_S: 86,436,921
  Only R_C: 78,512,275

  ✅ Combined matrix: 167,022,875 pairs

📈 Combined Score Distribution:
+--------------------+------------------+------------------+-------------------+
|                 min|               max|              mean|             stddev|
+--------------------+------------------+------------------+-------------------+
|7.031542100033996E-4|0.9994059801101685|0.1735841389391148|0.11926603580588531|
+--------------------+------------------+------------------+-------------------+

⏱️  Loading time: 36.50s


### Prepare User History with Temporal Info


In [0]:
print("\nPreparing user histories...")
start_time = time.time()

# Load interactions
interactions_df = spark.table(INTERACTIONS_TABLE).select(
    F.col("user_id"),
    F.col("item_id"),
    F.col("unixReviewTime").alias("timestamp"),
    F.coalesce(F.col("rating"), F.lit(1.0)).alias("rating")
)

print(f"Total interactions: {interactions_df.count():,}")

# Get most recent HISTORY_LENGTH items per user with temporal weights
w_time = Window.partitionBy("user_id").orderBy(F.col("timestamp").desc())

user_history_df = interactions_df \
    .withColumn("recency_rank", F.row_number().over(w_time)) \
    .filter(F.col("recency_rank") <= HISTORY_LENGTH) \
    .withColumn(
        "temporal_weight", 
        F.pow(F.lit(LAMBDA_DECAY), F.col("recency_rank") - 1)
    ) \
    .select(
        "user_id",
        F.col("item_id").alias("history_item"),
        "rating",
        "recency_rank",
        "temporal_weight"
    )

history_count = user_history_df.count()
users_count = user_history_df.select("user_id").distinct().count()

print(f"User-history pairs: {history_count:,}")
print(f"Unique users: {users_count:,}")
print(f"Avg history per user: {history_count / users_count:.2f}")
print(f"Preparation time: {time.time() - start_time:.2f}s")

# Show sample
print("\nSample user histories:")
user_history_df.show(10, truncate=False)


🔄 Preparing user histories...
  Total interactions: 8,719,087
  ✅ User-history pairs: 2,329,110
  ✅ Unique users: 776,370
  ✅ Avg history per user: 3.00
⏱️  Preparation time: 3.94s

📋 Sample user histories:
+----------------------------+------------+------+------------+-------------------+
|user_id                     |history_item|rating|recency_rank|temporal_weight    |
+----------------------------+------------+------+------------+-------------------+
|AE2224D3S4GTKVFJ5V7ZRQJ7P4FQ|B01N5XZCTV  |2.0   |1           |1.0                |
|AE2224D3S4GTKVFJ5V7ZRQJ7P4FQ|B002U3CBLQ  |5.0   |2           |0.7                |
|AE2224D3S4GTKVFJ5V7ZRQJ7P4FQ|B003KK5RR8  |4.0   |3           |0.48999999999999994|
|AE222H3FGXWLHRFUMGMS2RR57NDQ|0307090302  |5.0   |1           |1.0                |
|AE222H3FGXWLHRFUMGMS2RR57NDQ|0307090299  |5.0   |2           |0.7                |
|AE222H3FGXWLHRFUMGMS2RR57NDQ|B01C1LUFFK  |4.0   |3           |0.48999999999999994|
|AE222SFRFOAFCUJ3DKA3MVPDP3IA|043913

### Define Scoring UDFs


In [0]:
print("\nJoining user history with similarity matrix...")
start_time = time.time()

# Join user history with combined matrix to get candidates
# This is efficient because combined_matrix only has top-200 neighbors per item
candidates_with_scores = user_history_df.join(
    combined_matrix_df,
    on="history_item",
    how="inner"
).select(
    "user_id",
    "history_item",
    "candidate_item",
    "rating",
    "temporal_weight",
    "combined_score",
    "semantic_score",
    "collab_score",
    "has_semantic",
    "has_collab"
)

candidates_count = candidates_with_scores.count()
print(f"Candidate pairs: {candidates_count:,}")
print(f"Join time: {time.time() - start_time:.2f}s")

# Check how many candidates come from each source
print("\nCandidate Sources:")
candidates_with_scores.select(
    F.sum(F.when(F.col("has_semantic") & F.col("has_collab"), 1).otherwise(0)).alias("both"),
    F.sum(F.when(F.col("has_semantic") & ~F.col("has_collab"), 1).otherwise(0)).alias("only_semantic"),
    F.sum(F.when(~F.col("has_semantic") & F.col("has_collab"), 1).otherwise(0)).alias("only_collab")
).show()

# Show sample
print("\nSample candidates:")
candidates_with_scores.show(10, truncate=False)


🔗 Joining user history with similarity matrix...
  ✅ Candidate pairs: 829,311,090
⏱️  Join time: 15.84s

📊 Candidate Sources:
+--------+-------------+-----------+
|    both|only_semantic|only_collab|
+--------+-------------+-----------+
|13654694|    402975706|  412680690|
+--------+-------------+-----------+


📋 Sample candidates:
+----------------------------+------------+--------------+------+-------------------+--------------------+--------------+--------------------+------------+----------+
|user_id                     |history_item|candidate_item|rating|temporal_weight    |combined_score      |semantic_score|collab_score        |has_semantic|has_collab|
+----------------------------+------------+--------------+------+-------------------+--------------------+--------------+--------------------+------------+----------+
|AHQLSN5RFYDHTYEVY535JVREVV7Q|B005MWKB2I  |0670026107    |3.0   |1.0                |0.03571428571428571 |0.0           |0.07142857142857142 |false       |true     

In [0]:
print("\nCalculating final STAR scores...")
start_time = time.time()

# Apply rating and temporal weight to similarity score
weighted_scores = candidates_with_scores.withColumn(
    "weighted_score",
    F.col("rating") * F.col("temporal_weight") * F.col("combined_score")
)

# Aggregate by (user_id, candidate_item) - this is the core STAR formula
# score(x) = (1/n) * Σ[weighted_score]
final_scores = weighted_scores.groupBy("user_id", "candidate_item") \
    .agg(
        F.sum("weighted_score").alias("score_sum"),
        F.count("*").alias("history_count"),
        F.max("combined_score").alias("max_similarity"),
        F.avg("combined_score").alias("avg_similarity"),
        F.max("semantic_score").alias("max_semantic"),
        F.max("collab_score").alias("max_collab"),
        F.sum(F.when(F.col("has_semantic"), 1).otherwise(0)).alias("semantic_sources"),
        F.sum(F.when(F.col("has_collab"), 1).otherwise(0)).alias("collab_sources")
    ) \
    .withColumn(
        "final_score",
        F.col("score_sum") / F.col("history_count")  # Average per history item
    ) \
    .select(
        "user_id",
        F.col("candidate_item").alias("item_id"),
        "final_score",
        "max_similarity",
        "avg_similarity",
        "max_semantic",
        "max_collab",
        "history_count",
        "semantic_sources",
        "collab_sources"
    )

final_count = final_scores.count()

print(f"Final scores computed: {final_count:,}")
print(f"Scoring time: {time.time() - start_time:.2f}s")

# Analyze data source distribution
print("\nData Source Analysis:")
source_dist = final_scores.select(
    F.sum(F.when((F.col("semantic_sources") > 0) & (F.col("collab_sources") > 0), 1).otherwise(0)).alias("both_sources"),
    F.sum(F.when((F.col("semantic_sources") > 0) & (F.col("collab_sources") == 0), 1).otherwise(0)).alias("only_semantic"),
    F.sum(F.when((F.col("semantic_sources") == 0) & (F.col("collab_sources") > 0), 1).otherwise(0)).alias("only_collab"),
    F.count("*").alias("total")
).collect()[0]

print(f"  Both R_S & R_C: {source_dist['both_sources']:,} ({source_dist['both_sources']/source_dist['total']*100:.1f}%)")
print(f"  Only R_S: {source_dist['only_semantic']:,} ({source_dist['only_semantic']/source_dist['total']*100:.1f}%)")
print(f"  Only R_C: {source_dist['only_collab']:,} ({source_dist['only_collab']/source_dist['total']*100:.1f}%)")

# Show top scores
print("\nTop scored items (across all users):")
final_scores.orderBy(F.col("final_score").desc()).show(20, truncate=False)

# Score distribution
print("\nScore statistics:")
final_scores.select(
    F.min("final_score").alias("min"),
    F.max("final_score").alias("max"),
    F.mean("final_score").alias("mean"),
    F.stddev("final_score").alias("stddev")
).show()


🎯 Calculating final STAR scores...
  ✅ Final scores computed: 788,772,912
⏱️  Scoring time: 106.48s

📊 Data Source Analysis:
  Both R_S & R_C: 14,774,746 (1.9%)
  Only R_S: 383,689,993 (48.6%)
  Only R_C: 390,308,173 (49.5%)

🏆 Top scored items (across all users):
+----------------------------+----------+------------------+------------------+------------------+------------+------------------+-------------+----------------+--------------+
|user_id                     |item_id   |final_score       |max_similarity    |avg_similarity    |max_semantic|max_collab        |history_count|semantic_sources|collab_sources|
+----------------------------+----------+------------------+------------------+------------------+------------+------------------+-------------+----------------+--------------+
|AH2ZQQT4L3CKJHLQKZOSG5JXWNEA|1974707180|4.834704033132715 |0.966940806626543 |0.966940806626543 |0.99846727  |0.9354143466934853|1            |1               |1             |
|AHA4ZLYRQR6JEQ577N3MUYET2

### Generate Candidate Items


In [0]:
print(f"\nRanking and selecting top-{TOP_K_RECOMMENDATIONS} per user...")
start_time = time.time()

# Rank by final_score within each user
w_rank = Window.partitionBy("user_id").orderBy(F.col("final_score").desc())

recommendations_df = final_scores \
    .withColumn("rank", F.row_number().over(w_rank)) \
    .filter(F.col("rank") <= TOP_K_RECOMMENDATIONS) \
    .orderBy("user_id", "rank")

rec_count = recommendations_df.count()

print(f"Total recommendations: {rec_count:,}")
print(f"Users with recommendations: {recommendations_df.select('user_id').distinct().count():,}")
print(f"Ranking time: {time.time() - start_time:.2f}s")

# Show sample recommendations for one user
sample_user = recommendations_df.select("user_id").first()[0]
print(f"\n👤 Sample recommendations for user: {sample_user}")
recommendations_df.filter(F.col("user_id") == sample_user).show(20, truncate=False)


🏅 Ranking and selecting top-30 per user...
  ✅ Total recommendations: 23,291,100
  ✅ Users with recommendations: 776,370
⏱️  Ranking time: 550.31s

👤 Sample recommendations for user: AE22236AFRRSMQIKGG7TPTB75QEA
+----------------------------+----------+------------------+-------------------+-------------------+------------+--------------------+-------------+----------------+--------------+----+
|user_id                     |item_id   |final_score       |max_similarity     |avg_similarity     |max_semantic|max_collab          |history_count|semantic_sources|collab_sources|rank|
+----------------------------+----------+------------------+-------------------+-------------------+------------+--------------------+-------------+----------------+--------------+----+
|AE22236AFRRSMQIKGG7TPTB75QEA|B0030DHPAW|2.6990791310972817|0.5398158262194563 |0.5398158262194563 |0.901844    |0.17778762778070956 |1            |1               |1             |1   |
|AE22236AFRRSMQIKGG7TPTB75QEA|B000UZNS0O|2.

In [0]:
### Calculate Semantic Score (R_S)




In [0]:
print(f"\nSaving recommendations to {USER_RECOMMENDATIONS_TABLE}...")
start_time = time.time()

# Select only the essential 4 columns
recommendations_final = recommendations_df.select(
    "user_id",                      # STRING - User identifier
    "item_id",                      # STRING - Recommended item
    "rank",                         # INT - Rank position (1 to TOP_K)
    "final_score"                   # FLOAT - STAR computed score
)

# Save to Delta
recommendations_final.write \
    .format("delta") \
    .mode("overwrite") \
    .option("overwriteSchema", "true") \
    .saveAsTable(USER_RECOMMENDATIONS_TABLE)

print(f"Saved successfully!")
print(f"Save time: {time.time() - start_time:.2f}s")

# Show schema
print("\nTable Schema:")
recommendations_final.printSchema()

# Show sample
print("\nSample recommendations:")
recommendations_final.show(10, truncate=False)


💾 Saving recommendations to `bigdata-and-bi`.gold.star_user_recommendations...
✅ Saved successfully!
⏱️  Save time: 467.56s

📋 Table Schema:
root
 |-- user_id: string (nullable = true)
 |-- item_id: string (nullable = true)
 |-- rank: integer (nullable = false)
 |-- final_score: double (nullable = true)


📊 Sample recommendations:
+----------------------------+----------+----+------------------+
|user_id                     |item_id   |rank|final_score       |
+----------------------------+----------+----+------------------+
|AE22236AFRRSMQIKGG7TPTB75QEA|B0030DHPAW|1   |2.6990791310972817|
|AE22236AFRRSMQIKGG7TPTB75QEA|B000UZNS0O|2   |2.688416745027133 |
|AE22236AFRRSMQIKGG7TPTB75QEA|B000PC0SBY|3   |2.595545660483002 |
|AE22236AFRRSMQIKGG7TPTB75QEA|B000PDYVQ6|4   |2.584702193423582 |
|AE22236AFRRSMQIKGG7TPTB75QEA|B002YKOX3E|5   |2.5051297221797504|
|AE22236AFRRSMQIKGG7TPTB75QEA|045141912X|6   |2.48974472284317  |
|AE22236AFRRSMQIKGG7TPTB75QEA|B001650UDA|7   |2.4826291442807182|
|AE222