In [0]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, IntegerType, TimestampType
from datetime import datetime
import time

### Configuation

In [0]:
GOLD_DATABASE = "`bigdata-and-bi`.gold"

# Input tables
INTERACTIONS_TABLE = f"{GOLD_DATABASE}.star_interactions"
RECOMMENDATIONS_TABLE = f"{GOLD_DATABASE}.star_user_recommendations"

# Output table for evaluation metrics
EVALUATION_METRICS_TABLE = f"{GOLD_DATABASE}.star_evaluation_metrics"

# Evaluation parameters
TOP_K = 30  # Number of recommendations to evaluate
MIN_TOTAL_INTERACTIONS = 2  # Users need at least 2 interactions (1 train + 1 test)

print("=" * 80)
print("STAR Recommender System - Evaluation Pipeline")
print("=" * 80)
print(f"Interactions table: {INTERACTIONS_TABLE}")
print(f"Recommendations table: {RECOMMENDATIONS_TABLE}")
print(f"Evaluation metrics table: {EVALUATION_METRICS_TABLE}")
print(f"Top-K evaluation: {TOP_K}")
print(f"Test strategy: Last interaction per user")
print("=" * 80)

STAR Recommender System - Evaluation Pipeline
📊 Interactions table: `bigdata-and-bi`.gold.star_interactions
🎯 Recommendations table: `bigdata-and-bi`.gold.star_user_recommendations
📈 Evaluation metrics table: `bigdata-and-bi`.gold.star_evaluation_metrics
🔢 Top-K evaluation: 30
📝 Test strategy: Last interaction per user


### Helper function

In [0]:
def create_evaluation_table_if_not_exists():
    """
    Create evaluation metrics table if it doesn't exist.
    Stores historical evaluation results with timestamps.
    """
    schema = StructType([
        StructField("evaluation_id", StringType(), False),
        StructField("evaluation_timestamp", TimestampType(), False),
        StructField("test_users_count", IntegerType(), True),
        StructField("test_interactions_count", IntegerType(), True),
        StructField("users_with_recommendations", IntegerType(), True),
        StructField("precision_at_k", DoubleType(), True),
        StructField("recall_at_k", DoubleType(), True),
        StructField("f1_at_k", DoubleType(), True),
        StructField("hit_rate", DoubleType(), True),
        StructField("mrr", DoubleType(), True),  # Mean Reciprocal Rank
        StructField("map_score", DoubleType(), True),  # Mean Average Precision
        StructField("ndcg", DoubleType(), True),  # Normalized DCG
        StructField("avg_hit_position", DoubleType(), True),  # Average rank of hits
        StructField("coverage", DoubleType(), True),
        StructField("avg_recommendations_per_user", DoubleType(), True),
        StructField("processing_time_seconds", DoubleType(), True)
    ])
    
    try:
        spark.table(EVALUATION_METRICS_TABLE)
        print(f"Evaluation metrics table already exists: {EVALUATION_METRICS_TABLE}")
    except Exception:
        print(f"Creating evaluation metrics table: {EVALUATION_METRICS_TABLE}")
        (spark.createDataFrame([], schema)
         .write
         .format("delta")
         .mode("overwrite")
         .saveAsTable(EVALUATION_METRICS_TABLE))
        print(f"Created evaluation metrics table")

### Load data

In [0]:
print("\n" + "=" * 80)
print("STEP 1: LOAD DATA")
print("=" * 80)

start_time = time.time()

# Load all interactions
all_interactions_df = spark.table(INTERACTIONS_TABLE)
total_interactions = all_interactions_df.count()
total_users = all_interactions_df.select("user_id").distinct().count()
total_items = all_interactions_df.select("item_id").distinct().count()

print(f"All interactions loaded: {total_interactions:,}")
print(f"   Total users: {total_users:,}")
print(f"   Total items: {total_items:,}")

# Load recommendations
recommendations_df = spark.table(RECOMMENDATIONS_TABLE)
total_recommendations = recommendations_df.count()
unique_users_with_recs = recommendations_df.select("user_id").distinct().count()

print(f"Recommendations loaded: {total_recommendations:,}")
print(f"Users with recommendations: {unique_users_with_recs:,}")

print(f"Loading time: {time.time() - start_time:.2f}s")


STEP 1: LOAD DATA
📦 All interactions loaded: 9,489,569
   Total users: 776,370
   Total items: 495,063
🎯 Recommendations loaded: 23,291,100
👥 Users with recommendations: 776,370
⏱️  Loading time: 4.15s


### Indentify test interactions

In [0]:
print("\n" + "=" * 80)
print("STEP 2: EXTRACT TEST INTERACTIONS")
print("=" * 80)

print(f"\nTest Strategy: Using LAST interaction of each user as test case")
print(f"   - Rationale: Last interaction represents user's most recent preference")
print(f"   - All previous interactions were used for training")
print(f"   - This simulates real-world prediction scenario")

# Count interactions per user
user_interaction_counts = all_interactions_df.groupBy("user_id").agg(
    F.count("*").alias("total_interactions")
)

# Filter users with at least MIN_TOTAL_INTERACTIONS
users_with_enough_data = user_interaction_counts.filter(
    F.col("total_interactions") >= MIN_TOTAL_INTERACTIONS
)

users_for_eval = users_with_enough_data.count()
print(f"\nUsers with >= {MIN_TOTAL_INTERACTIONS} interactions: {users_for_eval:,} / {total_users:,}")

# Get the last interaction for each user
w_last = Window.partitionBy("user_id").orderBy(F.col("unixReviewTime").desc())

test_interactions_df = (all_interactions_df
    .join(users_with_enough_data.select("user_id"), "user_id", "inner")
    .withColumn("interaction_rank", F.row_number().over(w_last))
    .filter(F.col("interaction_rank") == 1)  # Only keep the last interaction
    .select("user_id", "item_id", "unixReviewTime", "rating")
)

test_count = test_interactions_df.count()
test_users = test_interactions_df.select("user_id").distinct().count()
test_items = test_interactions_df.select("item_id").distinct().count()

print(f"\nTest Set Statistics:")
print(f"   Test interactions: {test_count:,} (1 per user)")
print(f"   Test users       : {test_users:,}")
print(f"   Test items       : {test_items:,}")

if test_count == 0:
    print("No test interactions found. Exiting evaluation.")
    dbutils.notebook.exit("No test data to evaluate")

# Show temporal distribution
print(f"\nTest Data Temporal Distribution:")
test_interactions_df.select(
    F.from_unixtime("unixReviewTime").alias("review_time")
).agg(
    F.min("review_time").alias("earliest"),
    F.max("review_time").alias("latest")
).show(truncate=False)

# Show sample test interactions
print(f"\nSample test interactions (last interaction per user):")
test_interactions_df.join(
    all_interactions_df.groupBy("user_id").agg(
        F.count("*").alias("total_user_interactions")
    ),
    "user_id"
).select(
    "user_id",
    "total_user_interactions",
    "item_id",
    F.from_unixtime("unixReviewTime").alias("review_time"),
    "rating"
).show(10, truncate=False)

# Verify: show example of a user's full history vs test
sample_user = test_interactions_df.select("user_id").first()[0]
print(f"\nExample - Full history for user: {sample_user}")

user_history = (all_interactions_df
    .filter(F.col("user_id") == sample_user)
    .select(
        "item_id",
        F.from_unixtime("unixReviewTime").alias("review_time"),
        "rating"
    )
    .orderBy("unixReviewTime")
)

print(f"   Total interactions: {user_history.count()}")
user_history.show(truncate=False)

user_test = test_interactions_df.filter(F.col("user_id") == sample_user)
print(f"\n   Test interaction (last one):")
user_test.select(
    "item_id",
    F.from_unixtime("unixReviewTime").alias("review_time"),
    "rating"
).show(truncate=False)

print(f"\nEvaluation Mode: OFFLINE (Last Interaction)")
print(f"   - Training: All interactions except last per user")
print(f"   - Testing: Last interaction per user")
print(f"   - Model evaluated on unseen most recent preferences")


STEP 2: EXTRACT TEST INTERACTIONS

🎯 Test Strategy: Using LAST interaction of each user as test case
   - Rationale: Last interaction represents user's most recent preference
   - All previous interactions were used for training
   - This simulates real-world prediction scenario

✅ Users with >= 2 interactions: 776,370 / 776,370

📊 Test Set Statistics:
   Test interactions: 776,370 (1 per user)
   Test users       : 776,370
   Test items       : 256,256

📅 Test Data Temporal Distribution:
+-------------------+-------------------+
|earliest           |latest             |
+-------------------+-------------------+
|1997-09-26 23:58:29|2023-09-12 00:22:22|
+-------------------+-------------------+


📋 Sample test interactions (last interaction per user):
+----------------------------+-----------------------+----------+-------------------+------+
|user_id                     |total_user_interactions|item_id   |review_time        |rating|
+----------------------------+---------------------

### Prepare ground truth and recommendations

In [0]:
print("\n" + "=" * 80)
print("STEP 3: PREPARE GROUND TRUTH AND RECOMMENDATIONS")
print("=" * 80)

# Ground truth: items that users actually interacted with in test set
# Since we only have 1 test interaction per user, this is straightforward
ground_truth = (test_interactions_df
    .select(
        "user_id",
        F.array("item_id").alias("actual_items")  # Single item wrapped in array
    )
)

print(f"Ground truth prepared for {ground_truth.count():,} users")

# Get top-K recommendations for test users
top_k_recommendations = (recommendations_df
    .filter(F.col("rank") <= TOP_K)
    .groupBy("user_id")
    .agg(F.collect_list("item_id").alias("recommended_items"))
)

print(f"Top-{TOP_K} recommendations prepared for {top_k_recommendations.count():,} users")

# Join ground truth with recommendations
evaluation_df = ground_truth.join(
    top_k_recommendations,
    "user_id",
    "inner"  # Only evaluate users who have both test interactions and recommendations
)

users_to_evaluate = evaluation_df.count()
print(f"Users to evaluate (have both test interaction and recommendations): {users_to_evaluate:,}")

if users_to_evaluate == 0:
    print("No users to evaluate (no overlap between test users and users with recommendations)")
    dbutils.notebook.exit("No users to evaluate")



STEP 3: PREPARE GROUND TRUTH AND RECOMMENDATIONS
📝 Ground truth prepared for 776,370 users
🎯 Top-30 recommendations prepared for 776,370 users
✅ Users to evaluate (have both test interaction and recommendations): 776,370


### Calculate Metric

In [0]:

print("\n" + "=" * 80)
print("STEP 4: CALCULATE EVALUATION METRICS")
print("=" * 80)

# Prepare recommendations with ranks
recommendations_with_rank = (recommendations_df
    .filter(F.col("rank") <= TOP_K)
    .select("user_id", "item_id", "rank")
)

# Join ground truth with ranked recommendations
evaluation_with_ranks = ground_truth.join(
    recommendations_with_rank,
    "user_id",
    "inner"
)

print(f"Evaluating {evaluation_with_ranks.select('user_id').distinct().count():,} users")


STEP 4: CALCULATE EVALUATION METRICS
📊 Evaluating 776,370 users


### Rank-based metrics

In [0]:
print("\nCalculating rank-based metrics...")

# UDF for calculating comprehensive metrics including rank-based ones
from pyspark.sql.types import StructType, StructField, IntegerType, DoubleType, ArrayType

metrics_schema = StructType([
    StructField("hits", IntegerType(), False),
    StructField("precision", DoubleType(), False),
    StructField("recall", DoubleType(), False),
    StructField("reciprocal_rank", DoubleType(), False),  # For MRR
    StructField("average_precision", DoubleType(), False),  # For MAP
    StructField("ndcg", DoubleType(), False),  # NDCG
    StructField("hit_positions", ArrayType(IntegerType()), False)  # Positions of hits
])

@F.udf(metrics_schema)
def calculate_comprehensive_metrics(actual_items, recommended_items_with_ranks):
    """
    Calculate comprehensive metrics including rank-based ones.
    
    Args:
        actual_items: list of actual item IDs
        recommended_items_with_ranks: list of tuples [(item_id, rank), ...]
    
    Returns:
        tuple of metrics
    """
    import math
    
    if not actual_items or not recommended_items_with_ranks:
        return (0, 0.0, 0.0, 0.0, 0.0, 0.0, [])
    
    actual_set = set(actual_items)
    
    # Sort by rank to ensure correct order
    sorted_recs = sorted(recommended_items_with_ranks, key=lambda x: x[1])
    
    hits = 0
    hit_positions = []
    first_hit_rank = None
    precision_sum = 0.0
    dcg = 0.0
    
    # Calculate metrics in one pass
    for idx, (item_id, rank) in enumerate(sorted_recs, 1):
        is_hit = item_id in actual_set
        
        if is_hit:
            hits += 1
            hit_positions.append(rank)
            
            # For MRR: record first hit position
            if first_hit_rank is None:
                first_hit_rank = rank
            
            # For MAP: accumulate precision at each hit position
            precision_at_i = hits / idx
            precision_sum += precision_at_i
            
            # For DCG: rel=1 for hits, rel=0 for misses
            dcg += 1.0 / math.log2(rank + 1)
    
    # Basic metrics
    precision = hits / len(sorted_recs) if len(sorted_recs) > 0 else 0.0
    recall = hits / len(actual_set) if len(actual_set) > 0 else 0.0
    
    # MRR (Mean Reciprocal Rank)
    reciprocal_rank = 1.0 / first_hit_rank if first_hit_rank else 0.0
    
    # MAP (Mean Average Precision)
    average_precision = precision_sum / len(actual_set) if len(actual_set) > 0 else 0.0
    
    # NDCG (Normalized Discounted Cumulative Gain)
    # IDCG: ideal DCG if all relevant items were at top
    ideal_hits = min(len(actual_set), len(sorted_recs))
    idcg = sum(1.0 / math.log2(i + 2) for i in range(ideal_hits))
    ndcg = dcg / idcg if idcg > 0 else 0.0
    
    return (hits, precision, recall, reciprocal_rank, average_precision, ndcg, hit_positions)

# Collect recommendations with ranks per user
recommendations_collected = (recommendations_with_rank
    .groupBy("user_id")
    .agg(
        F.collect_list(
            F.struct("item_id", "rank")
        ).alias("recommended_items_with_ranks")
    )
)

# Join ground truth with collected recommendations
evaluation_df = ground_truth.join(
    recommendations_collected,
    "user_id",
    "inner"
)

# Calculate per-user metrics
user_metrics_df = evaluation_df.withColumn(
    "metrics",
    calculate_comprehensive_metrics(
        F.col("actual_items"), 
        F.col("recommended_items_with_ranks")
    )
).select(
    "user_id",
    F.col("metrics.hits").alias("hits"),
    F.col("metrics.precision").alias("precision"),
    F.col("metrics.recall").alias("recall"),
    F.col("metrics.reciprocal_rank").alias("reciprocal_rank"),
    F.col("metrics.average_precision").alias("average_precision"),
    F.col("metrics.ndcg").alias("ndcg"),
    F.col("metrics.hit_positions").alias("hit_positions"),
    F.size("actual_items").alias("num_actual")
)

# Show sample of detailed metrics
print("\nSample user-level metrics:")
user_metrics_df.select(
    "user_id",
    "hits",
    F.round("precision", 4).alias("precision"),
    F.round("recall", 4).alias("recall"),
    F.round("reciprocal_rank", 4).alias("mrr"),
    F.round("average_precision", 4).alias("map"),
    F.round("ndcg", 4).alias("ndcg"),
    "hit_positions"
).show(5, truncate=False)

# Aggregate metrics across all users
aggregated_metrics = user_metrics_df.agg(
    F.avg("precision").alias("avg_precision"),
    F.avg("recall").alias("avg_recall"),
    F.avg("reciprocal_rank").alias("avg_mrr"),
    F.avg("average_precision").alias("avg_map"),
    F.avg("ndcg").alias("avg_ndcg"),
    F.sum(F.when(F.col("hits") > 0, 1).otherwise(0)).alias("users_with_hits"),
    F.count("*").alias("total_users")
).collect()[0]

# Calculate aggregate metrics
precision_at_k = aggregated_metrics["avg_precision"]
recall_at_k = aggregated_metrics["avg_recall"]
mrr = aggregated_metrics["avg_mrr"]
map_score = aggregated_metrics["avg_map"]
ndcg_score = aggregated_metrics["avg_ndcg"]
users_with_hits = aggregated_metrics["users_with_hits"]
total_users_evaluated = aggregated_metrics["total_users"]

# F1 Score
f1_at_k = (2 * precision_at_k * recall_at_k) / (precision_at_k + recall_at_k) \
    if (precision_at_k + recall_at_k) > 0 else 0.0

# Hit Rate (percentage of users with at least one hit)
hit_rate = users_with_hits / total_users_evaluated if total_users_evaluated > 0 else 0.0

# Coverage (percentage of unique items recommended)
all_items = all_interactions_df.select("item_id").distinct().count()
recommended_items_unique = recommendations_df.select("item_id").distinct().count()
coverage = recommended_items_unique / all_items if all_items > 0 else 0.0

# Average recommendations per user
avg_recs_per_user = recommendations_df.groupBy("user_id").count().agg(
    F.avg("count")
).collect()[0][0]


🎯 Calculating rank-based metrics...

📋 Sample user-level metrics:
+----------------------------+----+---------+------+----+----+------+-------------+
|user_id                     |hits|precision|recall|mrr |map |ndcg  |hit_positions|
+----------------------------+----+---------+------+----+----+------+-------------+
|AF44OEMEO7EDENNDT5Y456V47ONQ|0   |0.0      |0.0   |0.0 |0.0 |0.0   |[]           |
|AF47PGDGREYUAPBVUC33URD65ZYA|0   |0.0      |0.0   |0.0 |0.0 |0.0   |[]           |
|AF4ACGTK5AZW4VFSJ6F6MN7QFSOA|0   |0.0      |0.0   |0.0 |0.0 |0.0   |[]           |
|AF4CU3MFF6IPJLU6RMT3EEIWTPDQ|0   |0.0      |0.0   |0.0 |0.0 |0.0   |[]           |
|AF4HSDERS7EQJP5FZCOQBIIPWYFA|1   |0.0333   |1.0   |0.25|0.25|0.4307|[4]          |
+----------------------------+----+---------+------+----+----+------+-------------+
only showing top 5 rows


### Rank position analysis

In [0]:
print("\nAnalyzing hit positions...")

# Explode hit positions to analyze distribution
hit_position_analysis = (user_metrics_df
    .select("user_id", F.explode("hit_positions").alias("hit_rank"))
    .groupBy("hit_rank")
    .agg(F.count("*").alias("hit_count"))
    .orderBy("hit_rank")
)

print("\nHit Distribution by Rank Position:")
hit_position_analysis.show(20)

# Calculate average hit position
avg_hit_position = (user_metrics_df
    .select(F.explode("hit_positions").alias("hit_rank"))
    .agg(F.avg("hit_rank").alias("avg_position"))
    .collect()[0]["avg_position"]
)

print(f"Average hit position: {avg_hit_position:.2f}")


📊 Analyzing hit positions...

📈 Hit Distribution by Rank Position:
+--------+---------+
|hit_rank|hit_count|
+--------+---------+
|       1|    10863|
|       2|     7034|
|       3|     4942|
|       4|     3685|
|       5|     2662|
|       6|     2164|
|       7|     1640|
|       8|     1424|
|       9|     1313|
|      10|     1025|
|      11|      953|
|      12|      812|
|      13|      741|
|      14|      670|
|      15|      649|
|      16|      569|
|      17|      544|
|      18|      511|
|      19|      483|
|      20|      449|
+--------+---------+
only showing top 20 rows
📍 Average hit position: 6.57


### Results

In [0]:
print("\n" + "=" * 80)
print("EVALUATION RESULTS")
print("=" * 80)

print(f"\nDataset Statistics:")
print(f"  Test users evaluated        : {total_users_evaluated:,}")
print(f"  Test interactions (1 per user): {test_count:,}")
print(f"  Users with recommendations  : {users_to_evaluate:,}")

print(f"\nRecommendation Quality Metrics:")
print(f"  Precision@{TOP_K}             : {precision_at_k:.4f}")
print(f"  Recall@{TOP_K}                : {recall_at_k:.4f} (Note: 1 test item per user)")
print(f"  F1-Score@{TOP_K}              : {f1_at_k:.4f}")
print(f"  Hit Rate                    : {hit_rate:.4f} ({users_with_hits:,}/{total_users_evaluated:,})")

print(f"\nRank-Based Metrics (considering position):")
print(f"  MRR (Mean Reciprocal Rank)  : {mrr:.4f}")
print(f"  MAP (Mean Average Precision): {map_score:.4f}")
print(f"  NDCG@{TOP_K}                  : {ndcg_score:.4f}")
print(f"  Avg Hit Position            : {avg_hit_position:.2f}")

print(f"\nSystem Metrics:")
print(f"  Coverage                    : {coverage:.4f} ({recommended_items_unique:,}/{all_items:,} items)")
print(f"  Avg recommendations/user    : {avg_recs_per_user:.2f}")

print(f"\nInterpretation:")
print(f"  - Lower Avg Hit Position is better (hits closer to top)")
print(f"  - MRR focuses on first hit position")
print(f"  - Since we have 1 test item per user, Recall = Hit Rate / K")
print(f"  - Hit Rate is the most intuitive metric here")

processing_time = time.time() - start_time
print(f"\nProcessing time: {processing_time:.2f}s")


EVALUATION RESULTS

📊 Dataset Statistics:
  Test users evaluated        : 776,370
  Test interactions (1 per user): 776,370
  Users with recommendations  : 776,370

🎯 Recommendation Quality Metrics:
  Precision@30             : 0.0020
  Recall@30                : 0.0599 (Note: 1 test item per user)
  F1-Score@30              : 0.0039
  Hit Rate                    : 0.0599 (46,522/776,370)

🎖️  Rank-Based Metrics (considering position):
  MRR (Mean Reciprocal Rank)  : 0.0246
  MAP (Mean Average Precision): 0.0246
  NDCG@30                  : 0.0324
  Avg Hit Position            : 6.57

📈 System Metrics:
  Coverage                    : 0.8882 (439,705/495,063 items)
  Avg recommendations/user    : 30.00

💡 Interpretation:
  - Lower Avg Hit Position is better (hits closer to top)
  - MRR focuses on first hit position
  - Since we have 1 test item per user, Recall = Hit Rate / K
  - Hit Rate is the most intuitive metric here

⏱️  Processing time: 159.23s


In [0]:
print("\n" + "=" * 80)
print("STEP 5: SAVE EVALUATION RESULTS")
print("=" * 80)

create_evaluation_table_if_not_exists()

# Create evaluation record
evaluation_id = f"eval_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
evaluation_timestamp = datetime.now()

evaluation_record = spark.createDataFrame([{
    "evaluation_id": evaluation_id,
    "evaluation_timestamp": evaluation_timestamp,
    "test_users_count": total_users_evaluated,
    "test_interactions_count": test_count,
    "users_with_recommendations": users_to_evaluate,
    "precision_at_k": precision_at_k,
    "recall_at_k": recall_at_k,
    "f1_at_k": f1_at_k,
    "hit_rate": hit_rate,
    "mrr": mrr,
    "map_score": map_score,
    "ndcg": ndcg_score,
    "avg_hit_position": avg_hit_position,
    "coverage": coverage,
    "avg_recommendations_per_user": avg_recs_per_user,
    "processing_time_seconds": processing_time
}])

# Append to evaluation metrics table
evaluation_record.write \
    .format("delta") \
    .mode("append") \
    .saveAsTable(EVALUATION_METRICS_TABLE)

print(f"Evaluation results saved to {EVALUATION_METRICS_TABLE}")
print(f"Evaluation ID: {evaluation_id}")


STEP 5: SAVE EVALUATION RESULTS
✅ Evaluation metrics table already exists: `bigdata-and-bi`.gold.star_evaluation_metrics
✅ Evaluation results saved to `bigdata-and-bi`.gold.star_evaluation_metrics
📝 Evaluation ID: eval_20251115_095047


In [0]:
print("\n" + "=" * 80)
print("HISTORICAL EVALUATION TRENDS")
print("=" * 80)

historical_metrics = spark.table(EVALUATION_METRICS_TABLE) \
    .orderBy(F.col("evaluation_timestamp").desc()) \
    .limit(5)

print("\nLast 5 evaluations:")
historical_metrics.select(
    "evaluation_id",
    "evaluation_timestamp",
    "test_users_count",
    F.round("precision_at_k", 4).alias("precision"),
    F.round("recall_at_k", 4).alias("recall"),
    F.round("f1_at_k", 4).alias("f1"),
    F.round("hit_rate", 4).alias("hit_rate"),
    F.round("mrr", 4).alias("mrr"),
    F.round("ndcg", 4).alias("ndcg")
).show(truncate=False)

print("\n" + "=" * 80)
print("EVALUATION COMPLETE")
print("=" * 80)


HISTORICAL EVALUATION TRENDS

📊 Last 5 evaluations:
+--------------------+--------------------------+----------------+---------+------+------+--------+------+------+
|evaluation_id       |evaluation_timestamp      |test_users_count|precision|recall|f1    |hit_rate|mrr   |ndcg  |
+--------------------+--------------------------+----------------+---------+------+------+--------+------+------+
|eval_20251115_095047|2025-11-15 09:50:47.007022|776370          |0.002    |0.0599|0.0039|0.0599  |0.0246|0.0324|
+--------------------+--------------------------+----------------+---------+------+------+--------+------+------+


✅ EVALUATION COMPLETE
