# Event Recommendation System - Random Baseline

Test if geographic filtering alone (without learning) provides good recommendations.

In [9]:
import sys
from pathlib import Path
import pandas as pd
import numpy as np

sys.path.append(str(Path.cwd().parent))

from utils.metrics import evaluate_recommendations
from utils.temporal_split import temporal_split_per_user, print_split_stats
from utils.geo_filter import haversine_distance

## 1. Load and Split Data

In [10]:
raw_dir = Path("../data/raw")

train_raw = pd.read_csv(raw_dir / "train.csv")
events_raw = pd.read_csv(raw_dir / "events.csv")

print(f"Raw train data: {len(train_raw)} interactions")
print(f"Events: {len(events_raw)}")

Raw train data: 15398 interactions
Events: 3137972


In [11]:
train_df, val_df = temporal_split_per_user(train_raw, train_ratio=0.5, min_interactions=3)

print_split_stats(train_df, val_df)

TEMPORAL SPLIT STATISTICS

TRAIN SET:
  Total interactions: 7393
  Unique users: 2034
  Unique events: 4733
  Interested=1: 1337

VALIDATION SET:
  Total interactions: 8005
  Unique users: 2034
  Unique events: 5127
  Interested=1: 2794

OVERLAP:
  Users in both: 2034
  Events in both: 1014


## 2. Load Processed Events (for geo coordinates)

In [12]:
from utils.preprocessing import EventFeatureExtractor

processed_events_path = Path("../data/processed/events_processed.csv")

if processed_events_path.exists():
    print("Loading cached processed events...")
    events = pd.read_csv(processed_events_path)
else:
    print("Processing events...")
    extractor = EventFeatureExtractor(n_clusters=30)
    events = extractor.fit_transform(events_raw)
    events.to_csv(processed_events_path, index=False)

print(f"Processed events shape: {events.shape}")

Loading cached processed events...
Processed events shape: (3137972, 113)


## 3. Random Baseline with Geographic Filtering

This baseline:
1. Finds user's median location from past events
2. Filters to top-K nearest events (geo_top_k)
3. **Randomly samples 200 events** from this pool
4. No learning - just geographic proximity

In [13]:
def get_user_median_location(user_id, train_df, events):
    """Get user's median location from their past interactions"""
    user_events = train_df[train_df["user"] == user_id]["event"].tolist()
    
    if not user_events:
        return None, None
    
    # Use merge instead of isin for better performance
    event_locs = events[events["event_id"].isin(user_events)][["lat", "lng"]].dropna()
    
    if len(event_locs) == 0:
        return None, None
    
    median_lat = event_locs["lat"].median()
    median_lng = event_locs["lng"].median()
    
    return median_lat, median_lng


def random_recommend_with_geo(user_id, train_df, events, geo_top_k=3000, n=200, exclude_seen=True):
    """Random recommendation with geographic filtering"""
    
    # Get user location
    user_lat, user_lng = get_user_median_location(user_id, train_df, events)
    
    if user_lat is None or pd.isna(user_lat) or pd.isna(user_lng):
        # Fallback: random sample from all events
        candidates = events["event_id"].sample(min(geo_top_k, len(events))).tolist()
    else:
        # Pre-filter events with valid coordinates
        valid_events = events.dropna(subset=["lat", "lng"]).copy()
        
        # Vectorized distance calculation
        valid_events["distance"] = haversine_distance(
            user_lat, user_lng, 
            valid_events["lat"].values, 
            valid_events["lng"].values
        )
        
        # Get top-K nearest events
        nearest_events = valid_events.nsmallest(geo_top_k, "distance")
        candidates = nearest_events["event_id"].tolist()
    
    # Exclude seen events
    if exclude_seen:
        seen_events = set(train_df[train_df["user"] == user_id]["event"])
        candidates = [e for e in candidates if e not in seen_events]
    
    # Randomly sample n events
    if len(candidates) <= n:
        return candidates
    else:
        return list(np.random.choice(candidates, size=n, replace=False))


print("Random baseline function ready")

Random baseline function ready


## 4. Evaluate Random Baseline

In [14]:
K = 200
N_TEST_USERS = 100
GEO_TOP_K = 3000
RANDOM_SEED = 42

np.random.seed(RANDOM_SEED)

val_with_labels = val_df[(val_df["interested"] == 1) | (val_df["not_interested"] == 1)]
users_with_labels = val_with_labels["user"].unique()

print(f"Users with labels in validation: {len(users_with_labels)}")

if N_TEST_USERS:
    test_users = users_with_labels[:N_TEST_USERS]
else:
    test_users = users_with_labels

print(f"Evaluating on {len(test_users)} users...")

random_predictions = {}
actuals = {}
not_interested = {}

for user in test_users:
    random_predictions[user] = random_recommend_with_geo(
        user, train_df, events, geo_top_k=GEO_TOP_K, n=K, exclude_seen=True
    )
    actuals[user] = val_df[(val_df["user"] == user) & (val_df["interested"] == 1)]["event"].tolist()
    not_interested[user] = val_df[(val_df["user"] == user) & (val_df["not_interested"] == 1)]["event"].tolist()

metrics = evaluate_recommendations(actuals, random_predictions, not_interested, k=K)

print(f"\n{'='*50}")
print(f"RANDOM BASELINE (geo_top_k={GEO_TOP_K}) @ K={K}")
print(f"{'='*50}")
for metric, value in metrics.items():
    print(f"{metric:20s}: {value:.5f}")
print(f"{'='*50}")

Users with labels in validation: 1501
Evaluating on 100 users...

RANDOM BASELINE (geo_top_k=3000) @ K=200
Recall@K            : 0.03833
Hit_Rate@K          : 0.06000
Contamination@K     : 0.00000


## 5. Test Different geo_top_k Values

In [15]:
import time

geo_values = [500, 1000, 2000, 3000, 5000]
results = []

for geo_k in geo_values:
    print(f"\n{'='*60}")
    print(f"Testing Random with geo_top_k = {geo_k}")
    print(f"{'='*60}")
    
    np.random.seed(RANDOM_SEED)
    start_time = time.time()
    
    predictions = {}
    for user in test_users:
        predictions[user] = random_recommend_with_geo(
            user, train_df, events, geo_top_k=geo_k, n=K, exclude_seen=True
        )
    
    metrics = evaluate_recommendations(actuals, predictions, not_interested, k=K)
    elapsed = time.time() - start_time
    
    results.append({
        "geo_top_k": geo_k,
        "recall": metrics["Recall@K"],
        "hit_rate": metrics["Hit_Rate@K"],
        "contamination": metrics["Contamination@K"]
    })
    
    print(f"Recall@{K}: {metrics['Recall@K']:.5f}")
    print(f"Hit_Rate@{K}: {metrics['Hit_Rate@K']:.5f}")
    print(f"Time: {elapsed:.1f}s")

print(f"\n{'='*60}")
print("SUMMARY - Random Baseline with Different geo_top_k")
print(f"{'='*60}")
print(f"{'geo_top_k':<15} {'Recall@K':<15} {'Hit_Rate@K':<15}")
print(f"{'-'*45}")
for r in results:
    print(f"{r['geo_top_k']:<15} {r['recall']:<15.5f} {r['hit_rate']:<15.5f}")


Testing Random with geo_top_k = 500
Recall@200: 0.02760
Hit_Rate@200: 0.07000
Time: 195.9s

Testing Random with geo_top_k = 1000
Recall@200: 0.02176
Hit_Rate@200: 0.06000
Time: 239.6s

Testing Random with geo_top_k = 2000
Recall@200: 0.05310
Hit_Rate@200: 0.10000
Time: 250.4s

Testing Random with geo_top_k = 3000
Recall@200: 0.03833
Hit_Rate@200: 0.06000
Time: 255.3s

Testing Random with geo_top_k = 5000
Recall@200: 0.00667
Hit_Rate@200: 0.02000
Time: 251.0s

SUMMARY - Random Baseline with Different geo_top_k
geo_top_k       Recall@K        Hit_Rate@K     
---------------------------------------------
500             0.02760         0.07000        
1000            0.02176         0.06000        
2000            0.05310         0.10000        
3000            0.03833         0.06000        
5000            0.00667         0.02000        


## 6. Comparison with Real Models

Compare Random baseline with the actual models to see the improvement from learning.

In [16]:
# Best results from each model (using optimal geo_top_k for each)
comparison = [
    {"model": "Random (geo_top_k=1000)", "recall": 0.02176, "hit_rate": 0.06000, "geo_k": 1000},
    {"model": "Random (geo_top_k=3000)", "recall": 0.03833, "hit_rate": 0.06000, "geo_k": 3000},
    {"model": "Content-Based (geo_top_k=1000)", "recall": 0.10376, "hit_rate": 0.16000, "geo_k": 1000},
    {"model": "Collaborative (geo_top_k=3000)", "recall": 0.27231, "hit_rate": 0.34000, "geo_k": 3000},
    {"model": "Social (no geo filter)", "recall": 0.46997, "hit_rate": 0.59184, "geo_k": None},
]

print(f"\n{'='*70}")
print("COMPARISON: Random Baseline vs Real Models")
print(f"{'='*70}")
print(f"{'Model':<40} {'Recall@200':<15} {'Hit_Rate@200':<15}")
print(f"{'-'*70}")

for item in comparison:
    print(f"{item['model']:<40} {item['recall']:<15.5f} {item['hit_rate']:<15.5f}")

print(f"\n{'='*70}")
print("ANALYSIS: How much do models improve over random?")
print(f"{'='*70}")

# Content-Based vs Random @ geo_top_k=1000
random_cb = comparison[0]["recall"]
cb_recall = comparison[2]["recall"]
cb_improvement = (cb_recall / random_cb)

print(f"\nContent-Based (geo_top_k=1000):")
print(f"  Random baseline:  {random_cb:.5f}")
print(f"  Content-Based:    {cb_recall:.5f}")
print(f"  Improvement:      {cb_improvement:.1f}x")

# Collaborative vs Random @ geo_top_k=3000
random_cf = comparison[1]["recall"]
cf_recall = comparison[3]["recall"]
cf_improvement = (cf_recall / random_cf)

print(f"\nCollaborative (geo_top_k=3000):")
print(f"  Random baseline:  {random_cf:.5f}")
print(f"  Collaborative:    {cf_recall:.5f}")
print(f"  Improvement:      {cf_improvement:.1f}x")

# Social vs Random @ geo_top_k=3000 (for comparison)
social_recall = comparison[4]["recall"]
social_improvement = (social_recall / random_cf)

print(f"\nSocial (no geo filter, compared to random@3000):")
print(f"  Random baseline:  {random_cf:.5f}")
print(f"  Social:           {social_recall:.5f}")
print(f"  Improvement:      {social_improvement:.1f}x")

print(f"\n{'='*70}")
print("CONCLUSION:")
print(f"{'='*70}")
print("✓ All models significantly beat random baseline!")
print(f"  - Content-Based learns useful patterns ({cb_improvement:.1f}x improvement)")
print(f"  - Collaborative learns strong patterns ({cf_improvement:.1f}x improvement)")
print(f"  - Social learns exceptional patterns ({social_improvement:.1f}x improvement)")
print()
print("This proves that models are NOT just benefiting from geographic filtering.")
print("They successfully learn meaningful user preferences and event characteristics.")
print(f"{'='*70}")


COMPARISON: Random Baseline vs Real Models
Model                                    Recall@200      Hit_Rate@200   
----------------------------------------------------------------------
Random (geo_top_k=1000)                  0.02176         0.06000        
Random (geo_top_k=3000)                  0.03833         0.06000        
Content-Based (geo_top_k=1000)           0.10376         0.16000        
Collaborative (geo_top_k=3000)           0.27231         0.34000        
Social (no geo filter)                   0.46997         0.59184        

ANALYSIS: How much do models improve over random?

Content-Based (geo_top_k=1000):
  Random baseline:  0.02176
  Content-Based:    0.10376
  Improvement:      4.8x

Collaborative (geo_top_k=3000):
  Random baseline:  0.03833
  Collaborative:    0.27231
  Improvement:      7.1x

Social (no geo filter, compared to random@3000):
  Random baseline:  0.03833
  Social:           0.46997
  Improvement:      12.3x

CONCLUSION:
✓ All models significan