# Event Recommendation System - Temporal Evaluation

In [1]:
import sys
from pathlib import Path
import pandas as pd
import numpy as np

sys.path.append(str(Path.cwd().parent))

from models.content_based import ContentBasedRecommender
from utils.metrics import evaluate_recommendations
from utils.temporal_split import temporal_split_per_user, print_split_stats

## 1. Load and Split Data

In [2]:
raw_dir = Path("../data/raw")

train_raw = pd.read_csv(raw_dir / "train.csv")
events_raw = pd.read_csv(raw_dir / "events.csv")
event_attendees = pd.read_csv(raw_dir / "event_attendees.csv")
user_friends = pd.read_csv(raw_dir / "user_friends.csv")

print(f"Raw train data: {len(train_raw)} interactions")
print(f"Events: {len(events_raw)}")
print(f"Event attendees: {len(event_attendees)}")

Raw train data: 15398 interactions
Events: 3137972
Event attendees: 24144


In [3]:
train_df, val_df = temporal_split_per_user(train_raw, train_ratio=0.5, min_interactions=3)

print_split_stats(train_df, val_df)

TEMPORAL SPLIT STATISTICS

TRAIN SET:
  Total interactions: 7393
  Unique users: 2034
  Unique events: 4733
  Interested=1: 1337

VALIDATION SET:
  Total interactions: 8005
  Unique users: 2034
  Unique events: 5127
  Interested=1: 2794

OVERLAP:
  Users in both: 2034
  Events in both: 1014


## 2. Preprocess Events Data

This cell processes events (K-means clustering, feature extraction) only if not already cached. Subsequent runs will load from cache.

In [4]:
from utils.preprocessing import EventFeatureExtractor

processed_events_path = Path("../data/processed/events_processed.csv")

if processed_events_path.exists():
    print("Loading cached processed events...")
    events = pd.read_csv(processed_events_path)
else:
    print("Processing events (this will take a few minutes)...")
    extractor = EventFeatureExtractor(n_clusters=30)
    events = extractor.fit_transform(events_raw)
    events.to_csv(processed_events_path, index=False)
    print("Events processed and cached!")

print(f"Processed events shape: {events.shape}")
print(f"Event categories: {events['event_category'].nunique()}")

Processing events (this will take a few minutes)...
Events processed and cached!
Processed events shape: (3137972, 113)
Event categories: 30


## 3. Hyperparameters

Model uses geographic filtering to reduce search space from 3M events to top-K nearest events based on user's median location.

In [None]:
CONTENT_BASED_PARAMS = {
    "weight_purchase": 3.0,
    "weight_interested": 1.0,
    "temporal_decay": 0.01,
    "geo_top_k": 3000
}

K = 200
N_TEST_USERS = 100

## 4. Train Content-Based Model

In [6]:
print("Training Content-Based model...")
cb_model = ContentBasedRecommender(**CONTENT_BASED_PARAMS)
cb_model.fit(events, train_df, event_attendees)
print("Training complete!")

Training Content-Based model...
Training complete!


## 5. Evaluate on Validation Set

In [7]:
val_with_labels = val_df[(val_df["interested"] == 1) | (val_df["not_interested"] == 1)]
users_with_labels = val_with_labels["user"].unique()

print(f"Users with labels in validation: {len(users_with_labels)}")

if N_TEST_USERS:
    test_users = users_with_labels[:N_TEST_USERS]
else:
    test_users = users_with_labels

print(f"Evaluating on {len(test_users)} users...")

cb_predictions = {}
actuals = {}
not_interested = {}

for user in test_users:
    cb_predictions[user] = cb_model.recommend(user, n=K, exclude_seen=True)
    actuals[user] = val_df[(val_df["user"] == user) & (val_df["interested"] == 1)]["event"].tolist()
    not_interested[user] = val_df[(val_df["user"] == user) & (val_df["not_interested"] == 1)]["event"].tolist()

metrics = evaluate_recommendations(actuals, cb_predictions, not_interested, k=K)

print(f"\n{'='*50}")
print(f"CONTENT-BASED RESULTS @ K={K}")
print(f"{'='*50}")
for metric, value in metrics.items():
    print(f"{metric:20s}: {value:.5f}")
print(f"{'='*50}")

Users with labels in validation: 1501
Evaluating on 100 users...

CONTENT-BASED RESULTS @ K=200
Recall@K            : 0.10376
Hit_Rate@K          : 0.16000
Contamination@K     : 0.00000
