In [None]:
import pandas as pd
import os

# 切换到项目根目录或目标数据目录
os.chdir("/Users/dallylovely/Desktop/CCGG/Projects/recommendation_system")


In [None]:
# Step 2: 加载行为数据
behavior_df = pd.read_csv("data/raw/user_behavior_log_info.csv")

In [None]:
# Step 3: 构建 dt 时间列
mmd_str = behavior_df["time_stamp"].astype(str).str.zfill(4)
month = mmd_str.str[:2]
day = mmd_str.str[2:]
behavior_df["dt"] = pd.to_datetime(
    behavior_df["year"].astype(str) + "-" + month + "-" + day,
    errors='coerce'
) + pd.to_timedelta(behavior_df["timestamp"], unit="s")

In [None]:
# Step 4: 只取最近 7 天点击
max_dt = behavior_df["dt"].max()
min_dt = max_dt - pd.Timedelta(days=7)
click_df = behavior_df[
    (behavior_df["dt"] >= min_dt) &
    (behavior_df["dt"] <= max_dt) &
    (behavior_df["action_type"].str.lower() == "click")
]

In [None]:
# Step 5: 判断点击的 item 是否出现在 item_metadata 中
total_gt = len(click_df)
in_vocab = click_df["item_id"].astype(str).isin(valid_items).sum()
ratio = round(in_vocab / total_gt, 4) if total_gt > 0 else 0

print(f"✅ Ground Truth 总点击数: {total_gt}")
print(f"📦 其中出现在 item_metadata 的 item 数: {in_vocab}")
print(f"📉 命中比例: {ratio * 100:.2f}%")


✅ Ground Truth 总点击数: 3132633
📦 其中出现在 item_metadata 的 item 数: 89019
📉 命中比例: 2.84%


In [32]:
recall_df = pd.read_csv("output/youtube_dnn/youtube_dnn_faiss_recall.csv")
behavior_df = pd.read_csv("data/raw/user_behavior_log_info.csv")
item_df = pd.read_csv("data/raw/item_metadata.csv")

In [34]:
from src.data.load_behavior_log import build_datetime


In [35]:
# Ground truth 用户（点击行为）
behavior_df['dt'] = build_datetime(behavior_df['year'], behavior_df['time_stamp'], behavior_df['timestamp'])
gt_max = behavior_df["dt"].max()
gt_min = gt_max - pd.Timedelta(days=7)
click_df = behavior_df[
    (behavior_df["dt"] >= gt_min) &
    (behavior_df["dt"] <= gt_max) &
    (behavior_df["action_type"].str.lower() == "click")
]
click_df["item_id"] = click_df["item_id"].astype(str)
valid_item_ids = set(item_df["item_id"].astype(str).unique())
click_df = click_df[click_df["item_id"].isin(valid_item_ids)]
gt_users = set(click_df["user_id"].astype(str).unique())

# Recall 用户
recall_users = set(recall_df["user_id"].astype(str).unique())

# 打印交集情况
print("✅ Recall user count:", len(recall_users))
print("✅ Ground Truth user count:", len(gt_users))
print("🔍 Overlap user count:", len(recall_users & gt_users))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  click_df["item_id"] = click_df["item_id"].astype(str)


✅ Recall user count: 424170
✅ Ground Truth user count: 26350
🔍 Overlap user count: 26350


In [36]:
import pandas as pd
from collections import defaultdict
from src.data.load_behavior_log import build_datetime

# ----------- Config -----------
RECALL_CSV = "output/youtube_dnn/youtube_dnn_faiss_recall.csv"
BEHAVIOR_CSV = "data/raw/user_behavior_log_info.csv"
ITEM_CSV = "data/raw/item_metadata.csv"
TOP_K = 10
GROUND_TRUTH_WINDOW_DAYS = 7
NUM_USERS_TO_CHECK = 10

# ----------- 1. Load recall results -----------
recall_df = pd.read_csv(RECALL_CSV)
recall_df["user_id"] = recall_df["user_id"].astype(str)
recall_df["item_id"] = recall_df["item_id"].astype(str)

# ----------- 2. Load item metadata -----------
item_df = pd.read_csv(ITEM_CSV)
valid_item_ids = set(item_df["item_id"].astype(str).unique())

# ----------- 3. Load ground truth -----------
df = pd.read_csv(BEHAVIOR_CSV)
df["dt"] = build_datetime(df["year"], df["time_stamp"], df["timestamp"])
df["item_id"] = df["item_id"].astype(str)
df["user_id"] = df["user_id"].astype(str)

max_dt = df["dt"].max()
min_dt = max_dt - pd.Timedelta(days=GROUND_TRUTH_WINDOW_DAYS)
click_df = df[
    (df["dt"] >= min_dt) &
    (df["dt"] <= max_dt) &
    (df["action_type"].str.lower() == "click") &
    (df["item_id"].isin(valid_item_ids))
].copy()

# Build ground truth dict
gt_dict = defaultdict(set)
for row in click_df.itertuples(index=False):
    gt_dict[row.user_id].add(row.item_id)

# ----------- 4. Analyze first N users with GT -----------
print(f"🧪 分析前 {NUM_USERS_TO_CHECK} 个 Ground Truth 用户:")

checked = 0
for user_id in gt_dict:
    if user_id not in recall_df["user_id"].values:
        continue

    recalled_items = set(
        recall_df[(recall_df["user_id"] == user_id) & (recall_df["rank"] <= TOP_K)]["item_id"]
    )
    clicked_items = gt_dict[user_id]
    hit_items = recalled_items & clicked_items
    missed_items = clicked_items - recalled_items

    print("=" * 40)
    print(f"👤 用户: {user_id}")
    print(f"✅ Ground Truth Clicked Items ({len(clicked_items)}): {clicked_items}")
    print(f"🎯 Recalled Items ({len(recalled_items)}): {recalled_items}")
    print(f"🔥 Hit Items: {hit_items}")
    print(f"❌ Missed Items: {missed_items}")

    checked += 1
    if checked >= NUM_USERS_TO_CHECK:
        break


🧪 分析前 10 个 Ground Truth 用户:
👤 用户: 328862
✅ Ground Truth Clicked Items (1): {'406349'}
🎯 Recalled Items (10): {'413046', '215089', '201361', '81766', '1089499', '67897', '333159', '671759', '492131', '815434'}
🔥 Hit Items: set()
❌ Missed Items: {'406349'}
👤 用户: 234512
✅ Ground Truth Clicked Items (4): {'524779', '758374', '233274', '137298'}
🎯 Recalled Items (10): {'413046', '215089', '201361', '81766', '1089499', '67897', '333159', '671759', '492131', '815434'}
🔥 Hit Items: set()
❌ Missed Items: {'524779', '758374', '137298', '233274'}
👤 用户: 356311
✅ Ground Truth Clicked Items (11): {'685857', '215289', '931066', '335377', '1016560', '758374', '913290', '998103', '830436', '492131', '269842'}
🎯 Recalled Items (10): {'413046', '215089', '201361', '487805', '922514', '1089499', '67897', '333159', '671759', '492131'}
🔥 Hit Items: {'492131'}
❌ Missed Items: {'685857', '215289', '931066', '335377', '1016560', '758374', '913290', '998103', '830436', '269842'}
👤 用户: 272389
✅ Ground Truth Clic