In [41]:
import pandas as pd
import os

# 切换到项目根目录或目标数据目录
os.chdir("/Users/dallylovely/Desktop/CCGG/Projects/recommendation_system")


In [44]:
import pandas as pd
from src.data.load_behavior_log import find_window_bounds
from src.recall.youtube_dnn.dataset_youtube_dnn import YoutubeDNNDataset

In [None]:
# 配置路径
USER_PROFILE_CSV = "data/processed/user_profile_feature.csv"
ITEM_METADATA_CSV = "data/raw/item_metadata.csv"
BEHAVIOR_CSV = "data/raw/user_behavior_log_info.csv"

# 时间窗口
_, cutoff_date, upper_date = find_window_bounds(
    BEHAVIOR_CSV, chunksize=100_000, days_window=30
)

# 初始化 Dataset
dataset = YoutubeDNNDataset(
    user_profile_csv=USER_PROFILE_CSV,
    item_csv=ITEM_METADATA_CSV,
    behavior_csv=BEHAVIOR_CSV,
    cutoff_date=cutoff_date,
    upper_date=upper_date,
    chunksize=100_000,
    num_negatives=4,
    max_text_feat_dim=300,
    filter_item_ids=None  # ✅ 默认取 metadata ∩ behavior
)

print("✅ Dataset 初始化完成")


2025-08-26 14:50:00,415 INFO:Max behavior date found: 2024-11-12 00:00:00, window from 2024-10-14 00:00:00 to 2024-11-13 00:00:00
2025-08-26 14:50:01,892 INFO:Loaded user profile from data/processed/user_profile_feature.csv, shape = (424170, 34)
2025-08-26 14:50:02,465 INFO:Successfully loaded file: data/raw/item_metadata.csv
2025-08-26 14:50:02,467 INFO:Dropped 0 rows with invalid item_id
2025-08-26 14:50:03,534 INFO:Filtered chunk by valid actions: 46421 → 46421
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk["user_id"] = chunk["user_id"].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/

In [46]:
# ----------------------------
# 打印数据信息
# ----------------------------

print("✅ Dataset 初始化完成")
print(f"样本总数 (正样本 + 负样本): {len(dataset)}")

# 用户数、物品数
n_users, n_items = dataset.get_num_users_items()
print(f"用户总数: {n_users}")
print(f"物品总数: {n_items}")

# 用户特征 vocab
print("用户类别特征 vocab sizes:", dataset.get_user_categorical_vocab_sizes())

# 数值特征均值和方差
print("用户数值特征 norm:", dataset.get_user_numeric_norm())

# item 文本特征维度
print(f"Item 文本 TF-IDF 向量维度: {dataset.get_item_text_dim()}")

✅ Dataset 初始化完成
样本总数 (正样本 + 负样本): 561910
用户总数: 424170
物品总数: 419
用户类别特征 vocab sizes: {'gender': 3, 'age_range': 9, 'city': 15, 'cluster_id': 4}
用户数值特征 norm: {'mean': array([ 1.4401301, 12.538259 ,  3.1723707], dtype=float32), 'std': array([ 2.690012 , 17.5567   ,  3.3856869], dtype=float32), 'cols': ['recency', 'frequency', 'actions_per_active_day_30d']}
Item 文本 TF-IDF 向量维度: 300


In [47]:
# ----------------------------
# 打印用户数对比
# ----------------------------
profile_users = dataset.users["user_id"].nunique()
print(f"用户画像文件 (profile.csv) 用户数: {profile_users}")

# 从原始行为日志中统计用户数（不过滤）
raw_behavior_users = pd.read_csv(
    BEHAVIOR_CSV,
    usecols=["user_id"],
    dtype=str,
    nrows=200_000  # ⚠️ 可以先采样部分加速
)["user_id"].nunique()
print(f"行为日志 (behavior.csv) 用户数 (采样): {raw_behavior_users}")

# 真实参与训练的用户数（三方交集后）
active_users = dataset.interactions["user_id"].nunique()
print(f"三方交集后 (有效交互用户) 用户数: {active_users}")

# ----------------------------
# 打印物品数对比
# ----------------------------
metadata_items = dataset.items["item_id"].nunique()
print(f"物品元数据 (metadata.csv) 物品数: {metadata_items}")

active_items = dataset.interactions["item_id"].nunique()
print(f"三方交集后 (有效交互物品) 物品数: {active_items}")

# ----------------------------
# 打印一个样本
# ----------------------------
print("\n👀 随机查看一个训练样本:")
print(dataset[0])

用户画像文件 (profile.csv) 用户数: 424170
行为日志 (behavior.csv) 用户数 (采样): 6601
三方交集后 (有效交互用户) 用户数: 39983
物品元数据 (metadata.csv) 物品数: 419
三方交集后 (有效交互物品) 物品数: 335

👀 随机查看一个训练样本:
(tensor(141468), tensor(25), tensor(1.), tensor(1), tensor(6), tensor(14), tensor(1), tensor([-0.1636, -0.3154, -0.4201]), tensor([0.0197, 0.0000, 0.0000, 0.0098, 0.0000, 0.0196, 0.0275, 0.0000, 0.0199,
        0.0179, 0.0000, 0.0000, 0.0000, 0.0000, 0.0758, 0.0294, 0.0474, 0.0427,
        0.0580, 0.2065, 0.0000, 0.0216, 0.0000, 0.0098, 0.0774, 0.0000, 0.0208,
        0.0581, 0.0432, 0.0207, 0.0196, 0.0414, 0.0593, 0.0228, 0.0000, 0.0684,
        0.0000, 0.0223, 0.0000, 0.0000, 0.0000, 0.0000, 0.0948, 0.0553, 0.0000,
        0.0145, 0.0230, 0.0000, 0.1084, 0.0000, 0.0000, 0.0000, 0.0000, 0.0904,
        0.0000, 0.0254, 0.0098, 0.0098, 0.0149, 0.0705, 0.0248, 0.0000, 0.0775,
        0.0220, 0.0242, 0.0000, 0.0000, 0.0192, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0233, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0474, 0.

In [39]:
# Step 2: 加载行为数据
behavior_df = pd.read_csv("data/raw/user_behavior_log_info.csv")

In [None]:
# Step 3: 构建 dt 时间列
mmd_str = behavior_df["time_stamp"].astype(str).str.zfill(4)
month = mmd_str.str[:2]
day = mmd_str.str[2:]
behavior_df["dt"] = pd.to_datetime(
    behavior_df["year"].astype(str) + "-" + month + "-" + day,
    errors='coerce'
) + pd.to_timedelta(behavior_df["timestamp"], unit="s")

In [None]:
# Step 4: 只取最近 7 天点击
max_dt = behavior_df["dt"].max()
min_dt = max_dt - pd.Timedelta(days=7)
click_df = behavior_df[
    (behavior_df["dt"] >= min_dt) &
    (behavior_df["dt"] <= max_dt) &
    (behavior_df["action_type"].str.lower() == "click")
]

In [None]:
# Step 5: 判断点击的 item 是否出现在 item_metadata 中
total_gt = len(click_df)
in_vocab = click_df["item_id"].astype(str).isin(valid_items).sum()
ratio = round(in_vocab / total_gt, 4) if total_gt > 0 else 0

print(f"✅ Ground Truth 总点击数: {total_gt}")
print(f"📦 其中出现在 item_metadata 的 item 数: {in_vocab}")
print(f"📉 命中比例: {ratio * 100:.2f}%")


✅ Ground Truth 总点击数: 3132633
📦 其中出现在 item_metadata 的 item 数: 89019
📉 命中比例: 2.84%


In [32]:
recall_df = pd.read_csv("output/youtube_dnn/youtube_dnn_faiss_recall.csv")
behavior_df = pd.read_csv("data/raw/user_behavior_log_info.csv")
item_df = pd.read_csv("data/raw/item_metadata.csv")

In [34]:
from src.data.load_behavior_log import build_datetime


In [35]:
# Ground truth 用户（点击行为）
behavior_df['dt'] = build_datetime(behavior_df['year'], behavior_df['time_stamp'], behavior_df['timestamp'])
gt_max = behavior_df["dt"].max()
gt_min = gt_max - pd.Timedelta(days=7)
click_df = behavior_df[
    (behavior_df["dt"] >= gt_min) &
    (behavior_df["dt"] <= gt_max) &
    (behavior_df["action_type"].str.lower() == "click")
]
click_df["item_id"] = click_df["item_id"].astype(str)
valid_item_ids = set(item_df["item_id"].astype(str).unique())
click_df = click_df[click_df["item_id"].isin(valid_item_ids)]
gt_users = set(click_df["user_id"].astype(str).unique())

# Recall 用户
recall_users = set(recall_df["user_id"].astype(str).unique())

# 打印交集情况
print("✅ Recall user count:", len(recall_users))
print("✅ Ground Truth user count:", len(gt_users))
print("🔍 Overlap user count:", len(recall_users & gt_users))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  click_df["item_id"] = click_df["item_id"].astype(str)


✅ Recall user count: 424170
✅ Ground Truth user count: 26350
🔍 Overlap user count: 26350


In [36]:
import pandas as pd
from collections import defaultdict
from src.data.load_behavior_log import build_datetime

# ----------- Config -----------
RECALL_CSV = "output/youtube_dnn/youtube_dnn_faiss_recall.csv"
BEHAVIOR_CSV = "data/raw/user_behavior_log_info.csv"
ITEM_CSV = "data/raw/item_metadata.csv"
TOP_K = 10
GROUND_TRUTH_WINDOW_DAYS = 7
NUM_USERS_TO_CHECK = 10

# ----------- 1. Load recall results -----------
recall_df = pd.read_csv(RECALL_CSV)
recall_df["user_id"] = recall_df["user_id"].astype(str)
recall_df["item_id"] = recall_df["item_id"].astype(str)

# ----------- 2. Load item metadata -----------
item_df = pd.read_csv(ITEM_CSV)
valid_item_ids = set(item_df["item_id"].astype(str).unique())

# ----------- 3. Load ground truth -----------
df = pd.read_csv(BEHAVIOR_CSV)
df["dt"] = build_datetime(df["year"], df["time_stamp"], df["timestamp"])
df["item_id"] = df["item_id"].astype(str)
df["user_id"] = df["user_id"].astype(str)

max_dt = df["dt"].max()
min_dt = max_dt - pd.Timedelta(days=GROUND_TRUTH_WINDOW_DAYS)
click_df = df[
    (df["dt"] >= min_dt) &
    (df["dt"] <= max_dt) &
    (df["action_type"].str.lower() == "click") &
    (df["item_id"].isin(valid_item_ids))
].copy()

# Build ground truth dict
gt_dict = defaultdict(set)
for row in click_df.itertuples(index=False):
    gt_dict[row.user_id].add(row.item_id)

# ----------- 4. Analyze first N users with GT -----------
print(f"🧪 分析前 {NUM_USERS_TO_CHECK} 个 Ground Truth 用户:")

checked = 0
for user_id in gt_dict:
    if user_id not in recall_df["user_id"].values:
        continue

    recalled_items = set(
        recall_df[(recall_df["user_id"] == user_id) & (recall_df["rank"] <= TOP_K)]["item_id"]
    )
    clicked_items = gt_dict[user_id]
    hit_items = recalled_items & clicked_items
    missed_items = clicked_items - recalled_items

    print("=" * 40)
    print(f"👤 用户: {user_id}")
    print(f"✅ Ground Truth Clicked Items ({len(clicked_items)}): {clicked_items}")
    print(f"🎯 Recalled Items ({len(recalled_items)}): {recalled_items}")
    print(f"🔥 Hit Items: {hit_items}")
    print(f"❌ Missed Items: {missed_items}")

    checked += 1
    if checked >= NUM_USERS_TO_CHECK:
        break


🧪 分析前 10 个 Ground Truth 用户:
👤 用户: 328862
✅ Ground Truth Clicked Items (1): {'406349'}
🎯 Recalled Items (10): {'413046', '215089', '201361', '81766', '1089499', '67897', '333159', '671759', '492131', '815434'}
🔥 Hit Items: set()
❌ Missed Items: {'406349'}
👤 用户: 234512
✅ Ground Truth Clicked Items (4): {'524779', '758374', '233274', '137298'}
🎯 Recalled Items (10): {'413046', '215089', '201361', '81766', '1089499', '67897', '333159', '671759', '492131', '815434'}
🔥 Hit Items: set()
❌ Missed Items: {'524779', '758374', '137298', '233274'}
👤 用户: 356311
✅ Ground Truth Clicked Items (11): {'685857', '215289', '931066', '335377', '1016560', '758374', '913290', '998103', '830436', '492131', '269842'}
🎯 Recalled Items (10): {'413046', '215089', '201361', '487805', '922514', '1089499', '67897', '333159', '671759', '492131'}
🔥 Hit Items: {'492131'}
❌ Missed Items: {'685857', '215289', '931066', '335377', '1016560', '758374', '913290', '998103', '830436', '269842'}
👤 用户: 272389
✅ Ground Truth Clic

In [48]:
import pandas as pd
import random

# 配置文件路径
RECALL_CSV = "output/youtube_dnn/youtube_dnn_faiss_recall.csv"
BEHAVIOR_CSV = "data/raw/user_behavior_log_info.csv"
ITEM_CSV = "data/raw/item_metadata.csv"

from src.data.load_behavior_log import build_datetime

# -------- 1. 加载数据 --------
recall_df = pd.read_csv(RECALL_CSV)
print(f"召回结果: {recall_df.shape}")

# ground truth 构建
behavior_df = pd.read_csv(BEHAVIOR_CSV)
item_df = pd.read_csv(ITEM_CSV)
valid_item_ids = set(item_df["item_id"].astype(str).unique())

behavior_df["dt"] = build_datetime(
    behavior_df["year"], behavior_df["time_stamp"], behavior_df["timestamp"]
)
max_dt = behavior_df["dt"].max()
min_dt = max_dt - pd.Timedelta(days=30)

click_df = behavior_df[
    (behavior_df["dt"] >= min_dt) &
    (behavior_df["dt"] <= max_dt) &
    (behavior_df["action_type"].str.lower() == "click")
].copy()
click_df["item_id"] = click_df["item_id"].astype(str)
click_df = click_df[click_df["item_id"].isin(valid_item_ids)]

gt_dict = click_df.groupby("user_id")["item_id"].apply(set).to_dict()
print(f"Ground Truth 用户数: {len(gt_dict)}")

# -------- 2. 抽样几个用户 --------
overlap_users = set(recall_df["user_id"].astype(str).unique()) & set(gt_dict.keys())
print(f"交集用户数: {len(overlap_users)}")

sample_users = random.sample(list(overlap_users), 5)

# -------- 3. 打印对比 --------
for u in sample_users:
    gt_items = list(gt_dict[u])[:10]
    rec_items = recall_df[recall_df["user_id"] == u].sort_values("rank")["item_id"].tolist()[:10]
    print(f"\n👤 User {u}")
    print(f"  Ground Truth items: {gt_items}")
    print(f"  Recall items: {rec_items}")
    print(f"  Hit: {set(gt_items) & set(rec_items)}")


召回结果: (399830, 5)
Ground Truth 用户数: 36029
交集用户数: 0


ValueError: Sample larger than population or is negative