In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from recbole.quick_start import run_recbole
from recbole.quick_start import load_data_and_model
import pandas as pd
import os

run_recbole(config_file_list=['ml-1m-base.yaml'])

In [None]:
def save_interaction_for_training(inter_feat, dataset, save_path):
    df = {}
    for field in inter_feat.interaction.keys():
        if field == dataset.uid_field:
            df['user_id:token'] = dataset.id2token(field, inter_feat[field].numpy().tolist())
        elif field == dataset.iid_field:
            df['item_id:token'] = dataset.id2token(field, inter_feat[field].numpy().tolist())
        elif field == config['TIME_FIELD']:
            df['timestamp:float'] = inter_feat[field].numpy()
        elif field == config['LABEL_FIELD']:
            df['label:float'] = inter_feat[field].numpy()
        else:
            df[field] = inter_feat[field].numpy()
    pd.DataFrame(df).to_csv(save_path, sep='\t', index=False)

# 加载模型与数据
config, model, dataset, train_data, valid_data, test_data = load_data_and_model(
    model_file='../checkpoint_saved/ml-1m/BPR-Jun-08-2025_14-26-19.pth'
)

# 获取 interaction 对象
train_inter = train_data.dataset.inter_feat
valid_inter = valid_data.dataset.inter_feat
test_inter  = test_data.dataset.inter_feat

# 保存路径
save_dir = '../datasets/split_datasets/ml-1m'
os.makedirs(save_dir, exist_ok=True)

# 保存为 .inter 文件
save_interaction_for_training(train_inter, dataset, os.path.join(save_dir, 'ml-1m.train.inter'))
save_interaction_for_training(valid_inter, dataset, os.path.join(save_dir, 'ml-1m.valid.inter'))
save_interaction_for_training(test_inter, dataset, os.path.join(save_dir, 'ml-1m.test.inter'))
print("Train, valid, test datasets saved successfully.")

In [None]:
from recbole.quick_start import load_data_and_model
from recbole.utils.case_study import full_sort_topk
from tqdm import tqdm
import pandas as pd

# 1. 加载模型和数据
config, model, dataset, train_data, valid_data, test_data = load_data_and_model(
    model_file='../checkpoint_saved/ml-1m/BPR-Jun-08-2025_20-35-43.pth'
)

# Step 1: Get all internal user IDs
all_uids = list(range(dataset.user_num))

# Step 2: Filter out users who have no interactions in the test set
valid_uids = [uid for uid in tqdm(all_uids) if test_data.uid2history_item[uid] is not None]

# Step 3: Convert to Series
import numpy as np
uid_series = np.array(valid_uids)

# Step 4: Run full_sort_topk
topk_scores, topk_index = full_sort_topk(uid_series, model, test_data, k=30, device=config['device'])

# Step 5: Convert internal item IDs to external tokens
external_item_lists = [dataset.id2token(dataset.iid_field, row.cpu().tolist()) for row in topk_index]
external_user_list = [dataset.id2token(dataset.uid_field, [uid])[0] for uid in uid_series]

# Step 6: Save as DataFrame
df = pd.DataFrame({
    'user_id': external_user_list,
    'topk_items': [','.join(items) for items in external_item_lists]
})
display(df.head())
df.to_csv('outputs/ml_all_user_top30.csv', index=False)
print("save ml_all_user_top30 successfully")

In [None]:
import pandas as pd
import numpy as np
from collections import defaultdict

# Step 1: 加载推荐结果 & 用户性别
topk_df = pd.read_csv('outputs/ml_all_user_top30.csv')
user_df = pd.read_csv('../datasets/atomic_datasets/ml-1m/ml-1m.user', sep='\t')
user2gender = dict(zip(user_df['user_id:token'], user_df['gender:token']))

# Step 2: 加载测试集 ground truth（正反馈）
test_df = pd.read_csv('datasets/ml-1m/ml-1m.test.inter', sep='\t')
test_df = test_df[test_df['label:float'] == 1.0]
user2ground_truth = test_df.groupby('user_id:token')['item_id:token'].agg(set).to_dict()

# Step 3: NDCG@10 计算函数
def ndcg_at_k(preds, true_items, k=10):
    dcg = 0.0
    for i, item in enumerate(preds[:k]):
        if item in true_items:
            dcg += 1.0 / np.log2(i + 2)
    ideal_len = min(len(true_items), k)
    idcg = sum(1.0 / np.log2(i + 2) for i in range(ideal_len))
    return dcg / idcg if idcg > 0 else 0.0

# Step 4: 分组统计
ndcg_male, ndcg_female, ndcg_all = [], [], []

for _, row in topk_df.iterrows():
    uid = row['user_id']
    pred_items = eval(row['topk_items']) if isinstance(row['topk_items'], str) else row['topk_items']
    true_items = user2ground_truth.get(uid, set())
    gender = user2gender.get(uid, None)

    if gender not in ('M', 'F') or not true_items:
        continue

    ndcg = ndcg_at_k(pred_items, true_items, k=10)
    if gender == 'M':
        ndcg_male.append(ndcg)
        ndcg_all.append(ndcg)
    else:
        ndcg_female.append(ndcg)
        ndcg_all.append(ndcg)

# Step 5: 输出
print(f'NDCG@10 (All):    {np.mean(ndcg_all):.4f} over {len(ndcg_all)} users')
print(f'NDCG@10 (Male):   {np.mean(ndcg_male):.4f} over {len(ndcg_male)} users')
print(f'NDCG@10 (Female): {np.mean(ndcg_female):.4f} over {len(ndcg_female)} users')