# 🎯 排序模型训练与评估模块 (2_rank.ipynb)

## 📋 模块功能
实现**Learning to Rank**排序模型，将召回的候选商品进行精准排序，提升推荐质量。

## 🚀 核心功能
1. **🔧 特征工程**: 构造用户-商品交互特征
2. **🤖 模型训练**: LightGBM优先，RandomForest备选
3. **📊 模型评估**: Recall@50、NDCG@50等标准指标
4. **🔬 消融实验**: 量化各组件的贡献度
5. **💾 模型保存**: 完整的模型持久化

## 🔬 消融实验设计
- **A1**: 仅协同过滤 vs 多路召回（无排序）
- **A2**: 多路召回无排序 vs 多路召回+排序模型

## 🔧 输出文件
### 评估结果
- `metrics_ablation.csv`: 消融实验指标对比
- `feature_importance.csv`: 特征重要性分析

### 预测结果
- `mm_norank.parquet`: 多路召回无排序预测
- `mm_rank.parquet`: 多路召回+排序预测  
- `cm_norank.parquet`: 协同过滤无排序预测

### 模型文件
- `model_lgb.pkl`: 训练好的排序模型（供线上使用）

## 1️⃣ 环境配置与数据加载


In [None]:

# =============================================================================
# 环境配置与依赖导入
# =============================================================================
import os
import math
import pandas as pd
import numpy as np
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# 机器学习相关
try:
    import lightgbm as lgb
    HAS_LIGHTGBM = True
    print("✅ LightGBM 可用")
except ImportError:
    HAS_LIGHTGBM = False
    print("⚠️  LightGBM 不可用，将使用 RandomForest")

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
import pickle

# 配置参数
OUTDIR = '../x'  # 修正为正确的输出目录
print(f'📁 输出目录: {OUTDIR}')
print(f'⏰ 开始时间: {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}')

# 检查必要文件
required_files = [
    'train_vis.parquet',
    'label_df.parquet', 
    'cands_multi.parquet',
    'cands_covisit_only.parquet'
]

print("🔍 检查数据文件...")
for file in required_files:
    if not os.path.exists(f'{OUTDIR}/{file}'):
        raise FileNotFoundError(f"❌ 缺少文件: {file}，请先运行前序notebook")
    print(f"  ✅ {file}")

print("✅ 环境配置完成")
cands_covisit = pd.read_parquet(f'{OUTDIR}/cands_covisit_only.parquet')

# 通用统计特征（基于 train_vis）
user_cnt = train_vis.groupby('buyer_admin_id').size().rename('user_hist_cnt')
item_cnt = train_vis.groupby('item_id').size().rename('item_pop_cnt')

def add_common_feats(df):
    return (df.merge(user_cnt, on='buyer_admin_id', how='left')
              .merge(item_cnt, on='item_id', how='left')).fillna(0)

def add_label(df):
    d = df.merge(label_df, on='buyer_admin_id', how='left')
    d['label'] = (d['item_id']==d['label_item']).astype(int)
    return d.drop(columns=['label_item'])

def recall_at_k(df, k=50):
    ok, tot = 0, df['buyer_admin_id'].nunique()
    for uid, g in df.groupby('buyer_admin_id'):
        ok += g.sort_values('pred', ascending=False).head(k)['label'].max()
    return ok / max(tot,1)

def ndcg_at_k(df, k=50):
    s, tot = 0.0, df['buyer_admin_id'].nunique()
    for _, g in df.groupby('buyer_admin_id'):
        rels = g.sort_values('pred', ascending=False).head(k)['label'].tolist()
        dcg = sum(rel / math.log2(i+2) for i, rel in enumerate(rels))
        s += dcg  # 留一验证 idcg = 1
    return s / max(tot,1)
    

In [None]:
# =============================================================================
# 数据加载
# =============================================================================
print("📖 正在加载数据...")

# 加载基础数据
train_vis = pd.read_parquet(f'{OUTDIR}/train_vis.parquet')
label_df = pd.read_parquet(f'{OUTDIR}/label_df.parquet')

# 加载候选数据
cands_multi = pd.read_parquet(f'{OUTDIR}/cands_multi.parquet')
cands_covisit = pd.read_parquet(f'{OUTDIR}/cands_covisit_only.parquet')

print("✅ 数据加载完成")
print(f"📊 数据规模:")
print(f"  训练数据: {train_vis.shape}")
print(f"  标签数据: {label_df.shape}")
print(f"  多路候选: {cands_multi.shape}")
print(f"  协同候选: {cands_covisit.shape}")

# 数据质量检查
print(f"\n🔍 数据质量检查:")
print(f"  多路候选用户数: {cands_multi['buyer_admin_id'].nunique():,}")
print(f"  协同候选用户数: {cands_covisit['buyer_admin_id'].nunique():,}")
print(f"  标签用户数: {label_df['buyer_admin_id'].nunique():,}")

# 检查数据一致性
multi_users = set(cands_multi['buyer_admin_id'].unique())
label_users = set(label_df['buyer_admin_id'].unique())
overlap = len(multi_users & label_users)
print(f"  用户重叠度: {overlap:,}/{len(label_users):,} ({overlap/len(label_users):.1%})")


## 2️⃣ 评估函数定义


In [None]:
# =============================================================================
# 推荐系统评估函数
# =============================================================================
def recall_at_k(y_true, y_pred, k=50):
    """
    计算 Recall@K
    
    Args:
        y_true: 真实标签 (user_id -> item_id)
        y_pred: 预测结果 (user_id -> [item_ids])
        k: Top-K
        
    Returns:
        float: Recall@K 值
    """
    if len(y_true) == 0:
        return 0.0
    
    hits = 0
    total = len(y_true)
    
    for user_id, true_item in y_true.items():
        if user_id in y_pred:
            pred_items = y_pred[user_id][:k]
            if true_item in pred_items:
                hits += 1
    
    return hits / total

def ndcg_at_k(y_true, y_pred, k=50):
    """
    计算 NDCG@K
    
    Args:
        y_true: 真实标签 (user_id -> item_id)
        y_pred: 预测结果 (user_id -> [item_ids])
        k: Top-K
        
    Returns:
        float: NDCG@K 值
    """
    if len(y_true) == 0:
        return 0.0
    
    ndcg_sum = 0.0
    total = len(y_true)
    
    for user_id, true_item in y_true.items():
        if user_id in y_pred:
            pred_items = y_pred[user_id][:k]
            
            # 计算DCG
            dcg = 0.0
            for i, item in enumerate(pred_items):
                if item == true_item:
                    dcg = 1.0 / math.log2(i + 2)  # i+2 because log2(1)=0
                    break
            
            # 计算IDCG (理想情况下为1.0)
            idcg = 1.0  # 因为只有一个相关物品
            
            # 计算NDCG
            if idcg > 0:
                ndcg_sum += dcg / idcg
    
    return ndcg_sum / total

def evaluate_predictions(cands_df, label_df, score_col='pre_score', k=50):
    """
    评估预测结果
    
    Args:
        cands_df: 候选DataFrame，包含 buyer_admin_id, item_id, score_col
        label_df: 标签DataFrame，包含 buyer_admin_id, label_item
        score_col: 评分列名
        k: Top-K
        
    Returns:
        dict: 评估指标
    """
    print(f"📊 评估预测结果 (K={k}, score_col={score_col})...")
    
    # 构建真实标签字典
    y_true = dict(zip(label_df['buyer_admin_id'], label_df['label_item']))
    
    # 构建预测结果字典
    y_pred = {}
    cands_sorted = cands_df.sort_values(['buyer_admin_id', score_col], ascending=[True, False])
    
    for user_id, group in cands_sorted.groupby('buyer_admin_id'):
        y_pred[user_id] = group['item_id'].tolist()
    
    # 计算指标
    recall = recall_at_k(y_true, y_pred, k)
    ndcg = ndcg_at_k(y_true, y_pred, k)
    
    # 覆盖率统计
    pred_users = set(y_pred.keys())
    true_users = set(y_true.keys())
    coverage = len(pred_users & true_users) / len(true_users)
    
    metrics = {
        f'recall@{k}': recall,
        f'ndcg@{k}': ndcg,
        'coverage': coverage,
        'total_users': len(true_users),
        'pred_users': len(pred_users)
    }
    
    print(f"  📈 Recall@{k}: {recall:.4f}")
    print(f"  📈 NDCG@{k}: {ndcg:.4f}")
    print(f"  📊 覆盖率: {coverage:.4f}")
    
    return metrics

print("✅ 评估函数定义完成")


In [None]:

# === 准备数据 ===
cm = add_common_feats(add_label(cands_covisit.copy()))
mm = add_common_feats(add_label(cands_multi.copy()))

# A1: No-Rank baselines（pred = pre_score）
cm_nr = cm.copy(); cm_nr['pred'] = cm_nr['pre_score']
mm_nr = mm.copy(); mm_nr['pred'] = mm_nr['pre_score']

mA1 = {
    'covisit_only_NoRank': dict(
        recall_at_50 = round(recall_at_k(cm_nr, 50), 6),
        ndcg_at_50   = round(ndcg_at_k(cm_nr, 50), 6)
    ),
    'multi_NoRank': dict(
        recall_at_50 = round(recall_at_k(mm_nr, 50), 6),
        ndcg_at_50   = round(ndcg_at_k(mm_nr, 50), 6)
    )
}
mA1
    

In [None]:

# === A2: Multi + Ranker ===
feat_cols = ['score_rebuy','score_covisit','is_cate_hot','is_store_hot','is_global_pop',
             'src_count','user_hist_cnt','item_pop_cnt','pre_score']

# LightGBM 优先，无法使用则降级 RF
try:
    from lightgbm import LGBMClassifier
    has_lgbm = True
except Exception:
    has_lgbm = False
    from sklearn.ensemble import RandomForestClassifier as RFC

if has_lgbm:
    model = LGBMClassifier(n_estimators=400, learning_rate=0.05,
                           num_leaves=63, subsample=0.8, colsample_bytree=0.8,
                           random_state=2025)
else:
    model = RFC(n_estimators=300, random_state=2025, n_jobs=-1)

X = mm[feat_cols].values
y = mm['label'].values
model.fit(X, y)

mm_rank = mm.copy()
if hasattr(model, 'predict_proba'):
    mm_rank['pred'] = model.predict_proba(mm[feat_cols])[:,1]
elif hasattr(model, 'decision_function'):
    v = model.decision_function(mm[feat_cols])
    mm_rank['pred'] = (v - v.min())/(v.max()-v.min()+1e-9)
else:
    mm_rank['pred'] = model.predict(mm[feat_cols]).astype(float)

mA2 = {
    'multi_Ranker': dict(
        recall_at_50 = round(recall_at_k(mm_rank, 50), 6),
        ndcg_at_50   = round(ndcg_at_k(mm_rank, 50), 6)
    )
}
mA2
    

In [None]:

# === 保存指标、明细、模型、特征重要性 ===
import pandas as pd, numpy as np, os, joblib

metrics = pd.DataFrame([
    dict(setting='A1_covisit_only_NoRank', **mA1['covisit_only_NoRank']),
    dict(setting='A1_multi_NoRank',        **mA1['multi_NoRank']),
    dict(setting='A2_multi_Ranker',        **mA2['multi_Ranker']),
])
metrics.to_csv(f'{OUTDIR}/metrics_ablation.csv', index=False)

mm_nr.to_parquet(f'{OUTDIR}/mm_norank.parquet', index=False)
mm_rank.to_parquet(f'{OUTDIR}/mm_rank.parquet', index=False)
cm_nr.to_parquet(f'{OUTDIR}/cm_norank.parquet', index=False)

# 保存模型（统一文件名，便于 4_online 自动加载）
model_path = os.path.join(OUTDIR, 'model_lgb.pkl')
joblib.dump(model, model_path)
print('Model saved to', model_path)

# 特征重要性（若可用）
fi_path = os.path.join(OUTDIR, 'feature_importance.csv')
if hasattr(model, 'feature_importances_'):
    importances = getattr(model, 'feature_importances_')
    pd.DataFrame({'feature': feat_cols, 'importance': importances})       .sort_values('importance', ascending=False).to_csv(fi_path, index=False)
    print('Feature importance saved to', fi_path)
else:
    print('Model has no feature_importances_; skip.')

print('\nMetrics:'); print(metrics)
    