# 2) 排序训练 + 线下评估 & 消融（含模型保存）

- 读取 0/1 步的产物
- 构造特征并训练排序模型（LightGBM 优先，无法使用则降级 RandomForest）
- 评估：Recall@50 / NDCG@50
- 消融：
  - A1：仅共现（No-Rank） vs 多路（No-Rank）
  - A2：多路（No-Rank） vs 多路（Ranker）
- **保存**：
  - 指标：`metrics_ablation.csv`
  - 预测明细：`mm_norank.parquet / mm_rank.parquet / cm_norank.parquet`
  - 模型：`model_lgb.pkl`（若 LightGBM），否则也用此文件名保存 RF 以便 4_online 加载
  - 特征重要性（若可用）：`feature_importance.csv`

In [None]:

import os, math, pandas as pd, numpy as np

# === 输出目录（如需改路径，改这里即可） ===
OUTDIR = '/Users/ringscherry/Desktop'
print('OUTDIR =', OUTDIR)

train_vis = pd.read_parquet(f'{OUTDIR}/train_vis.parquet')
label_df  = pd.read_parquet(f'{OUTDIR}/label_df.parquet')
cands_multi   = pd.read_parquet(f'{OUTDIR}/cands_multi.parquet')
cands_covisit = pd.read_parquet(f'{OUTDIR}/cands_covisit_only.parquet')

# 通用统计特征（基于 train_vis）
user_cnt = train_vis.groupby('buyer_admin_id').size().rename('user_hist_cnt')
item_cnt = train_vis.groupby('item_id').size().rename('item_pop_cnt')

def add_common_feats(df):
    return (df.merge(user_cnt, on='buyer_admin_id', how='left')
              .merge(item_cnt, on='item_id', how='left')).fillna(0)

def add_label(df):
    d = df.merge(label_df, on='buyer_admin_id', how='left')
    d['label'] = (d['item_id']==d['label_item']).astype(int)
    return d.drop(columns=['label_item'])

def recall_at_k(df, k=50):
    ok, tot = 0, df['buyer_admin_id'].nunique()
    for uid, g in df.groupby('buyer_admin_id'):
        ok += g.sort_values('pred', ascending=False).head(k)['label'].max()
    return ok / max(tot,1)

def ndcg_at_k(df, k=50):
    s, tot = 0.0, df['buyer_admin_id'].nunique()
    for _, g in df.groupby('buyer_admin_id'):
        rels = g.sort_values('pred', ascending=False).head(k)['label'].tolist()
        dcg = sum(rel / math.log2(i+2) for i, rel in enumerate(rels))
        s += dcg  # 留一验证 idcg = 1
    return s / max(tot,1)
    

In [None]:

# === 准备数据 ===
cm = add_common_feats(add_label(cands_covisit.copy()))
mm = add_common_feats(add_label(cands_multi.copy()))

# A1: No-Rank baselines（pred = pre_score）
cm_nr = cm.copy(); cm_nr['pred'] = cm_nr['pre_score']
mm_nr = mm.copy(); mm_nr['pred'] = mm_nr['pre_score']

mA1 = {
    'covisit_only_NoRank': dict(
        recall_at_50 = round(recall_at_k(cm_nr, 50), 6),
        ndcg_at_50   = round(ndcg_at_k(cm_nr, 50), 6)
    ),
    'multi_NoRank': dict(
        recall_at_50 = round(recall_at_k(mm_nr, 50), 6),
        ndcg_at_50   = round(ndcg_at_k(mm_nr, 50), 6)
    )
}
mA1
    

In [None]:

# === A2: Multi + Ranker ===
feat_cols = ['score_rebuy','score_covisit','is_cate_hot','is_store_hot','is_global_pop',
             'src_count','user_hist_cnt','item_pop_cnt','pre_score']

# LightGBM 优先，无法使用则降级 RF
try:
    from lightgbm import LGBMClassifier
    has_lgbm = True
except Exception:
    has_lgbm = False
    from sklearn.ensemble import RandomForestClassifier as RFC

if has_lgbm:
    model = LGBMClassifier(n_estimators=400, learning_rate=0.05,
                           num_leaves=63, subsample=0.8, colsample_bytree=0.8,
                           random_state=2025)
else:
    model = RFC(n_estimators=300, random_state=2025, n_jobs=-1)

X = mm[feat_cols].values
y = mm['label'].values
model.fit(X, y)

mm_rank = mm.copy()
if hasattr(model, 'predict_proba'):
    mm_rank['pred'] = model.predict_proba(mm[feat_cols])[:,1]
elif hasattr(model, 'decision_function'):
    v = model.decision_function(mm[feat_cols])
    mm_rank['pred'] = (v - v.min())/(v.max()-v.min()+1e-9)
else:
    mm_rank['pred'] = model.predict(mm[feat_cols]).astype(float)

mA2 = {
    'multi_Ranker': dict(
        recall_at_50 = round(recall_at_k(mm_rank, 50), 6),
        ndcg_at_50   = round(ndcg_at_k(mm_rank, 50), 6)
    )
}
mA2
    

In [None]:

# === 保存指标、明细、模型、特征重要性 ===
import pandas as pd, numpy as np, os, joblib

metrics = pd.DataFrame([
    dict(setting='A1_covisit_only_NoRank', **mA1['covisit_only_NoRank']),
    dict(setting='A1_multi_NoRank',        **mA1['multi_NoRank']),
    dict(setting='A2_multi_Ranker',        **mA2['multi_Ranker']),
])
metrics.to_csv(f'{OUTDIR}/metrics_ablation.csv', index=False)

mm_nr.to_parquet(f'{OUTDIR}/mm_norank.parquet', index=False)
mm_rank.to_parquet(f'{OUTDIR}/mm_rank.parquet', index=False)
cm_nr.to_parquet(f'{OUTDIR}/cm_norank.parquet', index=False)

# 保存模型（统一文件名，便于 4_online 自动加载）
model_path = os.path.join(OUTDIR, 'model_lgb.pkl')
joblib.dump(model, model_path)
print('Model saved to', model_path)

# 特征重要性（若可用）
fi_path = os.path.join(OUTDIR, 'feature_importance.csv')
if hasattr(model, 'feature_importances_'):
    importances = getattr(model, 'feature_importances_')
    pd.DataFrame({'feature': feat_cols, 'importance': importances})       .sort_values('importance', ascending=False).to_csv(fi_path, index=False)
    print('Feature importance saved to', fi_path)
else:
    print('Model has no feature_importances_; skip.')

print('\nMetrics:'); print(metrics)
    