# 1) 多路召回（加速版）
本 Notebook 是对原来的 `1_recall.ipynb` 的**性能优化替换**：
- 新增 **FAST_MODE**（小参数先冒烟）
- 压缩 `dtype` 降内存
- 预计算：共现邻接表 / 最近K / 用户偏好槽位 / 热榜字典 / 复购映射
- 候选构建改为**纯字典 + numpy 批处理**（避免频繁 `merge/iterrows`）
- 输出与原版一致：统计表 + 两套候选（多路 vs 仅共现）

In [1]:

import os, pandas as pd, numpy as np

# 输入/输出目录
OUTDIR = '../x'
assert os.path.exists(f'{OUTDIR}/train_vis.parquet'), "请先运行 0_prep.ipynb"
train_vis = pd.read_parquet(f'{OUTDIR}/train_vis.parquet')
label_df  = pd.read_parquet(f'{OUTDIR}/label_df.parquet')
item_attr = pd.read_parquet(f'{OUTDIR}/item_attr.parquet')

# 进度条（无则优雅退化）
try:
    from tqdm import tqdm
except Exception:
    def tqdm(x, **k): return x

# 参数
PARAMS = dict(
    covisit_window=3,           # 共现滑窗(3或5)
    covisit_top_per_a=200,      # 每个a保留TopK出边
    recent_k=5,                 # 最近K个item作为共现起点
    cand_per_recent=40,         # 每个起点取TopN下游
    tau_days=14,                # 复购时间衰减(天)
    user_top_cates=3,           # 用户偏好类目TopN
    user_top_stores=3,          # 用户偏好店铺TopN
    per_cate_pool=80,           # 每个偏好类目热门池
    per_store_pool=60,          # 每个偏好店铺热门池
    pop_pool=2000,              # 全局热门池大小
    recall_cap=600,             # 单用户候选上限
)

# —— 快车模式（先跑通，再关掉恢复全量） ——
FAST_MODE = True
if FAST_MODE:
    PARAMS.update({
        'covisit_top_per_a': 120,
        'recent_k': 3,
        'cand_per_recent': 24,
        'per_cate_pool': 40,
        'per_store_pool': 40,
        'pop_pool': 1000,
        'recall_cap': 400,
    })

# 压缩整数类型，减少内存/IO
for df in (train_vis, item_attr):
    for c in ('buyer_admin_id','item_id'):
        if c in df.columns:
            df[c] = pd.to_numeric(df[c], downcast='integer')
    for c in ('cate_id','store_id'):
        if c in df.columns:
            df[c] = pd.to_numeric(df[c], downcast='integer')

print("train_vis rows:", len(train_vis), " users:", train_vis['buyer_admin_id'].nunique())
    

train_vis rows: 6506700  users: 483117


# --- 复购评分 ---

In [None]:

def time_decay(days, tau=14.0):
    days = np.maximum(days, 0.0)
    return np.exp(-days / float(tau))

def build_rebuy_scores(df, tau_days=14):
    g = df.copy()
    ref = g.groupby('buyer_admin_id')['create_order_time'].transform('max')
    g['days_ago'] = (ref - g['create_order_time']).dt.days.clip(lower=0)
    g['score_rebuy'] = time_decay(g['days_ago'].to_numpy(), tau=tau_days)
    return (g.groupby(['buyer_admin_id','item_id'])['score_rebuy']
              .sum().reset_index())
    

# --- 共现图 a->b ---

In [None]:

def build_covisit(df, W=3, topk=200):
    base = df[['buyer_admin_id','item_id']].copy()
    pairs = []
    for lag in range(1, W+1):
        t = base.copy()
        t['item_b'] = t.groupby('buyer_admin_id')['item_id'].shift(-lag)
        t = t.dropna().rename(columns={'item_id':'item_a'})
        t['w'] = 1.0/lag
        pairs.append(t[['item_a','item_b','w']])
    if not pairs:
        return pd.DataFrame(columns=['item_a','item_b','w'])
    co = pd.concat(pairs, ignore_index=True)
    co = co.groupby(['item_a','item_b'])['w'].sum().reset_index()
    co['rn'] = co.groupby('item_a')['w'].rank(ascending=False, method='first')
    return co[co['rn']<=topk].drop(columns='rn')
    

# --- 热门池（全局/类目/店铺） ---

In [None]:


def build_pop_pools(df, item_attr, pop_pool=2000):
    pop = df.groupby('item_id').size().rename('pop').reset_index()

    cate_pop = (df.merge(item_attr, on='item_id', how='left')
                .groupby(['cate_id','item_id']).size().rename('pop').reset_index())
    cate_pop['rn'] = cate_pop.groupby('cate_id')['pop'].rank(ascending=False, method='first')

    store_pop = (df.merge(item_attr, on='item_id', how='left')
                 .groupby(['store_id','item_id']).size().rename('pop').reset_index())
    store_pop['rn'] = store_pop.groupby('store_id')['pop'].rank(ascending=False, method='first')

    global_pop = pop.sort_values('pop', ascending=False).head(pop_pool)
    return cate_pop, store_pop, global_pop
    

# --- 构建统计 ---

In [5]:

rebuy = build_rebuy_scores(train_vis, tau_days=PARAMS['tau_days'])
covisit = build_covisit(train_vis, W=PARAMS['covisit_window'], topk=PARAMS['covisit_top_per_a'])
cate_pop, store_pop, global_pop = build_pop_pools(train_vis, item_attr, pop_pool=PARAMS['pop_pool'])

# 保存统计表（snappy 压缩更快）
rebuy.to_parquet(f'{OUTDIR}/rebuy.parquet', index=False, compression='snappy')
covisit.to_parquet(f'{OUTDIR}/covisit.parquet', index=False, compression='snappy')
cate_pop.to_parquet(f'{OUTDIR}/cate_pop.parquet', index=False, compression='snappy')
store_pop.to_parquet(f'{OUTDIR}/store_pop.parquet', index=False, compression='snappy')
global_pop.to_parquet(f'{OUTDIR}/global_pop.parquet', index=False, compression='snappy')

print("stats saved.")
    

stats saved.


# --- 预计算映射：显著加速候选构建 ---

In [6]:


P = PARAMS  # 简写

# 1) 共现邻接 a -> (b[], w[])
cov_neighbors = {}
for a, g in covisit.groupby('item_a'):
    sub = g[['item_b','w']].head(P['cand_per_recent']).to_numpy()
    if len(sub):
        cov_neighbors[int(a)] = (sub[:,0].astype('int64'), sub[:,1].astype('float32'))

# 2) 每用户最近K个item
recent_map = (train_vis.sort_values('create_order_time')
              .groupby('buyer_admin_id')['item_id']
              .apply(lambda s: s.tail(P['recent_k']).to_numpy('int64'))
              ).to_dict()

# 3) 用户偏好类目/店铺 TopN
ua = train_vis.merge(item_attr, on='item_id', how='left')
user_topc = ua.groupby('buyer_admin_id')['cate_id']               .apply(lambda s: s.value_counts().head(P['user_top_cates']).index.to_numpy('int64')).to_dict()
user_tops = ua.groupby('buyer_admin_id')['store_id']               .apply(lambda s: s.value_counts().head(P['user_top_stores']).index.to_numpy('int64')).to_dict()

# 4) 类目/店铺热榜池预展平
cate_top = {int(c): grp.loc[grp['rn']<=P['per_cate_pool'],'item_id'].to_numpy('int64')
            for c, grp in cate_pop.groupby('cate_id')}
store_top = {int(s): grp.loc[grp['rn']<=P['per_store_pool'],'item_id'].to_numpy('int64')
             for s, grp in store_pop.groupby('store_id')}
global_items = global_pop['item_id'].to_numpy('int64')

# 5) 复购映射：user -> (items[], weights[])
rebuy_map = {}
for uid, g in rebuy.groupby('buyer_admin_id'):
    rebuy_map[int(uid)] = (g['item_id'].to_numpy('int64'),
                           g['score_rebuy'].to_numpy('float32'))

print("precomputed maps ready.")
    

precomputed maps ready.


In [7]:

# --- 快速候选构建 ---
def build_candidates_fast(uid,
                          use_rebuy=True, use_covisit=True,
                          use_cate_store=True, use_global=True):
    cand = {}

    # 路1：复购
    if use_rebuy and uid in rebuy_map:
        items, ws = rebuy_map[uid]
        for it, w in zip(items, ws):
            cand.setdefault(int(it), []).append(('rebuy', float(w)))

    # 路2：共现（最近K个起点）
    if use_covisit:
        for a in recent_map.get(uid, []):
            pair = cov_neighbors.get(int(a))
            if pair is None: 
                continue
            bs, ws = pair
            for b, w in zip(bs, ws):
                cand.setdefault(int(b), []).append(('covisit', float(w)))

    # 路3：类目/店铺热门（用户偏好槽位）
    if use_cate_store:
        for c in user_topc.get(uid, []):
            for it in cate_top.get(int(c), ()):
                cand.setdefault(int(it), []).append(('cate_hot', 1.0))
        for s in user_tops.get(uid, []):
            for it in store_top.get(int(s), ()):
                cand.setdefault(int(it), []).append(('store_hot', 1.0))

    # 路4：全局热门兜底
    if use_global:
        for it in global_items:
            cand.setdefault(int(it), []).append(('global_pop', 1.0))

    # 汇总来源与预打分
    if not cand:
        cols = ['buyer_admin_id','item_id','score_rebuy','score_covisit',
                'is_cate_hot','is_store_hot','is_global_pop','src_count','pre_score']
        return pd.DataFrame(columns=cols)

    rows = []
    for it, srcs in cand.items():
        srcset = set()
        sr = 0.0; sc = 0.0
        is_cate = 0; is_store = 0; is_glob = 0
        for tag, w in srcs:
            srcset.add(tag)
            if tag == 'rebuy': sr = max(sr, w)
            elif tag == 'covisit': sc = max(sc, w)
            elif tag == 'cate_hot': is_cate = 1
            elif tag == 'store_hot': is_store = 1
            elif tag == 'global_pop': is_glob = 1
        rows.append((int(uid), int(it), float(sr), float(sc),
                     is_cate, is_store, is_glob, len(srcset)))
    arr = np.asarray(rows, dtype=object)
    cdf = pd.DataFrame(arr, columns=[
        'buyer_admin_id','item_id','score_rebuy','score_covisit',
        'is_cate_hot','is_store_hot','is_global_pop','src_count'
    ])
    cdf['pre_score'] = (cdf['score_rebuy'] + cdf['score_covisit']
                        + 0.3*cdf['is_cate_hot'] + 0.3*cdf['is_store_hot'] + 0.1*cdf['is_global_pop'])
    return cdf.sort_values('pre_score', ascending=False).head(PARAMS['recall_cap'])
    

In [8]:

# --- 生成候选（多路 vs 仅共现，用于消融） ---
val_users = label_df['buyer_admin_id'].unique()

# 快车模式：如数据很大，先抽前 N 个用户冒烟
if FAST_MODE:
    N_SMOKE = 5000
    if len(val_users) > N_SMOKE:
        val_users = val_users[:N_SMOKE]
        print(f"FAST_MODE: only first {N_SMOKE} users for smoke test.")

multi_list, covis_list = [], []
for uid in tqdm(val_users, desc='build candidates (fast)', unit='user'):
    multi_list.append(build_candidates_fast(int(uid), True, True, True, True))
    covis_list.append(build_candidates_fast(int(uid), False, True, False, False))

cands_multi   = pd.concat(multi_list, ignore_index=True) if multi_list else                 pd.DataFrame(columns=['buyer_admin_id','item_id','score_rebuy','score_covisit','is_cate_hot','is_store_hot','is_global_pop','src_count','pre_score'])
cands_covisit = pd.concat(covis_list, ignore_index=True) if covis_list else                 pd.DataFrame(columns=['buyer_admin_id','item_id','score_rebuy','score_covisit','is_cate_hot','is_store_hot','is_global_pop','src_count','pre_score'])

# 保存候选
cands_multi.to_parquet(f'{OUTDIR}/cands_multi.parquet', index=False, compression='snappy')
cands_covisit.to_parquet(f'{OUTDIR}/cands_covisit_only.parquet', index=False, compression='snappy')

print('Saved recall artifacts to', OUTDIR)
print('multi:', cands_multi.shape, 'covisit_only:', cands_covisit.shape)
    

FAST_MODE: only first 5000 users for smoke test.


build candidates (fast): 100%|██████████| 5000/5000 [00:07<00:00, 634.89user/s]


Saved recall artifacts to ../x
multi: (2000000, 9) covisit_only: (180814, 9)
