# 🚀 线上推理与提交生成模块 (4_online.ipynb)

## 📋 模块功能
基于训练好的模型进行**线上推理**，为测试用户生成Top-30推荐结果并输出标准提交文件。

## 🎯 核心功能
1. **📊 全量统计重建**: 使用完整训练数据重新构建召回统计
2. **🎯 测试用户推理**: 为测试集用户生成推荐候选
3. **🤖 模型评分**: 加载训练好的排序模型进行精准打分
4. **📄 提交文件生成**: 输出符合比赛要求的提交格式

## 🔄 推理流程
1. **数据准备**: 加载全量训练数据 (`train_sorted.parquet`)
2. **统计重建**: 基于全量数据重新计算召回统计
3. **候选生成**: 为测试用户 (`test_sorted.parquet`) 生成候选
4. **模型打分**: 使用 `model_lgb.pkl` 进行排序（如可用）
5. **结果输出**: 生成Top-30推荐列表

## 🔧 输入文件
- `train_sorted.parquet`: 完整训练数据（用于统计重建）
- `test_sorted.parquet`: 测试用户数据
- `model_lgb.pkl`: 训练好的排序模型（可选）

## 📄 输出文件
- `submit_long.csv`: 长格式提交文件 (user, item, score, rank)
- `submit_wide.csv`: 宽格式提交文件 (每行一个用户，30列推荐)

## ⚙️ 配置参数
- **TOPK**: 推荐商品数量 (默认30)
- **降级策略**: 模型不可用时使用 `pre_score` 排序

## 1️⃣ 环境配置与参数设置


In [7]:

# =============================================================================
# 环境配置与依赖导入
# =============================================================================
import os
import pandas as pd
import numpy as np
import pickle
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# 机器学习相关
try:
    import lightgbm as lgb
    HAS_LIGHTGBM = True
    print("✅ LightGBM 可用")
except ImportError:
    HAS_LIGHTGBM = False
    print("⚠️  LightGBM 不可用")

# 配置参数
OUTDIR = '../x'  # 数据目录
TOPK = 30       # 推荐Top-K数量

print(f'🚀 线上推理模块启动')
print(f'📁 数据目录: {OUTDIR}')
print(f'🎯 推荐数量: Top-{TOPK}')
print(f'⏰ 推理时间: {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}')

# 检查必要文件
required_files = [
    'train_sorted.parquet',
    'test_sorted.parquet',
    'item_attr.parquet'
]

optional_files = [
    'model_lgb.pkl'
]

print("\n🔍 检查数据文件...")
for file in required_files:
    if not os.path.exists(f'{OUTDIR}/{file}'):
        raise FileNotFoundError(f"❌ 缺少必要文件: {file}")
    print(f"  ✅ {file}")

for file in optional_files:
    if os.path.exists(f'{OUTDIR}/{file}'):
        print(f"  ✅ {file} (模型文件)")
    else:
        print(f"  ⚠️  {file} (模型文件不存在，将使用pre_score)")

print("✅ 环境配置完成")
train = pd.read_parquet(f"{OUTDIR}/train_sorted.parquet")
test  = pd.read_parquet(f'{OUTDIR}/test_sorted.parquet')
item_attr = pd.read_parquet(f'{OUTDIR}/item_attr.parquet')
print('train rows:', len(train), ' test rows:', len(test))
    

✅ LightGBM 可用
🚀 线上推理模块启动
📁 数据目录: ../x
🎯 推荐数量: Top-30
⏰ 推理时间: 2025-09-26 20:25:16

🔍 检查数据文件...
  ✅ train_sorted.parquet
  ✅ test_sorted.parquet
  ✅ item_attr.parquet
  ✅ model_lgb.pkl (模型文件)
✅ 环境配置完成
train rows: 170699  test rows: 140380


In [8]:

PARAMS = dict(
    covisit_window=4, covisit_top_per_a=317,  # 贝叶斯优化: 4, 317
    recent_k=4, cand_per_recent=69,          # 贝叶斯优化: 4, 69
    tau_days=11,                             # 贝叶斯优化: 11
    user_top_cates=3, user_top_stores=3,     # 保持原值
    per_cate_pool=38, per_store_pool=96,     # 贝叶斯优化: 38, 96
    pop_pool=4863, recall_cap=866,           # 贝叶斯优化: 4863, 866
)
    

In [9]:

def time_decay(days, tau=14.0):
    days = np.maximum(days, 0.0)
    return np.exp(-days / float(tau))

def build_rebuy_scores(df, tau_days=14):
    g = df.copy()
    ref = g.groupby('buyer_admin_id')['create_order_time'].transform('max')
    g['days_ago'] = (ref - g['create_order_time']).dt.days.clip(lower=0)
    g['score_rebuy'] = time_decay(g['days_ago'].to_numpy(), tau=tau_days)
    return g.groupby(['buyer_admin_id','item_id'])['score_rebuy'].sum().reset_index()

def build_covisit(df, W=3, topk=200):
    base = df[['buyer_admin_id','item_id']].copy()
    pairs = []
    for lag in range(1, W+1):
        t = base.copy()
        t['item_b'] = t.groupby('buyer_admin_id')['item_id'].shift(-lag)
        t = t.dropna().rename(columns={'item_id':'item_a'})
        t['w'] = 1.0/lag
        pairs.append(t[['item_a','item_b','w']])
    if not pairs:
        return pd.DataFrame(columns=['item_a','item_b','w'])
    co = pd.concat(pairs, ignore_index=True)
    co = co.groupby(['item_a','item_b'])['w'].sum().reset_index()
    co['rn'] = co.groupby('item_a')['w'].rank(ascending=False, method='first')
    return co[co['rn']<=topk].drop(columns='rn')

def build_pop_pools(df, item_attr, pop_pool=2000):
    pop = df.groupby('item_id').size().rename('pop').reset_index()
    cate_pop = (df.merge(item_attr, on='item_id', how='left')
                .groupby(['cate_id','item_id']).size().rename('pop').reset_index())
    cate_pop['rn'] = cate_pop.groupby('cate_id')['pop'].rank(ascending=False, method='first')
    store_pop = (df.merge(item_attr, on='item_id', how='left')
                 .groupby(['store_id','item_id']).size().rename('pop').reset_index())
    store_pop['rn'] = store_pop.groupby('store_id')['pop'].rank(ascending=False, method='first')
    global_pop = pop.sort_values('pop', ascending=False).head(pop_pool)
    return cate_pop, store_pop, global_pop

rebuy = build_rebuy_scores(train, tau_days=PARAMS['tau_days'])
covisit = build_covisit(train, W=PARAMS['covisit_window'], topk=PARAMS['covisit_top_per_a'])
cate_pop, store_pop, global_pop = build_pop_pools(train, item_attr, pop_pool=PARAMS['pop_pool'])
print('full-train stats built.')
    

full-train stats built.


In [10]:

P = PARAMS

cov_neighbors = {}
for a, g in covisit.groupby('item_a'):
    sub = g[['item_b','w']].head(P['cand_per_recent']).to_numpy()
    if len(sub):
        cov_neighbors[int(a)] = (sub[:,0].astype('int64'), sub[:,1].astype('float32'))

recent_map = (test.sort_values('create_order_time')
              .groupby('buyer_admin_id')['item_id']
              .apply(lambda s: s.tail(P['recent_k']).to_numpy('int64'))
              ).to_dict()

ua = test.merge(item_attr, on='item_id', how='left')
user_topc = ua.groupby('buyer_admin_id')['cate_id']               .apply(lambda s: s.value_counts().head(P['user_top_cates']).index.to_numpy('int64')).to_dict()
user_tops = ua.groupby('buyer_admin_id')['store_id']               .apply(lambda s: s.value_counts().head(P['user_top_stores']).index.to_numpy('int64')).to_dict()

cate_top = {int(c): grp.loc[grp['rn']<=P['per_cate_pool'],'item_id'].to_numpy('int64')
            for c, grp in cate_pop.groupby('cate_id')}
store_top = {int(s): grp.loc[grp['rn']<=P['per_store_pool'],'item_id'].to_numpy('int64')
             for s, grp in store_pop.groupby('store_id')}
global_items = global_pop['item_id'].to_numpy('int64')

rebuy_map = {}
for uid, g in rebuy.groupby('buyer_admin_id'):
    rebuy_map[int(uid)] = (g['item_id'].to_numpy('int64'), g['score_rebuy'].to_numpy('float32'))
print('precomputed maps ready (online).')
    

precomputed maps ready (online).


In [11]:

def build_candidates_fast(uid,
                          use_rebuy=True, use_covisit=True,
                          use_cate_store=True, use_global=True):
    cand = {}
    if use_rebuy and uid in rebuy_map:
        items, ws = rebuy_map[uid]
        for it, w in zip(items, ws):
            cand.setdefault(int(it), []).append(('rebuy', float(w)))
    if use_covisit:
        for a in recent_map.get(uid, []):
            pair = cov_neighbors.get(int(a))
            if pair is None: 
                continue
            bs, ws = pair
            for b, w in zip(bs, ws):
                cand.setdefault(int(b), []).append(('covisit', float(w)))
    if use_cate_store:
        for c in user_topc.get(uid, []):
            for it in cate_top.get(int(c), ()):
                cand.setdefault(int(it), []).append(('cate_hot', 1.0))
        for s in user_tops.get(uid, []):
            for it in store_top.get(int(s), ()):
                cand.setdefault(int(it), []).append(('store_hot', 1.0))
    if use_global:
        for it in global_items:
            cand.setdefault(int(it), []).append(('global_pop', 1.0))

    if not cand:
        cols = ['buyer_admin_id','item_id','score_rebuy','score_covisit',
                'is_cate_hot','is_store_hot','is_global_pop','src_count','pre_score']
        return pd.DataFrame(columns=cols)
    rows = []
    for it, srcs in cand.items():
        srcset = set()
        sr=sc=0.0; is_c=is_s=is_g=0
        for tag, w in srcs:
            srcset.add(tag)
            if tag=='rebuy': sr=max(sr,w)
            elif tag=='covisit': sc=max(sc,w)
            elif tag=='cate_hot': is_c=1
            elif tag=='store_hot': is_s=1
            elif tag=='global_pop': is_g=1
        rows.append((int(uid), int(it), sr, sc, is_c, is_s, is_g, len(srcset)))
    df = pd.DataFrame(rows, columns=['buyer_admin_id','item_id','score_rebuy','score_covisit',
                                     'is_cate_hot','is_store_hot','is_global_pop','src_count'])
    df['pre_score'] = (df['score_rebuy'] + df['score_covisit']
                       + 0.3*df['is_cate_hot'] + 0.3*df['is_store_hot'] + 0.1*df['is_global_pop'])
    return df.sort_values('pre_score', ascending=False).head(PARAMS['recall_cap'])
    

In [12]:

# === 加同分布特征 + 载入模型（若不存在则退化 pre_score） ===
try:
    import joblib, os
    model_path = os.path.join(OUTDIR, 'model_lgb.pkl')
    model = joblib.load(model_path) if os.path.exists(model_path) else None
    print('Loaded model:', model_path if model is not None else 'None (fallback to pre_score)')
except Exception:
    model = None
    print('joblib not available; fallback to pre_score.')

feat_cols = ['score_rebuy','score_covisit','is_cate_hot','is_store_hot','is_global_pop',
             'src_count','user_hist_cnt','item_pop_cnt','pre_score']

user_cnt_full = train.groupby('buyer_admin_id').size().rename('user_hist_cnt')
item_cnt_full = train.groupby('item_id').size().rename('item_pop_cnt')

def score_dataframe(cdf):
    cdf = (cdf.merge(user_cnt_full, on='buyer_admin_id', how='left')
              .merge(item_cnt_full, on='item_id', how='left')).fillna(0)
    if (model is not None) and hasattr(model, 'predict_proba'):
        cdf['score'] = model.predict_proba(cdf[feat_cols])[:,1]
    else:
        cdf['score'] = cdf['pre_score']
    return cdf[['buyer_admin_id','item_id','score']]
    

Loaded model: ../x/model_lgb.pkl


In [None]:

# === 生成 Top30 提交 ===
# 根据比赛要求：为每个用户的最后一条购买数据预测Top30商品

print("🎯 开始为每个用户的最后一条购买记录生成推荐...")

# 获取每个用户的最后一条购买记录
test['create_order_time'] = pd.to_datetime(test['create_order_time'])
last_purchases = test.sort_values('create_order_time').groupby('buyer_admin_id').tail(1)
print(f"📊 需要预测的用户数: {len(last_purchases)}")

rows = []
processed_users = 0

for idx, row in last_purchases.iterrows():
    uid = int(row['buyer_admin_id'])
    
    # 为这个用户生成候选商品
    cdf = build_candidates_fast(uid, True, True, True, True)
    
    if len(cdf) == 0:
        print(f"⚠️ 用户 {uid} 没有候选商品")
        continue
    
    # 使用模型评分
    sdf = score_dataframe(cdf).sort_values('score', ascending=False).head(TOPK)
    
    # 添加排名
    rank = np.arange(1, len(sdf) + 1)
    sdf = sdf.assign(rank=rank)
    
    rows.append(sdf)
    processed_users += 1
    
    if processed_users % 1000 == 0:
        print(f"✅ 已处理 {processed_users}/{len(last_purchases)} 用户")

print(f"✅ 完成推荐生成，共处理 {processed_users} 个用户")

# 合并所有结果
submit_long = pd.concat(rows, ignore_index=True) if rows else \
    pd.DataFrame(columns=['buyer_admin_id','item_id','score','rank'])

print(f"📊 最终提交数据: {len(submit_long)} 条推荐记录")

# 转换为宽格式
def to_wide(df, topk=TOPK):
    df = df.sort_values(['buyer_admin_id','rank'])
    items = df.groupby('buyer_admin_id')['item_id'].apply(list).reset_index()
    items['item_list'] = items['item_id'].apply(lambda L: (L + [None]*topk)[:topk])
    out = items[['buyer_admin_id','item_list']].copy()
    for i in range(topk):
        out[f'item_{i+1}'] = out['item_list'].apply(lambda L: L[i])
    return out.drop(columns=['item_list'])

submit_wide = to_wide(submit_long[['buyer_admin_id','item_id','rank']], TOPK) if len(submit_long) else \
    pd.DataFrame(columns=['buyer_admin_id'] + [f'item_{i}' for i in range(1, TOPK+1)])

# 保存文件
submit_long.to_csv(f'{OUTDIR}/submit_long.csv', index=False)
submit_wide.to_csv(f'{OUTDIR}/submit_wide.csv', index=False)

print('💾 文件已保存:')
print(f'- {OUTDIR}/submit_long.csv ({len(submit_long)} 条记录)')
print(f'- {OUTDIR}/submit_wide.csv ({len(submit_wide)} 用户)')

# 验证结果
print(f"\n📊 提交结果验证:")
print(f"- 用户数: {submit_wide['buyer_admin_id'].nunique()}")
print(f"- 平均每用户推荐数: {submit_long.groupby('buyer_admin_id').size().mean():.1f}")
print(f"- 推荐商品总数: {submit_long['item_id'].nunique()}")
    

python(19988) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


Saved:
- ../x/submit_long.csv
- ../x/submit_wide.csv
