# 0) 数据准备（排序 + 留一验证切分）

**目的**  
- 读取 `train/test/item_attr`
- 解析时间 & 排序：`buyer_admin_id, create_order_time, irank` 升序  
- 留一验证切分：每个用户最后一条作为 `label_df`，其余为 `train_vis`（线下统计一律基于它）  
- 将中间产物保存到 `/mnt/data/reco/`

In [4]:
# 依赖
import pandas as pd, numpy as np, os, json

TRAIN_CSV = '../data/Antai_hackathon_train.csv'
TEST_CSV  = '../data/dianshang_test.csv'
ATTR_CSV  = '../data/Antai_hackathon_attr.csv'

OUTDIR = '../x'
os.makedirs(OUTDIR, exist_ok=True)

# 读取 CSV
# 指定 dtype，避免默认 int64/float64 吃内存
dtype_train = {
    "buyer_admin_id": "int32",   # 如果用户ID都是数字
    "item_id": "int32",          # 商品ID
    # 其他列可以继续补充，比如 "irank": "int16"
}

dtype_test = {
    "buyer_admin_id": "int32",
    "item_id": "int32"
}

dtype_item_attr = {
    "item_id": "int32"
}

train = pd.read_csv(
    TRAIN_CSV,
    parse_dates=['create_order_time'],  # 日期字段直接转 datetime
    dtype=dtype_train
)

test = pd.read_csv(
    TEST_CSV,
    parse_dates=['create_order_time'],
    dtype=dtype_test
)

item_attr = pd.read_csv(
    ATTR_CSV,
    dtype=dtype_item_attr
)


# 字段检查
need_train = {'buyer_admin_id','item_id','create_order_time','irank'}
need_attr  = {'item_id','cate_id','store_id'}
assert need_train.issubset(train.columns), f'missing: {need_train - set(train.columns)}'
assert need_train.issubset(test.columns),  f'missing: {need_train - set(test.columns)}'
assert need_attr.issubset(item_attr.columns), f'missing: {need_attr - set(item_attr.columns)}'


# 留一验证切分
last_idx = train.groupby('buyer_admin_id')['irank'].idxmin()
label_df = train.loc[last_idx, ['buyer_admin_id','item_id']].rename(columns={'item_id':'label_item'})
train_vis = train.drop(index=last_idx).copy()

# 保存中间产物
train.to_parquet(f'{OUTDIR}/train_sorted.parquet', index=False)
test.to_parquet(f'{OUTDIR}/test_sorted.parquet', index=False)
item_attr[['item_id','cate_id','store_id']].drop_duplicates().to_parquet(f'{OUTDIR}/item_attr.parquet', index=False)
train_vis.to_parquet(f'{OUTDIR}/train_vis.parquet', index=False)
label_df.to_parquet(f'{OUTDIR}/label_df.parquet', index=False)

print('Saved to', OUTDIR)
print(train.shape, test.shape, train_vis.shape, label_df.shape)
    

Saved to ../x
(6989817, 5) (140380, 5) (6506700, 5) (483117, 2)
