In [1]:
# 📘 1. 导入库
import pandas as pd

# 📘 2. 定义特征构造函数
def add_behavior_features(user_df, full_log_df, end_day):
    full_log_df["time"] = pd.to_datetime(full_log_df["timestamp"], unit="s")
    full_log_df = full_log_df[full_log_df["time"] <= pd.to_datetime(end_day)]

    agg = full_log_df.groupby(["user_id", "behavior_type"]).size().unstack(fill_value=0)
    for b in ["buy", "cart", "fav", "pv"]:
        if b not in agg.columns:
            agg[b] = 0
    agg = agg.reset_index()

    agg["total_act"] = agg[["buy", "cart", "fav", "pv"]].sum(axis=1) + 1e-5
    for col in ["buy", "cart", "fav", "pv"]:
        agg[f"ratio_{col}"] = agg[col] / agg["total_act"]

    def behavior_last_n_days(n):
        cutoff = pd.to_datetime(end_day) - pd.Timedelta(days=n)
        df_n = full_log_df[full_log_df["time"] >= cutoff]
        df_n = df_n.groupby(["user_id", "behavior_type"]).size().unstack(fill_value=0)
        df_n.columns = [f"{col}_last{n}d" for col in df_n.columns]
        return df_n.reset_index()

    last1d = behavior_last_n_days(1)
    last3d = behavior_last_n_days(3)

    feat_df = agg.merge(last1d, how="left", on="user_id").merge(last3d, how="left", on="user_id")
    feat_df = feat_df.fillna(0)

    out_df = user_df.merge(feat_df, on="user_id", how="left").fillna(0)
    return out_df


In [2]:
# 📘 3. 加载全量行为数据
full_log = pd.read_csv("data/UserBehavior.csv", header=None,
                       names=["user_id", "item_id", "category_id", "behavior_type", "timestamp"])


In [3]:
# 📘 4. 处理训练集
train_user = pd.read_csv("data/train_u.csv")
train_df = add_behavior_features(train_user, full_log, "2017-12-01")
train_df.to_csv("data/train_f.csv", index=False)
print("✅ 训练集保存完毕")


✅ 训练集保存完毕


In [4]:
# 📘 5. 处理验证集
val_user = pd.read_csv("data/valid_u.csv")
val_df = add_behavior_features(val_user, full_log, "2017-12-02")
val_df.to_csv("data/val_f.csv", index=False)
print("✅ 验证集保存完毕")


✅ 验证集保存完毕


In [5]:
# 📘 6. 处理测试集
test_user = pd.read_csv("data/test_u.csv")
test_df = add_behavior_features(test_user, full_log, "2017-12-03")
test_df.to_csv("data/test_f.csv", index=False)
print("✅ 测试集保存完毕")


✅ 测试集保存完毕
