In [1]:
# === 导入库 ===
import pandas as pd
import joblib
from lifelines import CoxPHFitter
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from tqdm import tqdm
import time


In [None]:

# === 设置路径 ===
train_path = "data/train_u.csv"
val_path = "data/valid_u.csv"
test_path = "data/test_u.csv"
model_path = "cox_model.pkl"

# === 加载数据 ===
train = pd.read_csv(train_path)
val = pd.read_csv(val_path)
test = pd.read_csv(test_path)

# === 特征列（保留所有特征）===
drop_cols = ["user_id", "event", "duration"]
feature_cols = [col for col in train.columns if col not in drop_cols]


COX:

In [9]:
print("开始训练 Cox 模型...")

cph = CoxPHFitter()
cph.fit(
    train[feature_cols + ["duration", "event"]],
    duration_col="duration",
    event_col="event",
    show_progress=True
)

cph.print_summary()
joblib.dump(cph, model_path)
print(f"模型已保存至 {model_path}")


开始训练 Cox 模型...
Iteration 1: norm_delta = 6.87e-01, step_size = 0.9500, log_lik = -285055.18108, newton_decrement = 7.18e+03, seconds_since_start = 6.6
Iteration 2: norm_delta = 9.38e-01, step_size = 0.9500, log_lik = -275859.17135, newton_decrement = 4.28e+03, seconds_since_start = 13.2
Iteration 3: norm_delta = 9.60e-01, step_size = 0.9500, log_lik = -270559.81926, newton_decrement = 1.64e+03, seconds_since_start = 19.7
Iteration 4: norm_delta = 5.72e-01, step_size = 0.9310, log_lik = -268665.96352, newton_decrement = 2.88e+02, seconds_since_start = 26.1
Iteration 5: norm_delta = 1.57e-01, step_size = 0.9124, log_lik = -268359.56028, newton_decrement = 1.56e+01, seconds_since_start = 32.6
Iteration 6: norm_delta = 6.28e-03, step_size = 1.0000, log_lik = -268343.54304, newton_decrement = 2.21e-02, seconds_since_start = 39.0
Iteration 7: norm_delta = 9.10e-06, step_size = 1.0000, log_lik = -268343.52096, newton_decrement = 4.52e-08, seconds_since_start = 45.5
Convergence success after 7

0,1
model,lifelines.CoxPHFitter
duration col,'duration'
event col,'event'
baseline estimation,breslow
number of observations,987984
number of events observed,20716
partial log-likelihood,-268343.52
time fit was run,2025-06-07 20:51:35 UTC

Unnamed: 0,coef,exp(coef),se(coef),coef lower 95%,coef upper 95%,exp(coef) lower 95%,exp(coef) upper 95%,cmp to,z,p,-log2(p)
buy,-0.36,0.7,0.01,-0.38,-0.35,0.68,0.71,0.0,-42.25,<0.005,inf
cart,-0.07,0.93,0.0,-0.08,-0.07,0.92,0.93,0.0,-21.17,<0.005,328.02
fav,-0.03,0.97,0.0,-0.03,-0.02,0.97,0.98,0.0,-7.81,<0.005,47.29
pv,-0.04,0.96,0.0,-0.05,-0.04,0.96,0.96,0.0,-90.13,<0.005,inf

0,1
Concordance,0.84
Partial AIC,536695.04
log-likelihood ratio test,33423.32 on 4 df
-log2(p) of ll-ratio test,inf


模型已保存至 cox_model.pkl


In [10]:
# === 加载模型（用于预测阶段）===
cph = joblib.load(model_path)

# === 生存预测函数（含 tqdm）===
def predict_survival_with_tqdm(model, df, features, times=[1]):
    preds = []
    for _, row in tqdm(df.iterrows(), total=len(df), desc="生存函数预测"):
        row_df = pd.DataFrame([row[features].values], columns=features)
        surv = model.predict_survival_function(row_df, times=times)
        preds.append(surv.values.flatten())
    return pd.DataFrame(preds, columns=[f"day_{t}" for t in times], index=df.index).T

# === 开始预测 ===
val_surv = predict_survival_with_tqdm(cph, val, feature_cols, times=[1])
test_surv = predict_survival_with_tqdm(cph, test, feature_cols, times=[1])


生存函数预测: 100%|██████████| 987992/987992 [57:07<00:00, 288.29it/s] 
生存函数预测: 100%|██████████| 987994/987994 [57:29<00:00, 286.46it/s] 


In [12]:
# === 二分类评估函数 ===
def evaluate_and_export(surv_df, data, day=1, threshold=0.5, dataset_name="Val"):
    prob_churn = 1 - surv_df.loc[f"day_{day}"]
    y_pred = (prob_churn > threshold).astype(int)
    y_true = data["event"].values

    acc = accuracy_score(y_true, y_pred)
    prec = precision_score(y_true, y_pred, zero_division=0)
    rec = recall_score(y_true, y_pred, zero_division=0)
    f1 = f1_score(y_true, y_pred, zero_division=0)
    auc = roc_auc_score(y_true, prob_churn)

    print(f"\n{dataset_name} @ day={day} 的评估结果：")
    print(f"- Accuracy:  {acc:.4f}")
    print(f"- Precision: {prec:.4f}")
    print(f"- Recall:    {rec:.4f}")
    print(f"- F1 Score:  {f1:.4f}")
    print(f"- AUC:       {auc:.4f}")

    result = pd.DataFrame({
        "user_id": data["user_id"],
        "true_event": y_true,
        "predicted_prob": prob_churn.values,
        "predicted_event": y_pred.values
    })
    result.to_csv(f"cox_prediction_{dataset_name.lower()}_day{day}.csv", index=False)
    print(f"已保存预测结果至 cox_prediction_{dataset_name.lower()}_day{day}.csv")

# === 输出验证集与测试集评估结果 ===
evaluate_and_export(val_surv, val, day=1, threshold=0.05, dataset_name="Val")
evaluate_and_export(test_surv, test, day=1, threshold=0.05, dataset_name="Test")



Val @ day=1 的评估结果：
- Accuracy:  0.9585
- Precision: 0.0307
- Recall:    0.2663
- F1 Score:  0.0551
- AUC:       0.8124
已保存预测结果至 cox_prediction_val_day1.csv

Test @ day=1 的评估结果：
- Accuracy:  0.9786
- Precision: 0.0247
- Recall:    0.2282
- F1 Score:  0.0447
- AUC:       0.8202
已保存预测结果至 cox_prediction_test_day1.csv


COX（用户行为特征）：

In [22]:
def add_behavior_features(user_df, full_log_df, end_day):
    """
    user_df: 当前用于训练/验证/测试的样本（包含 user_id、event、duration）
    full_log_df: 原始行为日志（包含 user_id, behavior_type, timestamp）
    end_day: 样本构造的结束时间（如 "2017-12-01"）
    """
    # 转换时间
    full_log_df["time"] = pd.to_datetime(full_log_df["timestamp"], unit="s")
    full_log_df = full_log_df[full_log_df["time"] <= pd.to_datetime(end_day)]

    # == 基础行为数特征 ==
    agg = full_log_df.groupby(["user_id", "behavior_type"]).size().unstack(fill_value=0)
    for b in ["buy", "cart", "fav", "pv"]:
        if b not in agg.columns:
            agg[b] = 0
    agg = agg.reset_index()

    # == 行为比例特征 ==
    agg["total_act"] = agg[["buy", "cart", "fav", "pv"]].sum(axis=1) + 1e-5  # 避免除以0
    for col in ["buy", "cart", "fav", "pv"]:
        agg[f"ratio_{col}"] = agg[col] / agg["total_act"]

    # == 滑窗行为特征（近1天、3天）==
    def behavior_last_n_days(n):
        cutoff = pd.to_datetime(end_day) - pd.Timedelta(days=n)
        df_n = full_log_df[full_log_df["time"] >= cutoff]
        df_n = df_n.groupby(["user_id", "behavior_type"]).size().unstack(fill_value=0)
        df_n.columns = [f"{col}_last{n}d" for col in df_n.columns]
        return df_n.reset_index()

    last1d = behavior_last_n_days(1)
    last3d = behavior_last_n_days(3)

    # 合并所有特征
    feat_df = agg.merge(last1d, how="left", on="user_id").merge(last3d, how="left", on="user_id")
    feat_df = feat_df.fillna(0)

    # 合并到 user_df
    out_df = user_df.merge(feat_df, on="user_id", how="left").fillna(0)
    return out_df


随机森林：

In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score


In [14]:
# 2. 提取特征列和标签
drop_cols = ["user_id", "event", "duration"]
feature_cols = [col for col in train.columns if col not in drop_cols]

X_train, y_train = train[feature_cols], train["event"]
X_val, y_val = val[feature_cols], val["event"]
X_test, y_test = test[feature_cols], test["event"]

print("特征列:", feature_cols)

特征列: ['buy', 'cart', 'fav', 'pv']


In [15]:
# 3. 训练随机森林模型
rf_model = RandomForestClassifier(
    n_estimators=100,
    max_depth=10,
    class_weight="balanced",
    random_state=42,
    n_jobs=-1
)
rf_model.fit(X_train, y_train)


0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,10
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [20]:
# 4. 定义评估函数
def evaluate_rf(model, X, y, dataset_name="Val"):
    y_prob = model.predict_proba(X)[:, 1]
    y_pred = (y_prob > 0.5).astype(int)

    acc = accuracy_score(y, y_pred)
    prec = precision_score(y, y_pred, zero_division=0)
    rec = recall_score(y, y_pred, zero_division=0)
    f1 = f1_score(y, y_pred, zero_division=0)
    auc = roc_auc_score(y, y_prob)

    print(f"\n【{dataset_name} 评估结果】")
    print(f"- Accuracy:  {acc:.4f}")
    print(f"- Precision: {prec:.4f}")
    print(f"- Recall:    {rec:.4f}")
    print(f"- F1 Score:  {f1:.4f}")
    print(f"- AUC:       {auc:.4f}")
    
    return y_prob, y_pred


In [21]:
# 5. 在验证集和测试集上评估模型
val_prob, val_pred = evaluate_rf(rf_model, X_val, y_val, dataset_name="Val")
test_prob, test_pred = evaluate_rf(rf_model, X_test, y_test, dataset_name="Test")



【Val 评估结果】
- Accuracy:  0.8260
- Precision: 0.0160
- Recall:    0.6154
- F1 Score:  0.0312
- AUC:       0.8091

【Test 评估结果】
- Accuracy:  0.8752
- Precision: 0.0099
- Recall:    0.5625
- F1 Score:  0.0194
- AUC:       0.8155


In [18]:
# 6. 导出预测结果（可选）
val_result = pd.DataFrame({
    "user_id": val["user_id"],
    "true_event": y_val,
    "predicted_prob": val_prob,
    "predicted_event": val_pred
})
val_result.to_csv("rf_prediction_val.csv", index=False)

test_result = pd.DataFrame({
    "user_id": test["user_id"],
    "true_event": y_test,
    "predicted_prob": test_prob,
    "predicted_event": test_pred
})
test_result.to_csv("rf_prediction_test.csv", index=False)

print("已保存预测结果至 rf_prediction_val.csv 与 rf_prediction_test.csv")


已保存预测结果至 rf_prediction_val.csv 与 rf_prediction_test.csv
