In [1]:
from lifelines import CoxPHFitter
import joblib
import pandas as pd

# 加载模型
cox_model = joblib.load("model/cox_model_f.pkl")

# 加载特征数据（如 test_df、train_df）
df = pd.read_csv("/root/lanyun-tmp/prediction/data/train_f.csv")

# 注意保持和训练时特征列一致
feature_cols = [
    "buy_x", "cart_x", "fav_x", "pv_x",
    "buy_y", "cart_y", "fav_y", "pv_y",
    "total_act",
    "ratio_buy", "ratio_cart", "ratio_fav", "ratio_pv",
    "buy_last1d", "cart_last1d", "fav_last1d", "pv_last1d",
    "buy_last3d", "cart_last3d", "fav_last3d", "pv_last3d"]
df["cox_risk_score"] = cox_model.predict_partial_hazard(df[feature_cols])


In [2]:
# 将 risk_score 加入 XGBoost 的特征列
xgb_feature_cols = [
   "buy_x", "cart_x", "fav_x", "pv_x",
    "buy_y", "cart_y", "fav_y", "pv_y",
    "total_act",
    "ratio_buy", "ratio_cart", "ratio_fav", "ratio_pv",
    "buy_last1d", "cart_last1d", "fav_last1d", "pv_last1d",
    "buy_last3d", "cart_last3d", "fav_last3d", "pv_last3d",
    "cox_risk_score"  
]
X = df[xgb_feature_cols]
y = df["event"]
user_ids = df["user_id"]


In [4]:
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test, train_ids, test_ids = train_test_split(
    X, y, user_ids, test_size=0.2, stratify=y, random_state=42
)

xgb_model = XGBClassifier(
    n_estimators=300,
    max_depth=6,
    learning_rate=0.1,
    scale_pos_weight=(len(y) - sum(y)) / sum(y),
    eval_metric="logloss",
    random_state=42
)
xgb_model.fit(X_train, y_train)
# 保存 XGBoost 模型
joblib.dump(xgb_model, "model/xgb_model_f.pkl")


['model/xgb_model_f.pkl']

In [5]:
y_prob = xgb_model.predict_proba(X_test)[:, 1]
y_pred = (y_prob >= 0.5).astype(int)

result_df = pd.DataFrame({
    "user_id": test_ids,
    "true_event": y_test.values,
    "predicted_prob": y_prob,
    "predicted_label": y_pred
})
result_df.to_csv("xgb_with_cox_prediction.csv", index=False)


In [6]:

import pandas as pd

def evaluate_ranking_verbose(df, k_list=[0.01, 0.03, 0.05, 0.1], output_path="xgb_with_cox_hit.csv"):
    print("开始排名评估...")

    # === 0. 检查输入是否为空 ===
    if df is None or df.empty:
        print("错误：输入的 DataFrame 是空的！")
        return

    # === 1. 检查必须的列是否存在 ===
    required_cols = ["predicted_prob", "true_event"]
    for col in required_cols:
        if col not in df.columns:
            print(f"错误：缺失必要列 {col}")
            print(f"当前列为: {df.columns.tolist()}")
            return

    # === 2. 排序 + 总体统计 ===
    df = df.sort_values("predicted_prob", ascending=False).reset_index(drop=True)
    print(f"排序后前5行：\n{df[['user_id', 'true_event', 'predicted_prob']].head()}")

    total = len(df)
    total_event_1 = df["true_event"].sum()
    print(f"\n全部样本数: {total}")
    print(f"实际流失用户数 (event=1): {total_event_1}")

    # === 3. Top-K 评估 ===
    results = []

    for k in k_list:
        top_n = int(total * k)
        top_df = df.head(top_n)
        hit = top_df["true_event"].sum()

        precision_at_k = hit / top_n if top_n > 0 else 0
        recall_at_k = hit / total_event_1 if total_event_1 > 0 else 0

        print(f"\nTop {int(k*100)}% ({top_n}人):")
        print(f"   ➤ 命中真实流失用户数: {hit}")
        print(f"   ➤ Precision@{int(k*100)}%: {precision_at_k:.4f}")
        print(f"   ➤ Recall@{int(k*100)}%:    {recall_at_k:.4f}")

        results.append({
            "Top %": f"{int(k*100)}%",
            "Top N": top_n,
            "Hit (event=1)": hit,
            "Precision@K": precision_at_k,
            "Recall@K": recall_at_k
        })

    # === 4. 保存输出 ===
    result_df = pd.DataFrame(results)
    result_df.to_csv(output_path, index=False)
    print(f"\n排名评估结果保存至：{output_path}")
    print(result_df)

evaluate_ranking_verbose(result_df, output_path="xgb_with_cox_hit.csv")

开始排名评估...
排序后前5行：
   user_id  true_event  predicted_prob
0    20353           1        0.999779
1   770445           1        0.999779
2   763252           1        0.999779
3   925522           1        0.999779
4   787738           1        0.999779

全部样本数: 197597
实际流失用户数 (event=1): 4143

Top 1% (1975人):
   ➤ 命中真实流失用户数: 1973
   ➤ Precision@1%: 0.9990
   ➤ Recall@1%:    0.4762

Top 3% (5927人):
   ➤ 命中真实流失用户数: 4143
   ➤ Precision@3%: 0.6990
   ➤ Recall@3%:    1.0000

Top 5% (9879人):
   ➤ 命中真实流失用户数: 4143
   ➤ Precision@5%: 0.4194
   ➤ Recall@5%:    1.0000

Top 10% (19759人):
   ➤ 命中真实流失用户数: 4143
   ➤ Precision@10%: 0.2097
   ➤ Recall@10%:    1.0000

排名评估结果保存至：xgb_with_cox_hit.csv
  Top %  Top N  Hit (event=1)  Precision@K  Recall@K
0    1%   1975           1973     0.998987  0.476225
1    3%   5927           4143     0.699005  1.000000
2    5%   9879           4143     0.419374  1.000000
3   10%  19759           4143     0.209677  1.000000
