In [1]:
import pandas as pd
import numpy as np
from lifelines import CoxPHFitter
import joblib
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

In [2]:
# 2. 加载训练数据
train_df = pd.read_csv("data/train_f.csv")
drop_cols = ["user_id", "duration", "event"]
feature_cols = [col for col in train_df.columns if col not in drop_cols]


In [11]:
# 3. 训练 Cox 模型
print("开始训练 Cox 模型...")

cph = CoxPHFitter(penalizer=0.01)  # 可以尝试不同的 L2 正则强度
cph.fit(train_df[feature_cols + ["duration", "event"]],
        duration_col="duration",
        event_col="event",
        show_progress=True)

cph.print_summary()
joblib.dump(cph, "cox_model_f.pkl")
print("模型已保存至 cox_model_f.pkl")


开始训练 Cox 模型...
Iteration 1: norm_delta = 4.46e-01, step_size = 0.9500, log_lik = -285055.18108, newton_decrement = 7.96e+03, seconds_since_start = 0.4
Iteration 2: norm_delta = 4.13e-01, step_size = 0.9500, log_lik = -275244.54418, newton_decrement = 3.77e+03, seconds_since_start = 0.7
Iteration 3: norm_delta = 2.08e-01, step_size = 0.9500, log_lik = -270857.13907, newton_decrement = 5.85e+02, seconds_since_start = 1.1
Iteration 4: norm_delta = 2.55e-02, step_size = 1.0000, log_lik = -270231.11714, newton_decrement = 7.64e+00, seconds_since_start = 1.5
Iteration 5: norm_delta = 3.09e-04, step_size = 1.0000, log_lik = -270223.41604, newton_decrement = 1.12e-03, seconds_since_start = 1.9
Iteration 6: norm_delta = 4.78e-08, step_size = 1.0000, log_lik = -270223.41492, newton_decrement = 2.67e-11, seconds_since_start = 2.3
Convergence success after 6 iterations.


0,1
model,lifelines.CoxPHFitter
duration col,'duration'
event col,'event'
penalizer,0.01
l1 ratio,0.0
baseline estimation,breslow
number of observations,987984
number of events observed,20716
partial log-likelihood,-270223.41
time fit was run,2025-06-08 03:30:55 UTC

Unnamed: 0,coef,exp(coef),se(coef),coef lower 95%,coef upper 95%,exp(coef) lower 95%,exp(coef) upper 95%,cmp to,z,p,-log2(p)
buy_x,-0.09,0.91,0.0,-0.1,-0.09,0.9,0.92,0.0,-25.07,<0.005,458.28
cart_x,-0.03,0.97,0.0,-0.03,-0.03,0.97,0.98,0.0,-18.94,<0.005,263.38
fav_x,-0.01,0.99,0.0,-0.01,-0.01,0.99,0.99,0.0,-7.83,<0.005,47.52
pv_x,-0.0,1.0,0.0,-0.01,-0.0,0.99,1.0,0.0,-34.45,<0.005,861.34
buy_y,-0.05,0.95,0.0,-0.06,-0.04,0.94,0.96,0.0,-12.39,<0.005,114.71
cart_y,-0.01,0.99,0.0,-0.01,-0.01,0.99,0.99,0.0,-5.62,<0.005,25.68
fav_y,-0.0,1.0,0.0,-0.01,0.0,0.99,1.0,0.0,-1.88,0.06,4.07
pv_y,-0.0,1.0,0.0,-0.0,-0.0,1.0,1.0,0.0,-17.36,<0.005,221.91
total_act,-0.0,1.0,0.0,-0.0,-0.0,1.0,1.0,0.0,-16.97,<0.005,212.07
ratio_buy,0.89,2.45,0.07,0.75,1.04,2.11,2.83,0.0,11.99,<0.005,107.6

0,1
Concordance,0.91
Partial AIC,540488.83
log-likelihood ratio test,29663.53 on 21 df
-log2(p) of ll-ratio test,inf


模型已保存至 cox_model_f.pkl


In [None]:
# 3. 加载模型和验证 / 测试集
cph = joblib.load("cox_model_f.pkl")


In [6]:

val_df = pd.read_csv("data/val_f.csv")
test_df = pd.read_csv("data/test_f.csv")

drop_cols = ["user_id", "duration", "event"]
feature_cols = [col for col in val_df.columns if col not in drop_cols]



In [16]:
def evaluate_predictions(prob, true_event, user_ids, threshold=0.05, dataset_name="Val"):
    pred = (prob > threshold).astype(int)

    acc = accuracy_score(true_event, pred)
    prec = precision_score(true_event, pred, zero_division=0)
    rec = recall_score(true_event, pred, zero_division=0)
    f1 = f1_score(true_event, pred, zero_division=0)
    auc = roc_auc_score(true_event, prob)

    print(f"\n【{dataset_name} 评估结果 @ threshold={threshold}】")
    print(f"- Accuracy:  {acc:.4f}")
    print(f"- Precision: {prec:.4f}")
    print(f"- Recall:    {rec:.4f}")
    print(f"- F1 Score:  {f1:.4f}")
    print(f"- AUC:       {auc:.4f}")

    return pd.DataFrame({
        "user_id": user_ids,
        "true_event": true_event,
        "predicted_prob": prob,
        "predicted_event": pred
    })


In [17]:
# 5.预测生存函数（取第1天）
val_surv = cph.predict_survival_function(val_df[feature_cols], times=[1])
test_surv = cph.predict_survival_function(test_df[feature_cols], times=[1])

val_prob = 1 - val_surv.loc[1].values  # 流失概率 = 1 - 生存概率
test_prob = 1 - test_surv.loc[1].values


In [19]:
for t in [0.005, 0.01, 0.015, 0.02, 0.03, 0.04,0.05]:
    evaluate_predictions(val_prob, val_df["event"].values, val_df["user_id"].values, threshold=t, dataset_name=f"Val@{t}")



【Val@0.005 评估结果 @ threshold=0.005】
- Accuracy:  0.4058
- Precision: 0.0073
- Recall:    0.9591
- F1 Score:  0.0145
- AUC:       0.8171

【Val@0.01 评估结果 @ threshold=0.01】
- Accuracy:  0.5480
- Precision: 0.0090
- Recall:    0.9028
- F1 Score:  0.0178
- AUC:       0.8171

【Val@0.015 评估结果 @ threshold=0.015】
- Accuracy:  0.6519
- Precision: 0.0107
- Recall:    0.8254
- F1 Score:  0.0211
- AUC:       0.8171

【Val@0.02 评估结果 @ threshold=0.02】
- Accuracy:  0.7373
- Precision: 0.0126
- Recall:    0.7337
- F1 Score:  0.0248
- AUC:       0.8171

【Val@0.03 评估结果 @ threshold=0.03】
- Accuracy:  0.8728
- Precision: 0.0178
- Recall:    0.4983
- F1 Score:  0.0344
- AUC:       0.8171

【Val@0.04 评估结果 @ threshold=0.04】
- Accuracy:  0.9725
- Precision: 0.0399
- Recall:    0.2189
- F1 Score:  0.0675
- AUC:       0.8171

【Val@0.05 评估结果 @ threshold=0.05】
- Accuracy:  0.9906
- Precision: 0.0497
- Recall:    0.0585
- F1 Score:  0.0538
- AUC:       0.8171


In [22]:
val_result = evaluate_predictions(
    val_prob,
    val_df["event"].values,
    val_df["user_id"].values, 
    threshold=0.04,
    dataset_name="Val"
)
val_result.to_csv("cox_prediction_val_f0.04.csv", index=False)



【Val 评估结果 @ threshold=0.04】
- Accuracy:  0.9725
- Precision: 0.0399
- Recall:    0.2189
- F1 Score:  0.0675
- AUC:       0.8171


In [24]:
test_result = evaluate_predictions(
    test_prob,
    test_df["event"].values,
    test_df["user_id"].values,
    threshold=0.04,
    dataset_name="Test"
)
test_result.to_csv("cox_prediction_test_f0.04.csv", index=False)


【Test 评估结果 @ threshold=0.04】
- Accuracy:  0.9919
- Precision: 0.0636
- Recall:    0.1955
- F1 Score:  0.0959
- AUC:       0.8453


随机森林：

In [14]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
)


In [17]:
feature_cols = [
    "buy_x", "cart_x", "fav_x", "pv_x",
    "buy_y", "cart_y", "fav_y", "pv_y",
    "total_act",
    "ratio_buy", "ratio_cart", "ratio_fav", "ratio_pv",
    "buy_last1d", "cart_last1d", "fav_last1d", "pv_last1d",
    "buy_last3d", "cart_last3d", "fav_last3d", "pv_last3d"
]
target_col = "event"

X = train_df[feature_cols]
y = train_df[target_col]
user_ids = train_df["user_id"]


In [18]:
X_train, X_test, y_train, y_test, train_ids, test_ids = train_test_split(
    X, y, user_ids, test_size=0.2, stratify=y, random_state=42
)


In [19]:
rf_model = RandomForestClassifier(
    n_estimators=100,
    max_depth=10,
    class_weight="balanced",
    random_state=42,
    n_jobs=-1
)

rf_model.fit(X_train, y_train)


0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,10
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [23]:
y_prob = rf_model.predict_proba(X_test)[:, 1]
y_pred = (y_prob >= 0.5).astype(int)

result_df = pd.DataFrame({
    "user_id": test_ids,
    "true_event": y_test.values,
    "predicted_prob": y_prob,
    "predicted_label": y_pred
})

result_df.to_csv("rf_prediction_with_prob.csv", index=False)


In [24]:
acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred, zero_division=0)
rec = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_prob)

print(f"Accuracy:  {acc:.4f}")
print(f"Precision: {prec:.4f}")
print(f"Recall:    {rec:.4f}")
print(f"F1 Score:  {f1:.4f}")
print(f"AUC:       {auc:.4f}")


Accuracy:  0.9723
Precision: 0.4311
Recall:    1.0000
F1 Score:  0.6025
AUC:       0.9991


In [27]:
import pandas as pd

def evaluate_ranking_verbose(df, k_list=[0.01, 0.03, 0.05, 0.1], output_path="rf_hit.csv"):
    print("🚀 开始排名评估...")

    # === 0. 检查输入是否为空 ===
    if df is None or df.empty:
        print("❌ 错误：输入的 DataFrame 是空的！")
        return

    # === 1. 检查必须的列是否存在 ===
    required_cols = ["predicted_prob", "true_event"]
    for col in required_cols:
        if col not in df.columns:
            print(f"❌ 错误：缺失必要列 {col}")
            print(f"当前列为: {df.columns.tolist()}")
            return

    # === 2. 排序 + 总体统计 ===
    df = df.sort_values("predicted_prob", ascending=False).reset_index(drop=True)
    print(f"✅ 排序后前5行：\n{df[['user_id', 'true_event', 'predicted_prob']].head()}")

    total = len(df)
    total_event_1 = df["true_event"].sum()
    print(f"\n📊 全部样本数: {total}")
    print(f"📊 实际流失用户数 (event=1): {total_event_1}")

    # === 3. Top-K 评估 ===
    results = []

    for k in k_list:
        top_n = int(total * k)
        top_df = df.head(top_n)
        hit = top_df["true_event"].sum()

        precision_at_k = hit / top_n if top_n > 0 else 0
        recall_at_k = hit / total_event_1 if total_event_1 > 0 else 0

        print(f"\n🎯 Top {int(k*100)}% ({top_n}人):")
        print(f"   ➤ 命中真实流失用户数: {hit}")
        print(f"   ➤ Precision@{int(k*100)}%: {precision_at_k:.4f}")
        print(f"   ➤ Recall@{int(k*100)}%:    {recall_at_k:.4f}")

        results.append({
            "Top %": f"{int(k*100)}%",
            "Top N": top_n,
            "Hit (event=1)": hit,
            "Precision@K": precision_at_k,
            "Recall@K": recall_at_k
        })

    # === 4. 保存输出 ===
    result_df = pd.DataFrame(results)
    result_df.to_csv(output_path, index=False)
    print(f"\n✅ 排名评估结果保存至：{output_path}")
    print(result_df)
    
df = pd.read_csv("rf_prediction_with_prob.csv")
evaluate_ranking_verbose(df, k_list=[0.01, 0.03, 0.05, 0.1], output_path="rf_hit.csv")



🚀 开始排名评估...
✅ 排序后前5行：
   user_id  true_event  predicted_prob
0    11780           1        0.973105
1   116687           1        0.973105
2   988274           1        0.972917
3   785895           1        0.972917
4   743686           1        0.972917

📊 全部样本数: 197597
📊 实际流失用户数 (event=1): 4143

🎯 Top 1% (1975人):
   ➤ 命中真实流失用户数: 1869
   ➤ Precision@1%: 0.9463
   ➤ Recall@1%:    0.4511

🎯 Top 3% (5927人):
   ➤ 命中真实流失用户数: 4130
   ➤ Precision@3%: 0.6968
   ➤ Recall@3%:    0.9969

🎯 Top 5% (9879人):
   ➤ 命中真实流失用户数: 4143
   ➤ Precision@5%: 0.4194
   ➤ Recall@5%:    1.0000

🎯 Top 10% (19759人):
   ➤ 命中真实流失用户数: 4143
   ➤ Precision@10%: 0.2097
   ➤ Recall@10%:    1.0000

✅ 排名评估结果保存至：rf_hit.csv
  Top %  Top N  Hit (event=1)  Precision@K  Recall@K
0    1%   1975           1869     0.946329  0.451122
1    3%   5927           4130     0.696811  0.996862
2    5%   9879           4143     0.419374  1.000000
3   10%  19759           4143     0.209677  1.000000
