In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

# 1. Load datasets
df = pd.read_csv('C:/Users/q/Desktop/James/Schulich/Data Science II/assignment1/train.csv')

In [2]:
print(df.shape)
df.head()

(45211, 17)


Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


In [4]:
df['y'].value_counts(normalize=True)

y
no     0.883015
yes    0.116985
Name: proportion, dtype: float64

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45211 entries, 0 to 45210
Data columns (total 17 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   age        45211 non-null  int64 
 1   job        45211 non-null  object
 2   marital    45211 non-null  object
 3   education  45211 non-null  object
 4   default    45211 non-null  object
 5   balance    45211 non-null  int64 
 6   housing    45211 non-null  object
 7   loan       45211 non-null  object
 8   contact    45211 non-null  object
 9   day        45211 non-null  int64 
 10  month      45211 non-null  object
 11  duration   45211 non-null  int64 
 12  campaign   45211 non-null  int64 
 13  pdays      45211 non-null  int64 
 14  previous   45211 non-null  int64 
 15  poutcome   45211 non-null  object
 16  y          45211 non-null  object
dtypes: int64(7), object(10)
memory usage: 5.9+ MB


In [5]:
df.describe()

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous
count,45211.0,45211.0,45211.0,45211.0,45211.0,45211.0,45211.0
mean,40.93621,1362.272058,15.806419,258.16308,2.763841,40.197828,0.580323
std,10.618762,3044.765829,8.322476,257.527812,3.098021,100.128746,2.303441
min,18.0,-8019.0,1.0,0.0,1.0,-1.0,0.0
25%,33.0,72.0,8.0,103.0,1.0,-1.0,0.0
50%,39.0,448.0,16.0,180.0,2.0,-1.0,0.0
75%,48.0,1428.0,21.0,319.0,3.0,-1.0,0.0
max,95.0,102127.0,31.0,4918.0,63.0,871.0,275.0


In [6]:
# 1) 去掉会导致数据泄露的列
df2 = df.drop(columns=['duration'])

# 2) 目标变量二值化
df2['y_bin'] = (df2['y'] == 'yes').astype(int)

# 3) 拆分特征/目标
X = df2.drop(columns=['y', 'y_bin'])
y = df2['y_bin']

X.shape, y.shape, y.mean()  # 看一下正例比例

((45211, 15), (45211,), 0.11698480458295547)

In [7]:
from sklearn.model_selection import train_test_split

# 先去掉泄露变量 duration，并建立 y 的 0/1
df2 = df.drop(columns=['duration']).copy()
df2['y_bin'] = (df2['y'] == 'yes').astype(int)

X = df2.drop(columns=['y', 'y_bin'])
y = df2['y_bin']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

X_train.shape, X_test.shape, y_train.mean(), y_test.mean()


((36168, 15), (9043, 15), 0.11698186241981863, 0.11699657193409267)

In [8]:
num_cols = ['age', 'balance', 'day', 'campaign', 'pdays', 'previous']
cat_cols = [c for c in X_train.columns if c not in num_cols]

num_cols, cat_cols

(['age', 'balance', 'day', 'campaign', 'pdays', 'previous'],
 ['job',
  'marital',
  'education',
  'default',
  'housing',
  'loan',
  'contact',
  'month',
  'poutcome'])

In [9]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, average_precision_score, classification_report

# 预处理：类别做 One-Hot，数值直接透传
preprocess = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=False), cat_cols),
        ("num", "passthrough", num_cols),
    ]
)

# 模型：Logistic，处理类不平衡
log_reg = LogisticRegression(max_iter=1000, class_weight="balanced", solver="lbfgs")

# 管道：预处理 -> 模型
pipe_log = Pipeline(steps=[
    ("prep", preprocess),
    ("model", log_reg)
])

# 训练
pipe_log.fit(X_train, y_train)

# 预测（概率）
proba_test = pipe_log.predict_proba(X_test)[:, 1]

# 指标
roc = roc_auc_score(y_test, proba_test)
prauc = average_precision_score(y_test, proba_test)  # PR-AUC

print(f"Logistic (balanced) | ROC-AUC = {roc:.4f} | PR-AUC = {prauc:.4f}")


Logistic (balanced) | ROC-AUC = 0.7701 | PR-AUC = 0.4014


In [10]:
from sklearn.metrics import classification_report

pred_test = (proba_test >= 0.5).astype(int)
print(classification_report(y_test, pred_test, digits=3))


              precision    recall  f1-score   support

           0      0.942     0.751     0.836      7985
           1      0.257     0.649     0.368      1058

    accuracy                          0.739      9043
   macro avg      0.599     0.700     0.602      9043
weighted avg      0.862     0.739     0.781      9043



In [11]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler

# 预处理：数值列做标准化，类别列做 One-Hot
preprocess_knn = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=False), cat_cols),
        ("num", StandardScaler(), num_cols),
    ]
)

# 模型：kNN (k=5 默认)
knn = KNeighborsClassifier(n_neighbors=5)

pipe_knn = Pipeline(steps=[
    ("prep", preprocess_knn),
    ("model", knn)
])

# 训练
pipe_knn.fit(X_train, y_train)

# 预测概率
proba_test_knn = pipe_knn.predict_proba(X_test)[:, 1]

# 评估
roc_knn = roc_auc_score(y_test, proba_test_knn)
prauc_knn = average_precision_score(y_test, proba_test_knn)

print(f"kNN (k=5) | ROC-AUC = {roc_knn:.4f} | PR-AUC = {prauc_knn:.4f}")


kNN (k=5) | ROC-AUC = 0.7027 | PR-AUC = 0.2913


In [12]:
pred_test_knn = (proba_test_knn >= 0.5).astype(int)
print(classification_report(y_test, pred_test_knn, digits=3))

              precision    recall  f1-score   support

           0      0.902     0.979     0.939      7985
           1      0.553     0.198     0.291      1058

    accuracy                          0.887      9043
   macro avg      0.727     0.588     0.615      9043
weighted avg      0.861     0.887     0.863      9043



In [13]:
import numpy as np
from sklearn.metrics import precision_recall_curve

def threshold_tradeoff(y_true, proba, name="model"):
    prec, rec, thr = precision_recall_curve(y_true, proba)
    # 去掉最后一个点（sklearn会多给一个阈值位置）
    prec, rec, thr = prec[:-1], rec[:-1], thr

    f1 = 2 * prec * rec / (prec + rec + 1e-12)
    idx_f1 = np.argmax(f1)

    # 业务导向点：Precision >= 0.40 时，Recall 最大
    mask_p40 = prec >= 0.40
    idx_p40 = np.argmax(rec * mask_p40) if mask_p40.any() else None

    def row(i):
        return {
            "threshold": float(thr[i]),
            "precision": float(prec[i]),
            "recall": float(rec[i]),
            "f1": float(f1[i])
        }

    print(f"\n=== {name}: 阈值权衡 ===")
    print("F1 最大点：", row(idx_f1))
    if idx_p40 is not None:
        print("Precision≥0.40 且 Recall 最大：", row(idx_p40))
    else:
        print("Precision 无法达到 0.40（该模型在该数据上较难获得较高精度）。")

# Logistic
threshold_tradeoff(y_test.values, proba_test, name="Logistic (balanced)")

# kNN
threshold_tradeoff(y_test.values, proba_test_knn, name="kNN (k=5)")



=== Logistic (balanced): 阈值权衡 ===
F1 最大点： {'threshold': 0.669166868399666, 'precision': 0.437984496124031, 'recall': 0.42722117202268434, 'f1': 0.4325358851669643}
Precision≥0.40 且 Recall 最大： {'threshold': 0.6354857661852041, 'precision': 0.4001636661211129, 'recall': 0.4621928166351607, 'f1': 0.42894736842055525}

=== kNN (k=5): 阈值权衡 ===
F1 最大点： {'threshold': 0.4, 'precision': 0.401840490797546, 'recall': 0.3714555765595463, 'f1': 0.386051080549599}
Precision≥0.40 且 Recall 最大： {'threshold': 0.4, 'precision': 0.401840490797546, 'recall': 0.3714555765595463, 'f1': 0.386051080549599}


In [14]:
def topk_metrics(y_true, proba, ks=(200, 500, 1000), name="model"):
    y_true = np.asarray(y_true)
    order = np.argsort(-proba)  # 概率从高到低排序
    y_sorted = y_true[order]
    pos_total = y_true.sum()

    print(f"\n=== {name}: Top-k 评估 ===")
    for k in ks:
        k = min(k, len(y_true))
        tp_k = y_sorted[:k].sum()
        precision_k = tp_k / k
        recall_k = tp_k / pos_total if pos_total>0 else 0.0
        print(f"Top-{k}: Precision@k={precision_k:.3f}, Recall@k={recall_k:.3f}, 命中人数={int(tp_k)} / 正类总数={int(pos_total)}")

# 你可以按你的真实预算改 k 值
K_LIST = (200, 500, 1000)

topk_metrics(y_test.values, proba_test, ks=K_LIST, name="Logistic (balanced)")
topk_metrics(y_test.values, proba_test_knn, ks=K_LIST, name="kNN (k=5)")



=== Logistic (balanced): Top-k 评估 ===
Top-200: Precision@k=0.680, Recall@k=0.129, 命中人数=136 / 正类总数=1058
Top-500: Precision@k=0.578, Recall@k=0.273, 命中人数=289 / 正类总数=1058
Top-1000: Precision@k=0.443, Recall@k=0.419, 命中人数=443 / 正类总数=1058

=== kNN (k=5): Top-k 评估 ===
Top-200: Precision@k=0.610, Recall@k=0.115, 命中人数=122 / 正类总数=1058
Top-500: Precision@k=0.480, Recall@k=0.227, 命中人数=240 / 正类总数=1058
Top-1000: Precision@k=0.395, Recall@k=0.373, 命中人数=395 / 正类总数=1058


In [19]:
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold


# 预处理：类别One-Hot，数值透传（Logistic可不缩放，也可对比缩放版）
preprocess_log = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=False), cat_cols),
        ("num", "passthrough", num_cols),
    ]
)

log_base = LogisticRegression(
    solver="saga", class_weight="balanced", max_iter=5000, n_jobs=-1
)

pipe_log = Pipeline([("prep", preprocess_log), ("model", log_base)])

param_dist_log = {
    "model__penalty": ["l2"],
    "model__C": np.logspace(-2, 1, 20)  # 0.01~10, 20个候选
}

cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

rs_log = RandomizedSearchCV(
    estimator=pipe_log,
    param_distributions=param_dist_log,
    n_iter=10,                 # 随机评估15组（可调大/小以平衡速度）
    scoring="average_precision",
    refit=True,                # 以 scoring 指标 refit
    cv=cv,
    n_jobs=-1,
    random_state=42,
    verbose=0
)

rs_log.fit(X_train, y_train)

# 测试集评估
proba_log_best = rs_log.predict_proba(X_test)[:, 1]
roc_log_best = roc_auc_score(y_test, proba_log_best)
prauc_log_best = average_precision_score(y_test, proba_log_best)

print("=== Logistic (RandomizedSearch, refit=PR-AUC) ===")
print("Best params:", rs_log.best_params_)
print(f"CV best PR-AUC: {rs_log.best_score_:.4f}")
print(f"TEST ROC-AUC: {roc_log_best:.4f} | TEST PR-AUC: {prauc_log_best:.4f}")

pred_log_best = (proba_log_best >= 0.5).astype(int)
print(classification_report(y_test, pred_log_best, digits=3))

=== Logistic (RandomizedSearch, refit=PR-AUC) ===
Best params: {'model__penalty': 'l2', 'model__C': 0.5455594781168517}
CV best PR-AUC: 0.1726
TEST ROC-AUC: 0.6390 | TEST PR-AUC: 0.1742
              precision    recall  f1-score   support

           0      0.913     0.771     0.836      7985
           1      0.204     0.443     0.280      1058

    accuracy                          0.733      9043
   macro avg      0.559     0.607     0.558      9043
weighted avg      0.830     0.733     0.771      9043





In [22]:
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, average_precision_score, classification_report
import numpy as np

# 预处理：类别 One-Hot，数值透传（先与之前baseline对齐）
preprocess_log_lbfgs = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=False), cat_cols),
        ("num", "passthrough", num_cols),
    ]
)

log_lbfgs = LogisticRegression(
    solver="lbfgs",
    penalty="l2",
    class_weight="balanced",
    max_iter=5000,
    n_jobs=-1
)

pipe_log_lbfgs = Pipeline([("prep", preprocess_log_lbfgs), ("model", log_lbfgs)])

param_dist_log_lbfgs = {
    "model__C": np.logspace(-2, 1, 20)  # 0.01~10
}

cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

rs_log_lbfgs = RandomizedSearchCV(
    estimator=pipe_log_lbfgs,
    param_distributions=param_dist_log_lbfgs,
    n_iter=12,                 # 更快
    scoring="average_precision",
    refit=True,
    cv=cv,
    n_jobs=-1,
    random_state=42,
    verbose=0
)

rs_log_lbfgs.fit(X_train, y_train)
proba_log_best = rs_log_lbfgs.predict_proba(X_test)[:, 1]
print("=== Logistic (lbfgs + L2, RS) ===")
print("Best params:", rs_log_lbfgs.best_params_)
print(f"TEST ROC-AUC: {roc_auc_score(y_test, proba_log_best):.4f} | TEST PR-AUC: {average_precision_score(y_test, proba_log_best):.4f}")
print(classification_report(y_test, (proba_log_best>=0.5).astype(int), digits=3))


=== Logistic (lbfgs + L2, RS) ===
Best params: {'model__C': 4.832930238571752}
TEST ROC-AUC: 0.7705 | TEST PR-AUC: 0.4052
              precision    recall  f1-score   support

           0      0.941     0.739     0.828      7985
           1      0.248     0.649     0.359      1058

    accuracy                          0.729      9043
   macro avg      0.595     0.694     0.594      9043
weighted avg      0.860     0.729     0.773      9043



In [23]:
from sklearn.preprocessing import StandardScaler

preprocess_log_lbfgs_scaled = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=False), cat_cols),
        ("num", StandardScaler(), num_cols),
    ]
)

pipe_log_lbfgs_scaled = Pipeline([("prep", preprocess_log_lbfgs_scaled), ("model", log_lbfgs)])

rs_log_lbfgs_scaled = RandomizedSearchCV(
    estimator=pipe_log_lbfgs_scaled,
    param_distributions=param_dist_log_lbfgs,
    n_iter=12,
    scoring="average_precision",
    refit=True,
    cv=cv,
    n_jobs=-1,
    random_state=42,
    verbose=0
)

rs_log_lbfgs_scaled.fit(X_train, y_train)
proba_log_best2 = rs_log_lbfgs_scaled.predict_proba(X_test)[:, 1]
print("\n=== Logistic (lbfgs + L2, RS, num scaled) ===")
print("Best params:", rs_log_lbfgs_scaled.best_params_)
print(f"TEST ROC-AUC: {roc_auc_score(y_test, proba_log_best2):.4f} | TEST PR-AUC: {average_precision_score(y_test, proba_log_best2):.4f}")
print(classification_report(y_test, (proba_log_best2>=0.5).astype(int), digits=3))



=== Logistic (lbfgs + L2, RS, num scaled) ===
Best params: {'model__C': 0.06158482110660264}
TEST ROC-AUC: 0.7726 | TEST PR-AUC: 0.4089
              precision    recall  f1-score   support

           0      0.941     0.768     0.846      7985
           1      0.266     0.635     0.375      1058

    accuracy                          0.752      9043
   macro avg      0.603     0.701     0.610      9043
weighted avg      0.862     0.752     0.790      9043



In [20]:
from sklearn.neighbors import KNeighborsClassifier

preprocess_knn = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=False), cat_cols),
        ("num", StandardScaler(), num_cols),
    ]
)

knn_base = KNeighborsClassifier()

pipe_knn = Pipeline([("prep", preprocess_knn), ("model", knn_base)])

param_dist_knn = {
    "model__n_neighbors": [3, 5, 7, 9, 11, 13, 15],
    "model__weights": ["uniform", "distance"],
    "model__p": [1, 2]   # 曼哈顿/欧式
}

rs_knn = RandomizedSearchCV(
    estimator=pipe_knn,
    param_distributions=param_dist_knn,
    n_iter=16,                 # 比完整网格少很多
    scoring="average_precision",
    refit=True,
    cv=cv,
    n_jobs=-1,
    random_state=42,
    verbose=0
)

rs_knn.fit(X_train, y_train)

proba_knn_best = rs_knn.predict_proba(X_test)[:, 1]
roc_knn_best = roc_auc_score(y_test, proba_knn_best)
prauc_knn_best = average_precision_score(y_test, proba_knn_best)

print("\n=== kNN (RandomizedSearch, refit=PR-AUC) ===")
print("Best params:", rs_knn.best_params_)
print(f"CV best PR-AUC: {rs_knn.best_score_:.4f}")
print(f"TEST ROC-AUC: {roc_knn_best:.4f} | TEST PR-AUC: {prauc_knn_best:.4f}")

pred_knn_best = (proba_knn_best >= 0.5).astype(int)
print(classification_report(y_test, pred_knn_best, digits=3))



=== kNN (RandomizedSearch, refit=PR-AUC) ===
Best params: {'model__weights': 'distance', 'model__p': 1, 'model__n_neighbors': 15}
CV best PR-AUC: 0.3737
TEST ROC-AUC: 0.7484 | TEST PR-AUC: 0.3855
              precision    recall  f1-score   support

           0      0.902     0.988     0.943      7985
           1      0.676     0.187     0.293      1058

    accuracy                          0.894      9043
   macro avg      0.789     0.588     0.618      9043
weighted avg      0.875     0.894     0.867      9043



In [24]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score, average_precision_score

# 预处理：与Logistic一致——类别One-Hot，数值透传
preprocess_rf = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=False), cat_cols),
        ("num", "passthrough", num_cols),
    ]
)

rf = RandomForestClassifier(
    class_weight="balanced",
    random_state=42,
    n_jobs=-1
)

param_dist_rf = {
    "model__n_estimators": [200, 300, 400, 600],
    "model__max_depth": [None, 8, 12, 16, 20],
    "model__max_features": ["sqrt", 0.3, 0.5, None],
    "model__min_samples_leaf": [1, 2, 5]
}

cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
pipe_rf = Pipeline([("prep", preprocess_rf), ("model", rf)])

rs_rf = RandomizedSearchCV(
    estimator=pipe_rf,
    param_distributions=param_dist_rf,
    n_iter=18,
    scoring="average_precision",
    refit=True, cv=cv, n_jobs=-1, random_state=42, verbose=0
)

rs_rf.fit(X_train, y_train)
proba_rf = rs_rf.predict_proba(X_test)[:, 1]
print("=== RandomForest ===")
print("Best params:", rs_rf.best_params_)
print(f"TEST ROC-AUC: {roc_auc_score(y_test, proba_rf):.4f} | TEST PR-AUC: {average_precision_score(y_test, proba_rf):.4f}")

threshold_tradeoff(y_test.values, proba_rf, name="RandomForest")
topk_metrics(y_test.values, proba_rf, ks=(200, 500, 1000), name="RandomForest")


=== RandomForest ===
Best params: {'model__n_estimators': 300, 'model__min_samples_leaf': 5, 'model__max_features': 'sqrt', 'model__max_depth': None}
TEST ROC-AUC: 0.8041 | TEST PR-AUC: 0.4523

=== RandomForest: 阈值权衡 ===
F1 最大点： {'threshold': 0.5271262614648102, 'precision': 0.46563573883161513, 'recall': 0.5122873345935728, 'f1': 0.487848784877989}
Precision≥0.40 且 Recall 最大： {'threshold': 0.44278359572892534, 'precision': 0.4, 'recall': 0.5879017013232514, 'f1': 0.4760811327970688}

=== RandomForest: Top-k 评估 ===
Top-200: Precision@k=0.715, Recall@k=0.135, 命中人数=143 / 正类总数=1058
Top-500: Precision@k=0.592, Recall@k=0.280, 命中人数=296 / 正类总数=1058
Top-1000: Precision@k=0.487, Recall@k=0.460, 命中人数=487 / 正类总数=1058


In [25]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

preprocess_ab = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=False), cat_cols),
        ("num", "passthrough", num_cols),
    ]
)

# 基学习器：树桩/浅树（对不平衡用sample_weight间接处理，AdaBoost会传递）
base_stump = DecisionTreeClassifier(max_depth=1, random_state=42)

ab = AdaBoostClassifier(
    estimator=base_stump,  # sklearn>=1.2 用 estimator 参数；老版本用 base_estimator
    algorithm="SAMME.R",
    random_state=42
)

param_dist_ab = {
    "model__n_estimators": [100, 200, 400, 600],
    "model__learning_rate": [0.05, 0.1, 0.2, 0.5],
    # 也可试浅一点的基树
    # "model__estimator__max_depth": [1, 2, 3]
}

pipe_ab = Pipeline([("prep", preprocess_ab), ("model", ab)])

rs_ab = RandomizedSearchCV(
    estimator=pipe_ab,
    param_distributions=param_dist_ab,
    n_iter=12,
    scoring="average_precision",
    refit=True, cv=cv, n_jobs=-1, random_state=42, verbose=0
)

rs_ab.fit(X_train, y_train)
proba_ab = rs_ab.predict_proba(X_test)[:, 1]
print("\n=== AdaBoost ===")
print("Best params:", rs_ab.best_params_)
print(f"TEST ROC-AUC: {roc_auc_score(y_test, proba_ab):.4f} | TEST PR-AUC: {average_precision_score(y_test, proba_ab):.4f}")

threshold_tradeoff(y_test.values, proba_ab, name="AdaBoost")
topk_metrics(y_test.values, proba_ab, ks=(200, 500, 1000), name="AdaBoost")



=== AdaBoost ===
Best params: {'model__n_estimators': 400, 'model__learning_rate': 0.5}
TEST ROC-AUC: 0.7881 | TEST PR-AUC: 0.4241

=== AdaBoost: 阈值权衡 ===
F1 最大点： {'threshold': 0.49821084718211117, 'precision': 0.4649621212121212, 'recall': 0.46408317580340264, 'f1': 0.4645222327336533}
Precision≥0.40 且 Recall 最大： {'threshold': 0.4978968867434518, 'precision': 0.4, 'recall': 0.5349716446124764, 'f1': 0.45774363121665557}

=== AdaBoost: Top-k 评估 ===
Top-200: Precision@k=0.695, Recall@k=0.131, 命中人数=139 / 正类总数=1058
Top-500: Precision@k=0.562, Recall@k=0.266, 命中人数=281 / 正类总数=1058
Top-1000: Precision@k=0.470, Recall@k=0.444, 命中人数=470 / 正类总数=1058


In [26]:
from sklearn.ensemble import BaggingClassifier

preprocess_bag = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=False), cat_cols),
        ("num", "passthrough", num_cols),
    ]
)

base_tree = DecisionTreeClassifier(
    max_depth=None,  # 允许长一些的树，Bagging来降方差
    class_weight="balanced",
    random_state=42
)

bag = BaggingClassifier(
    estimator=base_tree,
    n_estimators=200,
    random_state=42,
    n_jobs=-1
)

param_dist_bag = {
    "model__n_estimators": [100, 200, 400],
    "model__max_samples": [0.5, 0.7, 1.0],
    "model__max_features": [0.5, 1.0]
}

pipe_bag = Pipeline([("prep", preprocess_bag), ("model", bag)])

rs_bag = RandomizedSearchCV(
    estimator=pipe_bag,
    param_distributions=param_dist_bag,
    n_iter=12,
    scoring="average_precision",
    refit=True, cv=cv, n_jobs=-1, random_state=42, verbose=0
)

rs_bag.fit(X_train, y_train)
proba_bag = rs_bag.predict_proba(X_test)[:, 1]
print("\n=== Bagging ===")
print("Best params:", rs_bag.best_params_)
print(f"TEST ROC-AUC: {roc_auc_score(y_test, proba_bag):.4f} | TEST PR-AUC: {average_precision_score(y_test, proba_bag):.4f}")

threshold_tradeoff(y_test.values, proba_bag, name="Bagging")
topk_metrics(y_test.values, proba_bag, ks=(200, 500, 1000), name="Bagging")



=== Bagging ===
Best params: {'model__n_estimators': 400, 'model__max_samples': 0.5, 'model__max_features': 0.5}
TEST ROC-AUC: 0.7965 | TEST PR-AUC: 0.4556

=== Bagging: 阈值权衡 ===
F1 最大点： {'threshold': 0.2676868929102808, 'precision': 0.4489112227805695, 'recall': 0.5066162570888468, 'f1': 0.4760213143867132}
Precision≥0.40 且 Recall 最大： {'threshold': 0.23664102989987065, 'precision': 0.4, 'recall': 0.5652173913043478, 'f1': 0.4684684684679831}

=== Bagging: Top-k 评估 ===
Top-200: Precision@k=0.765, Recall@k=0.145, 命中人数=153 / 正类总数=1058
Top-500: Precision@k=0.592, Recall@k=0.280, 命中人数=296 / 正类总数=1058
Top-1000: Precision@k=0.478, Recall@k=0.452, 命中人数=478 / 正类总数=1058


In [27]:
from sklearn.ensemble import VotingClassifier

# 确保你已经跑过：rs_log_lbfgs_scaled、rs_knn、rs_rf
voter = VotingClassifier(
    estimators=[
        ("log", rs_log_lbfgs_scaled.best_estimator_),  # Logistic 最优管道
        ("rf", rs_rf.best_estimator_),                 # RF 最优管道
        ("knn", rs_knn.best_estimator_)                # kNN 最优管道
    ],
    voting="soft",
    weights=None,  # 先均匀加权；如需可改成 [2,2,1] 略偏向log+rf
    n_jobs=-1
)

voter.fit(X_train, y_train)
proba_vote = voter.predict_proba(X_test)[:, 1]
print("\n=== Voting (soft) ===")
print(f"TEST ROC-AUC: {roc_auc_score(y_test, proba_vote):.4f} | TEST PR-AUC: {average_precision_score(y_test, proba_vote):.4f}")

threshold_tradeoff(y_test.values, proba_vote, name="Voting (soft)")
topk_metrics(y_test.values, proba_vote, ks=(200, 500, 1000), name="Voting (soft)")



=== Voting (soft) ===
TEST ROC-AUC: 0.7988 | TEST PR-AUC: 0.4410

=== Voting (soft): 阈值权衡 ===
F1 最大点： {'threshold': 0.4350783792493078, 'precision': 0.4532312925170068, 'recall': 0.503780718336484, 'f1': 0.4771709937327154}
Precision≥0.40 且 Recall 最大： {'threshold': 0.38850071418529647, 'precision': 0.4001344989912576, 'recall': 0.5623818525519849, 'f1': 0.4675834970525594}

=== Voting (soft): Top-k 评估 ===
Top-200: Precision@k=0.735, Recall@k=0.139, 命中人数=147 / 正类总数=1058
Top-500: Precision@k=0.588, Recall@k=0.278, 命中人数=294 / 正类总数=1058
Top-1000: Precision@k=0.482, Recall@k=0.456, 命中人数=482 / 正类总数=1058


In [28]:
import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score, average_precision_score

# === 请确认这些变量已在你前面代码里生成 ===
# Logistic(最优，数值列标准化后的那版)
proba_log = proba_log_best2   # 若你用的是未缩放A方案，就把这一行改成：proba_log = proba_log_best
# kNN(最优)
proba_knn = proba_knn_best
# RandomForest/AdaBoost/Bagging/Voting(最优)
proba_rf   = proba_rf
proba_ab   = proba_ab
proba_bag  = proba_bag
proba_vote = proba_vote

y_true = y_test.values  # 真实标签

def precision_at_k(y_true, proba, k):
    order = np.argsort(-proba)
    y_topk = y_true[order][:k]
    return y_topk.mean()

def recall_at_k(y_true, proba, k):
    order = np.argsort(-proba)
    y_topk = y_true[order][:k].sum()
    pos_total = y_true.sum()
    return y_topk / pos_total if pos_total > 0 else 0.0

models = {
    "Logistic (best)": proba_log,
    "kNN (best)": proba_knn,
    "RandomForest": proba_rf,
    "AdaBoost": proba_ab,
    "Bagging": proba_bag,
    "Voting (soft)": proba_vote,
}

rows = []
for name, proba in models.items():
    roc = roc_auc_score(y_true, proba)
    pr  = average_precision_score(y_true, proba)
    p200 = precision_at_k(y_true, proba, 200)
    p500 = precision_at_k(y_true, proba, 500)
    p1000= precision_at_k(y_true, proba, 1000)
    r200 = recall_at_k(y_true, proba, 200)
    r500 = recall_at_k(y_true, proba, 500)
    r1000= recall_at_k(y_true, proba, 1000)
    rows.append({
        "model": name,
        "ROC-AUC": round(roc, 4),
        "PR-AUC": round(pr, 4),
        "Precision@200": round(p200, 3),
        "Precision@500": round(p500, 3),
        "Precision@1000": round(p1000, 3),
        "Recall@200": round(r200, 3),
        "Recall@500": round(r500, 3),
        "Recall@1000": round(r1000, 3),
    })

df_cmp = pd.DataFrame(rows).sort_values(by=["PR-AUC","ROC-AUC"], ascending=False).reset_index(drop=True)
df_cmp

Unnamed: 0,model,ROC-AUC,PR-AUC,Precision@200,Precision@500,Precision@1000,Recall@200,Recall@500,Recall@1000
0,Bagging,0.7965,0.4556,0.765,0.592,0.478,0.145,0.28,0.452
1,RandomForest,0.8041,0.4523,0.715,0.592,0.487,0.135,0.28,0.46
2,Voting (soft),0.7988,0.441,0.735,0.588,0.482,0.139,0.278,0.456
3,AdaBoost,0.7881,0.4241,0.695,0.562,0.47,0.131,0.266,0.444
4,Logistic (best),0.7726,0.4089,0.695,0.576,0.461,0.131,0.272,0.436
5,kNN (best),0.7484,0.3855,0.7,0.57,0.436,0.132,0.269,0.412
