# Single DLE

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression, RidgeClassifier, SGDClassifier, Perceptron, PassiveAggressiveClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, BaggingClassifier, ExtraTreesClassifier
from sklearn.svm import SVC, NuSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB, BernoulliNB, ComplementNB
from sklearn.neural_network import MLPClassifier
from sklearn.isotonic import IsotonicRegression
from sklearn.metrics import roc_auc_score
from xgboost import XGBClassifier
from scipy.special import expit
import glob, os

# ================= Calibration functions =================
def platt_cv(p, y, n_splits=5):
    out = np.zeros_like(p)
    skf = StratifiedKFold(n_splits, shuffle=True, random_state=42)
    for tr, te in skf.split(p, y):
        lr = LogisticRegression(solver='lbfgs', max_iter=1000)
        lr.fit(p[tr].reshape(-1,1), y[tr])
        out[te] = lr.predict_proba(p[te].reshape(-1,1))[:,1]
    return out

def isotonic_cv(p, y, n_splits=5):
    out = np.zeros_like(p)
    skf = StratifiedKFold(n_splits, shuffle=True, random_state=42)
    for tr, te in skf.split(p, y):
        iso = IsotonicRegression(out_of_bounds='clip')
        iso.fit(p[tr], y[tr])
        out[te] = iso.transform(p[te])
    return out

# ================= Candidate stacking models =================
candidate_models = {
    'LR': LogisticRegression(solver='lbfgs', max_iter=2000),
    'Ridge': RidgeClassifier(max_iter=2000),
    'SGD': SGDClassifier(max_iter=2000, tol=1e-5),
    'Perceptron': Perceptron(max_iter=2000),
    'PassiveAggressive': PassiveAggressiveClassifier(max_iter=2000),
    'RF': RandomForestClassifier(n_estimators=500, max_depth=5, random_state=6),
    'ExtraTrees': ExtraTreesClassifier(n_estimators=500, max_depth=5, random_state=6),
    'GBDT': GradientBoostingClassifier(n_estimators=500, learning_rate=1e-5),
    'AdaBoost': AdaBoostClassifier(n_estimators=500, learning_rate=1e-5),
    'Bagging': BaggingClassifier(n_estimators=500),
    'XGB': XGBClassifier(n_estimators=500, learning_rate=1e-5,
                          use_label_encoder=False, eval_metric='logloss'),
    'SVC_rbf': SVC(probability=True, kernel='rbf'),
    'SVC_linear': SVC(probability=True, kernel='linear'),
    'NuSVC': NuSVC(probability=True),
    'KNN': KNeighborsClassifier(n_neighbors=5),
    'GaussianNB': GaussianNB(),
    'BernoulliNB': BernoulliNB(),
    'ComplementNB': ComplementNB(),
    'MLP_small': MLPClassifier(hidden_layer_sizes=(32,16), max_iter=2000, random_state=6),
    'MLP_large': MLPClassifier(hidden_layer_sizes=(64,32), max_iter=2000, random_state=6)
}

# ================= Layer-wise Boosting models =================
layer_boosters = {
    'Layer_GBDT': GradientBoostingClassifier(
        n_estimators=300, learning_rate=0.05, max_depth=3),
    'Layer_XGB': XGBClassifier(
        n_estimators=300, learning_rate=0.05, max_depth=3,
        subsample=0.8, colsample_bytree=0.8,
        eval_metric='logloss', use_label_encoder=False)
}

# ================= Multi-layer stacking + Boosting =================
def multi_layer_stacking(df, prob_cols, label_col='label',
                          max_layers=10, topN=5):

    y = df[label_col].values
    base_probs = [df[c].values for c in prob_cols]

    # Initial calibration
    probs_platt = [platt_cv(p, y) for p in base_probs]
    probs_iso = [isotonic_cv(p, y) for p in base_probs]
    current_probs = np.vstack(probs_platt + probs_iso).T

    skf = StratifiedKFold(5, shuffle=True, random_state=42)
    best_overall_auc = 0
    best_overall_prob = None
    layer = 1

    while layer <= max_layers:
        print(f"\n=== Layer {layer} ===")
        layer_probs = {}
        layer_auc = {}

        # ---------- normal stacking models ----------
        for name, model in candidate_models.items():
            stack_prob = np.zeros_like(y, dtype=float)

            for tr, te in skf.split(current_probs, y):
                model.fit(current_probs[tr], y[tr])
                if hasattr(model, "predict_proba"):
                    stack_prob[te] = model.predict_proba(current_probs[te])[:,1]
                elif hasattr(model, "decision_function"):
                    stack_prob[te] = expit(model.decision_function(current_probs[te]))
                else:
                    stack_prob[te] = model.predict(current_probs[te])

            orig_prob = current_probs.mean(axis=1)
            ensemble_prob = 0.5 * stack_prob + 0.5 * orig_prob

            layer_probs[name] = ensemble_prob
            layer_auc[name] = roc_auc_score(y, ensemble_prob)
            print(f"{name}: AUC={layer_auc[name]:.4f}")

        # ---------- layer-wise boosting ----------
        for bname, booster in layer_boosters.items():
            boost_prob = np.zeros_like(y, dtype=float)

            for tr, te in skf.split(current_probs, y):
                booster.fit(current_probs[tr], y[tr])
                boost_prob[te] = booster.predict_proba(current_probs[te])[:,1]

            orig_prob = current_probs.mean(axis=1)
            ensemble_boost = 0.5 * boost_prob + 0.5 * orig_prob

            layer_probs[bname] = ensemble_boost
            layer_auc[bname] = roc_auc_score(y, ensemble_boost)
            print(f"{bname}: AUC={layer_auc[bname]:.4f}")

        # ---------- select best ----------
        best_model = max(layer_auc, key=lambda k: layer_auc[k])
        best_auc = layer_auc[best_model]
        print(f"✅ Layer {layer} best: {best_model} | AUC={best_auc:.4f}")

        if best_auc <= best_overall_auc:
            print("❌ AUC not improved. Stop.")
            break

        best_overall_auc = best_auc
        best_overall_prob = layer_probs[best_model]
        df[f'layer{layer}_best_{best_model}'] = best_overall_prob

        # ---------- top-N enter next layer ----------
        topN_models = sorted(layer_auc.items(),
                             key=lambda x: x[1], reverse=True)[:topN]
        topN_probs = np.vstack(
            [layer_probs[name] for name, _ in topN_models]).T

        topN_platt = [platt_cv(p, y) for p in topN_probs.T]
        topN_iso = [isotonic_cv(p, y) for p in topN_probs.T]
        current_probs = np.vstack(topN_platt + topN_iso).T

        layer += 1

    df['final_best_ensemble'] = best_overall_prob
    return df, best_overall_auc

# ================= Load datasets =================

# data1
data1 = pd.read_csv('OPENSMILE_probabilities.csv')
prob_cols1 = [c for c in data1.columns if c not in ['case','label']]
df1, auc1 = multi_layer_stacking(data1, prob_cols1)
df1.to_csv('data1_multi_layer_ensemble_boosting.csv', index=False)

# data2
data2 = pd.read_csv('./result/Normal single models results/Normal_seven_results.csv')
prob_cols2 = [c for c in data2.columns if c not in ['case','label']]
df2, auc2 = multi_layer_stacking(data2, prob_cols2)
df2.to_csv('data2_multi_layer_ensemble_boosting.csv', index=False)

# data3 (multiple CSV)
data3_folder = './result/text_model/'
files = glob.glob(os.path.join(data3_folder, '*.csv'))

dfs = []
for i, f in enumerate(files):
    tmp = pd.read_csv(f)
    dfs.append(tmp[['prob']].rename(columns={'prob': f'prob{i}'}))

labels = pd.read_csv(files[0])['label'].values
df3 = pd.concat(dfs, axis=1)
df3['label'] = labels

df3, auc3 = multi_layer_stacking(df3, [c for c in df3.columns if c != 'label'])
df3.to_csv('data3_multi_layer_ensemble_boosting.csv', index=False)

# data4
data4 = pd.read_csv('lingustic_probabilities.csv')
prob_cols4 = [c for c in data4.columns if c not in ['case','label']]
df4, auc4 = multi_layer_stacking(data4, prob_cols4)
df4.to_csv('data4_multi_layer_ensemble_boosting.csv', index=False)

print(f"\n✅ Final AUCs:")
print(f"data1={auc1:.4f}, data2={auc2:.4f}, data3={auc3:.4f}, data4={auc4:.4f}")
print("✅ All boosting multi-layer ensemble results saved.")


# multi DLE

In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import (
    roc_auc_score, f1_score, precision_score, recall_score,
    accuracy_score, confusion_matrix
)
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from scipy.stats import mannwhitneyu
from scipy.optimize import minimize

# ===================== 读取多层堆叠输出数据 =====================
data1 = pd.read_csv('data1_multi_layer_ensemble.csv')
data2 = pd.read_csv('data2_multi_layer_ensemble.csv')
data3 = pd.read_csv('data3_multi_layer_ensemble.csv')
data4 = pd.read_csv('data4_multi_layer_ensemble.csv')

df = pd.DataFrame({
    'case': data1['case'],
    'prob1': data1['final_best_ensemble'],
    'label': data1['label'],
    'prob2': data2['final_best_ensemble'],
    'prob3': data3['final_best_ensemble'],
    'prob4': data4['final_best_ensemble']
})

# ===================== 评价指标函数 =====================
def calculate_metrics(y_true, y_pred_probs, threshold=0.5):
    y_pred = (y_pred_probs >= threshold).astype(int)
    auc = roc_auc_score(y_true, y_pred_probs)
    f1 = f1_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    accuracy = accuracy_score(y_true, y_pred)
    cm = confusion_matrix(y_true, y_pred)
    ppv = precision
    npv = cm[0, 0] / (cm[0, 0] + cm[1, 0]) if (cm[0, 0] + cm[1, 0]) > 0 else 0
    specificity = cm[0, 0] / (cm[0, 0] + cm[0, 1]) if (cm[0, 0] + cm[0, 1]) > 0 else 0
    positives = y_pred_probs[y_true == 1]
    negatives = y_pred_probs[y_true == 0]
    auc_pvalue = mannwhitneyu(positives, negatives, alternative='two-sided').pvalue
    return {
        "AUC": auc,
        "F1-Score": f1,
        "Precision": precision,
        "Recall": recall,
        "Accuracy": accuracy,
        "AUC P-Value": auc_pvalue,
        "PPV": ppv,
        "NPV": npv,
        "Sensitivity": recall,
        "Specificity": specificity
    }

def find_best_f1_threshold(y_true, probs):
    best_f1 = 0
    best_thresh = 0.5
    for thresh in np.arange(0.1, 0.9, 0.01):
        y_pred = (probs >= thresh).astype(int)
        f1 = f1_score(y_true, y_pred)
        if f1 > best_f1:
            best_f1 = f1
            best_thresh = thresh
    return best_thresh

# ===================== 1. 简单平均集成 =====================
df['ensemble_prob_mean'] = df[['prob1', 'prob2', 'prob3', 'prob4']].mean(axis=1)
mean_thresh = find_best_f1_threshold(df['label'], df['ensemble_prob_mean'])
metrics_mean = calculate_metrics(df['label'], df['ensemble_prob_mean'], threshold=mean_thresh)
print('\n【1. 简单平均集成】')
print(f'最佳F1阈值={mean_thresh:.2f}')
for k, v in metrics_mean.items():
    print(f"{k}: {v:.4f}")

# ===================== 2. 自动加权平均（最优权重） =====================
def weighted_auc(weights):
    weights = np.clip(weights, 0, 1)
    weights = weights / weights.sum()
    ensemble_probs = (
        weights[0] * df['prob1'] +
        weights[1] * df['prob2'] +
        weights[2] * df['prob3'] +
        weights[3] * df['prob4']
    )
    return -roc_auc_score(df['label'], ensemble_probs)

init_weights = [0.25, 0.25, 0.25, 0.25]
cons = {'type': 'eq', 'fun': lambda w: w.sum() - 1}
bounds = [(0, 1)] * 4

res = minimize(weighted_auc, init_weights, bounds=bounds, constraints=cons)
opt_weights = res.x / res.x.sum()
df['ensemble_prob_weighted'] = (
    opt_weights[0] * df['prob1'] +
    opt_weights[1] * df['prob2'] +
    opt_weights[2] * df['prob3'] +
    opt_weights[3] * df['prob4']
)
weighted_thresh = find_best_f1_threshold(df['label'], df['ensemble_prob_weighted'])
metrics_weighted = calculate_metrics(df['label'], df['ensemble_prob_weighted'], threshold=weighted_thresh)
print('\n【2. 自动加权集成】')
print('最优权重:', [f"{w:.3f}" for w in opt_weights])
print(f'最佳F1阈值={weighted_thresh:.2f}')
for k, v in metrics_weighted.items():
    print(f"{k}: {v:.4f}")

# ===================== 3. Stacking集成（逻辑回归元学习器，交叉验证） =====================
X = df[['prob1', 'prob2', 'prob3', 'prob4']].values
y = df['label'].values

meta_probs = np.zeros_like(y, dtype=float)
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
for train_idx, test_idx in skf.split(X, y):
    clf = LogisticRegression(max_iter=1000)
    clf.fit(X[train_idx], y[train_idx])
    meta_probs[test_idx] = clf.predict_proba(X[test_idx])[:, 1]

df['ensemble_prob_stacking'] = meta_probs
stacking_thresh = find_best_f1_threshold(y, df['ensemble_prob_stacking'])
metrics_stacking = calculate_metrics(y, df['ensemble_prob_stacking'], threshold=stacking_thresh)
print('\n【3. Stacking集成（逻辑回归）】')
print(f'最佳F1阈值={stacking_thresh:.2f}')
for k, v in metrics_stacking.items():
    print(f"{k}: {v:.4f}")

# ===================== 4. 多层集成（再次stacking，融合单模型与前三种集成结果） =====================
feature_cols_multi = [
    'prob1', 'prob2', 'prob3', 'prob4',
    'ensemble_prob_stacking', 'ensemble_prob_mean', 'ensemble_prob_weighted'
]
X_multi = df[feature_cols_multi].values
y_multi = df['label'].values

meta_probs_multi = np.zeros_like(y_multi, dtype=float)
skf_multi = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
for train_idx, test_idx in skf_multi.split(X_multi, y_multi):
    clf_multi = LogisticRegression(max_iter=1000)
    clf_multi.fit(X_multi[train_idx], y_multi[train_idx])
    meta_probs_multi[test_idx] = clf_multi.predict_proba(X_multi[test_idx])[:, 1]

df['ensemble_prob_final'] = meta_probs_multi
final_thresh = find_best_f1_threshold(y_multi, df['ensemble_prob_final'])
metrics_final = calculate_metrics(y_multi, df['ensemble_prob_final'], threshold=final_thresh)
print('\n【4. 多层集成：最终stacking】')
print(f'最佳F1阈值={final_thresh:.2f}')
for k, v in metrics_final.items():
    print(f"{k}: {v:.4f}")

# ===================== 自动化选择最佳集成方法 =====================
results = {
    'stacking': {
        'name': 'Stacking集成（逻辑回归）',
        'metrics': metrics_stacking,
        'probs': df['ensemble_prob_stacking'],
        'threshold': stacking_thresh
    },
    'mean': {
        'name': '简单平均集成',
        'metrics': metrics_mean,
        'probs': df['ensemble_prob_mean'],
        'threshold': mean_thresh
    },
    'weighted': {
        'name': '自动加权集成',
        'metrics': metrics_weighted,
        'probs': df['ensemble_prob_weighted'],
        'threshold': weighted_thresh
    },
    'final': {
        'name': '多层Stacking（最终）',
        'metrics': metrics_final,
        'probs': df['ensemble_prob_final'],
        'threshold': final_thresh
    }
}

key_metric = 'AUC'  # 可以改成 F1-Score 等
best_key = max(results, key=lambda k: results[k]['metrics'][key_metric])
best_method = results[best_key]

print('\n【自动化选择最佳集成方法】')
print(f"最佳方法: {best_method['name']}")
print(f"{key_metric}: {best_method['metrics'][key_metric]:.4f}")
print(f"全部指标:")
for k, v in best_method['metrics'].items():
    print(f"{k}: {v:.4f}")
print(f"最佳阈值（用于分类）: {best_method['threshold']:.2f}")

# 输出最佳预测标签
y_pred_best = (best_method['probs'] >= best_method['threshold']).astype(int)
df['y_pred_best'] = y_pred_best

# 计算最优概率与四单模型概率的P-Value（Mann-Whitney U检验）
best_prob = best_method['probs']
p_values = {}
for col in ['prob1', 'prob2', 'prob3', 'prob4']:
    stat, pval = mannwhitneyu(best_prob, df[col], alternative='two-sided')
    p_values[f'ensemble_vs_{col}'] = pval

print("\n【最优集成概率与各单模型概率的P-Value（Mann-Whitney U 检验）】")
for k, v in p_values.items():
    print(f"{k}: {v:.6f}")

# 如需保存结果，取消注释以下行
df.to_csv('final_ensemble_result_singleensemble.csv', index=False)