In [9]:
import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score, precision_score, recall_score, f1_score, confusion_matrix
from scipy import stats

def compute_auc_pvalue(y_true, y_prob, n_bootstraps=1000, random_state=42):
    rng = np.random.RandomState(random_state)
    auc = roc_auc_score(y_true, y_prob)
    bootstrapped_scores = []
    for i in range(n_bootstraps):
        indices = rng.randint(0, len(y_true), len(y_true))
        if len(np.unique(y_true[indices])) < 2:
            continue
        score = roc_auc_score(y_true[indices], y_prob[indices])
        bootstrapped_scores.append(score)
    bootstrapped_scores = np.array(bootstrapped_scores)
    p_value = np.sum(bootstrapped_scores <= 0.5) / len(bootstrapped_scores)
    return auc, p_value

names = ['text', 'interviewee_text', 'new_text', 'new_interviewee_text']
models = [
    'ckiplab/albert-base-chinese',
    'indiejoseph/bert-base-chinese',
    'zwzzz/Chinese-MentalBERT',
    'Geotrend/distilbert-base-zh-cased',
    'hfl/chinese-roberta-wwm-ext',
    'hfl/chinese-xlnet-base',
    'hfl/chinese-electra-base-discriminator'
]

all_model_metrics = []

for name in names:
    for model_name in models:
        model_short = model_name.split("/")[-1]
        data = pd.read_csv(f"./{name}_{model_short}_majority_voting_results.csv")
        y_true = data['label'].values
        y_prob = data['prob'].values
        y_pred = data['pred'].values

        auc, auc_pvalue = compute_auc_pvalue(y_true, y_prob)

        f1 = f1_score(y_true, y_pred)
        precision = precision_score(y_true, y_pred)
        recall = recall_score(y_true, y_pred)
        tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
        specificity = tn / (tn + fp) if (tn + fp) > 0 else 0.0
        npv = tn / (tn + fn) if (tn + fn) > 0 else 0.0
        accuracy = (tp + tn) / (tp + tn + fp + fn)

        metrics = {
            'Input': name,
            'Model': model_short,
            'AUC': round(auc, 2),
            'AUC_Pvalue': round(auc_pvalue, 2),
            'F1': round(f1, 2),
            'Accuracy': round(accuracy, 2),
            'Sensitivity': round(recall, 2),
            'Specificity': round(specificity, 2),
            'PPV': round(precision, 2),
            'NPV': round(npv, 2),
            'TP': tp,
            'TN': tn,
            'FP': fp,
            'FN': fn
        }
        all_model_metrics.append(metrics)

result_df = pd.DataFrame(all_model_metrics)
result_df.to_csv("all_model_metrics_with_auc_pvalue.csv", index=False)
print("所有模型AUC及其p-value已保存到 all_model_metrics_with_auc_pvalue.csv")


所有模型AUC及其p-value已保存到 all_model_metrics_with_auc_pvalue.csv


# Machine Learning ensemble

In [6]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import roc_auc_score, f1_score, precision_score, recall_score, confusion_matrix
from sklearn.model_selection import StratifiedKFold
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostClassifier

names = ['text', 'interviewee_text', 'new_text', 'new_interviewee_text']
models = [
    'ckiplab/albert-base-chinese',
    'indiejoseph/bert-base-chinese',
    'zwzzz/Chinese-MentalBERT',
    'Geotrend/distilbert-base-zh-cased',
    'hfl/chinese-roberta-wwm-ext',
    'hfl/chinese-xlnet-base',
    'hfl/chinese-electra-base-discriminator'
]

def compute_metrics(y_true, y_prob, y_pred):
    auc = roc_auc_score(y_true, y_prob)
    f1 = f1_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    specificity = tn / (tn + fp)
    npv = tn / (tn + fn) if (tn + fn) > 0 else 0.0
    return {
        "AUC": auc,
        "F1": f1,
        "Sensitivity": recall,
        "Specificity": specificity,
        "PPV": precision,
        "NPV": npv
    }

# 二级模型集合
ensemble_models = {
    'LogisticRegression': LogisticRegression(solver='liblinear'),
    'RandomForest': RandomForestClassifier(n_estimators=100, random_state=42),
    'XGBoost': xgb.XGBClassifier(n_estimators=100, use_label_encoder=False, eval_metric='logloss', random_state=42),
    'LightGBM': lgb.LGBMClassifier(n_estimators=100, random_state=42),
    'CatBoost': CatBoostClassifier(verbose=0, random_seed=42),
    'MLP': MLPClassifier(hidden_layer_sizes=(16,), max_iter=300, random_state=42)
}

summary_rows = []

for name in names:
    model_probs = {}
    auc_list = []
    # 读取所有模型
    for model_name in models:
        model_short = model_name.split("/")[-1]
        data = pd.read_csv(f"./{name}_{model_short}_majority_voting_results.csv")
        model_probs[model_short] = data['prob'].values
        auc = roc_auc_score(data['label'], data['prob'])
        auc_list.append((model_short, auc))
    auc_list_sorted = sorted(auc_list, key=lambda x: x[1], reverse=True)
    sorted_model_names = [x[0] for x in auc_list_sorted]
    all_labels = data['label'].values
    case_ids = data['case'].values if 'case' in data.columns else np.arange(len(all_labels))

    prob_dict = {'case': case_ids, 'label': all_labels}

    for k in range(2, 7):  # Top 2~6
        top_models = sorted_model_names[:k]
        X = np.stack([model_probs[m] for m in top_models], axis=1)
        y = all_labels

        best_auc = -1
        best_prob = None
        best_method = None
        best_metrics = None

        for method_name, base_model in ensemble_models.items():
            oof_pred = np.zeros(len(y))
            skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
            for train_idx, val_idx in skf.split(X, y):
                model = base_model
                # 重置模型参数（避免sklearn模型交叉验证间污染）
                model = ensemble_models[method_name]
                model = model.__class__(**model.get_params())
                model.fit(X[train_idx], y[train_idx])
                oof_pred[val_idx] = model.predict_proba(X[val_idx])[:, 1]
            agg_prob = oof_pred
            agg_pred = (agg_prob >= 0.5).astype(int)
            metrics = compute_metrics(y, agg_prob, agg_pred)
            if metrics['AUC'] > best_auc:
                best_auc = metrics['AUC']
                best_prob = agg_prob
                best_method = method_name
                best_metrics = metrics

        # 保存最佳prob
        prob_dict[f'prob_{k}'] = best_prob

        # 保存summary信息
        summary_row = {
            'name': name,
            'top_k': k,
            'AUC': best_metrics['AUC'],
            'F1': best_metrics['F1'],
            'Sensitivity': best_metrics['Sensitivity'],
            'Specificity': best_metrics['Specificity'],
            'PPV': best_metrics['PPV'],
            'NPV': best_metrics['NPV'],
            'combined_model_name': ','.join(top_models),
            'ensemble_method': best_method
        }
        summary_rows.append(summary_row)

    prob_df = pd.DataFrame(prob_dict)
    prob_df.to_csv(f"./ensemble/{name}_other__prob.csv", index=False)
    print(f"Saved {name}_prob.csv")

summary_df = pd.DataFrame(summary_rows)
summary_df.to_csv("./ensemble/all_other_ensemble_summary.csv", index=False)
print("Saved all_ensemble_summary.csv")

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



[LightGBM] [Info] Number of positive: 152, number of negative: 87
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000060 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 162
[LightGBM] [Info] Number of data points in the train set: 239, number of used features: 2
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.635983 -> initscore=0.557972
[LightGBM] [Info] Start training from score 0.557972
[LightGBM] [Info] Number of positive: 152, number of negative: 87
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000061 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 162
[LightGBM] [Info] Number of data points in the train set: 239, number of used features: 2
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.635983 -> initscore=0.557972
[LightGBM] [Info] Start training from score 0.557972
[LightGBM] [Info] Number of positi



KeyboardInterrupt: 

Ours Ensemble

In [6]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import roc_auc_score, f1_score, precision_score, recall_score, confusion_matrix
from sklearn.model_selection import StratifiedKFold
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostClassifier
names = ['text', 'interviewee_text', 'new_text', 'new_interviewee_text']
abbreviation_map = {
    'distilbert': 'D',
    'roberta': 'R',
    'mentalbert': 'M',
    'bert': 'B',
    'electra': 'E',
    'xlnet': 'X',
    'albert': 'A'
}
names = ['text', 'interviewee_text', 'new_text', 'new_interviewee_text']
models = [
    'ckiplab/albert-base-chinese',
    'indiejoseph/bert-base-chinese',
    'zwzzz/Chinese-MentalBERT',
    'Geotrend/distilbert-base-zh-cased',
    'hfl/chinese-roberta-wwm-ext',
    'hfl/chinese-xlnet-base',
    'hfl/chinese-electra-base-discriminator'
]
stat_methods = {
    'mean': np.mean,
    'max': np.max,
    'min': np.min,
    'std': np.std,
    'median': np.median,
    'var': np.var,
    'upper_quartile': lambda arr, axis: np.percentile(arr, 75, axis=axis),
    'lower_quartile': lambda arr, axis: np.percentile(arr, 25, axis=axis),
}

def compute_metrics(y_true, y_prob, y_pred):
    auc = roc_auc_score(y_true, y_prob)
    f1 = f1_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    specificity = tn / (tn + fp)
    npv = tn / (tn + fn) if (tn + fn) > 0 else 0.0
    return {
        "AUC": auc,
        "F1": f1,
        "Sensitivity": recall,
        "Specificity": specificity,
        "PPV": precision,
        "NPV": npv
    }
summary_rows = []

for name in names:
    model_probs = {}
    auc_list = []
    first_model_short = None
    first_data = None

    # 读取所有模型的prob
    for model_name in models:
        model_short = model_name.split("/")[-1]
        data = pd.read_csv(f"./{name}_{model_short}_majority_voting_results.csv")
        if first_model_short is None:
            first_model_short = model_short
            first_data = data
        model_probs[model_short] = data['prob'].values
        auc = roc_auc_score(data['label'], data['prob'])
        auc_list.append((model_short, auc))
    auc_list_sorted = sorted(auc_list, key=lambda x: x[1], reverse=True)
    sorted_model_names = [x[0] for x in auc_list_sorted]
    all_labels = first_data['label'].values
    case_ids = first_data['case'].values if 'case' in first_data.columns else np.arange(len(all_labels))

    # 用于存储每个top k的最佳prob
    prob_dict = {'case': case_ids, 'label': all_labels}
    # 记录最佳summary
    for k in range(2, 7):  # Top 2~6
        top_models = sorted_model_names[:k]
        probs_stack = np.stack([model_probs[m] for m in top_models], axis=1)
        best_auc = -1
        best_prob = None
        best_method = None
        best_metrics = None
        for method_name, func in stat_methods.items():
            try:
                agg_prob = func(probs_stack, axis=1)
            except Exception as e:
                print(f"Error in method {method_name}: {e}")
                continue
            agg_pred = (agg_prob >= 0.5).astype(int)
            metrics = compute_metrics(all_labels, agg_prob, agg_pred)
            if metrics['F1'] > best_auc:
                best_auc = metrics['F1']
                best_prob = agg_prob
                best_method = method_name
                best_metrics = metrics
        # 保存最佳prob
        prob_dict[f'prob_{k}'] = best_prob

        # 生成模型名称缩写
        combined_model_name = '+'.join([abbreviation_map.get(model.split("-")[-1].lower(), model) for model in top_models])

        # 保存summary信息，保留两位有效数字
        summary_row = {
            'name': name,
            'top_k': k,
            'AUC': round(best_metrics['AUC'], 2),
            'F1': round(best_metrics['F1'], 2),
            'Sensitivity': round(best_metrics['Sensitivity'], 2),
            'Specificity': round(best_metrics['Specificity'], 2),
            'PPV': round(best_metrics['PPV'], 2),
            'NPV': round(best_metrics['NPV'], 2),
            'combined_model_name': combined_model_name,
            'ensemble_method': best_method
        }
        summary_rows.append(summary_row)
    # 保存prob表
    prob_df = pd.DataFrame(prob_dict)
    prob_df.to_csv(f"./ensemble/F1/{name}_F1__top_prob.csv", index=False)
    print(f"Saved {name}_prob.csv")

# 保存summary汇总表
summary_df = pd.DataFrame(summary_rows)
summary_df.to_csv("./ensemble/F1/all_ensemble_top_summary_F1.csv", index=False)
print("Saved all_ensemble_summary.csv")

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Saved text_prob.csv


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Saved interviewee_text_prob.csv


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Saved new_text_prob.csv
Saved new_interviewee_text_prob.csv
Saved all_ensemble_summary.csv


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [10]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import roc_auc_score, f1_score, precision_score, recall_score, confusion_matrix
from sklearn.model_selection import StratifiedKFold
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostClassifier

names = ['text', 'interviewee_text', 'new_text', 'new_interviewee_text']
abbreviation_map = {
    'distilbert': 'D', 'roberta': 'R', 'mentalbert': 'M',
    'bert': 'B', 'electra': 'E', 'xlnet': 'X', 'albert': 'A'
}
models = [
    'ckiplab/albert-base-chinese',
    'indiejoseph/bert-base-chinese',
    'zwzzz/Chinese-MentalBERT',
    'Geotrend/distilbert-base-zh-cased',
    'hfl/chinese-roberta-wwm-ext',
    'hfl/chinese-xlnet-base',
    'hfl/chinese-electra-base-discriminator'
]
stat_methods = {
    'mean': np.mean,
    'max': np.max,
    'min': np.min,
    'std': np.std,
    'median': np.median,
    'var': np.var,
    'upper_quartile': lambda arr, axis: np.percentile(arr, 75, axis=axis),
    'lower_quartile': lambda arr, axis: np.percentile(arr, 25, axis=axis),
}

def compute_metrics(y_true, y_prob, y_pred):
    auc = roc_auc_score(y_true, y_prob)
    f1 = f1_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    specificity = tn / (tn + fp)
    npv = tn / (tn + fn) if (tn + fn) > 0 else 0.0
    return {
        "AUC": auc,
        "F1": f1,
        "Sensitivity": recall,
        "Specificity": specificity,
        "PPV": precision,
        "NPV": npv
    }

summary_rows = []
final_summary_rows = []

for name in names:
    model_probs = {}
    auc_list = []
    first_model_short = None
    first_data = None

    # 读取所有模型的prob
    for model_name in models:
        model_short = model_name.split("/")[-1]
        data = pd.read_csv(f"./{name}_{model_short}_majority_voting_results.csv")
        if first_model_short is None:
            first_model_short = model_short
            first_data = data
        model_probs[model_short] = data['prob'].values
        auc = roc_auc_score(data['label'], data['prob'])
        auc_list.append((model_short, auc))
    auc_list_sorted = sorted(auc_list, key=lambda x: x[1], reverse=True)
    sorted_model_names = [x[0] for x in auc_list_sorted]
    all_labels = first_data['label'].values
    case_ids = first_data['case'].values if 'case' in first_data.columns else np.arange(len(all_labels))

    # 第一层：各种统计集成，保留每个top_k的F1最高的结果
    first_level_probs = []
    first_level_methods = []
    first_level_modelnames = []
    prob_dict = {'case': case_ids, 'label': all_labels}
    for k in range(2, 7):  # Top 2~6
        top_models = sorted_model_names[:k]
        probs_stack = np.stack([model_probs[m] for m in top_models], axis=1)
        best_f1 = -1
        best_prob = None
        best_method = None
        best_metrics = None
        for method_name, func in stat_methods.items():
            try:
                agg_prob = func(probs_stack, axis=1)
            except Exception as e:
                print(f"Error in method {method_name}: {e}")
                continue
            agg_pred = (agg_prob >= 0.5).astype(int)
            metrics = compute_metrics(all_labels, agg_prob, agg_pred)
            if metrics['F1'] > best_f1:
                best_f1 = metrics['F1']
                best_prob = agg_prob
                best_method = method_name
                best_metrics = metrics
        # 保存最佳prob
        prob_dict[f'prob_{k}'] = best_prob
        first_level_probs.append(best_prob)
        combined_model_name = '+'.join([abbreviation_map.get(model.split("-")[-1].lower(), model) for model in top_models])
        first_level_modelnames.append(combined_model_name)
        first_level_methods.append(best_method)
        summary_row = {
            'name': name,
            'top_k': k,
            'AUC': round(best_metrics['AUC'], 2),
            'F1': round(best_metrics['F1'], 2),
            'Sensitivity': round(best_metrics['Sensitivity'], 2),
            'Specificity': round(best_metrics['Specificity'], 2),
            'PPV': round(best_metrics['PPV'], 2),
            'NPV': round(best_metrics['NPV'], 2),
            'combined_model_name': combined_model_name,
            'ensemble_method': best_method,
            'level': 'first'
        }
        summary_rows.append(summary_row)
    # 保存第一层prob表
    prob_df = pd.DataFrame(prob_dict)
    prob_df.to_csv(f"./ensemble/F1/{name}_F1__top_prob.csv", index=False)
    print(f"Saved {name}_F1__top_prob.csv")

    # 第二层：对所有第一层输出再做集成，包括统计、stacking、加权、投票
    second_level_candidates = {}
    first_level_probs_stack = np.column_stack(first_level_probs)

    # 统计方法再集成
    for method_name, func in stat_methods.items():
        try:
            agg_prob = func(first_level_probs_stack, axis=1)
            second_level_candidates[f'stat_{method_name}'] = agg_prob
        except Exception as e:
            print(f"2nd layer Error in method {method_name}: {e}")

    # stacking
    try:
        skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
        stacking_preds = np.zeros_like(all_labels, dtype=float)
        for train_idx, test_idx in skf.split(first_level_probs_stack, all_labels):
            stk = LogisticRegression(solver='liblinear')
            stk.fit(first_level_probs_stack[train_idx], all_labels[train_idx])
            stacking_preds[test_idx] = stk.predict_proba(first_level_probs_stack[test_idx])[:,1]
        second_level_candidates['stacking'] = stacking_preds
    except Exception as e:
        print(f"Error in stacking: {e}")

    # 加权平均（权重均分和简单加权）
    try:
        weights = np.ones(first_level_probs_stack.shape[1]) / first_level_probs_stack.shape[1]
        weighted_prob = np.sum(first_level_probs_stack * weights, axis=1)
        second_level_candidates['weighted_mean'] = weighted_prob

        # 另一个例子：按每个top k的F1做归一化加权
        f1_scores = [f1_score(all_labels, (p >= 0.5).astype(int)) for p in first_level_probs]
        norm_f1 = np.array(f1_scores) / np.sum(f1_scores)
        weighted_by_f1 = np.sum(first_level_probs_stack * norm_f1, axis=1)
        second_level_candidates['weighted_f1'] = weighted_by_f1
    except Exception as e:
        print(f"Error in weighted ensemble: {e}")

    # 多数投票（硬投票，取概率大于0.5的多数）
    try:
        hard_preds = (first_level_probs_stack >= 0.5).astype(int)
        voted = (np.sum(hard_preds, axis=1) >= (hard_preds.shape[1] / 2)).astype(int)
        voted_prob = voted  # 概率即为投票结果
        second_level_candidates['hard_vote'] = voted_prob
    except Exception as e:
        print(f"Error in hard vote: {e}")

    # 选F1最高的第二层集成
    best_f1_2nd = -1
    best_prob_2nd = None
    best_method_2nd = None
    best_metrics_2nd = None

    for method, agg_prob in second_level_candidates.items():
        if agg_prob.max() > 1 or agg_prob.min() < 0:
            # 防止某些方法输出非法概率
            agg_prob = np.clip(agg_prob, 0, 1)
        if method == 'hard_vote':
            agg_pred = agg_prob.astype(int)
            # 投票法本身就是预测标签，不是概率
            # 若想要评估prob，建议跳过或赋0.5?
            # 这里我们直接用它当概率（0/1）
        else:
            agg_pred = (agg_prob >= 0.5).astype(int)
        metrics = compute_metrics(all_labels, agg_prob, agg_pred)
        if metrics['F1'] > best_f1_2nd:
            best_f1_2nd = metrics['F1']
            best_prob_2nd = agg_prob
            best_method_2nd = method
            best_metrics_2nd = metrics
        # 保存所有second layer prob
        summary_row = {
            'name': name,
            'AUC': round(metrics['AUC'], 2),
            'F1': round(metrics['F1'], 2),
            'Sensitivity': round(metrics['Sensitivity'], 2),
            'Specificity': round(metrics['Specificity'], 2),
            'PPV': round(metrics['PPV'], 2),
            'NPV': round(metrics['NPV'], 2),
            'ensemble_method': method,
            'level': 'second'
        }
        summary_rows.append(summary_row)

    # 保存最终集成结果
    final_df = pd.DataFrame({'case': case_ids, 'label': all_labels, 'final_prob': best_prob_2nd})
    final_df.to_csv(f"./ensemble/F1/{name}_F1_final_ensemble.csv", index=False)
    print(f"Saved {name}_F1_final_ensemble.csv")
    final_summary_rows.append({
        'name': name,
        **{k: round(v, 2) for k, v in best_metrics_2nd.items()},
        'ensemble_method': best_method_2nd
    })

# 保存summary汇总表
summary_df = pd.DataFrame(summary_rows)
summary_df.to_csv("./ensemble/F1/all_ensemble_top_summary_F1.csv", index=False)
print("Saved all_ensemble_top_summary_F1.csv")
final_summary_df = pd.DataFrame(final_summary_rows)
final_summary_df.to_csv("./ensemble/F1/all_final_ensemble_summary_F1.csv", index=False)
print("Saved all_final_ensemble_summary_F1.csv")

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Saved text_F1__top_prob.csv
Saved text_F1_final_ensemble.csv


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Saved interviewee_text_F1__top_prob.csv
Saved interviewee_text_F1_final_ensemble.csv


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Saved new_text_F1__top_prob.csv
Saved new_text_F1_final_ensemble.csv


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Saved new_interviewee_text_F1__top_prob.csv
Saved new_interviewee_text_F1_final_ensemble.csv
Saved all_ensemble_top_summary_F1.csv
Saved all_final_ensemble_summary_F1.csv


F1 and AUC ensembling of individual models

In [9]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, f1_score, precision_score, recall_score, confusion_matrix
from sklearn.model_selection import StratifiedKFold

names = ['text', 'interviewee_text', 'new_text', 'new_interviewee_text']
abbreviation_map = {
    'distilbert': 'D', 'roberta': 'R', 'mentalbert': 'M',
    'bert': 'B', 'electra': 'E', 'xlnet': 'X', 'albert': 'A'
}
models = [
    'ckiplab/albert-base-chinese',
    'indiejoseph/bert-base-chinese',
    'zwzzz/Chinese-MentalBERT',
    'Geotrend/distilbert-base-zh-cased',
    'hfl/chinese-roberta-wwm-ext',
    'hfl/chinese-xlnet-base',
    'hfl/chinese-electra-base-discriminator'
]
stat_methods = {
    'mean': np.mean,
    'max': np.max,
    'min': np.min,
    'std': np.std,
    'median': np.median,
    'var': np.var,
    'upper_quartile': lambda arr, axis: np.percentile(arr, 75, axis=axis),
    'lower_quartile': lambda arr, axis: np.percentile(arr, 25, axis=axis),
}

def compute_metrics(y_true, y_prob, y_pred):
    auc = roc_auc_score(y_true, y_prob)
    f1 = f1_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    specificity = tn / (tn + fp)
    npv = tn / (tn + fn) if (tn + fn) > 0 else 0.0
    return {
        "AUC": auc,
        "F1": f1,
        "Sensitivity": recall,
        "Specificity": specificity,
        "PPV": precision,
        "NPV": npv
    }

summary_rows = []
final_summary_rows = []

for name in names:
    model_probs = {}
    auc_list = []
    first_model_short = None
    first_data = None

    # 读取所有模型的prob
    for model_name in models:
        model_short = model_name.split("/")[-1]
        data = pd.read_csv(f"./{name}_{model_short}_majority_voting_results.csv")
        if first_model_short is None:
            first_model_short = model_short
            first_data = data
        model_probs[model_short] = data['prob'].values
        auc = roc_auc_score(data['label'], data['prob'])
        auc_list.append((model_short, auc))
    auc_list_sorted = sorted(auc_list, key=lambda x: x[1], reverse=True)
    sorted_model_names = [x[0] for x in auc_list_sorted]
    all_labels = first_data['label'].values
    case_ids = first_data['case'].values if 'case' in first_data.columns else np.arange(len(all_labels))

    # 第一层：每个top_k对所有集成方法，保留F1最高的输出
    topk_best_probs = []
    topk_best_labels = []
    topk_best_methods = []
    topk_best_modelnames = []
    prob_dict = {'case': case_ids, 'label': all_labels}

    for k in range(2, 7):  # Top 2~6
        top_models = sorted_model_names[:k]
        probs_stack = np.stack([model_probs[m] for m in top_models], axis=1)
        best_f1 = -1
        best_prob = None
        best_method = None
        best_metrics = None
        best_modelname = None

        # 所有集成方法
        candidate_probs = {}
        # 统计类
        for method_name, func in stat_methods.items():
            try:
                agg_prob = func(probs_stack, axis=1)
                candidate_probs[f'stat_{method_name}'] = agg_prob
            except Exception as e:
                print(f"Error in {method_name}: {e}")

        # stacking
        try:
            skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
            stacking_preds = np.zeros_like(all_labels, dtype=float)
            for train_idx, test_idx in skf.split(probs_stack, all_labels):
                stk = LogisticRegression(solver='liblinear')
                stk.fit(probs_stack[train_idx], all_labels[train_idx])
                stacking_preds[test_idx] = stk.predict_proba(probs_stack[test_idx])[:,1]
            candidate_probs['stacking'] = stacking_preds
        except Exception as e:
            print(f"Error in stacking: {e}")

        # 加权平均（权重均分）
        try:
            weights = np.ones(probs_stack.shape[1]) / probs_stack.shape[1]
            weighted_prob = np.sum(probs_stack * weights, axis=1)
            candidate_probs['weighted_mean'] = weighted_prob
        except Exception as e:
            print(f"Error in weighted_mean: {e}")

        # 多数投票（硬投票）
        try:
            hard_preds = (probs_stack >= 0.5).astype(int)
            voted = (np.sum(hard_preds, axis=1) >= (hard_preds.shape[1] / 2)).astype(int)
            candidate_probs['hard_vote'] = voted
        except Exception as e:
            print(f"Error in hard_vote: {e}")

        # 选F1最高的
        for method, agg_prob in candidate_probs.items():
            if agg_prob.max() > 1 or agg_prob.min() < 0:
                agg_prob = np.clip(agg_prob, 0, 1)
            if method == 'hard_vote':
                agg_pred = agg_prob.astype(int)
            else:
                agg_pred = (agg_prob >= 0.5).astype(int)
            metrics = compute_metrics(all_labels, agg_prob, agg_pred)
            if metrics['F1'] > best_f1:
                best_f1 = metrics['F1']
                best_prob = agg_prob
                best_method = method
                best_metrics = metrics

        # 保存最佳prob
        prob_dict[f'prob_{k}'] = best_prob
        topk_best_probs.append(best_prob)
        topk_best_methods.append(best_method)
        combined_model_name = '+'.join([abbreviation_map.get(model.split("-")[-1].lower(), model) for model in top_models])
        topk_best_modelnames.append(combined_model_name)
        summary_row = {
            'name': name,
            'top_k': k,
            'AUC': round(best_metrics['AUC'], 2),
            'F1': round(best_metrics['F1'], 2),
            'Sensitivity': round(best_metrics['Sensitivity'], 2),
            'Specificity': round(best_metrics['Specificity'], 2),
            'PPV': round(best_metrics['PPV'], 2),
            'NPV': round(best_metrics['NPV'], 2),
            'combined_model_name': combined_model_name,
            'ensemble_method': best_method,
            'level': 'first'
        }
        summary_rows.append(summary_row)
    # 保存第一层prob表
    prob_df = pd.DataFrame(prob_dict)
    prob_df.to_csv(f"./ensemble/F1/{name}_F1__top_prob.csv", index=False)
    print(f"Saved {name}_F1__top_prob.csv")

    # 第二层：用topk_best_probs做stacking和统计集成
    second_level_candidates = {}
    topk_best_probs_stack = np.column_stack(topk_best_probs)

    # 统计方法再集成
    for method_name, func in stat_methods.items():
        try:
            agg_prob = func(topk_best_probs_stack, axis=1)
            second_level_candidates[f'stat_{method_name}'] = agg_prob
        except Exception as e:
            print(f"2nd layer Error in method {method_name}: {e}")

    # stacking
    try:
        skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
        stacking_preds = np.zeros_like(all_labels, dtype=float)
        for train_idx, test_idx in skf.split(topk_best_probs_stack, all_labels):
            stk = LogisticRegression(solver='liblinear')
            stk.fit(topk_best_probs_stack[train_idx], all_labels[train_idx])
            stacking_preds[test_idx] = stk.predict_proba(topk_best_probs_stack[test_idx])[:,1]
        second_level_candidates['stacking'] = stacking_preds
    except Exception as e:
        print(f"Error in 2nd layer stacking: {e}")

    # 加权平均（权重均分）
    try:
        weights = np.ones(topk_best_probs_stack.shape[1]) / topk_best_probs_stack.shape[1]
        weighted_prob = np.sum(topk_best_probs_stack * weights, axis=1)
        second_level_candidates['weighted_mean'] = weighted_prob
    except Exception as e:
        print(f"Error in 2nd layer weighted_mean: {e}")

    # 多数投票
    try:
        hard_preds = (topk_best_probs_stack >= 0.5).astype(int)
        voted = (np.sum(hard_preds, axis=1) >= (hard_preds.shape[1] / 2)).astype(int)
        second_level_candidates['hard_vote'] = voted
    except Exception as e:
        print(f"Error in 2nd layer hard_vote: {e}")

    # 选F1最高的第二层集成
    best_f1_2nd = -1
    best_prob_2nd = None
    best_method_2nd = None
    best_metrics_2nd = None

    for method, agg_prob in second_level_candidates.items():
        if agg_prob.max() > 1 or agg_prob.min() < 0:
            agg_prob = np.clip(agg_prob, 0, 1)
        if method == 'hard_vote':
            agg_pred = agg_prob.astype(int)
        else:
            agg_pred = (agg_prob >= 0.5).astype(int)
        metrics = compute_metrics(all_labels, agg_prob, agg_pred)
        if metrics['F1'] > best_f1_2nd:
            best_f1_2nd = metrics['F1']
            best_prob_2nd = agg_prob
            best_method_2nd = method
            best_metrics_2nd = metrics
        # 保存所有second layer prob
        summary_row = {
            'name': name,
            'AUC': round(metrics['AUC'], 2),
            'F1': round(metrics['F1'], 2),
            'Sensitivity': round(metrics['Sensitivity'], 2),
            'Specificity': round(metrics['Specificity'], 2),
            'PPV': round(metrics['PPV'], 2),
            'NPV': round(metrics['NPV'], 2),
            'ensemble_method': method,
            'level': 'second'
        }
        summary_rows.append(summary_row)

    # 保存最终集成结果
    final_df = pd.DataFrame({'case': case_ids, 'label': all_labels, 'final_prob': best_prob_2nd})
    final_df.to_csv(f"./ensemble/F1/{name}_F1_final_ensemble.csv", index=False)
    print(f"Saved {name}_F1_final_ensemble.csv")
    final_summary_rows.append({
        'name': name,
        **{k: round(v, 2) for k, v in best_metrics_2nd.items()},
        'ensemble_method': best_method_2nd
    })

# 保存summary汇总表
summary_df = pd.DataFrame(summary_rows)
summary_df.to_csv("./ensemble/F1/all_ensemble_top_summary_F1.csv", index=False)
print("Saved all_ensemble_top_summary_F1.csv")
final_summary_df = pd.DataFrame(final_summary_rows)
final_summary_df.to_csv("./ensemble/F1/all_final_ensemble_summary_F1.csv", index=False)
print("Saved all_final_ensemble_summary_F1.csv")

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Saved text_F1__top_prob.csv
Saved text_F1_final_ensemble.csv


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize

Saved interviewee_text_F1__top_prob.csv
Saved interviewee_text_F1_final_ensemble.csv


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Saved new_text_F1__top_prob.csv
Saved new_text_F1_final_ensemble.csv


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Saved new_interviewee_text_F1__top_prob.csv
Saved new_interviewee_text_F1_final_ensemble.csv
Saved all_ensemble_top_summary_F1.csv
Saved all_final_ensemble_summary_F1.csv


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [14]:
import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score, f1_score, precision_score, recall_score, confusion_matrix
from itertools import combinations
import os

names = ['text', 'interviewee_text', 'new_text', 'new_interviewee_text']
models = [
    'ckiplab/albert-base-chinese',
    'indiejoseph/bert-base-chinese',
    'zwzzz/Chinese-MentalBERT',
    'Geotrend/distilbert-base-zh-cased',
    'hfl/chinese-roberta-wwm-ext',
    'hfl/chinese-xlnet-base',
    'hfl/chinese-electra-base-discriminator'
]

stat_methods = {
    'mean': np.mean,
    'max': np.max,
    'min': np.min,
    'std': np.std,
    'median': np.median,
    'var': np.var,
    'upper_quartile': lambda arr, axis: np.percentile(arr, 75, axis=axis),
    'lower_quartile': lambda arr, axis: np.percentile(arr, 25, axis=axis),
}

def compute_metrics(y_true, y_prob, y_pred):
    auc = roc_auc_score(y_true, y_prob)
    f1 = f1_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    specificity = tn / (tn + fp)
    npv = tn / (tn + fn) if (tn + fn) > 0 else 0.0
    return {
        "AUC": auc,
        "F1": f1,
        "Sensitivity": recall,
        "Specificity": specificity,
        "PPV": precision,
        "NPV": npv
    }

# Summary of metrics
summary_rows = []

for name in names:
    print(f"Processing {name}...")
    
    model_probs = {}
    first_data = None

    # Read model predictions
    for model_name in models:
        model_short = model_name.split("/")[-1]
        data = pd.read_csv(f"./{name}_{model_short}_majority_voting_results.csv")
        if first_data is None:
            first_data = data
        model_probs[model_short] = data['prob'].values

    all_labels = first_data['label'].values
    case_ids = first_data['case'].values if 'case' in first_data.columns else np.arange(len(all_labels))

    # Store the best probabilities for each k combination
    prob_result = {'case': case_ids, 'label': all_labels}

    model_short_names = list(model_probs.keys())

    for k in range(2, len(model_short_names)+1):  # k=2~7
        best_auc = -1
        best_prob = None
        best_combination = None
        best_method = None
        best_metrics = None

        # Generate all combinations of size k
        all_combos = combinations(model_short_names, k)

        for combo in all_combos:
            probs_stack = np.stack([model_probs[m] for m in combo], axis=1)

            for method_name, func in stat_methods.items():
                try:
                    agg_prob = func(probs_stack, axis=1)
                except Exception as e:
                    print(f"Error in method {method_name}: {e}")
                    continue

                # Calculate AUC
                auc = roc_auc_score(all_labels, agg_prob)

                # Predict and calculate other metrics
                agg_pred = (agg_prob >= 0.5).astype(int)
                metrics = compute_metrics(all_labels, agg_prob, agg_pred)

                if auc > best_auc:
                    best_auc = auc
                    best_prob = agg_prob
                    best_combination = combo
                    best_method = method_name
                    best_metrics = metrics

        # Store the best combination's probability
        prob_result[f'prob_{k}'] = best_prob

        # Save the summary information
        summary_row = {
            'name': name,
            'k': k,
            'AUC': f"{best_metrics['AUC']:.2f}",  # Save with 2 decimal places
            'F1': f"{best_metrics['F1']:.2f}",
            'Sensitivity': f"{best_metrics['Sensitivity']:.2f}",
            'Specificity': f"{best_metrics['Specificity']:.2f}",
            'PPV': f"{best_metrics['PPV']:.2f}",
            'NPV': f"{best_metrics['NPV']:.2f}",
            'combined_model_name': '+'.join(best_combination),  # Save only model names
            'ensemble_method': best_method
        }
        summary_rows.append(summary_row)

        print(f"{name} - Best combination for top {k}: {best_combination} using {best_method} (AUC={best_auc:.2f})")

    # Save the probability table for each name
    os.makedirs("./ensemble", exist_ok=True)
    prob_df = pd.DataFrame(prob_result)
    prob_df.to_csv(f"./ensemble/{name}_all_best_combination_prob.csv", index=False)
    print(f"Saved ./ensemble/{name}_all_best_combination_prob.csv")

# Save summary table
summary_df = pd.DataFrame(summary_rows)
summary_df.to_csv("./ensemble/all_ensemble_all_summary.csv", index=False)
print("Saved all_ensemble_top_summary.csv")


Processing text...


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize

text - Best combination for top 2: ('distilbert-base-zh-cased', 'chinese-roberta-wwm-ext') using max (AUC=0.82)


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize

text - Best combination for top 3: ('Chinese-MentalBERT', 'distilbert-base-zh-cased', 'chinese-roberta-wwm-ext') using max (AUC=0.83)


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize

text - Best combination for top 4: ('albert-base-chinese', 'Chinese-MentalBERT', 'distilbert-base-zh-cased', 'chinese-roberta-wwm-ext') using max (AUC=0.82)


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize

text - Best combination for top 5: ('albert-base-chinese', 'Chinese-MentalBERT', 'distilbert-base-zh-cased', 'chinese-roberta-wwm-ext', 'chinese-xlnet-base') using mean (AUC=0.82)


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


text - Best combination for top 6: ('albert-base-chinese', 'bert-base-chinese', 'Chinese-MentalBERT', 'distilbert-base-zh-cased', 'chinese-roberta-wwm-ext', 'chinese-xlnet-base') using mean (AUC=0.82)
text - Best combination for top 7: ('albert-base-chinese', 'bert-base-chinese', 'Chinese-MentalBERT', 'distilbert-base-zh-cased', 'chinese-roberta-wwm-ext', 'chinese-xlnet-base', 'chinese-electra-base-discriminator') using mean (AUC=0.82)
Saved ./ensemble/text_all_best_combination_prob.csv
Processing interviewee_text...


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize

interviewee_text - Best combination for top 2: ('distilbert-base-zh-cased', 'chinese-electra-base-discriminator') using max (AUC=0.79)


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize

interviewee_text - Best combination for top 3: ('distilbert-base-zh-cased', 'chinese-roberta-wwm-ext', 'chinese-electra-base-discriminator') using max (AUC=0.80)


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize

interviewee_text - Best combination for top 4: ('Chinese-MentalBERT', 'distilbert-base-zh-cased', 'chinese-roberta-wwm-ext', 'chinese-electra-base-discriminator') using median (AUC=0.78)


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize

interviewee_text - Best combination for top 5: ('bert-base-chinese', 'Chinese-MentalBERT', 'distilbert-base-zh-cased', 'chinese-xlnet-base', 'chinese-electra-base-discriminator') using lower_quartile (AUC=0.78)


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


interviewee_text - Best combination for top 6: ('albert-base-chinese', 'Chinese-MentalBERT', 'distilbert-base-zh-cased', 'chinese-roberta-wwm-ext', 'chinese-xlnet-base', 'chinese-electra-base-discriminator') using mean (AUC=0.78)
interviewee_text - Best combination for top 7: ('albert-base-chinese', 'bert-base-chinese', 'Chinese-MentalBERT', 'distilbert-base-zh-cased', 'chinese-roberta-wwm-ext', 'chinese-xlnet-base', 'chinese-electra-base-discriminator') using lower_quartile (AUC=0.77)
Saved ./ensemble/interviewee_text_all_best_combination_prob.csv
Processing new_text...


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize

new_text - Best combination for top 2: ('distilbert-base-zh-cased', 'chinese-roberta-wwm-ext') using min (AUC=0.78)


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize

new_text - Best combination for top 3: ('Chinese-MentalBERT', 'distilbert-base-zh-cased', 'chinese-roberta-wwm-ext') using min (AUC=0.78)


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize

new_text - Best combination for top 4: ('bert-base-chinese', 'Chinese-MentalBERT', 'distilbert-base-zh-cased', 'chinese-roberta-wwm-ext') using min (AUC=0.79)


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize

new_text - Best combination for top 5: ('albert-base-chinese', 'bert-base-chinese', 'Chinese-MentalBERT', 'distilbert-base-zh-cased', 'chinese-roberta-wwm-ext') using min (AUC=0.79)


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


new_text - Best combination for top 6: ('albert-base-chinese', 'bert-base-chinese', 'Chinese-MentalBERT', 'distilbert-base-zh-cased', 'chinese-roberta-wwm-ext', 'chinese-electra-base-discriminator') using min (AUC=0.79)
new_text - Best combination for top 7: ('albert-base-chinese', 'bert-base-chinese', 'Chinese-MentalBERT', 'distilbert-base-zh-cased', 'chinese-roberta-wwm-ext', 'chinese-xlnet-base', 'chinese-electra-base-discriminator') using min (AUC=0.78)
Saved ./ensemble/new_text_all_best_combination_prob.csv
Processing new_interviewee_text...


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize

new_interviewee_text - Best combination for top 2: ('distilbert-base-zh-cased', 'chinese-roberta-wwm-ext') using upper_quartile (AUC=0.72)


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize

new_interviewee_text - Best combination for top 3: ('bert-base-chinese', 'distilbert-base-zh-cased', 'chinese-roberta-wwm-ext') using max (AUC=0.72)


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize

new_interviewee_text - Best combination for top 4: ('distilbert-base-zh-cased', 'chinese-roberta-wwm-ext', 'chinese-xlnet-base', 'chinese-electra-base-discriminator') using mean (AUC=0.72)


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize

new_interviewee_text - Best combination for top 5: ('bert-base-chinese', 'distilbert-base-zh-cased', 'chinese-roberta-wwm-ext', 'chinese-xlnet-base', 'chinese-electra-base-discriminator') using mean (AUC=0.72)


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


new_interviewee_text - Best combination for top 6: ('albert-base-chinese', 'bert-base-chinese', 'distilbert-base-zh-cased', 'chinese-roberta-wwm-ext', 'chinese-xlnet-base', 'chinese-electra-base-discriminator') using max (AUC=0.71)
new_interviewee_text - Best combination for top 7: ('albert-base-chinese', 'bert-base-chinese', 'Chinese-MentalBERT', 'distilbert-base-zh-cased', 'chinese-roberta-wwm-ext', 'chinese-xlnet-base', 'chinese-electra-base-discriminator') using max (AUC=0.71)
Saved ./ensemble/new_interviewee_text_all_best_combination_prob.csv
Saved all_ensemble_top_summary.csv


In [None]:
import pandas as pd
from sklearn.metrics import roc_auc_score
import os

names = ['text', 'interviewee_text', 'new_text', 'new_interviewee_text']
models = [
    'ckiplab/albert-base-chinese',
    'indiejoseph/bert-base-cantonese',
    'zwzzz/Chinese-MentalBERT',
    'Geotrend/distilbert-base-zh-cased',
    'hfl/chinese-roberta-wwm-ext',
    'hfl/chinese-xlnet-base',
    'hfl/chinese-electra-base-discriminator'
]

# 1. Gather AUC scores
auc_results = []
for name in names:
    for model_name in models:
        model_short = model_name.split("/")[-1]
        csv_path = f"./{name}_{model_short}_majority_voting_results.csv"
        if not os.path.exists(csv_path):
            print(f"Warning: {csv_path} not found, skipping.")
            continue
        data = pd.read_csv(csv_path)
        auc = roc_auc_score(data['label'], data['prob'])
        auc_results.append({
            "dataset": name,
            "model": model_short,
            "AUC": auc
        })

df_auc = pd.DataFrame(auc_results)

# 2. Pairwise AUC comparisons within each model
comparison_pairs = [
    ("text", "new_text"),
    ("interviewee_text", "new_interviewee_text"),
    ("text", "interviewee_text"),
    ("new_text", "new_interviewee_text")
]

compare_rows = []
for model in df_auc['model'].unique():
    model_df = df_auc[df_auc['model'] == model].set_index('dataset')
    for (name1, name2) in comparison_pairs:
        if name1 in model_df.index and name2 in model_df.index:
            auc1 = model_df.loc[name1, "AUC"]
            auc2 = model_df.loc[name2, "AUC"]
            diff = auc1 - auc2
            compare_rows.append({
                'model': model,
                'comparison': f"{name1} vs {name2}",
                f'{name1}_AUC': auc1,
                f'{name2}_AUC': auc2,
                'AUC_diff': diff
            })

df_compare = pd.DataFrame(compare_rows)
df_compare.to_csv("pairwise_auc_comparisons_within_model.csv", index=False)

print(df_compare)

                                 model  \
0                  albert-base-chinese   
1                  albert-base-chinese   
2                  albert-base-chinese   
3                  albert-base-chinese   
4                  bert-base-cantonese   
5                  bert-base-cantonese   
6                  bert-base-cantonese   
7                  bert-base-cantonese   
8                   Chinese-MentalBERT   
9                   Chinese-MentalBERT   
10                  Chinese-MentalBERT   
11                  Chinese-MentalBERT   
12            distilbert-base-zh-cased   
13            distilbert-base-zh-cased   
14            distilbert-base-zh-cased   
15            distilbert-base-zh-cased   
16             chinese-roberta-wwm-ext   
17             chinese-roberta-wwm-ext   
18             chinese-roberta-wwm-ext   
19             chinese-roberta-wwm-ext   
20                  chinese-xlnet-base   
21                  chinese-xlnet-base   
22                  chinese-xlnet-