In [None]:
# 模型扰动
from y_scramble import Scrambler  # 确保y_scramble已正确安装和可用
from sklearn.ensemble import AdaBoostClassifier

# 设置训练集和测试集的特征和标签
X_train = x_mor_train
Y_train = y_mor_train
X_test = x_mor_test
Y_test = y_mor_test

# 使用找到的最佳参数实例化AdaBoost分类器
Ada = AdaBoostClassifier(random_state = 0,
                         algorithm = 'SAMME',
                         learning_rate = 1,
                         n_estimators = 479)

# 训练模型
Ada.fit(X_train, Y_train)

# 确保X和Y被正确赋值为训练数据集和标签
# 转换Y_train为numpy数组以避免KeyError
Y_train_np = Y_train.values.ravel()

# 确保X和Y被正确赋值为训练数据集和标签
X = X_train
Y = Y_train_np  # 使用Y_train转换为numpy数组的版本

# 初始化Scrambler实例
scrambler = Scrambler(model=Ada, iterations=100)

# 使用Scrambler的validate验证模型，假设以DataFrame形式返回了结果
results = scrambler.validate(
    X=X_train, 
    Y=Y_train_np, 
    scoring="accuracy", # 注意: 'scoring'参数在当前实现中不直接影响结果，因为返回了多个评价指标
    cross_val_score_aggregator="mean", 
    pvalue_threshold=0.01,
    as_df=True  # 假设我们想直接以DataFrame形式获取结果
)

# 检查是否返回了DataFrame
if isinstance(results, pd.DataFrame):
    # 将DataFrame保存到CSV文件
    results.to_csv("validation_results.csv", index=False)
    print("Results saved to CSV file.")
    print(results)
else:
    # 如果返回的不是DataFrame，则进行解包
    accuracy_scores, recall_scores, precision_scores, rou_auc_scores, matthews_corrcoef_scores, f1_scores, zscores, pvalues, significances = results
    print("Accuracy Scores:", accuracy_scores)
    print("Recall Scores:", recall_scores)
    print("Precision Scores:", precision_scores)
    print("Roc_auc Scores:", rou_auc_scores)
    print("Mcc Scores:", matthews_corrcoef_scores)
    print("F1 Scores:", f1_scores)
    print("Z-Scores:", zscores)
    print("P-Values:", pvalues)
    print("Significances:", significances)

import pandas as pd
import numpy as np

# 随机提取10次模型扰动的结果
if len(results) > 10:
    # 随机选取10次扰动的索引，排除第0行（原始模型的评分）
    random_indices = np.random.choice(results.index[1:], size=10, replace=False)
    # 使用iloc直接选择这些索引的行
    selected_scrambled_metrics = results.iloc[random_indices]
    # 保存选取的扰动模型指标到CSV
    selected_scrambled_metrics.to_csv("selected_scrambled_metrics.csv", index=False)
else:
    print("Not enough data to select from.")


# 假设results是已经通过validate方法获得的DataFrame，并且包含了原始模型的数据在第一行
original_model_data = results.iloc[0]

# 创建一个DataFrame来存储原始模型指标
original_model_metrics_df = pd.DataFrame({
    'Metric': ['Accuracy', 'Recall', 'Precision', 'ROC AUC', 'MCC', 'f1', 'Z-Score', 'P-Value', 'Significance'],
    'Value': [
        original_model_data['accuracy'],
        original_model_data['recall'],
        original_model_data['precision'],
        original_model_data['roc_auc'],
        original_model_data['mcc'],
        original_model_data['f1'],
        original_model_data['zscore'],
        original_model_data['pvalue'],
        '显著' if original_model_data['significancy'] else '不显著'
    ]
})

# 将DataFrame保存到CSV文件
original_model_metrics_df.to_csv("original_model_metrics.csv", index=False)

# 模型扰动结果可视化
import matplotlib.pyplot as plt
import seaborn as sns

# 筛选显著性结果
significant_results = results[results['significancy']]

# 绘制准确率的箱形图
plt.figure(figsize=(10, 6))
sns.boxplot(data=results, x='significancy', y='accuracy')
plt.title('Accuracy Scores by Significance')
plt.show()

# 绘制准确率和召回率的散点图
plt.figure(figsize=(10, 6))
sns.scatterplot(data=results, x='accuracy', y='recall', hue='significancy')
plt.title('Accuracy vs. Recall by Significance')
plt.show()