In [5]:
# 查看当前路径
pwd

'/home/md04/public/jupyter/xyt/模型验证'

In [4]:
# 文件y_scramble-0.0.8-py3-none-any.whl是一个Python Wheel包。
# Wheel是Python的一种分发格式，旨在替代传统的源代码分发（例如.tar.gz文件）和编译后分发（如.egg文件）。
# 这种格式的主要优点包括更快的安装速度和避免运行setup.py可能带来的安全风险。
pip install y_scramble-0.0.8-py3-none-any.whl

Looking in indexes: https://mirrors.aliyun.com/pypi/simple
Processing ./y_scramble-0.0.8-py3-none-any.whl
Installing collected packages: y-scramble
Successfully installed y-scramble-0.0.8
Note: you may need to restart the kernel to use updated packages.


In [7]:
# 模型扰动
from y_scramble import Scrambler  # 确保y_scramble已正确安装和可用


# 设置训练集和测试集的特征和标签
X_train = train_X = x_mor_train
Y_train = train_Y = y_mor_train
X_test = test_X = x_mor_test
Y_test = test_Y = y_mor_test

# 初始化XGBClassifier分类器，并设置超参数
xgb = XGBClassifier(
    random_state=0,  # 随机种子，保证可复现性
    objective='binary:logistic',  # 目标函数为二元逻辑回归
    colsample_bytree=0.8,  # 构造每棵树时列的采样率
    gamma=0.0,  # 叶子节点分裂所需的最小损失减少
    max_depth=9,  # 最大深度
    min_child_weight=1,  # 孩子节点所需的最小权重和
    n_estimators=40,  # 树的个数
    reg_alpha=0.1,  # L1正则化系数
    reg_lambda=0.1,  # L2正则化系数
    subsample=0.8  # 训练每棵树时样本的采样率
)

# 训练模型
estimator = xgb
estimator.fit(X_train, Y_train)

# 转换Y_train为numpy数组以避免KeyError
Y_train_np = Y_train.values.ravel()

# 确保X和Y被正确赋值为训练数据集和标签
X = X_train
Y = Y_train_np  # 使用Y_train转换为numpy数组的版本

# 初始化Scrambler实例
scrambler = Scrambler(model=xgb, iterations=100)

# 使用Scrambler验证模型
scores, zscores, pvalues, significances = scrambler.validate(
    X, Y, 
    scoring="roc_auc", #'accuracy', 'recall', precision'
    cross_val_score_aggregator="mean", 
    pvalue_threshold=0.01
)

In [8]:
import numpy as np  # 用于数值运算和数据操作
# 打印基模型的得分和显著性
print(f"基模型得分: {scores[0]}")
print(f"基模型显著性: {'显著' if significances[0] else '不显著'}")

# 打印打乱模型的平均得分、最高得分和显著性比例
average_scrambled_score = np.mean(scores[1:])
max_scrambled_score = np.max(scores[1:])
significant_scrambles = np.sum(significances[1:])
total_scrambles = len(significances[1:])

print(f"打乱模型平均得分: {average_scrambled_score}")
print(f"打乱模型最高得分: {max_scrambled_score}")
print(f"显著的打乱次数/总打乱次数: {significant_scrambles}/{total_scrambles} ({significant_scrambles/total_scrambles*100:.2f}%)")

# 可选：展示具体的得分、z分数和p值
# 为了简洁，这里只展示前5个打乱模型的结果
print("\n前5个打乱模型的得分、z分数和p值:")
for i in range(1, 6):
    print(f"打乱模型 {i}: 得分={scores[i]}, z分数={zscores[i]}, p值={pvalues[i]}, 显著性={'显著' if significances[i] else '不显著'}")

基模型得分: 0.8947368421052632
基模型显著性: 显著
打乱模型平均得分: 0.33447368421052637
打乱模型最高得分: 0.5263157894736842
显著的打乱次数/总打乱次数: 0/100 (0.00%)

前5个打乱模型的得分、z分数和p值:
打乱模型 1: 得分=0.47368421052631576, z分数=1.3343819174016471, p值=0.18207872314500007, 显著性=不显著
打乱模型 2: 得分=0.2631578947368421, z分数=-0.7673346308644964, p值=0.4428825833733361, 显著性=不显著
打乱模型 3: 得分=0.3157894736842105, z分数=-0.24190549379796053, p值=0.8088533931102685, 显著性=不显著
打乱模型 4: 得分=0.3684210526315789, z分数=0.2835236432685753, p值=0.7767754601879973, 显著性=不显著
打乱模型 5: 得分=0.3157894736842105, z分数=-0.24190549379796053, p值=0.8088533931102685, 显著性=不显著


In [6]:
from sklearn.metrics import get_scorer_names, get_scorer

# 获取并打印所有可用的评分器名称
scorer_names = get_scorer_names()
print(scorer_names)

# 示例：获取并使用“准确率”评分器
accuracy_scorer = get_scorer('accuracy')

# 现在，accuracy_scorer 可以被用在模型评估中，例如：
# score = accuracy_scorer(model, X_test, y_test)

['accuracy', 'adjusted_mutual_info_score', 'adjusted_rand_score', 'average_precision', 'balanced_accuracy', 'completeness_score', 'explained_variance', 'f1', 'f1_macro', 'f1_micro', 'f1_samples', 'f1_weighted', 'fowlkes_mallows_score', 'homogeneity_score', 'jaccard', 'jaccard_macro', 'jaccard_micro', 'jaccard_samples', 'jaccard_weighted', 'matthews_corrcoef', 'max_error', 'mutual_info_score', 'neg_brier_score', 'neg_log_loss', 'neg_mean_absolute_error', 'neg_mean_absolute_percentage_error', 'neg_mean_gamma_deviance', 'neg_mean_poisson_deviance', 'neg_mean_squared_error', 'neg_mean_squared_log_error', 'neg_median_absolute_error', 'neg_negative_likelihood_ratio', 'neg_root_mean_squared_error', 'neg_root_mean_squared_log_error', 'normalized_mutual_info_score', 'positive_likelihood_ratio', 'precision', 'precision_macro', 'precision_micro', 'precision_samples', 'precision_weighted', 'r2', 'rand_score', 'recall', 'recall_macro', 'recall_micro', 'recall_samples', 'recall_weighted', 'roc_auc',