In [None]:
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict
from sklearn.metrics import (
    get_scorer, accuracy_score, recall_score, precision_score,
    roc_auc_score, matthews_corrcoef, average_precision_score
)
from tqdm import tqdm
import pandas as pd
import numpy as np
import scipy.stats
import copy

class Scrambler:

    def __init__(self, model, iterations=100):
        # 初始化模型、迭代次数和进度条显示
        self.base_model = model
        self.iterations = iterations
        self.progress_bar = False

    def validate(self, X, Y, method="train_test_split", scoring="accuracy", cross_val_score_aggregator="mean", pvalue_threshold=0.05, cv_kfolds=5, as_df=False, validation_data=None, progress_bar=False):
        # 根据指定的评分方法获取评分器
        model_scorer = get_scorer(scoring)
        if model_scorer is None:
            raise Exception(f"Scoring function '{model_scorer}' is not a valid sklearn scorer")

        # 根据指定的方法对模型进行评估
        if method == "train_test_split":
            
            accuracy_scores, recall_scores, precision_scores, roc_auc_scores, mcc_scores, 
 avg_precision_scores = self.__validate_train_test_split(X, Y, model_scorer, progress_bar=progress_bar)
        elif method == "cross_validation":
            accuracy_scores, recall_scores, precision_scores, roc_auc_scores, mcc_scores, 
 avg_precision_scores = self.__validate_cross_validation(X, Y, model_scorer, cross_val_score_aggregator, cv_kfolds, progress_bar=progress_bar)

        # 对得分进行统计分析
        all_scores_zscores = scipy.stats.zscore(accuracy_scores)
        all_scores_pvalues = scipy.stats.norm.sf(abs(all_scores_zscores)) * 2
        all_scores_significances = all_scores_pvalues <= pvalue_threshold

        # 根据as_df参数决定返回数据格式
        if as_df:
            # 将结果组织为DataFrame返回
            df = pd.DataFrame({
                "accuracy": accuracy_scores, 
                "recall": recall_scores, 
                "precision": precision_scores,
                "roc_auc": roc_auc_scores,
                "mcc": mcc_scores,
                "avg_precision": avg_precision_scores,
                "zscore": all_scores_zscores, 
                "pvalue": all_scores_pvalues, 
                "significancy": all_scores_significances
            })
            return df
        else:
            # 直接返回元组形式的结果
            return accuracy_scores, recall_scores, precision_scores, roc_auc_scores, mcc_scores, 
 avg_precision_scores, all_scores_zscores, all_scores_pvalues, all_scores_significances

    def __validate_train_test_split(self, X, Y, scorer, progress_bar=False):
        # 使用train_test_split方法分割数据集
        X_train, X_test, Y_train, Y_test = train_test_split(X, Y)
        # 调用__evaluate_model方法计算得分
        return self.__evaluate_model(X_train, Y_train, X_test, Y_test, scorer, progress_bar)

    def __validate_cross_validation(self, X, Y, scorer, aggregation, cv_kfolds, progress_bar=False):
        # 使用cross_val_predict进行交叉验证预测
        return self.__evaluate_model(X, Y, X, Y, scorer, progress_bar, cross_val=True, cv_kfolds=cv_kfolds)

    def __evaluate_model(self, X_train, Y_train, X_test, Y_test, scorer, progress_bar, cross_val=False, cv_kfolds=7):
        # 训练基模型并计算得分
        self.base_model.fit(X_train, Y_train)
        accuracy_scores, recall_scores, precision_scores, roc_auc_scores, mcc_scores, 
 avg_precision_scores = [], [], [], [], [], []

        # 对基模型或使用交叉验证的模型进行评分
        if cross_val:
            Y_pred = cross_val_predict(self.base_model, X_train, Y_train, cv=cv_kfolds)
        else:
            Y_pred = self.base_model.predict(X_test)

        # 添加基模型的评分
        accuracy_scores.append(accuracy_score(Y_test, Y_pred))
        recall_scores.append(recall_score(Y_test, Y_pred, average='binary'))
        precision_scores.append(precision_score(Y_test, Y_pred, average='binary'))
        roc_auc_scores.append(roc_auc_score(Y_test, Y_pred))
        mcc_scores.append(matthews_corrcoef(Y_test, Y_pred))
        avg_precision_scores.append(average_precision_score(Y_test, Y_pred))

        # 对每次迭代的扰乱模型进行评分
        scrambled_models_iterator = tqdm(range(self.iterations)) if progress_bar else range(self.iterations)
        for _ in scrambled_models_iterator:
            Y_train_scrambled = np.random.permutation(Y_train)
            self.base_model.fit(X_train, Y_train_scrambled)
            
            if cross_val:
                Y_pred_scrambled = cross_val_predict(self.base_model, X_train, Y_train_scrambled, cv=cv_kfolds)
            else:
                Y_pred_scrambled = self.base_model.predict(X_test)
                
            accuracy_scores.append(accuracy_score(Y_test, Y_pred_scrambled))
            recall_scores.append(recall_score(Y_test, Y_pred_scrambled, average='binary'))
            precision_scores.append(precision_score(Y_test, Y_pred_scrambled, average='binary'))
            roc_auc_scores.append(roc_auc_score(Y_test, Y_pred_scrambled))
            mcc_scores.append(matthews_corrcoef(Y_test, Y_pred_scrambled))
            avg_precision_scores.append(average_precision_score(Y_test, Y_pred_scrambled))

        return accuracy_scores, recall_scores, precision_scores, accuracy_scores, recall_scores, precision_scores, roc_auc_scores, mcc_scores, 
 avg_precision_scores
