In [None]:
# -*- coding: utf-8 -*-
"""
无数据增强的XGBoost分类严谨实验框架 (基线对比版本)
=====================================================
本脚本是为响应审稿人“消融研究”要求而创建的基线对比实验版本。
它不使用任何数据增强方法，直接在原始数据上进行训练和评估，
用于和WGAN-GP版本、SMOTE版本形成直接对比。

核心方法学与增强版本保持一致，以确保对比的公平性：
1.  **严格的数据隔离**: 独立的“开发集”和“最终测试集”。
2.  **无数据增强**: 所有训练和调优步骤均只使用原始数据。
3.  **正确的模型与评估**: 全程使用XGBoost分类器及分类指标。
"""

# --- 基础库导入 ---
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import KFold, RandomizedSearchCV
from sklearn.metrics import (accuracy_score, precision_score, recall_score,
                             f1_score, roc_auc_score, ConfusionMatrixDisplay)
import matplotlib.pyplot as plt
import os
import joblib
from pathlib import Path

# --- 0. 全局配置 (与增强版本完全一致) ---

# --- 路径配置 ---
CURRENT_DIR = Path.cwd()
PROJECT_ROOT = CURRENT_DIR.parent
DATA_DIR = PROJECT_ROOT / "data"
OUTPUT_DIR = PROJECT_ROOT / "output"

DEV_SET_FILE = DATA_DIR / "development_set.xlsx"
TEST_SET_FILE = DATA_DIR / "final_test_set.xlsx"
# 为无增强版本创建独立的输出文件夹
NO_AUG_OUTPUT_DIR = OUTPUT_DIR / "no_augmentation_experiment"
MODEL_OUTPUT_PATH = NO_AUG_OUTPUT_DIR / "trained_models"
OUTPUT_PLOT_PATH = NO_AUG_OUTPUT_DIR

os.makedirs(MODEL_OUTPUT_PATH, exist_ok=True)
os.makedirs(OUTPUT_PLOT_PATH, exist_ok=True)


# --- 实验参数 ---
TARGET_COLUMN = 'target'
RANDOM_STATE = 42
N_SPLITS_KFOLD = 5

# --- XGBoost RandomizedSearchCV 超参数搜索空间 ---
XGB_PARAM_GRID = {
    'n_estimators': [100, 200, 300, 400],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.05, 0.1],
    'subsample': [0.7, 0.8, 0.9, 1],
    'colsample_bytree': [0.7, 0.8, 0.9, 1],
    'gamma': [0, 0.1, 0.2],
    'reg_alpha': [0, 0.01, 0.1],
    'reg_lambda': [0.5, 1, 1.5],
    'scale_pos_weight': [1, 5, 10, 20]
}
N_ITER_RANDOMIZED_SEARCH = 30

# --- 设备配置 ---
# 检查torch是否可用，以决定设备类型。如果torch不可用，则默认为'cpu'。
try:
    import torch
    device_type = 'cuda' if torch.cuda.is_available() else 'cpu'
except ImportError:
    device_type = 'cpu'
print(f"XGBoost将使用设备: {device_type}")


# --- 1. 主流程开始 ---
try:
    development_df_original = pd.read_excel(DEV_SET_FILE)
    final_test_df_original = pd.read_excel(TEST_SET_FILE)
    print(f"成功加载数据: 开发集形状 {development_df_original.shape}, 最终测试集形状 {final_test_df_original.shape}")
except FileNotFoundError as e:
    print(f"错误: 数据文件未找到，请检查路径。 {e}")
    exit()

# 将开发集和最终测试集划分为特征(X)和标签(y)
X_dev_original = development_df_original.drop(columns=[TARGET_COLUMN])
y_dev_original = development_df_original[TARGET_COLUMN]
X_final_test = final_test_df_original.drop(columns=[TARGET_COLUMN])
y_final_test = final_test_df_original[TARGET_COLUMN]

# --- 步骤 2: 超参数调优 (HPO) ---
print("\n--- [步骤 2] XGBoost 超参数调优 (无数据增强) ---")

# 初始化XGBoost分类器和RandomizedSearchCV
xgb_classifier_for_hpo = xgb.XGBClassifier(objective='binary:logistic', eval_metric='logloss',
                                           random_state=RANDOM_STATE, use_label_encoder=False,
                                           tree_method='gpu_hist' if device_type == 'cuda' else 'auto')

random_search_hpo = RandomizedSearchCV(
    estimator=xgb_classifier_for_hpo, param_distributions=XGB_PARAM_GRID,
    n_iter=N_ITER_RANDOMIZED_SEARCH, cv=N_SPLITS_KFOLD,
    scoring='roc_auc',
    verbose=1, random_state=RANDOM_STATE, n_jobs=-1
)
print(f"开始在 {X_dev_original.shape[0]} 个原始样本上进行XGBoost超参数搜索...")
# 直接在原始开发集上进行调优
random_search_hpo.fit(X_dev_original, y_dev_original)
best_overall_xgboost_params = random_search_hpo.best_params_
print(f"\n找到的最佳XGBoost超参数 (无增强): {best_overall_xgboost_params}")
print(f"最佳HPO ROC AUC Score (无增强): {random_search_hpo.best_score_:.4f}")

# --- 步骤 3: K-折交叉验证 ---
print(f"\n--- [步骤 3] 在开发集上进行 {N_SPLITS_KFOLD}-折交叉验证 (无数据增强) ---")
kf = KFold(n_splits=N_SPLITS_KFOLD, shuffle=True, random_state=RANDOM_STATE)

# 用于存储每折结果的列表
kfold_cv_val_metrics_list = []
kfold_cv_train_metrics_list = []
cv_train_loss_curves = []
cv_val_loss_curves = []

for fold_idx, (train_indices, val_indices) in enumerate(kf.split(development_df_original)):
    print(f"\n--- K-Fold: 第 {fold_idx + 1}/{N_SPLITS_KFOLD} 折 ---")
    
    # 直接从原始数据中划分训练集和验证集
    X_cv_train_fold = X_dev_original.iloc[train_indices]
    y_cv_train_fold = y_dev_original.iloc[train_indices]
    X_cv_val_fold = X_dev_original.iloc[val_indices]
    y_cv_val_fold = y_dev_original.iloc[val_indices]

    # 使用找到的最佳参数训练模型
    model_fold = xgb.XGBClassifier(
        **best_overall_xgboost_params, objective='binary:logistic', eval_metric='logloss',
        random_state=RANDOM_STATE, use_label_encoder=False,
        tree_method='gpu_hist' if device_type == 'cuda' else 'auto'
    )
    
    eval_set_fold = [(X_cv_train_fold, y_cv_train_fold), (X_cv_val_fold, y_cv_val_fold)]
    model_fold.fit(X_cv_train_fold, y_cv_train_fold,
                   eval_set=eval_set_fold, early_stopping_rounds=10, verbose=False)

    # 记录损失曲线
    fold_eval_results = model_fold.evals_result()
    cv_train_loss_curves.append(fold_eval_results['validation_0']['logloss'])
    cv_val_loss_curves.append(fold_eval_results['validation_1']['logloss'])

    # 在验证集上评估
    y_pred_val_proba = model_fold.predict_proba(X_cv_val_fold)[:, 1]
    y_pred_val = (y_pred_val_proba > 0.5).astype(int)
    kfold_cv_val_metrics_list.append({
        'Accuracy': accuracy_score(y_cv_val_fold, y_pred_val),
        'F1 Score': f1_score(y_cv_val_fold, y_pred_val),
        'AUC': roc_auc_score(y_cv_val_fold, y_pred_val_proba)
    })

    # 在训练集上评估 (用于监控过拟合)
    y_pred_train_proba = model_fold.predict_proba(X_cv_train_fold)[:, 1]
    y_pred_train = (y_pred_train_proba > 0.5).astype(int)
    kfold_cv_train_metrics_list.append({
        'Accuracy': accuracy_score(y_cv_train_fold, y_pred_train),
        'F1 Score': f1_score(y_cv_train_fold, y_pred_train),
        'AUC': roc_auc_score(y_cv_train_fold, y_pred_train_proba)
    })
    print(f"Fold {fold_idx + 1} - Val AUC: {kfold_cv_val_metrics_list[-1]['AUC']:.4f} | Train AUC: {kfold_cv_train_metrics_list[-1]['AUC']:.4f}")

# --- 步骤 3.1: 交叉验证结果分析与可视化 ---
# 计算平均性能
avg_kfold_cv_val_metrics_df = pd.DataFrame(kfold_cv_val_metrics_list)
avg_kfold_cv_train_metrics_df = pd.DataFrame(kfold_cv_train_metrics_list)

print("\n--- K-折交叉验证平均性能 (无数据增强) ---")
print("--- 平均验证集性能 ---")
print(avg_kfold_cv_val_metrics_df.mean())
print("\n--- 平均训练集性能 ---")
print(avg_kfold_cv_train_metrics_df.mean())

# 可视化1: 平均性能指标对比条形图
avg_val_metrics = avg_kfold_cv_val_metrics_df.mean()
avg_train_metrics = avg_kfold_cv_train_metrics_df.mean()
metrics_to_plot = ['Accuracy', 'F1 Score', 'AUC']
x_axis = np.arange(len(metrics_to_plot))

plt.figure(figsize=(10, 6))
plt.bar(x_axis - 0.2, avg_train_metrics[metrics_to_plot], width=0.4, label='CV Train Avg.', align='center')
plt.bar(x_axis + 0.2, avg_val_metrics[metrics_to_plot], width=0.4, label='CV Validation Avg.', align='center')
plt.xticks(x_axis, metrics_to_plot)
plt.ylabel('Score')
plt.title('Average K-Fold CV Train vs. Validation Metrics (No Augmentation)')
plt.legend()
plt.grid(True, linestyle='--', alpha=0.6)
plt.ylim(0.8, 1.0)
plt.savefig(OUTPUT_PLOT_PATH / "kfold_avg_eval_metrics_no_aug.png", dpi=300)
plt.show()

# 可视化2: 每折的训练/验证损失曲线
plt.figure(figsize=(12, 7))
for i in range(N_SPLITS_KFOLD):
    plt.plot(cv_train_loss_curves[i], label=f'Train Fold {i+1}', linestyle='--')
    plt.plot(cv_val_loss_curves[i], label=f'Validation Fold {i+1}', linestyle='-')
plt.xlabel('Boosting Round')
plt.ylabel('Log Loss')
plt.title('Training and Validation Log Loss per Fold (No Augmentation)')
plt.legend()
plt.grid(True, linestyle='--', alpha=0.6)
plt.savefig(OUTPUT_PLOT_PATH / "kfold_logloss_per_fold_no_aug.png", dpi=300)
plt.show()

# --- 步骤 4: 训练最终模型 ---
print("\n--- [步骤 4] 训练最终模型 (无数据增强) ---")
# 最终模型在完整的原始开发集上训练
X_train_final_model = X_dev_original
y_train_final_model = y_dev_original
print(f"用于训练最终模型的总数据形状: {X_train_final_model.shape}")

# 初始化并训练最终模型
final_model = xgb.XGBClassifier(
    **best_overall_xgboost_params, objective='binary:logistic', eval_metric='logloss',
    random_state=RANDOM_STATE, use_label_encoder=False,
    tree_method='gpu_hist' if device_type == 'cuda' else 'auto'
)
print("开始训练最终模型...")
final_model.fit(X_train_final_model, y_train_final_model)
print("最终模型训练完成。")

# 保存最终模型
final_model_path = MODEL_OUTPUT_PATH / "final_xgboost_no_aug_model.joblib"
joblib.dump(final_model, final_model_path)
print(f"最终模型已保存到: {final_model_path}")

# 可视化3: 最终模型特征重要性
plt.figure(figsize=(10, 8))
xgb.plot_importance(final_model, max_num_features=20, height=0.8, title="Feature Importance (Final Model, No Augmentation)")
plt.tight_layout()
plt.savefig(OUTPUT_PLOT_PATH / "final_model_feature_importances_no_aug.png", dpi=300)
plt.show()

# --- 步骤 5: 在最终测试集上进行无偏评估 ---
print("\n--- [步骤 5] 在最终测试集上进行无偏评估 (无增强模型) ---")
y_pred_proba_final = final_model.predict_proba(X_final_test)[:, 1]
y_pred_final = (y_pred_proba_final >= 0.5).astype(int)

print("--- 最终模型在最终测试集上的性能 (无增强) ---")
print(f"Accuracy : {accuracy_score(y_final_test, y_pred_final):.4f}")
print(f"Precision: {precision_score(y_final_test, y_pred_final):.4f}")
print(f"Recall   : {recall_score(y_final_test, y_pred_final):.4f}")
print(f"F1 Score : {f1_score(y_final_test, y_pred_final):.4f}")
print(f"AUC      : {roc_auc_score(y_final_test, y_pred_proba_final):.4f}")

# 可视化4: 最终测试集上的混淆矩阵
fig, ax = plt.subplots(figsize=(8, 8))
ConfusionMatrixDisplay.from_predictions(y_final_test, y_pred_final,
                                        ax=ax,
                                        display_labels=['Absence', 'Presence'],
                                        cmap='Blues')
ax.set_title('Confusion Matrix on Final Test Set (No Augmentation Model)')
plt.savefig(OUTPUT_PLOT_PATH / "final_model_confusion_matrix_no_aug.png", dpi=300)
plt.show()

print("\n--- 无增强版本整体流程执行完毕 ---")
