In [None]:
# -*- coding: utf-8 -*-
"""
完整项目流程：工业设备故障预测
模型：XGBoost
特点：含详细日志输出、SHAP解释、三曲线阈值图、特征重要性图、混淆矩阵、ROC 曲线图、缺失值可视化
"""
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, precision_score, recall_score, f1_score, roc_curve
from imblearn.combine import SMOTETomek
from xgboost import XGBClassifier
import optuna
import shap
from tqdm import tqdm

# ===================== 1. 读取数据 =====================
print("\n[阶段1] 正在读取数据...")
file_path =r'D:\桌面\2025年第四届“创新杯”（原钉钉杯）大学生大数据挑战赛初赛题目\2025年第四届“创新杯”（原钉钉杯）大学生大数据挑战赛初赛题目\A题\data\train_data.csv'
df = pd.read_csv(file_path)

# ===================== 2. 缺失值处理 =====================
print("\n[阶段2] 正在处理缺失值...")

# 可视化缺失比例
missing_ratio = df.isnull().mean()
plt.figure(figsize=(10, 5))
missing_ratio[missing_ratio > 0].sort_values().plot(kind='barh', color='tomato')
plt.title("各字段缺失值比例")
plt.xlabel("缺失比例")
plt.tight_layout()
plt.show()

# 删除缺失比例大于 20% 的列
df.drop(columns=missing_ratio[missing_ratio > 0.8].index, inplace=True)

# 删除无用字段
df.drop(columns=['Machine_ID', 'Remaining_Useful_Life_days', 'Installation_Year'], errors='ignore', inplace=True)

# ===================== 3. 异常值处理 =====================
print("\n[阶段3] 正在处理异常值（缩尾法）...")
def winsorize_series(series, lower=0.01, upper=0.99):
    return np.clip(series, series.quantile(lower), series.quantile(upper))

y = df['Failure_Within_7_Days']
df.drop(columns=['Failure_Within_7_Days'], inplace=True)

numeric_cols = df.select_dtypes(include=['number']).columns
for col in numeric_cols:
    df[col] = winsorize_series(df[col])

# ===================== 4. 特征工程 =====================
print("\n[阶段4] 正在进行特征编码...")
df = pd.get_dummies(df, columns=['Machine_Type'], drop_first=True)
X = df.copy()

# ===================== 5. 数据划分 =====================
print("\n[阶段5] 正在划分训练集与测试集...")
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# ===================== 6. SMOTETomek 采样 =====================
print("\n[阶段6] 正在进行 SMOTETomek 重采样平衡类别...")
smt = SMOTETomek(random_state=42)
X_train_res, y_train_res = smt.fit_resample(X_train, y_train)

# ===================== 7. Optuna 超参数调优 =====================
print("\n[阶段7] 正在使用 Optuna 进行超参数调优（XGBoost）...")
def objective(trial):
    params = {
        'max_depth': trial.suggest_int("max_depth", 3, 10),
        'learning_rate': trial.suggest_float("learning_rate", 0.01, 0.3),
        'n_estimators': trial.suggest_int("n_estimators", 100, 500),
        'subsample': trial.suggest_float("subsample", 0.6, 1.0),
        'colsample_bytree': trial.suggest_float("colsample_bytree", 0.6, 1.0),
        'gamma': trial.suggest_float("gamma", 0, 5),
        'reg_alpha': trial.suggest_float("reg_alpha", 0, 5),
        'reg_lambda': trial.suggest_float("reg_lambda", 0, 5),
        'use_label_encoder': False,
        'eval_metric': 'logloss',
        'random_state': 42
    }
    model = XGBClassifier(**params)
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    return cross_val_score(model, X_train_res, y_train_res, cv=cv, scoring='f1').mean()

study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=50, show_progress_bar=True)
best_params = study.best_params
print("\n[结果] 最佳参数：", best_params)

# ===================== 8. 训练模型 =====================
print("\n[阶段8] 正在训练 XGBoost 模型...")
model = XGBClassifier(**best_params, use_label_encoder=False, eval_metric='logloss', random_state=42)
model.fit(X_train_res, y_train_res)

# ===================== 9. 阈值调优与三曲线图 =====================
print("\n[阶段9] 正在绘制阈值-指标曲线并选择最佳阈值...")
def plot_threshold_metrics(model, X_val, y_val):
    y_proba = model.predict_proba(X_val)[:, 1]
    thresholds = np.linspace(0.1, 0.9, 81)
    precisions, recalls, f1s = [], [], []

    for t in thresholds:
        y_pred = (y_proba >= t).astype(int)
        precisions.append(precision_score(y_val, y_pred))
        recalls.append(recall_score(y_val, y_pred))
        f1s.append(f1_score(y_val, y_pred))

    best_idx = np.argmax(np.array(f1s) + np.array(recalls))
    best_threshold = thresholds[best_idx]

    plt.figure(figsize=(8, 5))
    plt.plot(thresholds, precisions, label='精确率', color='blue')
    plt.plot(thresholds, recalls, label='召回率', color='orange')
    plt.plot(thresholds, f1s, label='F1 分数', color='green')
    plt.axvline(best_threshold, linestyle='--', color='red', label=f'最优阈值 = {best_threshold:.3f}')
    plt.xlabel("分类概率阈值")
    plt.ylabel("指标值")
    plt.title("模型评估指标与阈值关系")
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    plt.show()

    return best_threshold

best_threshold = plot_threshold_metrics(model, X_test, y_test)

# ===================== 10. 模型评估 =====================
print("\n[阶段10] 正在评估模型性能...")
y_proba = model.predict_proba(X_test)[:, 1]
y_pred = (y_proba >= best_threshold).astype(int)

print("准确率:", round((y_pred == y_test).mean(), 6))
print("召回率:", round(recall_score(y_test, y_pred), 6))
print("F1 分数:", round(f1_score(y_test, y_pred), 6))
print("分类报告:\n", classification_report(y_test, y_pred))
roc_auc = roc_auc_score(y_test, y_proba)
print("ROC-AUC:", round(roc_auc, 6))

# ROC 曲线图
fpr, tpr, _ = roc_curve(y_test, y_proba)
plt.figure(figsize=(6, 5))
plt.plot(fpr, tpr, label=f"AUC = {roc_auc:.3f}", color='blue')
plt.plot([0, 1], [0, 1], linestyle='--', color='gray')
plt.xlabel("假正率 (FPR)")
plt.ylabel("真正率 (TPR)")
plt.title("ROC 曲线")
plt.legend()
plt.tight_layout()
plt.show()

# ===================== 11. 特征重要性图 =====================
print("\n[阶段11] 正在绘制特征重要性图...")
importance = pd.Series(model.feature_importances_, index=X.columns)
top5 = importance.sort_values(ascending=False).head(5)
print("\n前五个重要特征:\n", top5)

plt.figure(figsize=(8, 5))
top5.plot(kind='barh', color='teal')
plt.title("特征重要性 (Top 5)")
plt.xlabel("重要性得分")
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

# ===================== 12. 混淆矩阵 =====================
print("\n[阶段12] 正在绘制混淆矩阵...")
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(6, 5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title("混淆矩阵")
plt.xlabel("预测标签")
plt.ylabel("真实标签")
plt.tight_layout()
plt.show()

# ===================== 13. SHAP 模型解释 =====================
print("\n[阶段13] 正在使用 SHAP 解释模型预测结果...")
X_sample = X_test.sample(n=min(5000, len(X_test)), random_state=42)
explainer = shap.Explainer(model)
shap_values = explainer(X_sample)

# SHAP 柱状图
shap.summary_plot(shap_values, X_sample, plot_type="bar", show=False)
plt.title("SHAP 特征重要性 (柱状图)", fontsize=14)
plt.tight_layout()
plt.show()

# SHAP 分布图
shap.summary_plot(shap_values, X_sample, show=False)
plt.title("SHAP 特征重要性 (分布图)", fontsize=14)
plt.tight_layout()
plt.show()
