In [None]:
# 智慧工厂设备7天内故障预测：A题建模与可视化分析
# 2025“创新杯”大数据挑战赛

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import (
    classification_report, confusion_matrix, roc_auc_score,
    precision_score, recall_score, roc_curve, auc, precision_recall_curve
)
from sklearn.ensemble import VotingClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from imblearn.over_sampling import SMOTE
import optuna
import shap
import warnings
warnings.filterwarnings('ignore')

# =========================== 1. 数据读取与初步清洗 ===========================
file_path = r'D:/your_path/train_data.csv'  # 修改为实际路径
# 读取CSV数据
df = pd.read_csv(file_path)

# =========================== 2. 基础数据分析图 ===========================
# 缺失值可视化
plt.figure(figsize=(10, 6))
sns.heatmap(df.isnull(), cbar=False, cmap='viridis')
plt.title('Missing Value Heatmap')
plt.tight_layout()
plt.savefig("missing_values.png")
plt.close()

# 数据相关性热图
plt.figure(figsize=(12, 10))
sns.heatmap(df.corr(), annot=False, cmap='coolwarm')
plt.title('Feature Correlation Heatmap')
plt.tight_layout()
plt.savefig("correlation_heatmap.png")
plt.close()

# 类别分布：Machine_Type以及目标
plt.figure(figsize=(6,4))
df['Machine_Type'].value_counts().plot(kind='bar')
plt.title('Machine Type Distribution')
plt.tight_layout()
plt.savefig('machine_type_dist.png')
plt.close()

plt.figure(figsize=(6,4))
df['Failure_Within_7_Days'].value_counts().plot(kind='bar', color=['green','red'])
plt.title('Failure Within 7 Days Distribution')
plt.tight_layout()
plt.savefig('target_dist.png')
plt.close()

# =========================== 3. 特征清洗与转换 ===========================
# 删除无关字段
unused_cols = ['Laser_Intensity','Hydraulic_Pressure_bar','Coolant_Flow_L_min','Heat_Index']
df.drop(columns=unused_cols, inplace=True)

# 类型转换
for col in ['AI_Supervision','Failure_Within_7_Days']:
    df[col] = df[col].astype(int)

# 类别编码
df['Machine_Type'] = LabelEncoder().fit_transform(df['Machine_Type'])

# 缺失值填充
for col in df.columns[df.isnull().any()]:
    df[col] = df[col].fillna(df[col].median())

# =========================== 4. 异常值检测与处理 ===========================
# IQR裁剪函数定义
def clip_iqr(data, features):
    for f in features:
        q1, q3 = data[f].quantile([0.25,0.75])
        iqr = q3 - q1
        data[f] = data[f].clip(q1 - 1.5*iqr, q3 + 1.5*iqr)
    return data

# 异常值前箱线图示例
plt.figure(figsize=(8,4))
sns.boxplot(x=df['Temperature_C'])
plt.title('Temp Before IQR')
plt.tight_layout()
plt.savefig('temp_before_iqr.png')
plt.close()

# 调用IQR处理
num_feats = df.select_dtypes(include=np.number).columns.drop(['Failure_Within_7_Days','Remaining_Useful_Life_days'])
df = clip_iqr(df, num_feats)

# 异常值后箱线图对比
plt.figure(figsize=(8,4))
sns.boxplot(x=df['Temperature_C'])
plt.title('Temp After IQR')
plt.tight_layout()
plt.savefig('temp_after_iqr.png')
plt.close()

# =========================== 5. 特征工程 ===========================
df['Avg_Maintenance_Interval'] = df['Operational_Hours']/(df['Maintenance_History_Count']+1)
df['Historical_Failure_Rate'] = df['Failure_History_Count']/(df['Operational_Hours']+1)
df['Error_Code_Frequency'] = df['Error_Codes_Last_30_Days']/30
df['Maintenance_Frequency'] = df['Maintenance_History_Count']/(df['Last_Maintenance_Days_Ago']+1)

# =========================== 6. 数据集划分与标准化 ===========================
# 分离特征与标签
X = df.drop(columns=['Machine_ID','Remaining_Useful_Life_days','Failure_Within_7_Days'])
y = df['Failure_Within_7_Days']
# 划分数据集
X_temp,X_test,y_temp,y_test = train_test_split(X,y,test_size=0.2,stratify=y,random_state=42)
X_train,X_val,y_train,y_val = train_test_split(X_temp,y_temp,test_size=0.2,stratify=y_temp,random_state=42)

# 标准化
scaler = StandardScaler()
X_train[num_feats] = scaler.fit_transform(X_train[num_feats])
X_val[num_feats] = scaler.transform(X_val[num_feats])
X_test[num_feats] = scaler.transform(X_test[num_feats])

# =========================== 7. 特征选择 ===========================
sel = LGBMClassifier(random_state=42)
sel.fit(X_train, y_train)
selector = SelectFromModel(sel, threshold='median', prefit=True)
X_train_sel = selector.transform(X_train)
X_val_sel = selector.transform(X_val)
X_test_sel = selector.transform(X_test)

# 特征重要性可视化
imp = sel.feature_importances_
feat_names = X_train.columns
plt.figure(figsize=(10,6))
sns.barplot(x=imp, y=feat_names)
plt.title('Feature Importance (LightGBM)')
plt.tight_layout()
plt.savefig('feature_importance.png')
plt.close()

# =========================== 8. 处理不平衡 ===========================
# 可视化处理前后比例
y_counts = y_train.value_counts()
y_resampled = None
plt.figure(figsize=(6,4))
y_counts.plot(kind='bar')
plt.title('Train Label Distribution Before SMOTE')
plt.tight_layout()
plt.savefig('label_before_smote.png')
plt.close()

smote = SMOTE(random_state=42)
X_train_res,y_train_res = smote.fit_resample(X_train_sel, y_train)
# 处理后可视化
plt.figure(figsize=(6,4))
pd.Series(y_train_res).value_counts().plot(kind='bar',color=['green','red'])
plt.title('Train Label Distribution After SMOTE')
plt.tight_layout()
plt.savefig('label_after_smote.png')
plt.close()

# =========================== 9. 模型调参与训练 ===========================
# 通用调参函数，n_trials可在论文中说明
n_trials = 50

def tune_model(model_cls, space_fn, Xd, yd):
    study = optuna.create_study(direction='maximize')
    study.optimize(lambda t: cross_val_score(model_cls(**space_fn(t)), Xd, yd, cv=3, scoring='roc_auc').mean(), n_trials=n_trials)
    return study.best_params

# 定义搜索空间略（同前）
# 调参并训练模型
xgb_best = tune_model(XGBClassifier, xgb_space, X_train_res, y_train_res)
model_xgb = XGBClassifier(**xgb_best).fit(X_train_res, y_train_res)

lgb_best = tune_model(LGBMClassifier, lgb_space, X_train_res, y_train_res)
model_lgb = LGBMClassifier(**lgb_best).fit(X_train_res, y_train_res)

cat_best = tune_model(CatBoostClassifier, cat_space, X_train_res, y_train_res)
model_cat = CatBoostClassifier(**cat_best).fit(X_train_res, y_train_res)

# =========================== 10. 融合策略 ===========================
# Soft Voting
voting = VotingClassifier([('xgb',model_xgb),('lgb',model_lgb),('cat',model_cat)], voting='soft')
voting.fit(X_train_res, y_train_res)

# Blending（等权重）
def blend_proba(Xd): return np.mean([m.predict_proba(Xd)[:,1] for m in [model_xgb,model_lgb,model_cat]],axis=0)

# Stacking
stack = StackingClassifier(
    estimators=[('xgb',model_xgb),('lgb',model_lgb),('cat',model_cat)],
    final_estimator=LGBMClassifier(n_estimators=100, random_state=42),
    cv=3, passthrough=True
)
stack.fit(X_train_res, y_train_res)

# =========================== 11. 模型评估与可视化 ===========================
def evaluate(name, model, Xv, yv):
    # 概率与预测
    if name=='Blending':
        probs = blend_proba(Xv)
    else:
        probs = model.predict_proba(Xv)[:,1]
    preds = (probs>=0.5).astype(int)
    # ROC
    fpr,tpr,_=roc_curve(yv,probs); roc_auc=auc(fpr,tpr)
    # PR 曲线
    prec,rec,_=precision_recall_curve(yv,probs)
    # 输出指标
    print(f"{name} AUC: {roc_auc:.4f}")
    print(classification_report(yv,preds))
    # 绘图
    plt.figure(); plt.plot(fpr,tpr,label=f'AUC={roc_auc:.2f}'); plt.title(f'{name} ROC'); plt.legend(); plt.savefig(f'{name}_roc.png'); plt.close()
    plt.figure(); plt.plot(rec,prec,label='PR'); plt.title(f'{name} PR Curve'); plt.savefig(f'{name}_pr.png'); plt.close()

# 在验证集上评估
for nm,mdl in [('XGB',model_xgb),('LGB',model_lgb),('Cat',model_cat),('Voting',voting),('Stacking',stack)]:
    evaluate(nm, mdl, X_val_sel, y_val)

# =========================== 12. SHAP 解释性 ===========================
# 提取meta特征
X_meta = stack.transform(X_test_sel)
meta = stack.final_estimator_
expl = shap.TreeExplainer(meta)
shap_vals = expl.shap_values(X_meta)

# 绘制SHAP摘要图
shap.summary_plot(shap_vals, X_meta, show=False)
plt.savefig('shap_summary.png')
plt.close()

# 结束:所有图已保存，可直接插入论文
