In [None]:
print("111111111111111111111111111111111111111111111111111111111111111111111")
import time
import warnings
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression, RidgeCV
from sklearn.ensemble import RandomForestRegressor, StackingRegressor
from sklearn.impute import SimpleImputer
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
df = pd.read_csv("train_data.csv")
TARGET = "Remaining_Useful_Life_days"
base_features = [
    'Operational_Hours', 'Temperature_C', 'Vibration_mms',
    'Oil_Level_pct', 'Coolant_Level_pct',
    'Maintenance_History_Count', 'Failure_History_Count'
]
optional_features = [
    'Sound_dB', 'Power_Consumption_kW',
    'Last_Maintenance_Days_Ago', 'Machine_Type',
    'AI_Supervision', 'Installation_Year'
]
all_features = [c for c in base_features + optional_features if c in df.columns]
df = df[df[TARGET].notna()].copy()
for c in ['Machine_Type', 'AI_Supervision']:
    if c in all_features:
        df[c] = LabelEncoder().fit_transform(df[c].astype(str))
if 'Operational_Hours' in df and 'Maintenance_History_Count' in df:
    df['Hours_per_Maintenance'] = df['Operational_Hours'] / (df['Maintenance_History_Count'] + 1)
    all_features.append('Hours_per_Maintenance')
if 'Failure_History_Count' in df and 'Operational_Hours' in df:
    df['Failure_Rate'] = df['Failure_History_Count'] / (df['Operational_Hours'] + 1)
    all_features.append('Failure_Rate')
for c in ['Temperature_C', 'Vibration_mms', 'Sound_dB']:
    if c in all_features:
        lo, hi = df[c].quantile(0.01), df[c].quantile(0.99)
        df[c] = df[c].clip(lo, hi)
X = df[all_features]
y = df[TARGET].values
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print(f"训练集：{X_train.shape}, 测试集：{X_test.shape}, 特征数：{len(all_features)}")
def evaluate(name, model, X_tr, X_te, y_tr, y_te, cv=True):
    st = time.time()
    model.fit(X_tr, y_tr)
    pred = model.predict(X_te)
    mse = mean_squared_error(y_te, pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_te, pred)

    cv_r2 = None
    if cv:
        kf = KFold(5, shuffle=True, random_state=42)
        cv_r2 = cross_val_score(model, X_tr, y_tr, cv=kf, scoring='r2', n_jobs=8).mean()

    print(f"[{name}]  MSE={mse:.1f}  RMSE={rmse:.3f}  R2={r2:.4f}"
          + (f"  (CV_R2={cv_r2:.4f})" if cv_r2 is not None else "")
          + f"  [{time.time()-st:.1f}s]")
    return {"model": name, "MSE": mse, "RMSE": rmse, "R2": r2, "CV_R2": cv_r2}

results = []
lin = Pipeline([
    ("impute", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler()),
    ("lr", LinearRegression())
])
results.append(evaluate("Linear", lin, X_train, X_test, y_train, y_test))

ridge = Pipeline([
    ("imp", SimpleImputer(strategy="median")),
    ("sc", StandardScaler()),
    ("rg", RidgeCV(alphas=[0.1, 1, 10]))
])
results.append(evaluate("RidgeCV", ridge, X_train, X_test, y_train, y_test))
rf = RandomForestRegressor(
    n_estimators=200, max_depth=12, random_state=42, n_jobs=8
)
results.append(evaluate("RandomForest", rf, X_train, X_test, y_train, y_test))
xgb = XGBRegressor(
    n_estimators=200, learning_rate=0.05, max_depth=6,
    subsample=0.8, colsample_bytree=0.8,
    tree_method="hist", n_jobs=8, random_state=42
)
results.append(evaluate("XGB", xgb, X_train, X_test, y_train, y_test))
lgbm = LGBMRegressor(
    n_estimators=200, learning_rate=0.05,
    num_leaves=31, subsample=0.8, colsample_bytree=0.8,
    n_jobs=8, random_state=42, verbose=-1
)
results.append(evaluate("LightGBM", lgbm, X_train, X_test, y_train, y_test))
cat = CatBoostRegressor(
    iterations=400, learning_rate=0.05, depth=6,
    verbose=False, thread_count=8, random_seed=42
)
results.append(evaluate("CatBoost", cat, X_train, X_test, y_train, y_test))
base_models = [
    ('rf', rf),
    ('xgb', xgb),
    ('lgbm', lgbm),
    ('cat', cat),
]
meta = RidgeCV(alphas=[0.1, 1, 10])
stack = StackingRegressor(
    estimators=base_models,
    final_estimator=meta,
    cv=5,
    passthrough=True,
    n_jobs=8
)
results.append(evaluate("Stacking", stack, X_train, X_test, y_train, y_test))
df_res = pd.DataFrame(results).sort_values("RMSE")
print("\n====== 最终结果 排名======")
print(df_res.to_string(index=False))
import matplotlib.pyplot as plt
import numpy as np
y_pred = stack.predict(X_test)
residuals = y_test - y_pred

# 1. 预测值 vs 实际值
plt.figure(figsize=(6,6))
plt.scatter(y_test, y_pred, alpha=0.4)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')
plt.xlabel("实际值")
plt.ylabel("预测值")
plt.title("预测值 vs 实际值")
plt.grid(True)
plt.tight_layout()
plt.show()

# 2. 残差图
plt.figure(figsize=(8,4))
plt.scatter(y_pred, residuals, alpha=0.3)
plt.axhline(0, color='red', linestyle='--')
plt.xlabel("预测值")
plt.ylabel("残差")
plt.title("残差图")
plt.grid(True)
plt.tight_layout()
plt.show()

# 3. 残差分布
plt.figure(figsize=(6,4))
plt.hist(residuals, bins=30, color='skyblue', edgecolor='black')
plt.title("残差分布直方图")
plt.xlabel("残差")
plt.ylabel("频数")
plt.grid(True)
plt.tight_layout()
plt.show()

# 4. 特征重要性图（以XGB为例）
importances = xgb.feature_importances_
feat_names = X_train.columns
sorted_idx = np.argsort(importances)[::-1]

plt.figure(figsize=(8,6))
plt.barh(range(len(sorted_idx[:15])), importances[sorted_idx[:15]][::-1])
plt.yticks(range(len(sorted_idx[:15])), feat_names[sorted_idx[:15]][::-1])
plt.xlabel("重要性")
plt.title("XGBoost特征重要性")
plt.tight_layout()
plt.show()
