In [1]:
# -*- coding: utf-8 -*-
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# 数据预处理与建模工具
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    classification_report, confusion_matrix, roc_auc_score,
    accuracy_score, precision_score, recall_score, f1_score, roc_curve
)

# 机器学习模型
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

# 数据不平衡处理
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline

# 显示设置
import warnings
warnings.filterwarnings("ignore")

In [2]:
# 替换为你的数据路径
file_path =r'D:\桌面\2025年第四届“创新杯”（原钉钉杯）大学生大数据挑战赛初赛题目\2025年第四届“创新杯”（原钉钉杯）大学生大数据挑战赛初赛题目\A题\data\train_data.csv'
df = pd.read_csv(file_path)

# 初步检查
print(df.shape)
print(df.info())
print(df.isnull().sum())
print(df.duplicated().sum())


(400000, 22)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400000 entries, 0 to 399999
Data columns (total 22 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   Machine_ID                  400000 non-null  object 
 1   Machine_Type                400000 non-null  object 
 2   Installation_Year           400000 non-null  int64  
 3   Operational_Hours           400000 non-null  int64  
 4   Temperature_C               400000 non-null  float64
 5   Vibration_mms               400000 non-null  float64
 6   Sound_dB                    400000 non-null  float64
 7   Oil_Level_pct               400000 non-null  float64
 8   Coolant_Level_pct           400000 non-null  float64
 9   Power_Consumption_kW        400000 non-null  float64
 10  Last_Maintenance_Days_Ago   400000 non-null  int64  
 11  Maintenance_History_Count   400000 non-null  int64  
 12  Failure_History_Count       400000 non-null  int64  
 13  A

In [3]:
high_na_cols = df.columns[df.isnull().mean() > 0.9]
df.drop(columns=high_na_cols, inplace=True)

# 删除无预测价值的列
df.drop(columns=['Machine_ID', 'Remaining_Useful_Life_days'], inplace=True)

# 目标列转换为整数
df['Failure_Within_7_Days'] = df['Failure_Within_7_Days'].astype(int)

In [4]:
from datetime import datetime

# 1. 机器使用年限
df['Machine_Age'] = datetime.now().year - df['Installation_Year']
df.drop('Installation_Year', axis=1, inplace=True)

# 2. 创建交互特征
df['Hours_per_Maintenance'] = df['Operational_Hours'] / (df['Maintenance_History_Count'] + 1)
df['Failure_Rate_per_Maintenance'] = df['Failure_History_Count'] / (df['Maintenance_History_Count'] + 1)
df['Power_Vibration_Ratio'] = df['Power_Consumption_kW'] / (df['Vibration_mms'] + 1e-6)


In [5]:
# 类别变量独热编码
df = pd.get_dummies(df, columns=['Machine_Type'], drop_first=True)

# 布尔值转 int
df['AI_Supervision'] = df['AI_Supervision'].astype(int)


In [6]:
numeric_cols = df.select_dtypes(include=[np.number]).drop(columns=['Failure_Within_7_Days']).columns

for col in numeric_cols:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    df[col] = df[col].clip(lower=lower, upper=upper)


In [7]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.combine import SMOTETomek

# 1. 划分数据
X = df.drop('Failure_Within_7_Days', axis=1)
y = df['Failure_Within_7_Days']
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# 2. 过采样 + 欠采样
smote_tomek = SMOTETomek(random_state=42)
X_train_res, y_train_res = smote_tomek.fit_resample(X_train, y_train)

# 3. 标准化
scaler = StandardScaler()
X_train_res_scaled = scaler.fit_transform(X_train_res)
X_test_scaled = scaler.transform(X_test)


KeyboardInterrupt: 