# Predictive Maintenance – ICE-Komponenten (verbesserte Version)

In diesem Notebook wird ein verbesserter, synthetischer Datensatz für Predictive Maintenance von ICE-Zugkomponenten verwendet.
Die Failure-Rate liegt bei ca. 10 %, und das Label ist physikalisch sinnvoll von Sensorwerten abhängig.

Wir führen EDA, Datenaufbereitung und mehrere ML-Modelle (Logistische Regression, Random Forest, Gradient Boosting) durch und vergleichen die Modelle.


## 1. Bibliotheken importieren

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pandas.plotting import scatter_matrix

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, roc_auc_score

%matplotlib inline
sns.set_style('whitegrid')


## 2. Daten laden und Überblick

In [None]:
# CSV muss vorher mit dem Datengenerator erzeugt werden: predictive_maintenance_exam_v2.csv
df = pd.read_csv('predictive_maintenance_exam.csv')
df.head()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
# Zielverteilung (Failure-Rate)
df['failure_within_30d'].value_counts(normalize=True)

## 3. Explorative Datenanalyse (EDA)

### 3.1 Histogramme numerischer Variablen

In [None]:
numeric_cols = [
    'ambient_temp',
    'humidity',
    'speed_kmh',
    'distance_km_since_depot',
    'days_since_maintenance',
    'axle_load_tons',
    'brake_usage_pct',
    'vibration_rms',
    'bearing_temp',
    'error_count_24h',
    'curve_ratio',
    'temperature_diff',
    'stress_index'
]

df[numeric_cols].hist(figsize=(16, 12), bins=30, color='#ff9999')
plt.tight_layout()
plt.show()

### 3.2 Korrelationsmatrix (helle Farben)

In [None]:
corr = df[numeric_cols + ['failure_within_30d']].corr()

plt.figure(figsize=(14, 10))
sns.heatmap(
    corr,
    cmap='Reds',        # helle, gut lesbare Farbpalette
    annot=True,
    fmt='.2f',
    square=True,
    cbar_kws={'shrink': 0.8}
)
plt.title('Correlation Heatmap (helle Darstellung)')
plt.xticks(rotation=45, ha='right')
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()

### 3.3 Scatter-/Pairplot ausgewählter Features (mit Failure-Farbe)

In [None]:
scatter_cols = [
    'bearing_temp',
    'vibration_rms',
    'days_since_maintenance',
    'stress_index',
    'failure_within_30d'
]

sns.pairplot(
    df[scatter_cols],
    hue='failure_within_30d',
    diag_kind='hist',
    palette='Set1'
)
plt.show()

### 3.4 Jointplot: bearing_temp vs. vibration_rms

In [None]:
sns.jointplot(
    data=df,
    x='bearing_temp',
    y='vibration_rms',
    hue='failure_within_30d',
    kind='scatter',
    palette='Set1'
)
plt.show()

## 4. Datenvorbereitung & Train/Test-Split

In [None]:
X = df.drop('failure_within_30d', axis=1)
y = df['failure_within_30d']

categorical = ['train_line', 'shift', 'environment_mode']
numeric = [col for col in X.columns if col not in categorical]

preprocess = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical),
        ('num', StandardScaler(), numeric)
    ]
)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

X_train.shape, X_test.shape

## 5. Logistische Regression (Baseline-Modell)

In [None]:
log_reg = Pipeline([
    ('prep', preprocess),
    ('clf', LogisticRegression(max_iter=1000, class_weight='balanced'))
])

log_reg.fit(X_train, y_train)

y_proba_lr = log_reg.predict_proba(X_test)[:, 1]
y_pred_lr = (y_proba_lr > 0.5).astype(int)

print('Klassifikationsbericht – Logistische Regression (Threshold 0.5)')
print(classification_report(y_test, y_pred_lr, zero_division=0))
print('ROC-AUC (Logistische Regression):', roc_auc_score(y_test, y_proba_lr))

### 5.1 Confusion-Matrix – Logistische Regression (Threshold 0.5)

In [None]:
cm_lr = confusion_matrix(y_test, y_pred_lr)

plt.figure(figsize=(4, 3))
sns.heatmap(cm_lr, annot=True, fmt='d', cmap='Reds', cbar=False)
plt.title('Confusion Matrix – LogReg (0.5)')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.tight_layout()
plt.show()

### 5.2 ROC-Kurve – Logistische Regression

In [None]:
fpr_lr, tpr_lr, thr_lr = roc_curve(y_test, y_proba_lr)

plt.figure()
plt.plot(fpr_lr, tpr_lr, label='LogReg')
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC – Logistische Regression')
plt.legend()
plt.tight_layout()
plt.show()

### 5.3 Optimaler Threshold – Logistische Regression

In [None]:
best_idx_lr = np.argmax(tpr_lr - fpr_lr)
best_thr_lr = thr_lr[best_idx_lr]
print('Optimaler Threshold (LogReg):', best_thr_lr)

y_pred_lr_opt = (y_proba_lr > best_thr_lr).astype(int)
print('Klassifikationsbericht – LogReg (optimaler Threshold)')
print(classification_report(y_test, y_pred_lr_opt, zero_division=0))

cm_lr_opt = confusion_matrix(y_test, y_pred_lr_opt)
plt.figure(figsize=(4, 3))
sns.heatmap(cm_lr_opt, annot=True, fmt='d', cmap='Reds', cbar=False)
plt.title('Confusion Matrix – LogReg (optimal)')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.tight_layout()
plt.show()

## 6. Random Forest

In [None]:
rf = Pipeline([
    ('prep', preprocess),
    ('clf', RandomForestClassifier(
        n_estimators=400,
        random_state=42,
        class_weight='balanced_subsample'
    ))
])

rf.fit(X_train, y_train)

y_proba_rf = rf.predict_proba(X_test)[:, 1]
y_pred_rf = (y_proba_rf > 0.5).astype(int)

print('Klassifikationsbericht – Random Forest (Threshold 0.5)')
print(classification_report(y_test, y_pred_rf, zero_division=0))
print('ROC-AUC (Random Forest):', roc_auc_score(y_test, y_proba_rf))

### 6.1 Confusion-Matrix – Random Forest (Threshold 0.5)

In [None]:
cm_rf = confusion_matrix(y_test, y_pred_rf)

plt.figure(figsize=(4, 3))
sns.heatmap(cm_rf, annot=True, fmt='d', cmap='Reds', cbar=False)
plt.title('Confusion Matrix – RF (0.5)')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.tight_layout()
plt.show()

### 6.2 ROC-Kurve – Random Forest

In [None]:
fpr_rf, tpr_rf, thr_rf = roc_curve(y_test, y_proba_rf)

plt.figure()
plt.plot(fpr_rf, tpr_rf, label='RF')
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC – Random Forest')
plt.legend()
plt.tight_layout()
plt.show()

### 6.3 Optimaler Threshold – Random Forest

In [None]:
best_idx_rf = np.argmax(tpr_rf - fpr_rf)
best_thr_rf = thr_rf[best_idx_rf]
print('Optimaler Threshold (RF):', best_thr_rf)

y_pred_rf_opt = (y_proba_rf > best_thr_rf).astype(int)
print('Klassifikationsbericht – RF (optimaler Threshold)')
print(classification_report(y_test, y_pred_rf_opt, zero_division=0))

cm_rf_opt = confusion_matrix(y_test, y_pred_rf_opt)
plt.figure(figsize=(4, 3))
sns.heatmap(cm_rf_opt, annot=True, fmt='d', cmap='Reds', cbar=False)
plt.title('Confusion Matrix – RF (optimal)')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.tight_layout()
plt.show()

## 7. Gradient Boosting

In [None]:
gb = Pipeline([
    ('prep', preprocess),
    ('clf', GradientBoostingClassifier(random_state=42))
])

gb.fit(X_train, y_train)

y_proba_gb = gb.predict_proba(X_test)[:, 1]
y_pred_gb = (y_proba_gb > 0.5).astype(int)

print('Klassifikationsbericht – Gradient Boosting (Threshold 0.5)')
print(classification_report(y_test, y_pred_gb, zero_division=0))
print('ROC-AUC (Gradient Boosting):', roc_auc_score(y_test, y_proba_gb))

### 7.1 Confusion-Matrix – Gradient Boosting (Threshold 0.5)

In [None]:
cm_gb = confusion_matrix(y_test, y_pred_gb)

plt.figure(figsize=(4, 3))
sns.heatmap(cm_gb, annot=True, fmt='d', cmap='Reds', cbar=False)
plt.title('Confusion Matrix – GB (0.5)')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.tight_layout()
plt.show()

### 7.2 ROC-Kurve – Gradient Boosting

In [None]:
fpr_gb, tpr_gb, thr_gb = roc_curve(y_test, y_proba_gb)

plt.figure()
plt.plot(fpr_gb, tpr_gb, label='GB')
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC – Gradient Boosting')
plt.legend()
plt.tight_layout()
plt.show()

## 8. ROC-Vergleich der Modelle (LogReg, RF, GB)

In [None]:
plt.figure()
plt.plot(fpr_lr, tpr_lr, label='LogReg')
plt.plot(fpr_rf, tpr_rf, label='RF')
plt.plot(fpr_gb, tpr_gb, label='GB')
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC – Modellvergleich')
plt.legend()
plt.tight_layout()
plt.show()

## 9. Abschluss

In [None]:
print('Alle Schritte wurden erfolgreich ausgeführt – gut gemacht!')