# Predictive Maintenance – ICE-Komponenten (Prüfungsprojekt)

In diesem Notebook wird ein konsistenter, synthetischer Datensatz für Predictive Maintenance von ICE-Zugkomponenten verwendet.
Der Datensatz wurde mit einem festen Seed generiert und enthält neben klassischen Sensordaten (z. B. `bearing_temp`, `vibration_rms`, `days_since_maintenance`) auch zusätzliche Features wie `humidity`, `axle_load_tons` und `environment_mode`.

Es werden EDA, Datenaufbereitung und mehrere ML-Modelle (Logistische Regression, Random Forest, optional Gradient Boosting) trainiert und bewertet.
Am Ende erscheint die Meldung **"Alle Schritte wurden erfolgreich ausgeführt – gut gemacht!"**, wenn alle Zellen ohne Fehler durchlaufen.

## 1. Bibliotheken importieren

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pandas.plotting import scatter_matrix
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, roc_auc_score

%matplotlib inline


## 2. Daten laden und Überblick

In [None]:
df = pd.read_csv('predictive_maintenance_exam.csv')
df.head()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df['failure_within_30d'].value_counts(normalize=True)

## 3. Explorative Datenanalyse (EDA)

### 3.1 Histogramme numerischer Variablen

In [None]:
numeric_cols = [
    'ambient_temp',
    'humidity',
    'speed_kmh',
    'distance_km_since_depot',
    'days_since_maintenance',
    'axle_load_tons',
    'brake_usage_pct',
    'vibration_rms',
    'bearing_temp',
    'error_count_24h',
    'curve_ratio'
]

df[numeric_cols].hist(figsize=(14,10), bins=30)
plt.tight_layout()
plt.show()

### 3.2 Korrelationsmatrix (besser lesbare Heatmap mit Seaborn)

In [None]:
corr = df[numeric_cols + ['failure_within_30d']].corr()

plt.figure(figsize=(12, 9))
sns.heatmap(
    corr,
    cmap="coolwarm",        # deutliche Rot-Blau-Skala
    center=0,               # 0 = weiß, negative blau, positive rot
    annot=True,             # Korrelationswerte einblenden
    fmt=".2f",
    square=True,
    cbar_kws={"shrink": 0.8}
)
plt.title("Correlation Heatmap (besser lesbar)")
plt.xticks(rotation=45, ha="right")
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()

### 3.3 Scatter-/Pairplot ausgewählter Features

In [None]:
scatter_cols = ['bearing_temp','vibration_rms','days_since_maintenance','humidity']
scatter_matrix(df[scatter_cols], figsize=(10,10))
plt.suptitle("Scatter Matrix", y=0.9)
plt.show()

### 3.4 Jointplot-ähnliche Darstellung: Temperatur vs. Vibration

In [None]:
from matplotlib import gridspec

fig = plt.figure(figsize=(8,8))
gs = gridspec.GridSpec(4,4)

ax_main = fig.add_subplot(gs[1:4,0:3])
ax_x = fig.add_subplot(gs[0,0:3], sharex=ax_main)
ax_y = fig.add_subplot(gs[1:4,3], sharey=ax_main)

ax_main.scatter(df['bearing_temp'], df['vibration_rms'], s=10)
ax_main.set_xlabel("bearing_temp")
ax_main.set_ylabel("vibration_rms")

ax_x.hist(df['bearing_temp'], bins=30)
ax_y.hist(df['vibration_rms'], bins=30, orientation='horizontal')

plt.tight_layout()
plt.show()

## 4. Datenvorbereitung & Train/Test-Split

In [None]:
X = df.drop('failure_within_30d', axis=1)
y = df['failure_within_30d']

# Kategoriale Features inkl. environment_mode
categorical = ['train_line','shift','environment_mode']

# Alle übrigen Spalten werden als numerisch behandelt
numeric = [col for col in X.columns if col not in categorical]

preprocess = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical),
        ('num', StandardScaler(), numeric)
    ]
)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

X_train.shape, X_test.shape

## 5. Logistische Regression

In [None]:
log_reg = Pipeline([
    ('prep', preprocess),
    ('clf', LogisticRegression(max_iter=1000, class_weight='balanced'))
])

log_reg.fit(X_train, y_train)

y_pred_lr = log_reg.predict(X_test)
y_proba_lr = log_reg.predict_proba(X_test)[:,1]

print("Klassifikationsbericht – Logistische Regression (Threshold 0.5)")
print(classification_report(y_test, y_pred_lr, zero_division=0))
print("ROC-AUC (Logistische Regression):", roc_auc_score(y_test, y_proba_lr))

### 5.1 Confusion-Matrix – Logistische Regression

In [None]:
cm_lr = confusion_matrix(y_test, y_pred_lr)

plt.figure(figsize=(4,3))
plt.imshow(cm_lr, interpolation='nearest')
plt.title("Confusion Matrix – LogReg (0.5)")
plt.xlabel("Predicted")
plt.ylabel("True")
for i in range(cm_lr.shape[0]):
    for j in range(cm_lr.shape[1]):
        plt.text(j, i, cm_lr[i,j], ha='center', va='center')
plt.tight_layout()
plt.show()

### 5.2 ROC-Kurve – Logistische Regression

In [None]:
fpr_lr, tpr_lr, thr_lr = roc_curve(y_test, y_proba_lr)

plt.figure()
plt.plot(fpr_lr, tpr_lr, label="LogReg")
plt.plot([0,1],[0,1],'--')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC – Logistische Regression")
plt.legend()
plt.tight_layout()
plt.show()

### 5.3 Optimaler Threshold – Logistische Regression

In [None]:
best_idx_lr = np.argmax(tpr_lr - fpr_lr)
best_thr_lr = thr_lr[best_idx_lr]
print("Optimaler Threshold (LogReg):", best_thr_lr)

y_pred_lr_opt = (y_proba_lr > best_thr_lr).astype(int)
print("Klassifikationsbericht – LogReg (optimaler Threshold)")
print(classification_report(y_test, y_pred_lr_opt, zero_division=0))

cm_lr_opt = confusion_matrix(y_test, y_pred_lr_opt)
plt.figure(figsize=(4,3))
plt.imshow(cm_lr_opt, interpolation='nearest')
plt.title("Confusion Matrix – LogReg (optimal)")
plt.xlabel("Predicted")
plt.ylabel("True")
for i in range(cm_lr_opt.shape[0]):
    for j in range(cm_lr_opt.shape[1]):
        plt.text(j, i, cm_lr_opt[i,j], ha='center', va='center')
plt.tight_layout()
plt.show()

## 6. Random Forest

In [None]:
rf = Pipeline([
    ('prep', preprocess),
    ('clf', RandomForestClassifier(
        n_estimators=300,
        random_state=42,
        class_weight='balanced'
    ))
])

rf.fit(X_train, y_train)

y_pred_rf = rf.predict(X_test)
y_proba_rf = rf.predict_proba(X_test)[:,1]

print("Klassifikationsbericht – Random Forest (Threshold 0.5)")
print(classification_report(y_test, y_pred_rf, zero_division=0))
print("ROC-AUC (Random Forest):", roc_auc_score(y_test, y_proba_rf))

### 6.1 Confusion-Matrix – Random Forest

In [None]:
cm_rf = confusion_matrix(y_test, y_pred_rf)

plt.figure(figsize=(4,3))
plt.imshow(cm_rf, interpolation='nearest')
plt.title("Confusion Matrix – RF (0.5)")
plt.xlabel("Predicted")
plt.ylabel("True")
for i in range(cm_rf.shape[0]):
    for j in range(cm_rf.shape[1]):
        plt.text(j, i, cm_rf[i,j], ha='center', va='center')
plt.tight_layout()
plt.show()

### 6.2 ROC-Kurve – Random Forest

In [None]:
fpr_rf, tpr_rf, thr_rf = roc_curve(y_test, y_proba_rf)

plt.figure()
plt.plot(fpr_rf, tpr_rf, label="RF")
plt.plot([0,1],[0,1],'--')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC – Random Forest")
plt.legend()
plt.tight_layout()
plt.show()

### 6.3 Optimaler Threshold – Random Forest

In [None]:
best_idx_rf = np.argmax(tpr_rf - fpr_rf)
best_thr_rf = thr_rf[best_idx_rf]
print("Optimaler Threshold (RF):", best_thr_rf)

y_pred_rf_opt = (y_proba_rf > best_thr_rf).astype(int)
print("Klassifikationsbericht – RF (optimaler Threshold)")
print(classification_report(y_test, y_pred_rf_opt, zero_division=0))

cm_rf_opt = confusion_matrix(y_test, y_pred_rf_opt)
plt.figure(figsize=(4,3))
plt.imshow(cm_rf_opt, interpolation='nearest')
plt.title("Confusion Matrix – RF (optimal)")
plt.xlabel("Predicted")
plt.ylabel("True")
for i in range(cm_rf_opt.shape[0]):
        for j in range(cm_rf_opt.shape[1]):
            plt.text(j, i, cm_rf_opt[i,j], ha='center', va='center')
plt.tight_layout()
plt.show()

## 7. Gradient Boosting (optional)

In [None]:
gb = Pipeline([
    ('prep', preprocess),
    ('clf', GradientBoostingClassifier(random_state=42))
])

gb.fit(X_train, y_train)

y_pred_gb = gb.predict(X_test)
y_proba_gb = gb.predict_proba(X_test)[:,1]

print("Klassifikationsbericht – Gradient Boosting (Threshold 0.5)")
print(classification_report(y_test, y_pred_gb, zero_division=0))
print("ROC-AUC (Gradient Boosting):", roc_auc_score(y_test, y_proba_gb))

## 8. ROC-Vergleich der Modelle

In [None]:
fpr_gb, tpr_gb, thr_gb = roc_curve(y_test, y_proba_gb)

plt.figure()
plt.plot(fpr_lr, tpr_lr, label="LogReg")
plt.plot(fpr_rf, tpr_rf, label="RF")
plt.plot(fpr_gb, tpr_gb, label="GB")
plt.plot([0,1],[0,1],'--')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC – Modellvergleich")
plt.legend()
plt.tight_layout()
plt.show()

## 9. Abschluss

In [None]:
print("Alle Schritte wurden erfolgreich ausgeführt – gut gemacht!")