# Predictive Maintenance – Realistischer Datensatz

In diesem Notebook wird ein realistisch synthetischer Datensatz für Predictive Maintenance von Zugkomponenten analysiert. Es werden Explorative Datenanalysen (EDA) durchgeführt und zwei Machine-Learning-Modelle (Logistische Regression und Random Forest) trainiert und bewertet.

## 1. Bibliotheken importieren

In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

from pandas.plotting import scatter_matrix

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, roc_auc_score

%matplotlib inline


## 2. Daten laden und erste Übersicht

In [None]:
# CSV-Datei laden (muss im gleichen Ordner liegen)
df = pd.read_csv("realistic_predictive_maintenance.csv")

# Erster Blick auf die Daten
df.head()


In [None]:
df.info()

In [None]:
df.describe()

In [None]:
# Verteilung der Zielvariable (Ausfall innerhalb von 30 Tagen)
df['failure_within_30d'].value_counts(normalize=True)


## 3. Explorative Datenanalyse (EDA)

### 3.1 Histogramme wichtiger Merkmale

In [None]:
fig, axs = plt.subplots(2, 2, figsize=(12, 8))

axs[0, 0].hist(df['temperature'], bins=30)
axs[0, 0].set_title("Temperature")

axs[0, 1].hist(df['vibration_level'], bins=30)
axs[0, 1].set_title("Vibration Level")

axs[1, 0].hist(df['operating_hours'], bins=30)
axs[1, 0].set_title("Operating Hours")

axs[1, 1].hist(df['days_since_last_maintenance'], bins=30)
axs[1, 1].set_title("Days Since Last Maintenance")

plt.tight_layout()
plt.show()


### 3.2 Korrelationsmatrix

In [None]:
corr = df.corr(numeric_only=True)

plt.figure(figsize=(8, 6))
plt.imshow(corr, interpolation='nearest')
plt.title("Correlation Heatmap")
plt.xticks(range(len(corr.columns)), corr.columns, rotation=90)
plt.yticks(range(len(corr.columns)), corr.columns)
plt.colorbar()
plt.tight_layout()
plt.show()


### 3.3 Scatter-Matrix ausgewählter Merkmale

In [None]:
scatter_cols = ['temperature', 'vibration_level', 'operating_hours', 'days_since_last_maintenance']
plt.figure(figsize=(10, 10))
scatter_matrix(df[scatter_cols], figsize=(10, 10))
plt.suptitle("Scatter Matrix")
plt.show()


### 3.4 Zusammenhang Temperatur vs. Vibration

In [None]:
from matplotlib import gridspec

fig = plt.figure(figsize=(8, 8))
gs = gridspec.GridSpec(4, 4)

ax_main = fig.add_subplot(gs[1:4, 0:3])
ax_xhist = fig.add_subplot(gs[0, 0:3], sharex=ax_main)
ax_yhist = fig.add_subplot(gs[1:4, 3], sharey=ax_main)

ax_main.scatter(df['temperature'], df['vibration_level'], s=10)
ax_main.set_xlabel("temperature")
ax_main.set_ylabel("vibration_level")

ax_xhist.hist(df['temperature'], bins=30)
ax_yhist.hist(df['vibration_level'], bins=30, orientation='horizontal')

plt.tight_layout()
plt.show()


## 4. Datenvorbereitung & Train/Test-Split

In [None]:
X = df.drop('failure_within_30d', axis=1)
y = df['failure_within_30d']

categorical = ['weekday']
numeric = [col for col in X.columns if col not in categorical]

preprocess = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical),
        ('num', 'passthrough', numeric)
    ]
)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

X_train.shape, X_test.shape


## 5. Logistische Regression

In [None]:
log_reg = Pipeline(steps=[
    ('preprocess', preprocess),
    ('lr', LogisticRegression(max_iter=500, class_weight='balanced'))
])

log_reg.fit(X_train, y_train)

y_pred_lr = log_reg.predict(X_test)
y_proba_lr = log_reg.predict_proba(X_test)[:, 1]

print("### Klassifikationsbericht – Logistische Regression")
print(classification_report(y_test, y_pred_lr))
print("ROC-AUC (Logistische Regression):", roc_auc_score(y_test, y_proba_lr))


### 5.1 Confusion-Matrix (Logistische Regression)

In [None]:
cm_lr = confusion_matrix(y_test, y_pred_lr)

plt.figure(figsize=(4, 3))
plt.imshow(cm_lr, interpolation='nearest')
plt.title("Confusion Matrix – Logistische Regression")
plt.xlabel("Predicted")
plt.ylabel("True")
for i in range(cm_lr.shape[0]):
    for j in range(cm_lr.shape[1]):
        plt.text(j, i, cm_lr[i, j], ha="center", va="center")
plt.tight_layout()
plt.show()


### 5.2 ROC-Kurve (Logistische Regression)

In [None]:
fpr_lr, tpr_lr, _ = roc_curve(y_test, y_proba_lr)

plt.figure()
plt.plot(fpr_lr, tpr_lr, label="Logistische Regression")
plt.plot([0, 1], [0, 1], linestyle="--")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC-Kurve – Logistische Regression")
plt.legend()
plt.tight_layout()
plt.show()


## 6. Random Forest Modell

In [None]:
rf = Pipeline(steps=[
    ('preprocess', preprocess),
    ('rf', RandomForestClassifier(
        n_estimators=200,
        random_state=42,
        class_weight='balanced'
    ))
])

rf.fit(X_train, y_train)

y_pred_rf = rf.predict(X_test)
y_proba_rf = rf.predict_proba(X_test)[:, 1]

print("### Klassifikationsbericht – Random Forest")
print(classification_report(y_test, y_pred_rf))
print("ROC-AUC (Random Forest):", roc_auc_score(y_test, y_proba_rf))


### 6.1 Confusion-Matrix (Random Forest)

In [None]:
cm_rf = confusion_matrix(y_test, y_pred_rf)

plt.figure(figsize=(4, 3))
plt.imshow(cm_rf, interpolation='nearest')
plt.title("Confusion Matrix – Random Forest")
plt.xlabel("Predicted")
plt.ylabel("True")
for i in range(cm_rf.shape[0]):
    for j in range(cm_rf.shape[1]):
        plt.text(j, i, cm_rf[i, j], ha="center", va="center")
plt.tight_layout()
plt.show()


### 6.2 ROC-Kurve (Random Forest)

In [None]:
fpr_rf, tpr_rf, _ = roc_curve(y_test, y_proba_rf)

plt.figure()
plt.plot(fpr_rf, tpr_rf, label="Random Forest")
plt.plot([0, 1], [0, 1], linestyle="--")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC-Kurve – Random Forest")
plt.legend()
plt.tight_layout()
plt.show()


## 7. Modellvergleich über ROC-Kurve

In [None]:
plt.figure()
plt.plot(fpr_lr, tpr_lr, label="Logistische Regression")
plt.plot(fpr_rf, tpr_rf, label="Random Forest")
plt.plot([0, 1], [0, 1], linestyle="--")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC-Kurven – Modellvergleich")
plt.legend()
plt.tight_layout()
plt.show()
