# Predictive Maintenance – Realistischer Datensatz (Verbesserte Version)
Dieses Notebook enthält:
- StandardScaler für numerische Features
- Optimalen ROC-basierten Threshold für LR & RF
- EDA & ML mit realistischer Performance

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pandas.plotting import scatter_matrix

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, roc_auc_score

%matplotlib inline


## Daten laden

In [None]:
df = pd.read_csv('realistic_predictive_maintenance.csv')
df.head()

In [None]:
df.describe()

## EDA – Histogramme

In [None]:
fig, axs = plt.subplots(2,2, figsize=(12,8))
axs[0,0].hist(df['temperature'], bins=30); axs[0,0].set_title("Temperature")
axs[0,1].hist(df['vibration_level'], bins=30); axs[0,1].set_title("Vibration Level")
axs[1,0].hist(df['operating_hours'], bins=30); axs[1,0].set_title("Operating Hours")
axs[1,1].hist(df['days_since_last_maintenance'], bins=30); axs[1,1].set_title("Days Since Last Maintenance")
plt.tight_layout(); plt.show()

## EDA – Korrelationsmatrix

In [None]:
corr = df.corr(numeric_only=True)
plt.figure(figsize=(8,6))
plt.imshow(corr, interpolation='nearest')
plt.xticks(range(len(corr.columns)), corr.columns, rotation=90)
plt.yticks(range(len(corr.columns)), corr.columns)
plt.colorbar()
plt.title("Correlation Heatmap")
plt.tight_layout()
plt.show()

## Scatter-Matrix

In [None]:
scatter_cols = ['temperature','vibration_level','operating_hours','days_since_last_maintenance']
scatter_matrix(df[scatter_cols], figsize=(10,10))
plt.show()

## Datenvorbereitung mit StandardScaler (Verbessert)

In [None]:
X = df.drop('failure_within_30d', axis=1)
y = df['failure_within_30d']

categorical = ['weekday']
numeric = [col for col in X.columns if col not in categorical]

preprocess = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical),
        ('num', StandardScaler(), numeric)
    ]
)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

X_train.shape, X_test.shape

## Logistische Regression

In [None]:
log_reg = Pipeline([
    ('prep', preprocess),
    ('lr', LogisticRegression(max_iter=1000, class_weight='balanced'))
])

log_reg.fit(X_train, y_train)

y_pred_lr = log_reg.predict(X_test)
y_proba_lr = log_reg.predict_proba(X_test)[:,1]

print(classification_report(y_test, y_pred_lr))
print("ROC AUC:", roc_auc_score(y_test, y_proba_lr))

### ROC-Kurve LR

In [None]:
fpr_lr, tpr_lr, thresholds_lr = roc_curve(y_test, y_proba_lr)
plt.plot(fpr_lr,tpr_lr); plt.plot([0,1],[0,1],'--')
plt.title("ROC – LR"); plt.xlabel("FPR"); plt.ylabel("TPR")
plt.show()

### Optimaler Threshold LR

In [None]:
best_idx_lr = np.argmax(tpr_lr - fpr_lr)
best_th_lr = thresholds_lr[best_idx_lr]
print("Optimaler LR Threshold:", best_th_lr)

y_pred_lr_opt = (y_proba_lr > best_th_lr).astype(int)
print(classification_report(y_test, y_pred_lr_opt))

## Random Forest

In [None]:
rf = Pipeline([
    ('prep', preprocess),
    ('rf', RandomForestClassifier(n_estimators=200, class_weight='balanced', random_state=42))
])

rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)
y_proba_rf = rf.predict_proba(X_test)[:,1]

print(classification_report(y_test, y_pred_rf))
print("ROC AUC:", roc_auc_score(y_test, y_proba_rf))

### ROC-Kurve RF

In [None]:
fpr_rf, tpr_rf, thresholds_rf = roc_curve(y_test, y_proba_rf)
plt.plot(fpr_rf,tpr_rf); plt.plot([0,1],[0,1],'--')
plt.title("ROC – RF"); plt.xlabel("FPR"); plt.ylabel("TPR")
plt.show()

### Optimaler Threshold RF

In [None]:
best_idx_rf = np.argmax(tpr_rf - fpr_rf)
best_th_rf = thresholds_rf[best_idx_rf]
print("Optimaler RF Threshold:", best_th_rf)

y_pred_rf_opt = (y_proba_rf > best_th_rf).astype(int)
print(classification_report(y_test, y_pred_rf_opt))