# Predictive Maintenance – FULL Improved Notebook (New Realistic Dataset)
Dieses Notebook enthält:
- Vollständige EDA
- Realistischen Datensatz v2
- StandardScaler + OneHotEncoder
- Logistische Regression & Random Forest
- Optimale Thresholds
- ROC‑Vergleich
- Confusion Matrices
- Abschließendes 'gut gemacht' bei Erfolg

## 1. Bibliotheken importieren

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pandas.plotting import scatter_matrix

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, roc_auc_score

%matplotlib inline


## 2. Realistischen Datensatz laden

In [None]:
df = pd.read_csv('realistic_predictive_maintenance_v2.csv')
df.head()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df['failure_within_30d'].value_counts(normalize=True)

## 3. Explorative Datenanalyse (EDA)

### 3.1 Histogramme

In [None]:
numeric_cols = ['temperature','vibration_rms','acoustic_db','pressure_bar',
                 'humidity','load_percent','operating_hours','speed_rpm',
                 'distance_km','days_since_last_maintenance']

df[numeric_cols].hist(figsize=(14,10), bins=30)
plt.tight_layout()
plt.show()

### 3.2 Korrelationsmatrix

In [None]:
corr = df.corr(numeric_only=True)
plt.figure(figsize=(10,8))
plt.imshow(corr, cmap='coolwarm', interpolation='nearest')
plt.xticks(range(len(corr.columns)), corr.columns, rotation=90)
plt.yticks(range(len(corr.columns)), corr.columns)
plt.colorbar()
plt.title("Correlation Heatmap")
plt.tight_layout()
plt.show()

### 3.3 Scatter-Matrix

In [None]:
scatter_cols = ['temperature','vibration_rms','operating_hours','load_percent']
scatter_matrix(df[scatter_cols], figsize=(10,10))
plt.show()

## 4. Datenvorbereitung

In [None]:
X = df.drop('failure_within_30d', axis=1)
y = df['failure_within_30d']

categorical = ['weekday', 'month', 'maintenance_type']
numeric = [col for col in X.columns if col not in categorical]

preprocess = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical),
        ('num', StandardScaler(), numeric)
    ]
)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

X_train.shape, X_test.shape

## 5. Logistische Regression

In [None]:
lr_model = Pipeline([
    ('prep', preprocess),
    ('lr', LogisticRegression(max_iter=1000, class_weight='balanced'))
])

lr_model.fit(X_train, y_train)

y_pred_lr = lr_model.predict(X_test)
y_proba_lr = lr_model.predict_proba(X_test)[:,1]

print(classification_report(y_test, y_pred_lr))
print("ROC AUC:", roc_auc_score(y_test, y_proba_lr))

### 5.1 Confusion Matrix

In [None]:
cm_lr = confusion_matrix(y_test, y_pred_lr)
plt.imshow(cm_lr, cmap='Blues')
plt.title("Confusion Matrix – LR")
plt.xlabel("Predicted")
plt.ylabel("True")
for i in range(2):
    for j in range(2):
        plt.text(j, i, cm_lr[i,j], ha='center')
plt.show()

### 5.2 ROC-Kurve

In [None]:
fpr_lr, tpr_lr, th_lr = roc_curve(y_test, y_proba_lr)
plt.plot(fpr_lr, tpr_lr, label='LR')
plt.plot([0,1],[0,1],'--')
plt.xlabel("FPR")
plt.ylabel("TPR")
plt.title("ROC – LR")
plt.legend()
plt.show()

### 5.3 Optimaler Threshold

In [None]:
best_idx_lr = np.argmax(tpr_lr - fpr_lr)
best_th_lr = th_lr[best_idx_lr]
print("Optimaler Threshold LR:", best_th_lr)

y_pred_lr_opt = (y_proba_lr > best_th_lr).astype(int)
print(classification_report(y_test, y_pred_lr_opt))

cm_lr_opt = confusion_matrix(y_test, y_pred_lr_opt)
plt.imshow(cm_lr_opt, cmap='Greens')
plt.title("Confusion Matrix – LR (Optimal)")
plt.xlabel("Predicted")
plt.ylabel("True")
for i in range(2):
    for j in range(2):
        plt.text(j, i, cm_lr_opt[i,j], ha='center')
plt.show()

## 6. Random Forest

In [None]:
rf_model = Pipeline([
    ('prep', preprocess),
    ('rf', RandomForestClassifier(
        n_estimators=300, random_state=42, class_weight='balanced'
    ))
])

rf_model.fit(X_train, y_train)

y_pred_rf = rf_model.predict(X_test)
y_proba_rf = rf_model.predict_proba(X_test)[:,1]

print(classification_report(y_test, y_pred_rf))
print("ROC AUC:", roc_auc_score(y_test, y_proba_rf))

### 6.1 Confusion Matrix

In [None]:
cm_rf = confusion_matrix(y_test, y_pred_rf)
plt.imshow(cm_rf, cmap='Oranges')
plt.title("Confusion Matrix – RF")
plt.xlabel("Predicted")
plt.ylabel("True")
for i in range(2):
    for j in range(2):
        plt.text(j, i, cm_rf[i,j], ha='center')
plt.show()

### 6.2 ROC-Kurve

In [None]:
fpr_rf, tpr_rf, th_rf = roc_curve(y_test, y_proba_rf)
plt.plot(fpr_rf, tpr_rf, label='RF')
plt.plot([0,1],[0,1],'--')
plt.xlabel("FPR")
plt.ylabel("TPR")
plt.title("ROC – RF")
plt.legend()
plt.show()

### 6.3 Optimaler Threshold

In [None]:
best_idx_rf = np.argmax(tpr_rf - fpr_rf)
best_th_rf = th_rf[best_idx_rf]
print("Optimaler Threshold RF:", best_th_rf)

y_pred_rf_opt = (y_proba_rf > best_th_rf).astype(int)
print(classification_report(y_test, y_pred_rf_opt))

cm_rf_opt = confusion_matrix(y_test, y_pred_rf_opt)
plt.imshow(cm_rf_opt, cmap='Purples')
plt.title("Confusion Matrix – RF (Optimal)")
plt.xlabel("Predicted")
plt.ylabel("True")
for i in range(2):
    for j in range(2):
        plt.text(j, i, cm_rf_opt[i,j], ha='center')
plt.show()

## 7. ROC-Vergleich

In [None]:
plt.plot(fpr_lr, tpr_lr, label='LR')
plt.plot(fpr_rf, tpr_rf, label='RF')
plt.plot([0,1],[0,1],'--')
plt.title("ROC-Vergleich")
plt.xlabel("FPR")
plt.ylabel("TPR")
plt.legend()
plt.show()

## 8. Abschluss

In [None]:
print("Alle Schritte wurden erfolgreich ausgeführt – gut gemacht!")