In [None]:
# Importieren der benötigten Bibliotheken

# Datenmanipulation
import pandas as pd
import os

# Visualisierung
import seaborn as sns
import matplotlib.pyplot as plt

# Machine Learning
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.base import clone
from sklearn.metrics import root_mean_squared_error, mean_absolute_error, r2_score


# Warnings unterdrücken
import warnings
from sklearn.exceptions import DataConversionWarning
warnings.filterwarnings(action="ignore", category=DataConversionWarning)
warnings.filterwarnings(action="ignore", category=FutureWarning)

# Baseline Modellierung


In [None]:
# Einlesen der Trainings- und Testdaten
destination_path = "../data/processed"

features_train = pd.read_csv(f"{destination_path}/features_train.csv")
features_test = pd.read_csv(f"{destination_path}/features_test.csv")
target_train = pd.read_csv(f"{destination_path}/target_train.csv")
target_test = pd.read_csv(f"{destination_path}/target_test.csv")

# Zielvariable in einem dict organisieren
targets = {
    "compressor": {
        "train": target_train.iloc[:, 0],
        "test": target_test.iloc[:, 0]
    },
    "turbine": {
        "train": target_train.iloc[:, 1],
        "test": target_test.iloc[:, 1]
    }
}

In [None]:
# Baseline-Modelle
models = {
    "Linear Regression": Pipeline([("scaler", StandardScaler()), ("pca", PCA(n_components=14)), ("model", LinearRegression())]),
    "K-Neighbors": Pipeline([("scaler", StandardScaler()), ("pca", PCA(n_components=14)), ("model", KNeighborsRegressor(n_neighbors=5))]),
    "Random Forest": Pipeline([("model", RandomForestRegressor(random_state=42))]),
    "Decision Tree": Pipeline([("model", DecisionTreeRegressor(random_state=42))])
}

In [None]:
# Evaluierung der Baseline-Modelle mit allen Features
all_results = []

# Schleife für Kompressor und Turbine
for target_name, target_data in targets.items():
    # Split der Zielvariable in Trainings- und Testdaten
    target_train_data = target_data["train"]
    target_test_data = target_data["test"]
    
    # Schleife über alle Modelle
    for model_name, model_pipeline in models.items():
        current_model = clone(model_pipeline) # Klonen des Modells
        current_model.fit(features_train, target_train_data) # Training des Modells
        target_pred = current_model.predict(features_test) # Vorhersage auf den Testdaten
        
        # Berechnung der Metriken
        r2 = r2_score(target_test_data, target_pred)
        mae = mean_absolute_error(target_test_data, target_pred)
        rmse = root_mean_squared_error(target_test_data, target_pred)
        
        # Speichern der Ergebnisse
        results = {"Target": target_name, "Model": model_name, "R2": r2, "MAE": mae, "RMSE": rmse}
        all_results.append(results)

# Speichern der Ergebnisse in einer CSV-Datei
results_df = pd.DataFrame(all_results)
results_df.to_csv("../data/results/baseline_results.csv", index=False)

print("Baseline Modellergebnisse")
print(results_df.round(4))

# Model Interpretation (Feature Importance)


In [None]:
# Trainiere für jedes Target ein RandomForest-Modell mit allen Features
feature_names = features_train.columns
importance_dfs = {}

# Schleife für Kompressor und Turbine
for target_name, target_data in targets.items():
    rf_model = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1) # Initialisiere das Modell
    rf_model.fit(features_train, target_data['train']) # Trainiere das Modell
    
    # Extrahiere die Feature Importances
    importances = rf_model.feature_importances_
    importance_df = pd.DataFrame({"Feature": feature_names, "Importance": importances})
    importance_dfs[target_name] = importance_df.sort_values(by="Importance", ascending=False)

In [None]:
# Visualisierung der Feature-Wichtigkeiten
fig, axes = plt.subplots(1, 2, figsize=(16, 8))

colors = ["#009292"] + ["#074650"] * 9

# Plot für Kompressor
comp_data = importance_dfs["compressor"].head(10).copy()
comp_colors = colors[:len(comp_data)]
sns.barplot(x="Importance", y="Feature", data=comp_data, ax=axes[0], palette=comp_colors)
axes[0].set_title("Top 10 Feature-Importance - Kompressor")

# Plot für Turbine
turb_data = importance_dfs["turbine"].head(10).copy()
turb_colors = colors[:len(turb_data)]
sns.barplot(x="Importance", y="Feature", data=turb_data, ax=axes[1], palette=turb_colors)
axes[1].set_title("Top 10 Feature-Importance - Turbine")

plt.tight_layout()
plt.show()

# Erstelle "results" Verzeichnis
os.makedirs("../data/results", exist_ok=True)

# Save Top 10 feature importances
for target_name, importance_df in importance_dfs.items():
    importance_df.head(10).to_csv(f"../data/results/top_10_feature_importances_{target_name}.csv", index=False)