In [None]:
# Importieren der benötigten Bibliotheken

# Datenmanipulation
import pandas as pd

# Machine Learning
from sklearn.model_selection import train_test_split

# Split


**Train-Test-Split**


In [None]:
# Einlesen der bereinigten Daten
destination_path = "../data/processed"
df = pd.read_csv(f"{destination_path}/cleaned_data.csv")

In [None]:
# Aufteilung in Traings- und Testdaten
target_cols = ["GT_Compressor_decay_state_coefficient", "GT_Turbine_decay_state_coefficient"]
features = df.drop(columns=target_cols)
targets = df[target_cols]

# Durchführung des Train-Test-Splits
features_train, features_test, target_train, target_test = train_test_split(features, 
                                                                            targets, 
                                                                            random_state=42,
                                                                            test_size=0.3)

# Data Preparation


**Datatype Transformation**


In [None]:
# Keine Transformation nötig, alle Daten sind numerisch.
df.info()

**Data Imputation**


In [None]:
# Keine Imputation nötig, da keine fehlenden Werte vorhanden sind.
df.isnull().sum()

**Deal with Outliers**


In [None]:
# Ausreißerbehandlung mit der IQR-Methode (Trainingsdaten)
Q1 = features_train.quantile(0.25)
Q3 = features_train.quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Filterung auf die Trainingsdaten an
train_mask = ~((features_train < lower_bound) | (features_train > upper_bound)).any(axis=1)
features_train = features_train[train_mask]
target_train = target_train[train_mask]

In [None]:
# Speichern der Trainings- und Testdaten
features_train.to_csv(f"{destination_path}/features_train.csv", index=False)
features_test.to_csv(f"{destination_path}/features_test.csv", index=False)
target_train.to_csv(f"{destination_path}/target_train.csv", index=False)
target_test.to_csv(f"{destination_path}/target_test.csv", index=False)