In [1]:
# Importieren der benötigten Bibliotheken

# Datenmanipulation
import pandas as pd

# Machine Learning
from sklearn.model_selection import train_test_split

# Split


**Train-Test-Split**


In [2]:
# Einlesen der bereinigten Daten
destination_path = "../data/processed"
df = pd.read_csv(destination_path + "/cleaned_data.csv")

In [3]:
# Aufteilung in Traings- und Testdaten
target_cols = ['gt_compressor_decay_state_coefficient', 'gt_turbine_decay_state_coefficient']
features = df.drop(columns=target_cols)
targets = df[target_cols]

# Durchführung des Train-Test-Splits
features_train, features_test, target_train, target_test = train_test_split(features, 
                                                                            targets, 
                                                                            random_state=42,
                                                                            test_size=0.3)

# Data Preparation


**Datatype Transformation**


In [4]:
# Keine Transformation nötig, alle Daten sind numerisch.
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11934 entries, 0 to 11933
Data columns (total 16 columns):
 #   Column                                            Non-Null Count  Dtype  
---  ------                                            --------------  -----  
 0   lever_position                                    11934 non-null  float64
 1   ship_speed_v                                      11934 non-null  int64  
 2   gas_turbine_gt_shaft_torque_gtt_kn_m              11934 non-null  float64
 3   gt_rate_of_revolutions_gtn_rpm                    11934 non-null  float64
 4   gas_generator_rate_of_revolutions_ggn_rpm         11934 non-null  float64
 5   starboard_propeller_torque_ts_kn                  11934 non-null  float64
 6   port_propeller_torque_tp_kn                       11934 non-null  float64
 7   hight_pressure_hp_turbine_exit_temperature_t48_c  11934 non-null  float64
 8   gt_compressor_outlet_air_temperature_t2_c         11934 non-null  float64
 9   hp_turbine_exit_p

**Data Imputation**


In [5]:
# Keine Imputation nötig, da keine fehlenden Werte vorhanden sind.
df.isnull().sum()

lever_position                                      0
ship_speed_v                                        0
gas_turbine_gt_shaft_torque_gtt_kn_m                0
gt_rate_of_revolutions_gtn_rpm                      0
gas_generator_rate_of_revolutions_ggn_rpm           0
starboard_propeller_torque_ts_kn                    0
port_propeller_torque_tp_kn                         0
hight_pressure_hp_turbine_exit_temperature_t48_c    0
gt_compressor_outlet_air_temperature_t2_c           0
hp_turbine_exit_pressure_p48_bar                    0
gt_compressor_outlet_air_pressure_p2_bar            0
gt_exhaust_gas_pressure_pexh_bar                    0
turbine_injecton_control_tic                        0
fuel_flow_mf_kgs                                    0
gt_compressor_decay_state_coefficient               0
gt_turbine_decay_state_coefficient                  0
dtype: int64

**Deal with Outliers**


In [6]:
# Ausreißerbehandlung mit der IQR-Methode (Trainingsdaten)
Q1 = features_train.quantile(0.25)
Q3 = features_train.quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Filterung auf die Trainingsdaten an
train_mask = ~((features_train < lower_bound) | (features_train > upper_bound)).any(axis=1)
features_train = features_train[train_mask]
target_train = target_train[train_mask]

# Filterung auf die Testdaten an
test_mask = ~((features_test < lower_bound) | (features_test > upper_bound)).any(axis=1)
features_test = features_test[test_mask]
target_test = target_test[test_mask]

In [7]:
# Speichern der Trainings- und Testdaten
features_train.to_csv(destination_path + "/features_train.csv", index=False)
features_test.to_csv(destination_path + "/features_test.csv", index=False)
target_train.to_csv(destination_path + "/target_train.csv", index=False)
target_test.to_csv(destination_path + "/target_test.csv", index=False)