# Predictive Maintenance mit SCANIA-Daten – Common Functions to load

**Projekt:** Bachelorarbeit Data Science  
**Thema:** 
**Datengrundlage:** SCANIA Component X Dataset  
**Autor:** Justin Stange-Heiduk  
**Betreuung:** Dr. Martin Prause  
**Ziel:** Erstellen und testen der Daten Vorbereitung Funktionen  

---

**Erstellt:** 2025-08-19   
**Letzte Änderung:** 2025-07-25


---

In [None]:
import pandas as pd
import dask.dataframe as dd
import numpy as np
from sksurv.util import Surv
import xgboost as xgb


In [2]:
def save_df(df: pd.DataFrame, ordner: str, name: str) -> None:
    """
    Speichert ein DataFrame als Parquet-Datei im angegebenen Ordner.

    Args:
        df (pd.DataFrame): Das zu speichernde DataFrame.
        ordner (str): Der Ordner, in dem die Parquet-Datei gespeichert werden soll.
        name (str): Der Name der Parquet-Datei (ohne .parquet)
    """

    df.to_parquet(f"../data/{ordner}/{name}.parquet", index=False)

In [3]:
def load_df(ordner: str, name: str) -> pd.DataFrame:
    """
    Lädt ein DataFrame aus einer Parquet-Datei im angegebenen Ordner.

    Args:
        ordner (str): Der Ordner, in dem die Parquet-Datei gespeichert ist. (../data/{ordner})
        name (str): Der Name der Parquet-Datei (ohne .parquet)

    Returns:
        pd.DataFrame: Das geladene DataFrame.
    """

    return pd.read_parquet(f"../data/{ordner}/{name}.parquet")

In [4]:
def load_df_dd(ordner: str, name: str) -> dd.DataFrame:
    """
    Lädt ein DataFrame aus einer Parquet-Datei im angegebenen Ordner.

    Args:
        ordner (str): Der Ordner, in dem die Parquet-Datei gespeichert ist. (../data/{ordner})
        name (str): Der Name der Parquet-Datei (ohne .parquet)

    Returns:
        pd.DataFrame: Das geladene DataFrame.
    """

    return dd.read_parquet(f"../data/{ordner}/{name}.parquet")

In [5]:
def load_all_raw_data() -> dict:
    """
    Load raw data from a CSV file.

    Returns:
        dict: The loaded raw data.
    """
    test_labels = pd.read_csv("../data/01_raw/test_labels.csv")
    test_operational = pd.read_csv("../data/01_raw/test_operational_readouts.csv")
    test_specifications = pd.read_csv("../data/01_raw/test_specifications.csv")

    train_tte = pd.read_csv("../data/01_raw/train_tte.csv")
    train_operational = pd.read_csv("../data/01_raw/train_operational_readouts.csv")
    train_specifications = pd.read_csv("../data/01_raw/train_specifications.csv")

    validation_tte = pd.read_csv("../data/01_raw/validation_labels.csv")
    validation_operational = pd.read_csv("../data/01_raw/validation_operational_readouts.csv")
    validation_specifications = pd.read_csv("../data/01_raw/validation_specifications.csv")

    return dict({"test": {"labels": test_labels, "readouts": test_operational, "spec": test_specifications},
           "train": {"tte": train_tte, "readouts": train_operational, "spec": train_specifications},
           "validation": {"labels": validation_tte, "readouts": validation_operational, "spec": validation_specifications}})

In [6]:
def load_specific_raw_data(name: str) -> dict:
    """
    Load specific raw data from CSV files based on the data type.

    Args:
        name (str): The type of data to load. Options are:
        test_labels, test_operational_readouts, test_specifications, 
        train_tte, train_operational_readouts, train_specifications, 
        validation_labels, validation_operational_readouts, validation_specifications.

    Returns:
        dict: The loaded raw data for the specified data type.
    """
    return pd.read_csv(f"../data/01_raw/{name}.csv")

In [7]:
def prepare_rsf_model_input(df: pd.DataFrame, columns_to_drop: list, frag: float, class_column: str, sampling: bool) -> tuple[pd.DataFrame, np.ndarray]: 
    """ Prepares the input data for the Random Survival Forest model with option to sample a fraction of each class. 
    
    Args: df (pd.DataFrame): The input dataframe containing features and target variables. 
    columns_to_drop (list): List of columns to drop from the dataframe. 
    frag (float): Fraction of data to sample from each class. 
    class_column (str): The name of the column representing the class labels. 
    sampling (bool): Whether to perform sampling or not.
    Returns: tuple[pd.DataFrame, np.ndarray]: A tuple containing the feature dataframe and the structured array for survival analysis. """ 
    df_list = [] 

    if sampling:
        for i in df[class_column].unique(): 
            df_list.append( df[df[class_column] == i].sample(frac=frag, random_state=42)) 
        df = pd.concat(df_list) 

    y_surv = Surv.from_arrays(event=df["event"].astype(bool), time=df["duration"].astype(float)) 
    X = df.drop(columns=columns_to_drop) 
    return X, y_surv 



In [None]:
def prepare_aft_model_input(df: pd.DataFrame, columns_to_drop) -> pd.DataFrame: 
    """ Prepares the input data for the XGBoost model with aft. 
    
    Args: 
    df (pd.DataFrame): The input dataframe containing features and target variables. 
    columns_to_drop (list): List of columns to drop from the dataframe. 

    Return:
    pd.DataFrame: The feature dataframe for XGBoost. 
    """

    y = {
        "lower_bound": df["duration"].astype(float),
        "upper_bound":  df["upper_bound"].astype(float),
    }

    x = df.drop(columns=columns_to_drop)

    d = xgb.DMatrix(data=x, label_lower_bound=y["lower_bound"],
                     label_upper_bound=y["upper_bound"])

    return d


In [9]:
def get_cost_and_taus()-> tuple[np.ndarray, np.ndarray]:
    """ Returns the cost matrix and class boundaries (taus) for RUL classification.    
    # Kostenmatrix aus deinem Paper (Zeilen = Actual n, Spalten = Predicted m)
    Returns: tuple[np.ndarray, np.ndarray]: A tuple containing the cost matrix and class boundaries. 
    """
    COST = np.array([
        [0,   7,   8,   9,   10],
        [200, 0,   7,   8,    9],
        [300, 200, 0,   7,    8],
        [400, 300, 200, 0,    7],
        [500, 400, 300, 200,  0]
    ], dtype=float)

    # Klassengrenzen für RUL in Zeiteinheiten, konsistent zu deinen Labels 0..4
    # Beispiel: 4: [0,6), 3: [6,12), 2: [12,24), 1: [24,48), 0: [48, inf)
    TAUS = np.array([6.0, 12.0, 24.0, 48.0], dtype=float)

    return COST, TAUS

In [None]:
def ensure_dir(path: str) -> None:
    """Erzeugt das Zielverzeichnis, falls es nicht existiert."""
    os.makedirs(path, exist_ok=True)