# Unser ML-Workflow

## 1. Daten vorbereiten

### 1.1 Daten laden

In [1]:
import pandas as pd

df = pd.read_csv("data/car-sales-extended-missing-data.csv")
df

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431.0,4.0,15323.0
1,BMW,Blue,192714.0,5.0,19943.0
2,Honda,White,84714.0,4.0,28343.0
3,Toyota,White,154365.0,4.0,13434.0
4,Nissan,Blue,181577.0,3.0,14043.0
...,...,...,...,...,...
995,Toyota,Black,35820.0,4.0,32042.0
996,,White,155144.0,3.0,5716.0
997,Nissan,Blue,66604.0,4.0,31570.0
998,Honda,White,215883.0,4.0,4001.0


### 1.2 Fehlende Werte prüfen/behandeln

In [2]:
df.isna().sum()

Make             49
Colour           50
Odometer (KM)    50
Doors            50
Price            50
dtype: int64

In [3]:
df.dropna(subset=["Price"], inplace=True)
df.isna().sum()

Make             47
Colour           46
Odometer (KM)    48
Doors            47
Price             0
dtype: int64

In [4]:
X = df.drop("Price", axis=1)
y = df["Price"]

In [5]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
from sklearn.impute import SimpleImputer

categorical_features = ["Make", "Colour"]
door_feature = ["Doors"]
numerical_features = ["Odometer (KM)"]

categorical_imputer = SimpleImputer(strategy="constant", fill_value="missing")
door_imputer = SimpleImputer(strategy="constant", fill_value=4)
numerical_imputer = SimpleImputer(strategy="mean")

In [7]:
from sklearn.compose import ColumnTransformer

imputer = ColumnTransformer([
    ("cat_imputer", categorical_imputer, categorical_features),
    ("door_imputer", door_imputer, door_feature),
    ("num_imputer", numerical_imputer, numerical_features)
])

In [8]:
filled_X_train = imputer.fit_transform(X_train)
filled_X_test = imputer.transform(X_test)

In [9]:
filled_columns = categorical_features + door_feature + numerical_features
X_train_df = pd.DataFrame(filled_X_train, columns=filled_columns)
X_test_df = pd.DataFrame(filled_X_test, columns=filled_columns)
X_train_df

Unnamed: 0,Make,Colour,Doors,Odometer (KM)
0,Honda,White,4.0,71934.0
1,Toyota,Red,4.0,162665.0
2,Honda,White,4.0,42844.0
3,Honda,White,4.0,195829.0
4,Honda,Blue,4.0,219217.0
...,...,...,...,...
755,Toyota,missing,4.0,218803.0
756,BMW,Blue,5.0,245427.0
757,Toyota,White,4.0,196225.0
758,Honda,Blue,4.0,133117.0


In [10]:
X_train_df.isna().sum()

Make             0
Colour           0
Doors            0
Odometer (KM)    0
dtype: int64

In [11]:
X_test_df.isna().sum()

Make             0
Colour           0
Doors            0
Odometer (KM)    0
dtype: int64

### 1.3 Encoding der kategorischen Daten

In [12]:
from sklearn.preprocessing import OneHotEncoder

# Erneute Definition der kategorialen Spalten:
categorical_features = ["Make", "Colour", "Doors"]

# Initialisierung des OneHotEncoders:
one_hot = OneHotEncoder()

transformer = ColumnTransformer([
    ("one_hot", one_hot, categorical_features)],
    remainder="passthrough")

X_train_transformed = transformer.fit_transform(X_train_df)
X_test_transformed= transformer.transform(X_test_df)

In [13]:
X_train_transformed

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 3040 stored elements and shape (760, 15)>

In [14]:
X_train_array = X_train_transformed.toarray()
X_test_array = X_test_transformed.toarray()

# Neue Spaltennamen aus dem OneHotEncoder:
ohe_columns = transformer.named_transformers_["one_hot"].get_feature_names_out(categorical_features)

# Die Reihenfolge stimmt: one_hot-Spalten + passtrough-Spalten:
all_feature_names = list(ohe_columns) + numerical_features

# DataFrame erzeugen:
X_train_df_final = pd.DataFrame(X_train_array, columns=all_feature_names)
X_test_df_final = pd.DataFrame(X_test_array, columns=all_feature_names)

X_train_df_final

Unnamed: 0,Make_BMW,Make_Honda,Make_Nissan,Make_Toyota,Make_missing,Colour_Black,Colour_Blue,Colour_Green,Colour_Red,Colour_White,Colour_missing,Doors_3.0,Doors_4.0,Doors_5.0,Odometer (KM)
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,71934.0
1,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,162665.0
2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,42844.0
3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,195829.0
4,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,219217.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
755,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,218803.0
756,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,245427.0
757,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,196225.0
758,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,133117.0


### 1.4 Ausreißerprüfung (outlier detection/treatment)

In [15]:
num_cols = X_train_df_final.select_dtypes(include="number").columns
num_cols

Index(['Make_BMW', 'Make_Honda', 'Make_Nissan', 'Make_Toyota', 'Make_missing',
       'Colour_Black', 'Colour_Blue', 'Colour_Green', 'Colour_Red',
       'Colour_White', 'Colour_missing', 'Doors_3.0', 'Doors_4.0', 'Doors_5.0',
       'Odometer (KM)'],
      dtype='object')

In [16]:
# Numerische Spalten:
num_cols = X_train_df_final.select_dtypes(include="number").columns.tolist()
num_cols

['Make_BMW',
 'Make_Honda',
 'Make_Nissan',
 'Make_Toyota',
 'Make_missing',
 'Colour_Black',
 'Colour_Blue',
 'Colour_Green',
 'Colour_Red',
 'Colour_White',
 'Colour_missing',
 'Doors_3.0',
 'Doors_4.0',
 'Doors_5.0',
 'Odometer (KM)']

In [17]:
# Binäre (One-Hot) Spalten erkennen:
binary_cols = []
for c in num_cols:
    u = pd.unique(X_train_df_final[c].dropna())
    if len(u) <= 2 and set(u).issubset({0, 1, 0.0, 1.0}):
        binary_cols.append(c)
        
binary_cols

['Make_BMW',
 'Make_Honda',
 'Make_Nissan',
 'Make_Toyota',
 'Make_missing',
 'Colour_Black',
 'Colour_Blue',
 'Colour_Green',
 'Colour_Red',
 'Colour_White',
 'Colour_missing',
 'Doors_3.0',
 'Doors_4.0',
 'Doors_5.0']

In [18]:
# Nur kontinuierliche Spalten:
continous_cols = [c for c in num_cols if c not in binary_cols]
print("Kontinuierliche Features:", continous_cols)
print("Binäre (One-Hot) Features:", binary_cols)

Kontinuierliche Features: ['Odometer (KM)']
Binäre (One-Hot) Features: ['Make_BMW', 'Make_Honda', 'Make_Nissan', 'Make_Toyota', 'Make_missing', 'Colour_Black', 'Colour_Blue', 'Colour_Green', 'Colour_Red', 'Colour_White', 'Colour_missing', 'Doors_3.0', 'Doors_4.0', 'Doors_5.0']


In [19]:
import plotly.express as px

for col in continous_cols:
    Q1 = X_train_df_final[col].quantile(0.25)
    Q3 = X_train_df_final[col].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR

    # Outlier-Flag erstellen
    df_plot = X_train_df_final[[col]].copy()
    df_plot["is_outlier"] = (
        (df_plot[col] < lower) | (df_plot[col] > upper)
    )

    print(f"{col}: {df_plot['is_outlier'].sum()} Ausreißer")

    # Plotly Boxplot mit farblichen Punkten für Ausreißer
    fig = px.box(
        df_plot,
        y=col,
        points="all",  # alle Punkte anzeigen
        color="is_outlier",  # Ausreißer farbig markieren
        title=f"Boxplot - {col} (Outlier farbig)"
    )
    fig.show()

Odometer (KM): 0 Ausreißer


### 1.5 Verteilung der Daten prüfen

In [20]:
continous_cols

['Odometer (KM)']

In [21]:
import plotly.graph_objects as go

skews = {}
for c in continous_cols:
    s = X_train_df_final[c].dropna().astype(float)
    skew_val = s.skew()
    skews[c] = skew_val

    fig = go.Figure()
    fig.add_trace(go.Histogram(x=s, nbinsx=30))
    fig.update_layout(
        title=f"{c} | Skew={skew_val:.4f} | n={len(s)}",
        xaxis_title=c,
        yaxis_title="Häufigkeit",
        bargap=0.05
    )

fig

In [22]:
skew_series = pd.Series(skews).sort_values(ascending=False)
print("Skewness (absteigend):")
display(skew_series)

Skewness (absteigend):


Odometer (KM)    0.000578
dtype: float64

### 1.6 Transformation
Da unsere Skewness für alle kontinuierlichen Features ungefähr 0 ist, haben wir eine symmetrische Verteilung. Eine log-Transformation oder andere Transformationsmethoden zur Normalisierung entfallen an dieser Stelle.

### 1.7 Skalierung / Standardisierung

In [23]:
# Standardabweichung und Mittelwert vor Standardisierung prüfen:
for col in continous_cols:
    mean_val = X_train_df_final[col].mean()
    std_val = X_train_df_final[col].std(ddof=0)
    print(f"{col}\nMittelwert = {mean_val:.4f}\nStandardabweichung = {std_val:.4f}")

Odometer (KM)
Mittelwert = 130319.0331
Standardabweichung = 67130.4786


In [24]:
from sklearn.preprocessing import StandardScaler

# StandardScaler fitten (nur auf Train):
scaler = StandardScaler()
scaler.fit(X_train_df_final[continous_cols].astype(float))

# StandardScaler anwenden:
X_train_scaled = X_train_df_final.copy()
X_test_scaled = X_test_df_final.copy()
X_train_scaled[continous_cols] = scaler.transform(X_train_df_final[continous_cols].astype(float))
X_test_scaled[continous_cols]  = scaler.transform(X_test_df_final[continous_cols].astype(float))

In [25]:
# Standardabweichung und Mittelwert vor Standardisierung prüfen:
for col in continous_cols:
    mean_val = X_train_scaled[col].mean()
    std_val = X_train_scaled[col].std(ddof=0)
    print(f"{col}\nMittelwert = {mean_val:.4f}\nStandardabweichung = {std_val:.4f}")

Odometer (KM)
Mittelwert = -0.0000
Standardabweichung = 1.0000


### 2. Das richtige Modell auswählen

Dabei entscheiden wir uns für die Ridge Regression. Es ist ein Verfahren, das wie eine normale lineare Regression arbeitet, aber zusätzlich verhindert, dass einzelne Merkmale (Features) zu viel Einfluss bekommen. Sie bestraft im Training große Zahlen bei den Modell-Gewichten (Koeffizienten). Dadurch werden sehr Große, extreme Gewichte kleiner gemacht, wodurch das Modell "ruhiger" und stabiler wird. Ridge Regression ist wie eine normale Regression, aber mit einem "Dämpfer" auf den Gewichten, damit sie nicht zu verrückt spielt.

### 3. Modell trainieren

In [26]:
from sklearn.linear_model import Ridge

ridge_basic = Ridge(alpha=1.0, random_state=42)
ridge_basic.fit(X_train_scaled, y_train)

0,1,2
,alpha,1.0
,fit_intercept,True
,copy_X,True
,max_iter,
,tol,0.0001
,solver,'auto'
,positive,False
,random_state,42


### 4. Vorhersagen machen

In [27]:
import numpy as np

y_pred_basic = ridge_basic.predict(X_test_scaled)

# Kurzer Überblick einiger Kennzahlen:
pd.DataFrame({
    "Tatsächlicher Preis": y_test.values[:10],
    "Vorhergesagter Preis": y_pred_basic[:10],
    "Abweichung (Residuum)": y_test.values[:10] - y_pred_basic[:10],
    "Abweichung (%)": ((y_test.values[:10] - y_pred_basic[:10]) / y_test.values[:10]) * 100,
    "Absoluter Fehler": abs(y_test.values[:10] - y_pred_basic[:10]),
    "Absoluter Fehler (%)": (abs(y_test.values[:10] - y_pred_basic[:10]) / y_test.values[:10]) * 100,
    "Quadratischer Fehler": (y_test.values[:10] - y_pred_basic[:10]) ** 2,
    "Wurzel-Fehler": np.sqrt((y_test.values[:10] - y_pred_basic[:10]) ** 2)
})

Unnamed: 0,Tatsächlicher Preis,Vorhergesagter Preis,Abweichung (Residuum),Abweichung (%),Absoluter Fehler,Absoluter Fehler (%),Quadratischer Fehler,Wurzel-Fehler
0,10547.0,18525.013597,-7978.013597,-75.642492,7978.013597,75.642492,63648700.0,7978.013597
1,17940.0,22183.754241,-4243.754241,-23.655263,4243.754241,23.655263,18009450.0,4243.754241
2,12950.0,11059.03759,1890.96241,14.602026,1890.96241,14.602026,3575739.0,1890.96241
3,5905.0,6992.444072,-1087.444072,-18.415649,1087.444072,18.415649,1182535.0,1087.444072
4,9826.0,8779.306438,1046.693562,10.652285,1046.693562,10.652285,1095567.0,1046.693562
5,11162.0,10942.217885,219.782115,1.969021,219.782115,1.969021,48304.18,219.782115
6,13650.0,15281.658783,-1631.658783,-11.953544,1631.658783,11.953544,2662310.0,1631.658783
7,14345.0,13940.110827,404.889173,2.822511,404.889173,2.822511,163935.2,404.889173
8,12024.0,20281.993013,-8257.993013,-68.67925,8257.993013,68.67925,68194450.0,8257.993013
9,10076.0,14371.139125,-4295.139125,-42.627423,4295.139125,42.627423,18448220.0,4295.139125


### 5. Modell evaluieren
Die gängigen und aussagekräftigen Kennzahlen für ein Regressionsmodell sind:

MAE (Mean Absolute Error): Durchschnittlicher absoluter Fehler
RMSE (Root Mean Squared Error): Mittlerer quadratischer Fehler, große Ausreißer stärker gewichtet
R² (Bestimmtheitsmaß): Erklärt den Anteil der Varianz, den das Modell erfasst

In [28]:
from sklearn.metrics import mean_absolute_error, root_mean_squared_error, r2_score

mae = mean_absolute_error(y_test, y_pred_basic)
rmse = root_mean_squared_error(y_test, y_pred_basic)
r2 = r2_score(y_test, y_pred_basic)

print(f"Mean Absolute Error (MAE): {mae:,.2f} €")
print(f"Root Mean Squared Error (RMSE): {rmse:,.2f} €")
print(f"Bestimmtheitsmaß (R²): {r2:.3f}")

Mean Absolute Error (MAE): 5,722.18 €
Root Mean Squared Error (RMSE): 7,072.77 €
Bestimmtheitsmaß (R²): 0.253


In [29]:
from plotly.subplots import make_subplots

residuum  = (y_test - y_pred_basic).to_numpy() 
abs_resid = np.abs(residuum)
vmax = np.quantile(abs_resid, 0.95)
sigma_residuum = residuum.std()

# Plotly-Subplots erstellen (2 Reihen, 2 Spalten):
fig = make_subplots(
    rows=2, cols=2,
    subplot_titles=(
        "Tatsächlich vs. Vorhersage",
        "Residuen vs. Vorhersage",
        "Histogramm der Residuen",
        "Modellmetriken"
    )
)

# Tatsächlich vs. Vorhersage
fig.add_trace(
    go.Scatter(
        x=y_test, y=y_pred_basic,
        mode="markers",
        name="Vorhersagen",
        marker=dict(
            size=8,
            opacity=0.75,
            color=residuum,     
            colorscale="RdBu",
            cmin=-vmax,
            cmax=vmax,
            colorbar=dict(
                title=dict(
                    text="Residuum: <br>(Tatsächlich - Vorhersage)"
                )
            )
        ),
        hovertemplate=(
            "Tatsächlich: %{x:.0f} €<br>"
            "Vorhersage: %{y:.0f} €<br>"
            "Residuum: %{marker.color:.0f} €<extra></extra>"
        )
    ),
    row=1, col=1
)
# Ideal-Linie:
lims = [min(y_test.min(), y_pred_basic.min()), max(y_test.max(), y_pred_basic.max())]
fig.add_trace(
    go.Scatter(
        x=lims, y=lims,
        mode="lines",
        name="Ideal",
        line=dict(dash="dash", color="red")
    ),
    row=1, col=1
)

# Residuen vs. Vorhersage:
fig.add_trace(
    go.Scatter(
        x=y_pred_basic, y=residuum,
        mode="markers",
        name="Residuen",
        marker=dict(size=6, opacity=0.6)
    ),
    row=1, col=2
)
fig.add_hline(y=0, line_dash="dash", row=1, col=2)
fig.add_hrect(y0=-sigma_residuum, y1=sigma_residuum, fillcolor="lightgreen", opacity=0.2, row=1, col=2)

# Histogramm der Residuen:
fig.add_trace(
    go.Histogram(
        x=residuum,
        nbinsx=30,
        name="Residuen",
    ),
    row=2, col=1
)
fig.add_vline(x=0, line_dash="dash", row=2, col=1)

# Metriken Balkendiagramm:
metrics = ["MAE", "RMSE"]
values = [mae, rmse]
fig.add_trace(
    go.Bar(
        x=metrics,
        y=values,
        text=[f"{v:,.0f}" for v in values],
        textposition="auto",
        name="Fehler"
    ),
    row=2, col=2
)
# R² als Annotation hinzufügen:
fig.add_annotation(
    text=f"R² = {r2:.3f}",
    xref="x domain", yref="y domain",
    x=0, y=1, showarrow=False,
    font=dict(size=14, color="black"),
    bgcolor="lightgray",
    row=2, col=2
)

# Layout anpassen
fig.update_layout(
    height=800, width=900,
    showlegend=False,
    title_text="Modell-Evaluation - Ridge Regression",
    bargap=0.05
)

fig

In [30]:
from sklearn.model_selection import cross_val_score, KFold

cv = KFold(n_splits=5, shuffle=True, random_state=42)

mae_scores = -cross_val_score(ridge_basic, X_train_scaled, y_train, cv=cv, scoring="neg_mean_absolute_error")
rmse_scores = np.sqrt(-cross_val_score(ridge_basic, X_train_scaled, y_train, cv=cv, scoring="neg_mean_squared_error"))
r2_scores = cross_val_score(ridge_basic, X_train_scaled, y_train, cv=cv, scoring="r2")

print(f"MAE: {mae_scores.mean():.0f} ± {mae_scores.std():.0f}")
print(f"RMSE: {rmse_scores.mean():.0f} ± {rmse_scores.std():.0f}")
print(f"R²: {r2_scores.mean():.3f} ± {r2_scores.std():.3f}")

MAE: 5804 ± 64
RMSE: 7105 ± 62
R²: 0.318 ± 0.055


### 6. Modell verbessern
### 6.1 Feature-Engineering

In [31]:
X["Odometer_Sq"] = X["Odometer (KM)"] ** 2
X["Doors_Sq"] = X["Doors"] ** 2

In [32]:
X["Doors_x_Odometer"] = X["Doors"] * X["Odometer (KM)"]

In [33]:
X["Odometer_bin"] = pd.cut(X["Odometer (KM)"], bins=[0, 50000, 100000, 150000, 200000, float("inf")], labels=False)
X

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Odometer_Sq,Doors_Sq,Doors_x_Odometer,Odometer_bin
0,Honda,White,35431.0,4.0,1.255356e+09,16.0,141724.0,0.0
1,BMW,Blue,192714.0,5.0,3.713869e+10,25.0,963570.0,3.0
2,Honda,White,84714.0,4.0,7.176462e+09,16.0,338856.0,1.0
3,Toyota,White,154365.0,4.0,2.382855e+10,16.0,617460.0,3.0
4,Nissan,Blue,181577.0,3.0,3.297021e+10,9.0,544731.0,3.0
...,...,...,...,...,...,...,...,...
995,Toyota,Black,35820.0,4.0,1.283072e+09,16.0,143280.0,0.0
996,,White,155144.0,3.0,2.406966e+10,9.0,465432.0,3.0
997,Nissan,Blue,66604.0,4.0,4.436093e+09,16.0,266416.0,1.0
998,Honda,White,215883.0,4.0,4.660547e+10,16.0,863532.0,4.0


In [34]:
# Aufteilen der Daten in Trainings- und Testdaten:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [35]:
X_train.isna().sum()

Make                35
Colour              38
Odometer (KM)       36
Doors               38
Odometer_Sq         36
Doors_Sq            38
Doors_x_Odometer    73
Odometer_bin        36
dtype: int64

In [36]:
categorical_features = ["Make", "Colour"]
door_feature = ["Doors"]
door_feature_sq = ["Doors_Sq"]
numerical_features = ["Odometer (KM)", "Odometer_Sq", "Doors_x_Odometer"]
odometer_bin_feature = ["Odometer_bin"]


categorical_imputer = SimpleImputer(strategy="constant", fill_value="missing")
door_imputer = SimpleImputer(strategy="constant", fill_value=4)
door_imputer_sq = SimpleImputer(strategy="constant", fill_value=16)
numerical_imputer = SimpleImputer(strategy="mean")
odometer_bin_imputer = SimpleImputer(strategy="most_frequent")

imputer = ColumnTransformer([
    ("cat_imputer", categorical_imputer, categorical_features),
    ("door_imputer", door_imputer, door_feature),
    ("door_imputer_sq", door_imputer_sq, door_feature_sq),
    ("num_imputer", numerical_imputer, numerical_features),
    ("odo_bin_imputer", odometer_bin_imputer, odometer_bin_feature),
])

In [37]:
filled_X_train = imputer.fit_transform(X_train)
filled_X_test = imputer.transform(X_test)

In [38]:
filled_columns = categorical_features + door_feature + door_feature_sq + numerical_features + odometer_bin_feature
X_train_df = pd.DataFrame(filled_X_train, columns=filled_columns)
X_test_df = pd.DataFrame(filled_X_test, columns=filled_columns)
X_train_df

Unnamed: 0,Make,Colour,Doors,Doors_Sq,Odometer (KM),Odometer_Sq,Doors_x_Odometer,Odometer_bin
0,Honda,White,4.0,16.0,71934.0,5174500356.0,287736.0,1.0
1,Toyota,Red,4.0,16.0,162665.0,26459902225.0,650660.0,3.0
2,Honda,White,4.0,16.0,42844.0,1835608336.0,171376.0,0.0
3,Honda,White,4.0,16.0,195829.0,38348997241.0,783316.0,3.0
4,Honda,Blue,4.0,16.0,219217.0,48056093089.0,876868.0,4.0
...,...,...,...,...,...,...,...,...
755,Toyota,missing,4.0,16.0,218803.0,47874752809.0,875212.0,4.0
756,BMW,Blue,5.0,25.0,245427.0,60234412329.0,1227135.0,4.0
757,Toyota,White,4.0,16.0,196225.0,38504250625.0,784900.0,3.0
758,Honda,Blue,4.0,16.0,133117.0,17720135689.0,515473.164483,2.0


In [39]:
X_train_df.isna().sum()

Make                0
Colour              0
Doors               0
Doors_Sq            0
Odometer (KM)       0
Odometer_Sq         0
Doors_x_Odometer    0
Odometer_bin        0
dtype: int64

In [40]:
X_test_df.isna().sum()

Make                0
Colour              0
Doors               0
Doors_Sq            0
Odometer (KM)       0
Odometer_Sq         0
Doors_x_Odometer    0
Odometer_bin        0
dtype: int64

In [41]:
from sklearn.preprocessing import OneHotEncoder

# Erneute Definition der kategorialen Spalten:
categorical_features = ["Make", "Colour", "Doors", "Doors_Sq", "Odometer_bin"]

# Initialisierung des OneHotEncoders:
one_hot = OneHotEncoder()

transformer = ColumnTransformer([
    ("one_hot", one_hot, categorical_features)],
    remainder="passthrough")

X_train_transformed = transformer.fit_transform(X_train_df)
X_test_transformed = transformer.transform(X_test_df)

In [42]:
# Hole die Namen der kategorischen Spalten nach dem One-Hot-Encoding:
one_hot_feature_names = transformer.named_transformers_["one_hot"].get_feature_names_out(categorical_features)
# Alle Spaltennamen holen (one-hot + "remainder" passthrough):
all_feature_names = list(one_hot_feature_names) + list(X_train_df.drop(columns=categorical_features).columns)

In [43]:
# Jetzt die DataFrames:
X_train_df_transformed = pd.DataFrame(X_train_transformed, columns=all_feature_names)
X_test_df_transformed = pd.DataFrame(X_test_transformed, columns=all_feature_names)

X_test_transformed

array([[0.0, 0.0, 0.0, ..., 99761.0, 9952257121.0, 399044.0],
       [0.0, 0.0, 0.0, ..., 17975.0, 323100625.0, 71900.0],
       [0.0, 1.0, 0.0, ..., 197664.0, 39071056896.0, 790656.0],
       ...,
       [0.0, 0.0, 0.0, ..., 108569.0, 11787227761.0, 515473.16448326054],
       [1.0, 0.0, 0.0, ..., 201190.0, 40477416100.0, 603570.0],
       [0.0, 1.0, 0.0, ..., 61163.0, 3740912569.0, 244652.0]],
      shape=(190, 25), dtype=object)

In [44]:
X_train_df_transformed

Unnamed: 0,Make_BMW,Make_Honda,Make_Nissan,Make_Toyota,Make_missing,Colour_Black,Colour_Blue,Colour_Green,Colour_Red,Colour_White,...,Doors_Sq_16.0,Doors_Sq_25.0,Odometer_bin_0.0,Odometer_bin_1.0,Odometer_bin_2.0,Odometer_bin_3.0,Odometer_bin_4.0,Odometer (KM),Odometer_Sq,Doors_x_Odometer
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,71934.0,5174500356.0,287736.0
1,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,162665.0,26459902225.0,650660.0
2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,42844.0,1835608336.0,171376.0
3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,195829.0,38348997241.0,783316.0
4,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,219217.0,48056093089.0,876868.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
755,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,218803.0,47874752809.0,875212.0
756,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,245427.0,60234412329.0,1227135.0
757,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,196225.0,38504250625.0,784900.0
758,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,133117.0,17720135689.0,515473.164483


In [45]:
# Mittelwert nur aus Trainingsdaten berechnen:
odom_mean = X_train_df_transformed["Odometer (KM)"].mean()

# Neues Feature für Trainingsdaten:
X_train_df_transformed["Odometer_Centered"] = X_train_df_transformed["Odometer (KM)"] - odom_mean

# Neues Feature für Testdaten – gleicher Mittelwert!
X_test_df_transformed["Odometer_Centered"] = X_test_df_transformed["Odometer (KM)"] - odom_mean

X_train_df_transformed

Unnamed: 0,Make_BMW,Make_Honda,Make_Nissan,Make_Toyota,Make_missing,Colour_Black,Colour_Blue,Colour_Green,Colour_Red,Colour_White,...,Doors_Sq_25.0,Odometer_bin_0.0,Odometer_bin_1.0,Odometer_bin_2.0,Odometer_bin_3.0,Odometer_bin_4.0,Odometer (KM),Odometer_Sq,Doors_x_Odometer,Odometer_Centered
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,71934.0,5174500356.0,287736.0,-58385.033149
1,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,162665.0,26459902225.0,650660.0,32345.966851
2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,42844.0,1835608336.0,171376.0,-87475.033149
3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,195829.0,38348997241.0,783316.0,65509.966851
4,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,219217.0,48056093089.0,876868.0,88897.966851
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
755,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,218803.0,47874752809.0,875212.0,88483.966851
756,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,245427.0,60234412329.0,1227135.0,115107.966851
757,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,196225.0,38504250625.0,784900.0,65905.966851
758,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,133117.0,17720135689.0,515473.164483,2797.966851


In [46]:
X_test_transformed

array([[0.0, 0.0, 0.0, ..., 99761.0, 9952257121.0, 399044.0],
       [0.0, 0.0, 0.0, ..., 17975.0, 323100625.0, 71900.0],
       [0.0, 1.0, 0.0, ..., 197664.0, 39071056896.0, 790656.0],
       ...,
       [0.0, 0.0, 0.0, ..., 108569.0, 11787227761.0, 515473.16448326054],
       [1.0, 0.0, 0.0, ..., 201190.0, 40477416100.0, 603570.0],
       [0.0, 1.0, 0.0, ..., 61163.0, 3740912569.0, 244652.0]],
      shape=(190, 25), dtype=object)

In [47]:
# Datentypen prüfen:
X_train_df_transformed.dtypes

Make_BMW             object
Make_Honda           object
Make_Nissan          object
Make_Toyota          object
Make_missing         object
Colour_Black         object
Colour_Blue          object
Colour_Green         object
Colour_Red           object
Colour_White         object
Colour_missing       object
Doors_3.0            object
Doors_4.0            object
Doors_5.0            object
Doors_Sq_9.0         object
Doors_Sq_16.0        object
Doors_Sq_25.0        object
Odometer_bin_0.0     object
Odometer_bin_1.0     object
Odometer_bin_2.0     object
Odometer_bin_3.0     object
Odometer_bin_4.0     object
Odometer (KM)        object
Odometer_Sq          object
Doors_x_Odometer     object
Odometer_Centered    object
dtype: object

In [48]:
# Datentypen anpassen:
X_train_df_transformed = X_train_df_transformed.apply(pd.to_numeric)
X_test_df_transformed = X_test_df_transformed.apply(pd.to_numeric)

# Kontrolle der Datentypen:
X_train_df_transformed.dtypes

Make_BMW             float64
Make_Honda           float64
Make_Nissan          float64
Make_Toyota          float64
Make_missing         float64
Colour_Black         float64
Colour_Blue          float64
Colour_Green         float64
Colour_Red           float64
Colour_White         float64
Colour_missing       float64
Doors_3.0            float64
Doors_4.0            float64
Doors_5.0            float64
Doors_Sq_9.0         float64
Doors_Sq_16.0        float64
Doors_Sq_25.0        float64
Odometer_bin_0.0     float64
Odometer_bin_1.0     float64
Odometer_bin_2.0     float64
Odometer_bin_3.0     float64
Odometer_bin_4.0     float64
Odometer (KM)        float64
Odometer_Sq          float64
Doors_x_Odometer     float64
Odometer_Centered    float64
dtype: object

In [49]:
import pandas as pd
import numpy as np
import plotly.express as px

def plot_outliers_boxplots(df: pd.DataFrame, dataset_name: str = "Dataset"):
    """
    Sucht kontinuierliche numerische Features (exkl. binär/One-Hot),
    erstellt für jedes einen Boxplot mit Outlier-Highlighting und
    gibt alle Figure-Objekte sowie eine Outlier-Summary zurück.

    Returns
    -------
    figs : list[plotly.graph_objs._figure.Figure]
        Alle erzeugten Boxplot-Figuren.
    summary : pd.DataFrame
        Tabelle mit #Ausreißer je Feature und IQR-Grenzen.
    """
    # 0) Sicherstellen, dass numerische Spalten numerisch sind
    df_num = df.apply(pd.to_numeric)
    
    # 1) Numerische (und bool) Spalten erfassen
    num_cols = df_num.select_dtypes(include=["number", "bool"]).columns.tolist()

    # 2) Binäre (One-Hot) Spalten erkennen (0/1 oder 0.0/1.0/False/True)
    binary_cols = []
    for c in num_cols:
        u = pd.unique(df_num[c].dropna())
        # auf 0/1 normalisieren (True/False -> 1/0)
        norm = set()
        for v in u:
            if isinstance(v, (bool, np.bool_)):
                norm.add(int(v))
            else:
                norm.add(v)
        if len(norm) <= 2 and set(norm).issubset({0, 1, 0.0, 1.0}):
            binary_cols.append(c)

    # 3) Kontinuierliche Spalten
    continuous_cols = [c for c in num_cols if c not in binary_cols]

    figs = []
    rows = []

    for col in continuous_cols:
        s = df_num[col].dropna()
        if s.empty:
            continue

        Q1 = s.quantile(0.25)
        Q3 = s.quantile(0.75)
        IQR = Q3 - Q1
        lower = Q1 - 1.5 * IQR
        upper = Q3 + 1.5 * IQR

        # Outlier-Flag
        df_plot = pd.DataFrame({col: df_num[col]})
        df_plot["is_outlier"] = (df_plot[col] < lower) | (df_plot[col] > upper)

        # Summary-Zeile
        rows.append({
            "dataset": dataset_name,
            "feature": col,
            "n_total": df_plot[col].notna().sum(),
            "n_outliers": int(df_plot["is_outlier"].sum()),
            "lower_bound": lower,
            "upper_bound": upper,
            "Q1": Q1,
            "Q3": Q3,
            "IQR": IQR
        })

        fig = px.box(
            df_plot,
            y=col,
            points="all",
            color="is_outlier",
            title=f"{dataset_name} · {col} — Boxplot mit Outlier-Highlight",
        )
        figs.append(fig)

    summary = pd.DataFrame(rows).sort_values(["n_outliers", "feature"], ascending=[False, True])
    return figs, summary

In [50]:
# Trainingsdaten:
train_figs, train_summary = plot_outliers_boxplots(X_train_df_transformed, dataset_name="Train")
for f in train_figs:
    f.show()
display(train_summary)

Unnamed: 0,dataset,feature,n_total,n_outliers,lower_bound,upper_bound,Q1,Q3,IQR
2,Train,Doors_x_Odometer,760,0,-376059.0,1414701.0,295476.0,743166.0,447690.0
0,Train,Odometer (KM),760,0,-103599.8,364876.2,72078.75,189197.8,117119.0
3,Train,Odometer_Centered,760,0,-233918.8,234557.2,-58240.28,58878.72,117119.0
1,Train,Odometer_Sq,760,0,-40705300000.0,81696440000.0,5195352000.0,35795790000.0,30600440000.0


In [51]:
# Testdaten:
test_figs, test_summary = plot_outliers_boxplots(X_test_df_transformed, dataset_name="Test")
for f in test_figs:
    f.show()
display(test_summary)

Unnamed: 0,dataset,feature,n_total,n_outliers,lower_bound,upper_bound,Q1,Q3,IQR
2,Test,Doors_x_Odometer,190,0,-290376.0,1389640.0,339630.0,759634.0,420004.0
0,Test,Odometer (KM),190,0,-83240.38,357214.6,81930.25,192044.0,110113.8
3,Test,Odometer_Centered,190,0,-213559.4,226895.6,-48388.78,61724.97,110113.8
1,Test,Odometer_Sq,190,0,-38541400000.0,82136070000.0,6712649000.0,36882020000.0,30169370000.0


In [52]:
# Alle numerischen Spalten holen:
num_cols = X_train_df_transformed.select_dtypes(include=["number", "bool"]).columns.tolist()

# One-Hot-Spalten (binär) erkennen:
binary_cols = []
for c in num_cols:
    unique_vals = pd.unique(X_train_df_transformed[c].dropna())
    # Prüfen, ob nur 0/1 (oder 0.0/1.0, True/False) vorkommen
    if len(unique_vals) <= 2 and set(unique_vals).issubset({0, 1, 0.0, 1.0, True, False}):
        binary_cols.append(c)

# Kontinuierliche Spalten sind alle numerischen ohne binäre:
continous_cols = [c for c in num_cols if c not in binary_cols]

print("Kontinuierliche Features:", continous_cols)
print("Binäre (One-Hot) Features:", binary_cols)

print("Vor der Standardisierung:")
for col in continous_cols:
    mean_val = X_train_df_transformed[col].mean()
    std_val = X_train_df_transformed[col].std(ddof=0)
    print(f"{col}\n  Mittelwert = {mean_val:.4f}\n  Std-Abw.  = {std_val:.4f}")

Kontinuierliche Features: ['Odometer (KM)', 'Odometer_Sq', 'Doors_x_Odometer', 'Odometer_Centered']
Binäre (One-Hot) Features: ['Make_BMW', 'Make_Honda', 'Make_Nissan', 'Make_Toyota', 'Make_missing', 'Colour_Black', 'Colour_Blue', 'Colour_Green', 'Colour_Red', 'Colour_White', 'Colour_missing', 'Doors_3.0', 'Doors_4.0', 'Doors_5.0', 'Doors_Sq_9.0', 'Doors_Sq_16.0', 'Doors_Sq_25.0', 'Odometer_bin_0.0', 'Odometer_bin_1.0', 'Odometer_bin_2.0', 'Odometer_bin_3.0', 'Odometer_bin_4.0']
Vor der Standardisierung:
Odometer (KM)
  Mittelwert = 130319.0331
  Std-Abw.  = 67130.4786
Odometer_Sq
  Mittelwert = 21713631731.4530
  Std-Abw.  = 17972505231.7079
Doors_x_Odometer
  Mittelwert = 515473.1645
  Std-Abw.  = 267762.7651
Odometer_Centered
  Mittelwert = 0.0000
  Std-Abw.  = 67130.4786


In [53]:
# StandardScaler fitten (nur auf Trainingsdaten):
scaler = StandardScaler()
scaler.fit(X_train_df_transformed[continous_cols].astype(float))

# Kopien erstellen & Transformation anwenden:
X_train_df_transformed_scaled = X_train_df_transformed.copy()
X_test_df_transformed_scaled = X_test_df_transformed.copy()

X_train_df_transformed_scaled[continous_cols] = scaler.transform(X_train_df_transformed[continous_cols].astype(float))
X_test_df_transformed_scaled[continous_cols]  = scaler.transform(X_test_df_transformed[continous_cols].astype(float))

# Mittelwert & Standardabweichung nach der Standardisierung prüfen:
print("\nNach der Standardisierung (Trainingsdaten):")
for col in continous_cols:
    mean_val = X_train_df_transformed_scaled[col].mean()
    std_val = X_test_df_transformed_scaled[col].std(ddof=0)
    print(f"{col}\n  Mittelwert = {mean_val:.4f}\n  Std-Abw.  = {std_val:.4f}")


Nach der Standardisierung (Trainingsdaten):
Odometer (KM)
  Mittelwert = -0.0000
  Std-Abw.  = 0.9827
Odometer_Sq
  Mittelwert = -0.0000
  Std-Abw.  = 0.9799
Doors_x_Odometer
  Mittelwert = 0.0000
  Std-Abw.  = 0.9917
Odometer_Centered
  Mittelwert = -0.0000
  Std-Abw.  = 0.9827


In [54]:
X_train_df_transformed_scaled

Unnamed: 0,Make_BMW,Make_Honda,Make_Nissan,Make_Toyota,Make_missing,Colour_Black,Colour_Blue,Colour_Green,Colour_Red,Colour_White,...,Doors_Sq_25.0,Odometer_bin_0.0,Odometer_bin_1.0,Odometer_bin_2.0,Odometer_bin_3.0,Odometer_bin_4.0,Odometer (KM),Odometer_Sq,Doors_x_Odometer,Odometer_Centered
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,-0.869725,-0.920246,-0.850518,-0.869725
1,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.481837,0.264085,0.504875,0.481837
2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,-1.303060,-1.106024,-1.285082,-1.303060
3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.975860,0.925601,1.000299,0.975860
4,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.324256,1.465709,1.349683,1.324256
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
755,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.318089,1.455619,1.343498,1.318089
756,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,1.714690,2.143317,2.657807,1.714690
757,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.981759,0.934239,1.006215,0.981759
758,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.041680,-0.222200,0.000000,0.041680


In [55]:
ridge_basic = Ridge(alpha=1.0, random_state=42)
ridge_basic.fit(X_train_df_transformed_scaled, y_train)
y_pred_basic = ridge_basic.predict(X_test_df_transformed_scaled)

# Kurzer Überblick einiger Kennzahlen:
pd.DataFrame({
    "Tatsächlicher Preis": y_test.values[:10],
    "Vorhergesagter Preis": y_pred_basic[:10],
    "Abweichung (Residuum)": y_test.values[:10] - y_pred_basic[:10],
    "Abweichung (%)": ((y_test.values[:10] - y_pred_basic[:10]) / y_test.values[:10]) * 100,
    "Absoluter Fehler": abs(y_test.values[:10] - y_pred_basic[:10]),
    "Absoluter Fehler (%)": (abs(y_test.values[:10] - y_pred_basic[:10]) / y_test.values[:10]) * 100,
    "Quadratischer Fehler": (y_test.values[:10] - y_pred_basic[:10]) ** 2,
    "Wurzel-Fehler": np.sqrt((y_test.values[:10] - y_pred_basic[:10]) ** 2)
})

Unnamed: 0,Tatsächlicher Preis,Vorhergesagter Preis,Abweichung (Residuum),Abweichung (%),Absoluter Fehler,Absoluter Fehler (%),Quadratischer Fehler,Wurzel-Fehler
0,10547.0,20850.273535,-10303.273535,-97.689139,10303.273535,97.689139,106157400.0,10303.273535
1,17940.0,20866.145434,-2926.145434,-16.310733,2926.145434,16.310733,8562327.0,2926.145434
2,12950.0,11187.567751,1762.432249,13.609515,1762.432249,13.609515,3106167.0,1762.432249
3,5905.0,7267.230803,-1362.230803,-23.069108,1362.230803,23.069108,1855673.0,1362.230803
4,9826.0,9076.59771,749.40229,7.626728,749.40229,7.626728,561603.8,749.40229
5,11162.0,11260.559728,-98.559728,-0.882993,98.559728,0.882993,9714.02,98.559728
6,13650.0,14054.349072,-404.349072,-2.962264,404.349072,2.962264,163498.2,404.349072
7,14345.0,13027.088844,1317.911156,9.187251,1317.911156,9.187251,1736890.0,1317.911156
8,12024.0,19395.421369,-7371.421369,-61.3059,7371.421369,61.3059,54337850.0,7371.421369
9,10076.0,12862.099488,-2786.099488,-27.650848,2786.099488,27.650848,7762350.0,2786.099488


In [56]:
mae = mean_absolute_error(y_test, y_pred_basic)
rmse = root_mean_squared_error(y_test, y_pred_basic)
r2 = r2_score(y_test, y_pred_basic)

print(f"Mean Absolute Error (MAE): {mae:,.2f} €")
print(f"Root Mean Squared Error (RMSE): {rmse:,.2f} €")
print(f"Bestimmtheitsmaß (R²): {r2:.3f}")

Mean Absolute Error (MAE): 5,595.11 €
Root Mean Squared Error (RMSE): 6,883.90 €
Bestimmtheitsmaß (R²): 0.292


### 6.2 Modellvergleich
Empfohlener Ablauf, für einen sinnvollen Modellvergleich:

1. Einheitliche Datenbasis schaffen

Bevor wir unterschiedliche Modelle miteinander vergleichen, benötigen wir immer die selben Ausgangsbedingungen. Wir benötigne also z.B. gleiche Features, Imputationen, Encoding also alle gleichen Datenvorbereitungsschritte. Genau das hatten wir bereits gemacht.

2. Modelle mit Standardparametern testen

In [58]:
N = len(df)
n_train, n_valid, n_test = len(X_train), len(X_valid), len(X_test)
p_train = n_train / N
p_valid = n_valid / N
p_test = n_test / N

print("Größen (absolut):")
print(f"Training: {n_train}")
print(f"Validierung: {n_valid}")
print(f"Test: {n_test}")
print(f"Summe: {n_train + n_valid + n_test} (soll = {N})")

print("\nAnteile (relativ am Gesamt):")
print(f"Training: {p_train:.4f} ({p_train * 100:.2f}%)")
print(f"Validierung: {p_valid:.4f} ({p_valid * 100:.2f}%)")
print(f"Test: {p_test:.4f} ({p_test * 100:.2f}%)")

NameError: name 'X_valid' is not defined