# Bewertung von Regressionsmodellen

In [1]:
import pandas as pd


housing_df = pd.read_csv("data/housing 1.csv")
housing_df.drop("Unnamed: 0", axis=1, inplace=True)
housing_df



Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedHouseVal
0,8.3252,41.0,6.984127,1.023810,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.971880,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.802260,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422
...,...,...,...,...,...,...,...,...,...
20635,1.5603,25.0,5.045455,1.133333,845.0,2.560606,39.48,-121.09,0.781
20636,2.5568,18.0,6.114035,1.315789,356.0,3.122807,39.49,-121.21,0.771
20637,1.7000,17.0,5.205543,1.120092,1007.0,2.325635,39.43,-121.22,0.923
20638,1.8672,18.0,5.329513,1.171920,741.0,2.123209,39.43,-121.32,0.847


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

X = housing_df.drop("MedHouseVal", axis=1)
y = housing_df["MedHouseVal"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = RandomForestRegressor(random_state=42)
model.fit(X_train, y_train)

0,1,2
,n_estimators,100
,criterion,'squared_error'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


## R^2 - Bestimmtheitsmaß

In [3]:
from sklearn.metrics import r2_score
y_pred = model.predict(X_test)
r2 = r2_score(y_test, y_pred)
print(f"R^2: {r2}")

R^2: 0.8051230593157366


In [4]:
print(f"R^2: {model.score(X_test, y_test)}")

R^2: 0.8051230593157366


$$R^2=1-\frac{SS_{res}}{SS_{tot}}=1-\frac{\text{Residuenquadratsumme}}{\text{Gesamtquadratsumme}}$$
 

**Residuenquadratsumme (Fehler des Modells):**
$$SS_{res}=\sum_{i=1}^n(y_i-\hat{y_i})^2$$
 

In [5]:
import numpy as np

# Fehler des Modells bzw. Residuenquadratsumme:
ss_res = np.sum((y_test - y_pred) ** 2)
print(f"ss_res= {ss_res}")

# Fehler des Mittelwerts bzw. Gesamtquadratsumme:
ss_tot = np.sum((y_test - np.mean(y_test)) ** 2)
print(f"ss_tot= {ss_tot}")

# Berechnung von R^2:
r2_manuel = 1 - (ss_res / ss_tot)
print(f"R^2 manuell berechnen: {r2_manuel}")

ss_res= 1054.1611379678839
ss_tot= 5409.368262178434
R^2 manuell berechnen: 0.8051230593157366


In [6]:
import plotly.graph_objects as go

fig1 = go.Figure()

info_text = (
    f"R^2 = 1- (ss_res / ss_tot)\n"
    f"ss_res = {ss_res:.2f}\n"
)

fig1.add_trace(go.Scatter(
    x=y_test,
    y=y_pred,
    mode="markers",
    name="Vorhersagen",
    marker=dict(color="rgba(0, 150, 255, 0.6)", size=5)
))

# Ideale Linie:
fig1.add_trace(go.Scatter(
    x=[min(y_test), max(y_test)],
    y=[min(y_test), max(y_test)],
    mode="lines+markers",
    name="Ideale Linie",
    marker=dict(color="red", size=8, symbol="circle"),
    line=dict(dash="dash")
))

fig1.update_layout(
    title=f"Vorhersagen vs. Realität (R^2 = {r2:.3f})",
    annotations=[
        dict(
            text=info_text,
            align="left",
            xref="paper",
            yref="paper",
            x=0.99,

        )
    ]
    
)

fig1

## 2. Mean Absolute Error (MAE)

$$MAE=\frac{1}{n} \sum_{i=1}^n \vert y_i - \hat{y_i} \vert$$

Wie viel liegt das Model im Durchschnitt daneben?

In [7]:
from sklearn.metrics import mean_absolute_error

mae = mean_absolute_error(y_test, y_pred)
print(f"MAE: {mae}")
print(f"Modell liegt im Durchschnitt${mae * 10**5:.2f} daneben!")

MAE: 0.32754256845930246
Modell liegt im Durchschnitt$32754.26 daneben!


In [8]:
mean_price = np.mean(y_test)
relativer_error = mae / mean_price
print(f"Relativer Fehler: {relativer_error * 100:.2f}%")

Relativer Fehler: 15.94%


In [9]:
fig2 = go.Figure()

n = 100
indices = np.arange(n)
y_true = y_test[:n]
y_hat = y_pred[:n]
errors = np.abs(y_true - y_hat)

# Tatsächlcihe Werte:
fig2.add_trace(go.Scatter(
    x=indices,
    y=y_true,
    mode="markers",
    name="Tatsächlicher Wert",
    marker=dict(color="blue", size=6)
))

# Vorhergesagte Werte:
fig2.add_trace(go.Scatter(
    x=indices,
    y=y_hat,
    mode="markers",
    name="Vorhergesagter Wert",
    marker=dict(color="red", size=6, symbol="x")
))

# Fehlerlinien:
for i in range(n):
    fig2.add_trace(go.Scatter(
        x=[indices[i], indices[i]],
        y=[y_true[i], y_hat[i]],
        mode="lines",
        showlegend=(i == 0),
        name="Fehler (|y-ŷ|)",
        line=dict(color="orange", width=1, dash="dot")
    ))
fig2

KeyError: 0

# 3. Der Mittlere Quadratische fehler

| Tatsächlciher Wert | Vorhergesagter Wert | Fehler | Fehler² |
| ------------------ | ------------------- | ------ | ------- |
|100                 |102                     |2        |4         |
|100                    |96                     |4        |16         |
|100                    |110                     |10        |100         |
 

In [11]:
from sklearn.metrics import mean_squared_error

mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error (MSE): {mse:.4f}")

Mean Squared Error (MSE): 0.2554


In [12]:
fehler_quadrat = (y_test - y_pred) ** 2

fig3 = go.Figure()

# Tatsächlichen werte vs. vorhergesagtewerte
fig3.add_trace(go.Scatter(
    x=y_test,
    y=y_pred,
    mode="markers",
    marker=dict(
        size=6,
        color=fehler_quadrat,
        colorscale="Reds",
        colorbar=dict(title="Quadratische Fehler")
    )
))

fig3.add_trace(go.Scatter(
    x=[min(y_test), max(y_test)],
    y=[min(y_test), max(y_test)],
    mode="lines",
    line=dict(color="green", dash="dash")
))

fig3.update_layout(
    title=f"Vorhersage vs Tatsächlicher wert<br>(MSE = {mse:.4f})",
    width=800,
    height=600,
    showlegend=False
)

fig3

# RMSE (Wurzel des mittleren quadratischen Fehlers)

In [13]:
rmse = np.sqrt(mse)
print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")

Root Mean Squared Error (RMSE): 0.5053


In [15]:
from sklearn.metrics import root_mean_squared_error

rmse = root_mean_squared_error(y_test, y_pred)
rmse

0.5053399773665033

In [18]:
fig4 = go.Figure()

error_square = np.sqrt((y_test - y_pred) ** 2)

fig4.add_trace(go.Scatter(
    x=y_test,
    y=y_pred,
    mode="markers",
    showlegend=True,
    marker=dict(
        color=error_square,
        colorscale="Viridis",
        colorbar=dict(title="Wurzel aus MSE"),
        size=6
    )
))

fig4.add_trace(go.Scatter(
    x=[min(y_test), max(y_test)],
    y=[min(y_test), max(y_test)],
    mode="lines"
))

## Zusammenfassung
 
| Metrik | Wert  |
| ------ | ----- |
| R²     | 0,805 |
| MAE    | 0,327 |
| MSE    | 0,255 |
| RMSE   | 0,505 |

| $\frac{RMSE}{MAE}$ | Bedeutung                                |
| ------------------ | ---------------------------------------- |
| 1,0 - 1,2          | Stabil, kaum Ausreißer, gute Vorhersagen |
| 1,3 - 1,6          | Gute Vorhersagen mit größeren Fehlern    |
| > 2                | Viele oder sehr große Ausreißer          |
 
 
 