In [1]:
from IPython.display import display
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

  from IPython.core.display import display, HTML


# Daten Import

In [7]:
from pathlib import Path
import tarfile
import urllib.request

def load_housing_data():
    tarball_path = Path("datasets/housing.tgz")
    if not tarball_path.is_file():
        Path("datasets").mkdir(parents=True, exist_ok=True)
        url = "https://github.com/ageron/data/raw/main/housing.tgz"
        urllib.request.urlretrieve(url, tarball_path)
        with tarfile.open(tarball_path) as housing_tarball:
            housing_tarball.extractall(path="datasets")
    return pd.read_csv(Path("datasets/housing/housing.csv"))

housing = load_housing_data()
# housing.head()

# Trainings- und Testdatensatz erstellen

In [8]:
from sklearn.model_selection import train_test_split

train_set, test_set = train_test_split(housing, test_size=0.2, random_state=42)

# stratifizierte Testdaten
housing["income_cat"] = pd.cut(housing["median_income"], bins=[0., 1.5, 3., 4.5, 6., np.inf], labels=[1, 2, 3, 4, 5])
strat_train_set, strat_test_set = train_test_split(housing, test_size=0.2, stratify=housing["income_cat"], random_state=42)

# Spalte income_cat löschen
for set_ in (strat_train_set, strat_test_set):
    set_.drop("income_cat", axis=1, inplace=True)

# Processing

In [9]:
housing = strat_train_set.drop("median_house_value", axis=1)
housing_labels = strat_train_set["median_house_value"].copy()

In [10]:
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import make_column_selector, make_column_transformer

num_pipeline = make_pipeline(
    SimpleImputer(strategy="most_frequent"),
    StandardScaler()
)

default_num_pipeline = make_pipeline(
    SimpleImputer(strategy="median"),
    StandardScaler()
)

cat_pipeline = make_pipeline(
    SimpleImputer(strategy="most_frequent"),
    OneHotEncoder(handle_unknown="ignore")
)

preprocessing = make_column_transformer(
    (num_pipeline, make_column_selector(dtype_include=np.number)),
    (cat_pipeline, make_column_selector(dtype_include=object)),
    remainder=default_num_pipeline
)

In [11]:
housing_prepared = preprocessing.fit_transform(housing)

In [12]:
df_housing_prepared = pd.DataFrame(
    housing_prepared,
    columns=preprocessing.get_feature_names_out(),
    index=housing.index,
)

In [13]:
# df_housing_prepared

# Modell

## LinearRegression

In [16]:
from sklearn.linear_model import LinearRegression

lin_reg = make_pipeline(
    preprocessing,
    LinearRegression()
)

lin_reg.fit(housing, housing_labels)

In [19]:
housing_predictions = lin_reg.predict(housing)

In [18]:
housing_predictions[:5].round(-2)

array([270800., 334900., 119900., 109300., 305700.])

In [20]:
housing_labels[:5].values

array([458300., 483800., 101700.,  96100., 361800.])

In [24]:
from sklearn.metrics import mean_squared_error

lin_rmse = mean_squared_error(housing_labels, housing_predictions, squared=False)
lin_rmse



68237.48513154627

## DecisionTreeRegressor

In [25]:
from sklearn.tree import DecisionTreeRegressor

tree_reg = make_pipeline(
    preprocessing,
    DecisionTreeRegressor(random_state=42)
)

tree_reg.fit(housing, housing_labels)

In [26]:
housing_predictions = tree_reg.predict(housing)

In [28]:
tree_rmse = mean_squared_error(housing_labels, housing_predictions, squared=False)
tree_rmse



0.0

## Kreuzvalidierung

In [29]:
from sklearn.model_selection import cross_val_score

tree_rmses = -cross_val_score(tree_reg, housing, housing_labels, scoring="neg_root_mean_squared_error", cv=10)

In [31]:
pd.Series(tree_rmses).describe()

count       10.000000
mean     68434.621476
std       1681.846658
min      66427.249904
25%      67171.371671
50%      68172.454902
75%      69470.763466
max      71610.204991
dtype: float64

## RandomForestRegressor

In [34]:
from sklearn.ensemble import RandomForestRegressor

forest_reg = make_pipeline(
    preprocessing,
    RandomForestRegressor(random_state=42)
)

forest_rmses = -cross_val_score(forest_reg, housing, housing_labels, scoring="neg_root_mean_squared_error", cv=10)

In [35]:
pd.Series(forest_rmses).describe()

count       10.000000
mean     48932.060404
std        713.402263
min      47497.604370
25%      48510.497796
50%      48990.044237
75%      49503.382135
max      49880.994566
dtype: float64

# Model Optimierung

## Gittersuche

In [38]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

full_pipeline = Pipeline([
    ('preprocessing', preprocessing),
    ('random_forest', RandomForestRegressor(random_state=42))
])

param_grid = [
    {'random_forest__max_features': [4, 6, 8]},
    {'random_forest__max_features': [6, 8, 10]},
]

grid_search = GridSearchCV(
    full_pipeline, 
    param_grid,
    cv=3,
    scoring='neg_root_mean_squared_error'
)

grid_search.fit(housing, housing_labels)

In [39]:
grid_search.best_params_

{'random_forest__max_features': 8}

In [42]:
cv_res = pd.DataFrame(grid_search.cv_results_)
cv_res.sort_values(by="mean_test_score", ascending=False, inplace=True)
cv_res

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_random_forest__max_features,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
2,4.799036,0.030701,0.102446,0.000351,8,{'random_forest__max_features': 8},-49397.852708,-49362.24021,-49502.132966,-49420.741961,59.360112,1
4,4.766983,0.051878,0.103938,0.001528,8,{'random_forest__max_features': 8},-49397.852708,-49362.24021,-49502.132966,-49420.741961,59.360112,1
1,3.695896,0.013871,0.104059,0.001494,6,{'random_forest__max_features': 6},-49222.558192,-49440.953757,-49724.562845,-49462.691598,205.518156,3
3,3.706979,0.003773,0.103278,0.001369,6,{'random_forest__max_features': 6},-49222.558192,-49440.953757,-49724.562845,-49462.691598,205.518156,3
5,5.850362,0.06938,0.102693,0.000246,10,{'random_forest__max_features': 10},-49469.031136,-49451.712401,-49781.467323,-49567.40362,151.530935,5
0,2.721703,0.038809,0.108834,0.004722,4,{'random_forest__max_features': 4},-49639.71099,-49680.66038,-50108.535095,-49809.635488,212.014063,6


## Zufällige Suche

In [45]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

param_distribs = {
    'random_forest__max_features': randint(low=2, high=20)
}

rnd_search = RandomizedSearchCV(
    full_pipeline, param_distributions=param_distribs,
    n_iter=10,
    cv=3,
    scoring='neg_root_mean_squared_error',
    random_state=42
)

rnd_search.fit(housing, housing_labels)

In [None]:
rnd_search.