In [1]:
import random
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import root_mean_squared_error

random_seed = 42
random.seed(random_seed)
np.random.seed(random_seed)

housing = pd.read_csv('housing.csv')
housing

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY
...,...,...,...,...,...,...,...,...,...,...
20635,-121.09,39.48,25.0,1665.0,374.0,845.0,330.0,1.5603,78100.0,INLAND
20636,-121.21,39.49,18.0,697.0,150.0,356.0,114.0,2.5568,77100.0,INLAND
20637,-121.22,39.43,17.0,2254.0,485.0,1007.0,433.0,1.7000,92300.0,INLAND
20638,-121.32,39.43,18.0,1860.0,409.0,741.0,349.0,1.8672,84700.0,INLAND


In [2]:
# fehlende Daten von total_bedrooms auffüllen
total_bedrooms_median = housing['total_bedrooms'].median()
housing['total_bedrooms'] = housing['total_bedrooms'].fillna(total_bedrooms_median)
housing.describe()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
count,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0
mean,-119.569704,35.631861,28.639486,2635.763081,536.838857,1425.476744,499.53968,3.870671,206855.816909
std,2.003532,2.135952,12.585558,2181.615252,419.391878,1132.462122,382.329753,1.899822,115395.615874
min,-124.35,32.54,1.0,2.0,1.0,3.0,1.0,0.4999,14999.0
25%,-121.8,33.93,18.0,1447.75,297.0,787.0,280.0,2.5634,119600.0
50%,-118.49,34.26,29.0,2127.0,435.0,1166.0,409.0,3.5348,179700.0
75%,-118.01,37.71,37.0,3148.0,643.25,1725.0,605.0,4.74325,264725.0
max,-114.31,41.95,52.0,39320.0,6445.0,35682.0,6082.0,15.0001,500001.0


In [3]:
# One-Hot-Encoding für ocean_proximity
housing['near_ocean'] = housing['ocean_proximity'] == "NEAR OCEAN"
housing['near_bay'] = housing['ocean_proximity'] == "NEAR BAY"
housing['inland'] = housing['ocean_proximity'] == "INLAND"
housing['one_h_ocean'] = housing['ocean_proximity'] == "<1H OCEAN"
housing['island'] = housing['ocean_proximity'] == "ISLAND"
housing = housing.drop(columns=['ocean_proximity'], axis=1)

housing

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,near_ocean,near_bay,inland,one_h_ocean,island
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,False,True,False,False,False
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,False,True,False,False,False
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,False,True,False,False,False
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,False,True,False,False,False
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,False,True,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20635,-121.09,39.48,25.0,1665.0,374.0,845.0,330.0,1.5603,78100.0,False,False,True,False,False
20636,-121.21,39.49,18.0,697.0,150.0,356.0,114.0,2.5568,77100.0,False,False,True,False,False
20637,-121.22,39.43,17.0,2254.0,485.0,1007.0,433.0,1.7000,92300.0,False,False,True,False,False
20638,-121.32,39.43,18.0,1860.0,409.0,741.0,349.0,1.8672,84700.0,False,False,True,False,False


In [4]:
# aufteilen der Daten in Eingabe und erwartetes Ergebnis
X = housing.drop(columns=['median_house_value'])
y = housing['median_house_value']

In [5]:
# Test und Trainingsdaten splitten
X_Train, X_Test, Y_Train, Y_Test = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
regr = MLPRegressor(random_state=1, max_iter=20000, verbose = 15000).fit(X_Train, Y_Train)

Iteration 1, loss = 27619375336.69178772
Iteration 2, loss = 25883080183.98838425
Iteration 3, loss = 22685623621.30637360
Iteration 4, loss = 19000448444.71283340
Iteration 5, loss = 16007502559.79300880
Iteration 6, loss = 14264162267.74315071
Iteration 7, loss = 13583722728.84272194
Iteration 8, loss = 13398598738.32676506
Iteration 9, loss = 13320071778.69352913
Iteration 10, loss = 13264492005.45116615
Iteration 11, loss = 13210787714.20687675
Iteration 12, loss = 13148408749.26823997
Iteration 13, loss = 13082789118.58345985
Iteration 14, loss = 13007699294.21118546
Iteration 15, loss = 12927393702.40033340
Iteration 16, loss = 12838795818.65440178
Iteration 17, loss = 12741729439.45842361
Iteration 18, loss = 12635632081.63023949
Iteration 19, loss = 12517925013.21915627
Iteration 20, loss = 12396457873.83596039
Iteration 21, loss = 12266822902.05696106
Iteration 22, loss = 12131994287.31629562
Iteration 23, loss = 11987447081.61844444
Iteration 24, loss = 11847279164.05921936
I

In [8]:
Y_pred = regr.predict(X_Test)
 
mse = root_mean_squared_error(Y_Test, Y_pred)
print("Mean Squared Error:", mse)

Mean Squared Error: 64994.987487610226
