# Challenge - Boston House Prices

## 1. Data Preparation

In [43]:
# Import and load the dataset
import keras
from keras.models import Sequential
from keras.layers import Dense, Input
from sklearn.datasets import load_diabetes
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression

In [14]:
diabetes = load_diabetes(as_frame=True)
target = diabetes.target

array([151.,  75., 141., 206., 135.,  97., 138.,  63., 110., 310., 101.,
        69., 179., 185., 118., 171., 166., 144., 168.,  68.,  49., 245.,
       184., 202., 137.,  85., 131., 283., 129.,  59., 341.,  87.,  65.,
       102., 265., 276., 252.,  90., 100.,  55.,  61.,  92., 259.,  53.,
       190., 142., 155., 225., 104., 182., 128.,  52.,  37., 170.,  71.,
       163., 150., 160., 178.,  48., 270., 111.,  42., 200., 113., 143.,
        51., 210., 134.,  98., 164.,  96., 162., 279.,  83., 302., 198.,
        95., 232.,  81., 246., 297., 258., 229., 275., 281., 173., 180.,
        84., 121., 161.,  99., 109., 115., 268., 274., 158., 107., 103.,
       272., 280., 336., 317., 235.,  60., 174., 126., 288.,  88., 292.,
       197., 186.,  25., 195., 217., 172., 214.,  70., 220., 152.,  47.,
        74., 295., 127., 237.,  64.,  79.,  91., 116.,  86., 122.,  72.,
        39., 196., 222., 277.,  77., 191.,  73., 263., 248., 296.,  78.,
        93., 208., 108., 154., 124.,  67., 257., 26

In [11]:
data = diabetes.data
data.columns

Index(['age', 'sex', 'bmi', 'bp', 's1', 's2', 's3', 's4', 's5', 's6'], dtype='object')

In [12]:
# Have a look at the data
data.head()


Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6
0,0.038076,0.05068,0.061696,0.021872,-0.044223,-0.034821,-0.043401,-0.002592,0.019907,-0.017646
1,-0.001882,-0.044642,-0.051474,-0.026328,-0.008449,-0.019163,0.074412,-0.039493,-0.068332,-0.092204
2,0.085299,0.05068,0.044451,-0.00567,-0.045599,-0.034194,-0.032356,-0.002592,0.002861,-0.02593
3,-0.089063,-0.044642,-0.011595,-0.036656,0.012191,0.024991,-0.036038,0.034309,0.022688,-0.009362
4,0.005383,-0.044642,-0.036385,0.021872,0.003935,0.015596,0.008142,-0.002592,-0.031988,-0.046641


In [13]:
# replace float values in sex by 0/1 binary
data.sex = data.sex.apply(lambda x: 1 if x>0 else 0)
data.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.sex = data.sex.apply(lambda x: 1 if x>0 else 0)


Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6
0,0.038076,1,0.061696,0.021872,-0.044223,-0.034821,-0.043401,-0.002592,0.019907,-0.017646
1,-0.001882,0,-0.051474,-0.026328,-0.008449,-0.019163,0.074412,-0.039493,-0.068332,-0.092204
2,0.085299,1,0.044451,-0.00567,-0.045599,-0.034194,-0.032356,-0.002592,0.002861,-0.02593
3,-0.089063,0,-0.011595,-0.036656,0.012191,0.024991,-0.036038,0.034309,0.022688,-0.009362
4,0.005383,0,-0.036385,0.021872,0.003935,0.015596,0.008142,-0.002592,-0.031988,-0.046641


In [33]:
# Let's split and rescale
features = []

X = data
y = target.to_numpy()
X.shape

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

## 2. Modelling

Now we want to build a Neural Network to perform this regression task. We will build a Neural Networks with 5 hidden layers of 100 units each.

In [34]:
# Create the Sequential object
def regression_mlp(input_dim: tuple[int, ...]) -> Sequential:
    # We create a so called Sequential model
    model = Sequential()

    # Specify the input dimension via an `Input` layer
    model.add(Input(input_dim))

    # Add the first "Dense" layer of 100 units (neurons)
    model.add(Dense(100, activation="sigmoid"))
    model.add(Dense(100, activation="sigmoid"))
    model.add(Dense(100, activation="sigmoid"))
    model.add(Dense(100, activation="sigmoid"))
    model.add(Dense(100, activation="sigmoid")) 

    # Add finally the output layer with one unit: the prediction
    model.add(Dense(1, activation="sigmoid"))

    # return the created model
    return model

mlp = regression_mlp(input_dim=(data.shape[1],))
mlp.summary()

Model: "sequential_6"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_33 (Dense)            (None, 100)               1100      
                                                                 
 dense_34 (Dense)            (None, 100)               10100     
                                                                 
 dense_35 (Dense)            (None, 100)               10100     
                                                                 
 dense_36 (Dense)            (None, 100)               10100     
                                                                 
 dense_37 (Dense)            (None, 100)               10100     
                                                                 
 dense_38 (Dense)            (None, 1)                 101       
                                                                 
Total params: 41,601
Trainable params: 41,601
Non-trai

Now let's compile the model and then fit it

In [35]:
# Compile the model with mean squared error (for regression)
mlp.compile(optimizer="SGD", loss="mean_squared_error", metrics=["accuracy"])

In [36]:
# Now fit the model on 500 epoches with a batch size of 64
# You can add the test/validation set into the fit: it will give insights on this dataset too
mlp.fit(x=X_train, y=y_train, validation_data=(X_test, y_test), epochs=50, batch_size=64)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x7fe2600e9900>

Finally we can compute the results of our model

In [42]:
# Evaluation
y_pred_mlp = mlp.predict(X_test)
# MSE
mse_test = mean_squared_error(y_test, y_pred_mlp)
print(f" accurancy sur test: {mse_test}")

 accurancy sur test: 28605.74157303371


## 3. Benchmarking

We can compare that result to a good old linear regression

In [44]:
# Linear Regression
lr = LinearRegression()

lr.fit(X_train, y_train)

y_pred_lr = lr.predict(X_test)

# MSE linear reg
mse_test2 = mean_squared_error(y_test, y_pred_lr)
print(f" accurancy sur test: {mse_test2}")


 accurancy sur test: 3424.259334298692


Of course to compare properly the two models, one would make hyperparameter optimization first, this is just for the example.