This is a personal modification of a tutorial found at

https://machinelearningmastery.com/regression-tutorial-keras-deep-learning-library-python/

The problems with the above tutorial are manifold, but the most glaring issue is that the data is not properly segregated before building/training the model.  This is actually a consistent issue I've seen with such tutorials, and it seriously impacts the expected results of a given technique.  Additionally, I explore some slightly more aggressive implementations of NN topology than the author does.

In [16]:
#Usual imports to set up the full background
import numpy as np
import pandas as pd
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error as mse

In [17]:
#For reproducibility, always set the random seed
seed = 0
np.random.seed(seed)

In [15]:
#Unfortunately the dataset does not come in csv format and we have to manually input the column names
col_names = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT', 'MEDV']
df = pd.read_csv(r"J:\CASHDESK\Malus\Housing DL\housing.csv", delim_whitespace = True, header = None, names = col_names)
df.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222.0,18.7,396.9,5.33,36.2


In [19]:
#Separate out data and target
X = df.iloc[:,0:13]
y = df.iloc[:, 13]
#And then split into a training and hold out test set.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .2, random_state = seed)

In [43]:
#Define a baseline model, this will be the same as the initial model used by the tutorial.  But in order to better make
#use of the pipeline and transformer below, I'm going to structure it differently.

def basic_model():
    basic_model = Sequential()
    basic_model.add(Dense(13, input_dim = 13, kernel_initializer = 'normal', activation = 'relu'))
    basic_model.add(Dense(1, kernel_initializer = 'normal'))
    basic_model.compile(loss = 'mean_squared_error', optimizer = 'adam')
    return basic_model

In [39]:
#A key point that the tutorial misses is data leakage, ensuring that future information does not impact present
#information.  The pipeline will support this, but it needs a different setup than the naive "throw everything in"
#method of the tutorial.

X_scaler = StandardScaler()
X_train_scaled = X_scaler.fit_transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

y_scaler = StandardScaler()
y_train_scaled = y_scaler.fit_transform(y_train.values.reshape(-1, 1))
y_test_scaled = y_scaler.transform(y_test.values.reshape(-1, 1))

estimators = []
#estimators.append(('standardize', StandardScaler()))
estimators.append(('mlp', KerasRegressor(build_fn = basic_model, epochs = 50, batch_size = 5, verbose = 0)))

#Now we can actually standardize it properly for our case.

pipeline = Pipeline(estimators)
kfold = KFold(n_splits = 10, random_state = seed)
results = cross_val_score(pipeline, X_train_scaled, y_train_scaled, cv = kfold)

#Note here that I am flipping the sign of the MSE, it normally outputs a negative number to account for the 
#"greater is better" aspect programmed into the cross_val_score() function

print("Standardized: %.2f (%.2f) MSE" % (np.abs(results.mean()), results.std()))

Standardized: 0.15 (0.09) MSE


In [47]:
#Now this has to be generalized to the actual unseen data.
pipeline.fit(X_train_scaled, y_train_scaled)
prediction = pipeline.predict(X_test_scaled)
mse(y_test_scaled, prediction)

0.2803054942089024

In [48]:
#While it is worth it to explore deeper networks, a word of caution: given how little data we have, we must be
#very wary of overfitting.

def expanded_model():
    model = Sequential()
    model.add(Dense(13, input_dim = 13, kernel_initializer = 'normal', activation = 'relu'))
    model.add(Dense(6, kernel_initializer = 'normal', activation = 'relu'))
    model.add(Dense(1, kernel_initializer = 'normal'))
    model.compile(loss = 'mean_squared_error', optimizer = 'adam')
    return model

In [51]:
#Same idea as before
estimators = []
estimators.append(('mlp', KerasRegressor(build_fn = expanded_model, epochs = 50, batch_size = 5, verbose = 0)))
pipeline = Pipeline(estimators)
kfold = KFold(n_splits = 10, random_state = seed)
results = cross_val_score(pipeline, X_train_scaled, y_train_scaled, cv = kfold)
print("Standardized: %.2f (%.2f) MSE" % (np.abs(results.mean()), results.std()))

Standardized: 0.16 (0.12) MSE


In [50]:
#Not a real improvement on the training/validation sets, but we can see a solid improvement in the test set below
pipeline.fit(X_train_scaled, y_train_scaled)
prediction = pipeline.predict(X_test_scaled)
mse(y_test_scaled, prediction)

0.2420705111345385

In [52]:
#A wider network is another possibility
def wide_model():
    model = Sequential()
    model.add(Dense(20, input_dim = 13, kernel_initializer = 'normal', activation = 'relu'))
    model.add(Dense(1, kernel_initializer = 'normal'))
    #Compile the model
    model.compile(loss = 'mean_squared_error', optimizer = 'adam')
    return model

In [53]:
#Proceeding as before
estimators = []
estimators.append(('mlp', KerasRegressor(build_fn = wide_model, epochs = 50, batch_size = 5, verbose = 0)))
pipeline = Pipeline(estimators)
kfold = KFold(n_splits = 10, random_state = seed)
results = cross_val_score(pipeline, X_train_scaled, y_train_scaled, cv = kfold)
print("Standardized: %.2f (%.2f) MSE" % (np.abs(results.mean()), results.std()))

Standardized: 0.15 (0.13) MSE


In [54]:
#Not a real improvement on the training/validation sets, but we can see a solid improvement in the test set below
pipeline.fit(X_train_scaled, y_train_scaled)
prediction = pipeline.predict(X_test_scaled)
mse(y_test_scaled, prediction)

0.26079632272756487

In [55]:
#The final option would be to combine both varieties of model to create a deep and wide model as follows
def deep_wide_model():
    model = Sequential()
    model.add(Dense(20, input_dim = 13, kernel_initializer = 'normal', activation = 'relu'))
    model.add(Dense(10, kernel_initializer = 'normal', activation = 'relu'))
    model.add(Dense(10, kernel_initializer = 'normal', activation = 'relu'))
    model.add(Dense(1, kernel_initializer = 'normal'))
    #Compile the model
    model.compile(loss = 'mean_squared_error', optimizer = 'adam')
    return model

In [57]:
#Proceeding as before
estimators = []
estimators.append(('mlp', KerasRegressor(build_fn = deep_wide_model, epochs = 50, batch_size = 5, verbose = 0)))
pipeline = Pipeline(estimators)
kfold = KFold(n_splits = 10, random_state = seed)
results = cross_val_score(pipeline, X_train_scaled, y_train_scaled, cv = kfold)
print("Standardized: %.2f (%.2f) MSE" % (np.abs(results.mean()), results.std()))

Standardized: 0.14 (0.12) MSE


In [58]:
#So some training/validation data improvement.  Finally, we see our best result below by having combined methods.
pipeline.fit(X_train_scaled, y_train_scaled)
prediction = pipeline.predict(X_test_scaled)
mse(y_test_scaled, prediction)

0.21501132988032934