![alt text](https://static.wixstatic.com/media/f11c7c_69283cb47c30496d953e1d8d4c6b5b18~mv2.png/v1/fill/w_75,h_75,al_c,usm_0.66_1.00_0.01/Logo%20new.png)


# Regression model with Keras

This notebook holds my assignment for the Introduction to Deep Learning & Neural Networks with Keras.

---

In [5]:
#setup the environment
import pandas as pd
import numpy as np
import sklearnas sk
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import sys

In [6]:
#add the tensorflow and keras environment if necessary
#remove the comment marks on next two lines respectively
#!pip install tensorflow
#!pip install keras

In [7]:
#import the packages from the Keras library we need
import keras
from keras.models import Sequential
from keras.layers import Dense

In [10]:
#get the data and have a first glance at it
concrete_data = pd.read_csv('https://cocl.us/concrete_data')
print(concrete_data.shape)
concrete_data.head()

(1030, 9)


Unnamed: 0,Cement,Blast Furnace Slag,Fly Ash,Water,Superplasticizer,Coarse Aggregate,Fine Aggregate,Age,Strength
0,540.0,0.0,0.0,162.0,2.5,1040.0,676.0,28,79.99
1,540.0,0.0,0.0,162.0,2.5,1055.0,676.0,28,61.89
2,332.5,142.5,0.0,228.0,0.0,932.0,594.0,270,40.27
3,332.5,142.5,0.0,228.0,0.0,932.0,594.0,365,41.05
4,198.6,132.4,0.0,192.0,0.0,978.4,825.5,360,44.3


In [15]:
#borrowed from the lab notebook: the function to define the lineair regression
#check the data to see if we can work with it right out of the box
#remove missing values if any
if max(concrete_data.isnull().sum()) > 0:
    concrete_data.dropna(inplace=True)
print(concrete_data.isnull().sum())
print("The data has no missing values")    
concrete_data.describe()

Cement                0
Blast Furnace Slag    0
Fly Ash               0
Water                 0
Superplasticizer      0
Coarse Aggregate      0
Fine Aggregate        0
Age                   0
Strength              0
dtype: int64
The data has no missing values


Unnamed: 0,Cement,Blast Furnace Slag,Fly Ash,Water,Superplasticizer,Coarse Aggregate,Fine Aggregate,Age,Strength
count,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0
mean,281.167864,73.895825,54.18835,181.567282,6.20466,972.918932,773.580485,45.662136,35.817961
std,104.506364,86.279342,63.997004,21.354219,5.973841,77.753954,80.17598,63.169912,16.705742
min,102.0,0.0,0.0,121.8,0.0,801.0,594.0,1.0,2.33
25%,192.375,0.0,0.0,164.9,0.0,932.0,730.95,7.0,23.71
50%,272.9,22.0,0.0,185.0,6.4,968.0,779.5,28.0,34.445
75%,350.0,142.95,118.3,192.0,10.2,1029.4,824.0,56.0,46.135
max,540.0,359.4,200.1,247.0,32.2,1145.0,992.6,365.0,82.6


In [16]:
#borrowed from the lab notebook: the function to define the lineair regression
#split data into predictors and target
concrete_data_columns = concrete_data.columns

predictors = concrete_data[concrete_data_columns[concrete_data_columns != 'Strength']] # all columns except Strength
target = concrete_data['Strength'] # Strength column

In [17]:
#borrowed from the lab notebook: the function to define the lineair regression
#normalize the predictors
predictors_norm = (predictors - predictors.mean()) / predictors.std()
predictors_norm.head()

Unnamed: 0,Cement,Blast Furnace Slag,Fly Ash,Water,Superplasticizer,Coarse Aggregate,Fine Aggregate,Age
0,2.476712,-0.856472,-0.846733,-0.916319,-0.620147,0.862735,-1.217079,-0.279597
1,2.476712,-0.856472,-0.846733,-0.916319,-0.620147,1.055651,-1.217079,-0.279597
2,0.491187,0.79514,-0.846733,2.174405,-1.038638,-0.526262,-2.239829,3.55134
3,0.491187,0.79514,-0.846733,2.174405,-1.038638,-0.526262,-2.239829,5.055221
4,-0.790075,0.678079,-0.846733,0.488555,-1.038638,0.070492,0.647569,4.976069


In [18]:
#borrowed from the lab notebook: the function to define the lineair regression
#Let's save the number of predictors to _n_cols_ since we will need this number when building our network
n_cols = predictors_norm.shape[1] # number of predictors

In [32]:
#borrowed from the lab notebook: the function to define the lineair regression
def regression_model(nodes = 10):
    # create model
    model = Sequential()
    model.add(Dense(nodes, activation='relu', input_shape=(n_cols,))) # 1 hidden layer, 10 nodes and ReLU as activation
    model.add(Dense(1))
    
    # compile model
    model.compile(optimizer='adam', loss='mean_squared_error') # 
    return model

Let's do a single run to check if everything is working. If so, we'll then perform the 50 times loop

In [53]:
#split the set into train and test using sklearn
#one split for the non normalized predictors (step A)
predictors_reg_train, predictors_reg_test, target_reg_train, \
    target_reg_test = train_test_split(predictors, target, test_size=0.3)

#one split for the normalized predictors (step B)
predictors_norm_train, predictors_norm_test, target_norm_train, \
    target_norm_test = train_test_split(predictors_norm, target, test_size=0.3)

In [54]:
#do some sanity checks on the different dataframes we'll use
print("Regular sets")
print("Predictors training dataframe shape: {}".format(predictors_reg_train.shape))
print("Predictors test dataframe shape: {}".format(predictors_reg_test.shape))

print("Normalized sets")
print("Predictors training dataframe shape: {}".format(predictors_norm_train.shape))
print("Predictors test dataframe shape: {}".format(predictors_norm_test.shape))

Regular sets
Predictors training dataframe shape: (824, 8)
Predictors test dataframe shape: (206, 8)
Normalized sets
Predictors training dataframe shape: (824, 8)
Predictors test dataframe shape: (206, 8)


In [55]:
# build the model
model = regression_model(10)

In [56]:
# fit / train the model
model.fit(predictors_reg_train, target_reg_train, \
          validation_data=(predictors_reg_test, target_reg_test), epochs=50, verbose=0)

<tensorflow.python.keras.callbacks.History at 0x7f9636b468e0>

In [57]:
predicted_strengths = model.predict(predictors_reg_test)
mean_squared_error(target_reg_test, predicted_strengths)

206.1276574647648

Looks good to go, now let's program the loop doing step 1 to 3 50 times, store the MSE values in a list and
calc the mean and the standard deviation of the mean squared errors.

## A. Build a baseline model

In [71]:
# build the model with a hidden layer of 10 nodes
model = regression_model(10)

#init the list with results
mse_results = []

#loop 50 times and store the results
for i in range(50):
    sys.stdout.write('\rIteration: ' + str(i+1) + ' of ' + str(50))
    sys.stdout.flush()
    #print("Iteration: {}".format(i))
    #step 1: Randomly split the data into a training and test sets by holding 30% of the data for testing.
    predictors_reg_train, predictors_reg_test, target_reg_train, \
        target_reg_test = train_test_split(predictors, target, test_size=0.3)
    
    #step 2: Train the model on the training data using 50 epochs
    #remember: we don't have to build the model 50 times, we just need to feed a different train set
    model.fit(predictors_reg_train, target_reg_train, \
          validation_data=(predictors_reg_test, target_reg_test), epochs=50, verbose=0)
    
    #step 3: Evaluate the model on the test data and compute the mean squared error between the 
    #predicted concrete strength and the actual concrete strength.
    predicted_strengths = model.predict(predictors_reg_test)
    
    #step 4: add the result to the list
    mse_results.append(mean_squared_error(target_reg_test, predicted_strengths))

print("")
print("Iterations done.")
#step 5: report the mean and standard deviation of the mean squared errors
print("The Mean Squared Errors "\
      "\nhave a mean of {} \nand a standard deviation of {}".format(np.mean(mse_results), \
                                                                    np.std(mse_results)))

Iteration: 50 of 50
Iterations done.
The Mean Squared Errors have a mean of 110.34882283698649 
and a standard deviation of 14.331467864717725


## B. Normalize the data

In [74]:
#we use the model generated in step A so no need to call the constructor of the model again.

#normalize the data
predictors_norm = (predictors - predictors.mean()) / predictors.std()

#init the list with results
mse_norm_results = []

#loop 50 times and store the results
for i in range(50):
    sys.stdout.write('\rIteration: ' + str(i+1) + ' of ' + str(50))
    sys.stdout.flush()
    #print("Iteration: {}".format(i))
    #step 1: Randomly split the data into a training and test sets by holding 30% of the data for testing.
    #IMPORTANT: we have to take the normalized set here
    predictors_norm_train, predictors_norm_test, target_norm_train, \
        target_norm_test = train_test_split(predictors_norm, target, test_size=0.3)
    
    #step 2: Train the model on the training data using 50 epochs
    #remember: we don't have to build the model 50 times, we just need to feed a different train set
    #IMPORTANT: we have to take the normalized set here
    model.fit(predictors_norm_train, target_norm_train, \
          validation_data=(predictors_norm_test, target_norm_test), epochs=50, verbose=0)
    
    #step 3: Evaluate the model on the test data and compute the mean squared error between the 
    #predicted concrete strength and the actual concrete strength.
    #IMPORTANT: we have to take the normalized set here
    predicted_norm_strengths = model.predict(predictors_norm_test)
    
    #step 4: add the result to the list
    #IMPORTANT: we have to take the normalized set here
    mse_norm_results.append(mean_squared_error(target_norm_test, predicted_norm_strengths))

print("")
print("Iterations done.")
#step 5: report the mean and standard deviation of the mean squared errors
print("The Mean Squared Errors with the normalized data "\
      "\nhave a mean of {} \nand a standard deviation of {}".format(np.mean(mse_norm_results), \
                                                           np.std(mse_norm_results)))

Iteration: 50 of 50
Iterations done.
The Mean Squared Errors with the normalized data 
have a mean of 14.204298771816202 
and a standard deviation of 1.4875688711290145


## C. Increate the number of epochs

In [75]:
#we use the model generated in step A so no need to call the constructor of the model again.
#we use the normalized data generated in step B

#init the list with results
mse_norm_results2 = []

#loop 50 times and store the results
for i in range(50):
    sys.stdout.write('\rIteration: ' + str(i+1) + ' of ' + str(50))
    sys.stdout.flush()
    #print("Iteration: {}".format(i))
    #step 1: Randomly split the data into a training and test sets by holding 30% of the data for testing.
    #IMPORTANT: we have to take the normalized set here
    predictors_norm_train, predictors_norm_test, target_norm_train, \
        target_norm_test = train_test_split(predictors_norm, target, test_size=0.3)
    
    #step 2: Train the model on the training data using 50 epochs
    #remember: we don't have to build the model 50 times, we just need to feed a different train set
    #IMPORTANT: we havbe to take the normalized set here
    #IMPORTANT: we raised the number of epochs from 50 -> 100
    model.fit(predictors_norm_train, target_norm_train, \
          validation_data=(predictors_norm_test, target_norm_test), epochs=100, verbose=0)
    
    #step 3: Evaluate the model on the test data and compute the mean squared error between the 
    #predicted concrete strength and the actual concrete strength.
    #IMPORTANT: we have to take the normalized set here
    predicted_norm_strengths = model.predict(predictors_norm_test)
    
    #step 4: add the result to the list
    #IMPORTANT: we have to take the normalized set here
    mse_norm_results2.append(mean_squared_error(target_norm_test, predicted_norm_strengths))

print("")
print("Iterations done.")
#step 5: report the mean and standard deviation of the mean squared errors
print("The Mean Squared Errors with normalized data and 100 epochs "\
      "\nhave a mean of {} \nand a standard deviation of {}".format(np.mean(mse_norm_results2), \
                                                                np.std(mse_norm_results2)))

Iteration: 50 of 50
Iterations done.
The Mean Squared Errors with normalized data and 100 epochs 
have a mean of 13.958282286624854 
and a standard deviation of 1.461215383484202


## D. Increase the number of hidden layers

In [76]:
#we have to define the new model, now with 3 hidden layers
def regression_model2(nodes = 10):
    # create model
    model = Sequential()
    model.add(Dense(nodes, activation='relu', input_shape=(n_cols,))) # 3 hidden layers, 10 nodes and ReLU as activation
    model.add(Dense(nodes, activation='relu'))
    model.add(Dense(nodes, activation='relu'))            
    model.add(Dense(1))
    
    # compile model
    model.compile(optimizer='adam', loss='mean_squared_error') # 
    return model

In [77]:
# build the model with a hidden layer of 10 nodes
model2 = regression_model2(10)

#init the list with results
mse_norm_results2 = []

#loop 50 times and store the results
for i in range(50):
    sys.stdout.write('\rIteration: ' + str(i+1) + ' of ' + str(50))
    sys.stdout.flush()
    #print("Iteration: {}".format(i))
    #step 1: Randomly split the data into a training and test sets by holding 30% of the data for testing.
    #IMPORTANT: we have to take the normalized set here
    predictors_norm_train, predictors_norm_test, target_norm_train, \
        target_norm_test = train_test_split(predictors_norm, target, test_size=0.3)
    
    #step 2: Train the model on the training data using 50 epochs
    #remember: we don't have to build the model 50 times, we just need to feed a different train set
    #IMPORTANT: we have to take the normalized set here
    model2.fit(predictors_norm_train, target_norm_train, \
          validation_data=(predictors_norm_test, target_norm_test), epochs=50, verbose=0)
    
    #step 3: Evaluate the model on the test data and compute the mean squared error between the 
    #predicted concrete strength and the actual concrete strength.
    #IMPORTANT: we have to take the normalized set here
    predicted_norm_strengths2 = model2.predict(predictors_norm_test)
    
    #step 4: add the result to the list
    #IMPORTANT: we have to take the normalized set here
    mse_norm_results2.append(mean_squared_error(target_norm_test, predicted_norm_strengths2))

print("")
print("Iterations done.")
#step 5: report the mean and standard deviation of the mean squared errors
print("The Mean Squared Errors with the normalized data and new model " \
      "\nhave a mean of {} \nand a standard deviation of {}".format(np.mean(mse_norm_results2), \
                                                                np.std(mse_norm_results2)))

Iteration: 50 of 50
Iterations done.
The Mean Squared Errors with the normalized data and new model 
have a mean of 31.42006026453305 
and a standard deviation of 20.165493781357558
