## Loading and Cleaning the Dataset
Let's start by importing the pandas, os, ibm_boto3 libraries.

In [1]:

import os, types
import pandas as pd
from botocore.client import Config
import ibm_boto3

def __iter__(self): return 0

# @hidden_cell
# The following code accesses a file in your IBM Cloud Object Storage. It includes your credentials.
# You might want to remove those credentials before you share the notebook.
client_3ac1f00a48a141178e622b5fd5d48dd4 = ibm_boto3.client(service_name='s3',
    ibm_api_key_id='gpFEgEygpV5FTETUMEGMjUrXeo1CJv7RHwE2gmrbUzIn',
    ibm_auth_endpoint="https://iam.cloud.ibm.com/oidc/token",
    config=Config(signature_version='oauth'),
    endpoint_url='https://s3.private.us.cloud-object-storage.appdomain.cloud')

body = client_3ac1f00a48a141178e622b5fd5d48dd4.get_object(Bucket='buildaregressionmodelinkeras-donotdelete-pr-gxlp3ywxwmjhow',Key='concrete_data.csv')['Body']
# add missing __iter__ method, so pandas accepts body as file-like object
if not hasattr(body, "__iter__"): body.__iter__ = types.MethodType( __iter__, body )

concrete_data  = pd.read_csv(body)
# looking at the first 5 rows of the data
concrete_data.head()

Unnamed: 0,Cement,Blast Furnace Slag,Fly Ash,Water,Superplasticizer,Coarse Aggregate,Fine Aggregate,Age,Strength
0,540.0,0.0,0.0,162.0,2.5,1040.0,676.0,28,79.99
1,540.0,0.0,0.0,162.0,2.5,1055.0,676.0,28,61.89
2,332.5,142.5,0.0,228.0,0.0,932.0,594.0,270,40.27
3,332.5,142.5,0.0,228.0,0.0,932.0,594.0,365,41.05
4,198.6,132.4,0.0,192.0,0.0,978.4,825.5,360,44.3


### Let's check how many data points we have.


In [2]:
concrete_data.shape

(1030, 9)

### Let's check the dataset for any missing values.

In [3]:
concrete_data.describe()

Unnamed: 0,Cement,Blast Furnace Slag,Fly Ash,Water,Superplasticizer,Coarse Aggregate,Fine Aggregate,Age,Strength
count,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0
mean,281.167864,73.895825,54.18835,181.567282,6.20466,972.918932,773.580485,45.662136,35.817961
std,104.506364,86.279342,63.997004,21.354219,5.973841,77.753954,80.17598,63.169912,16.705742
min,102.0,0.0,0.0,121.8,0.0,801.0,594.0,1.0,2.33
25%,192.375,0.0,0.0,164.9,0.0,932.0,730.95,7.0,23.71
50%,272.9,22.0,0.0,185.0,6.4,968.0,779.5,28.0,34.445
75%,350.0,142.95,118.3,192.0,10.2,1029.4,824.0,56.0,46.135
max,540.0,359.4,200.1,247.0,32.2,1145.0,992.6,365.0,82.6


In [4]:
concrete_data.isnull().sum()

Cement                0
Blast Furnace Slag    0
Fly Ash               0
Water                 0
Superplasticizer      0
Coarse Aggregate      0
Fine Aggregate        0
Age                   0
Strength              0
dtype: int64

The data looks very clean and is ready to be used to build our model.

## Split data into predictors and target
The target variable in this problem is the concrete sample strength. Therefore, our predictors will be all the other columns.

In [5]:
concrete_data_columns = concrete_data.columns

predictors = concrete_data[concrete_data_columns[concrete_data_columns != 'Strength']] # all columns except Strength
target = concrete_data['Strength'] # Strength column

Let's do a quick sanity check of the predictors and the target dataframes.

In [6]:
predictors.head()

Unnamed: 0,Cement,Blast Furnace Slag,Fly Ash,Water,Superplasticizer,Coarse Aggregate,Fine Aggregate,Age
0,540.0,0.0,0.0,162.0,2.5,1040.0,676.0,28
1,540.0,0.0,0.0,162.0,2.5,1055.0,676.0,28
2,332.5,142.5,0.0,228.0,0.0,932.0,594.0,270
3,332.5,142.5,0.0,228.0,0.0,932.0,594.0,365
4,198.6,132.4,0.0,192.0,0.0,978.4,825.5,360


In [7]:
target.head()

0    79.99
1    61.89
2    40.27
3    41.05
4    44.30
Name: Strength, dtype: float64

## Train Test Split
Train/Test Split involves splitting the dataset into training and testing sets respectively, which are mutually exclusive. After which, you train with the training set and test with the testing set.

This will provide a more accurate evaluation on out-of-sample accuracy because the testing dataset is not part of the dataset that has been used to train the model. It is more realistic for the real world problems.

In [16]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split( predictors, target, test_size= 0.3, random_state=42)
print ('Train set:', X_train.shape,  y_train.shape)
print ('Test set:', X_test.shape,  y_test.shape)

Train set: (721, 8) (721,)
Test set: (309, 8) (309,)


Import the rest of the packages from the Keras library that we will need to build our regressoin model

In [9]:
import keras
from keras.models import Sequential
from keras.layers import Dense

Let's save the number of predictors to n_cols since we will need this number when building our network.

In [10]:
n_cols = X_train.shape[1] # number of predictors
n_cols

8

# A. Build a baseline model

## Building a Neural Network

In [11]:
# define regression model
def regression_model():
    
    # create model
    model = Sequential()
    model.add(Dense(10, activation='relu', input_shape=(n_cols,)))
    model.add(Dense(1))
    
    # compile model
    model.compile(optimizer='adam', loss='mean_squared_error')
    return model

### Train and Test the Network
Let us build the model, fit the model in training data and evaluated it using test data

In [14]:
# building the model
model = regression_model()

In [17]:
# fitting the model
model.fit(X_train, y_train, epochs=50, verbose=2)

Epoch 1/50
23/23 - 0s - loss: 103.7073 - 38ms/epoch - 2ms/step
Epoch 2/50
23/23 - 0s - loss: 100.6752 - 34ms/epoch - 1ms/step
Epoch 3/50
23/23 - 0s - loss: 99.8403 - 34ms/epoch - 1ms/step
Epoch 4/50
23/23 - 0s - loss: 99.0865 - 33ms/epoch - 1ms/step
Epoch 5/50
23/23 - 0s - loss: 97.8687 - 34ms/epoch - 1ms/step
Epoch 6/50
23/23 - 0s - loss: 96.9783 - 34ms/epoch - 1ms/step
Epoch 7/50
23/23 - 0s - loss: 96.2151 - 35ms/epoch - 2ms/step
Epoch 8/50
23/23 - 0s - loss: 96.1324 - 38ms/epoch - 2ms/step
Epoch 9/50
23/23 - 0s - loss: 95.3707 - 46ms/epoch - 2ms/step
Epoch 10/50
23/23 - 0s - loss: 94.3086 - 36ms/epoch - 2ms/step
Epoch 11/50
23/23 - 0s - loss: 92.9856 - 41ms/epoch - 2ms/step
Epoch 12/50
23/23 - 0s - loss: 92.9381 - 35ms/epoch - 2ms/step
Epoch 13/50
23/23 - 0s - loss: 91.4945 - 35ms/epoch - 2ms/step
Epoch 14/50
23/23 - 0s - loss: 91.0185 - 34ms/epoch - 1ms/step
Epoch 15/50
23/23 - 0s - loss: 90.5399 - 35ms/epoch - 2ms/step
Epoch 16/50
23/23 - 0s - loss: 90.3754 - 34ms/epoch - 1ms/step

<keras.callbacks.History at 0x7f5ca47d54c0>

In [18]:
# evaluating the model
scores = model.evaluate(X_test, y_test)
scores



82.98860168457031

Computing the mean squared error between the predicted concrete strength and the actual concrete strength.

In [19]:
from sklearn.metrics import mean_squared_error
# Predicting the output using the the test data
yhat = model.predict(X_test)
#compute the mean squared error between the predicted concrete strength and the actual concrete strength.
mean_squared_error(yhat, y_test)

82.98860134420103

Creating a list of 50 mean squared errors and Reporting the mean and the standard deviation of the mean squared errors.

In [20]:
import numpy as np
MSEs = []
# creating a for lopp in range 0 to 50
for i in range(0, 50):
    X_train, X_test, y_train, y_test = train_test_split( predictors, target, test_size=0.3, random_state=42)
    model.fit(X_train, y_train, epochs=50, verbose=0)
    yhat = model.predict(X_test)
    MSE = mean_squared_error(yhat, y_test)
    print("Mean_Squared_Error "+str(i)+": "+str(MSE))
    MSEs.append(MSE)
    
MSEs = np.array(MSEs)
mean = np.mean(MSEs)
STDV = np.std(MSEs)

print('\n')
print("Report the mean and the standard deviation of the mean squared errors is indicated below as having values")
print("Mean: ",(mean))
print("Standard Deviation: ",(STDV))

Mean_Squared_Error 0: 72.2748989185133
Mean_Squared_Error 1: 68.16269143806839
Mean_Squared_Error 2: 56.75376115999655
Mean_Squared_Error 3: 52.120970035382506
Mean_Squared_Error 4: 51.58000306819858
Mean_Squared_Error 5: 51.25409004620329
Mean_Squared_Error 6: 61.51221593505675
Mean_Squared_Error 7: 48.8183693159458
Mean_Squared_Error 8: 50.54280963245217
Mean_Squared_Error 9: 49.15823451903464
Mean_Squared_Error 10: 54.935755700587016
Mean_Squared_Error 11: 48.557074823293945
Mean_Squared_Error 12: 50.083872899183234
Mean_Squared_Error 13: 50.70672002540613
Mean_Squared_Error 14: 62.854400846010435
Mean_Squared_Error 15: 51.655639603281514
Mean_Squared_Error 16: 49.391280048187824
Mean_Squared_Error 17: 62.072657609761656
Mean_Squared_Error 18: 48.58279990353341
Mean_Squared_Error 19: 48.51324553438091
Mean_Squared_Error 20: 52.28405553996671
Mean_Squared_Error 21: 48.62868500305548
Mean_Squared_Error 22: 48.55015636942144
Mean_Squared_Error 23: 51.98977136755898
Mean_Squared_Error 2

# B. Normalize the data

Normalizing the data by substracting the mean and dividing by the standard deviation.

In [31]:
#Normalising the data
predictors_norm = (predictors - predictors.mean()) / predictors.std()
predictors_norm.head()

Unnamed: 0,Cement,Blast Furnace Slag,Fly Ash,Water,Superplasticizer,Coarse Aggregate,Fine Aggregate,Age
0,2.476712,-0.856472,-0.846733,-0.916319,-0.620147,0.862735,-1.217079,-0.279597
1,2.476712,-0.856472,-0.846733,-0.916319,-0.620147,1.055651,-1.217079,-0.279597
2,0.491187,0.79514,-0.846733,2.174405,-1.038638,-0.526262,-2.239829,3.55134
3,0.491187,0.79514,-0.846733,2.174405,-1.038638,-0.526262,-2.239829,5.055221
4,-0.790075,0.678079,-0.846733,0.488555,-1.038638,0.070492,0.647569,4.976069


In [32]:
#Spliting the data into train and testing data
X_train2, X_test2, y_train2, y_test2 = train_test_split( predictors_norm, target, test_size= 0.3, random_state=42)
print ('Train set:', X_train2.shape,  y_train2.shape)
print ('Test set:', X_test2.shape,  y_test2.shape)

Train set: (721, 8) (721,)
Test set: (309, 8) (309,)


In [35]:
# fitting the model
model.fit(X_train2, y_train2, epochs=50, verbose=2)
# evaluating the model
scores = model.evaluate(X_test2, y_test2)
scores

Epoch 1/50
23/23 - 0s - loss: 82.3887 - 47ms/epoch - 2ms/step
Epoch 2/50
23/23 - 0s - loss: 82.1157 - 40ms/epoch - 2ms/step
Epoch 3/50
23/23 - 0s - loss: 81.9280 - 37ms/epoch - 2ms/step
Epoch 4/50
23/23 - 0s - loss: 81.6107 - 35ms/epoch - 2ms/step
Epoch 5/50
23/23 - 0s - loss: 81.4684 - 34ms/epoch - 1ms/step
Epoch 6/50
23/23 - 0s - loss: 81.1863 - 34ms/epoch - 1ms/step
Epoch 7/50
23/23 - 0s - loss: 80.9220 - 35ms/epoch - 2ms/step
Epoch 8/50
23/23 - 0s - loss: 80.7255 - 33ms/epoch - 1ms/step
Epoch 9/50
23/23 - 0s - loss: 80.5294 - 33ms/epoch - 1ms/step
Epoch 10/50
23/23 - 0s - loss: 80.2137 - 34ms/epoch - 1ms/step
Epoch 11/50
23/23 - 0s - loss: 79.9950 - 37ms/epoch - 2ms/step
Epoch 12/50
23/23 - 0s - loss: 79.7323 - 44ms/epoch - 2ms/step
Epoch 13/50
23/23 - 0s - loss: 79.4734 - 35ms/epoch - 2ms/step
Epoch 14/50
23/23 - 0s - loss: 79.2804 - 37ms/epoch - 2ms/step
Epoch 15/50
23/23 - 0s - loss: 79.0284 - 36ms/epoch - 2ms/step
Epoch 16/50
23/23 - 0s - loss: 78.8497 - 33ms/epoch - 1ms/step
E

75.6015853881836

Creating a list of 50 mean squared errors and Reporting the mean and the standard deviation of the mean squared errors for normalised data.

In [36]:
import numpy as np
MSEs2 = []
# creating a for lopp in range 0 to 50
for i in range(0, 50):
    X_train2, X_test2, y_train2, y_test2 = train_test_split( predictors_norm, target, test_size=0.3, random_state=42)
    model.fit(X_train2, y_train2, epochs=50, verbose=0)
    yhat2 = model.predict(X_test2)
    MSE2 = mean_squared_error(yhat2, y_test2)
    print("Mean_Squared_Error "+str(i)+": "+str(MSE2))
    MSEs2.append(MSE2)
    
MSEs2 = np.array(MSEs2)
mean = np.mean(MSEs2)
STDV = np.std(MSEs2)

print('\n')
print("Report the mean and the standard deviation of the mean squared errors for normalised data is indicated below as having values")
print("Mean: ",(mean))
print("Standard Deviation: ",(STDV))

Mean_Squared_Error 0: 67.05371614320511
Mean_Squared_Error 1: 60.79063684769401
Mean_Squared_Error 2: 56.79632650780439
Mean_Squared_Error 3: 53.41997214977755
Mean_Squared_Error 4: 52.22371748410707
Mean_Squared_Error 5: 51.06807090451896
Mean_Squared_Error 6: 50.22484861207815
Mean_Squared_Error 7: 49.51325010858649
Mean_Squared_Error 8: 48.82389411081277
Mean_Squared_Error 9: 48.64982110506499
Mean_Squared_Error 10: 47.814590539875496
Mean_Squared_Error 11: 47.41173180836115
Mean_Squared_Error 12: 47.03473197012444
Mean_Squared_Error 13: 46.40415468684569
Mean_Squared_Error 14: 46.17503729657847
Mean_Squared_Error 15: 44.81521350712925
Mean_Squared_Error 16: 44.606753666547675
Mean_Squared_Error 17: 43.93784937312732
Mean_Squared_Error 18: 43.75296630914067
Mean_Squared_Error 19: 43.69435522690286
Mean_Squared_Error 20: 43.6090737195253
Mean_Squared_Error 21: 42.99605296452318
Mean_Squared_Error 22: 42.90356756167183
Mean_Squared_Error 23: 42.45730156755461
Mean_Squared_Error 24: 42

##### How does the mean of the mean squared errors compare to that from Step A?

There is an improvement in the mean from 53.0610 to 44.6942

# C. Increate the number of epochs
Repeat Part B but use 100 epochs this time for training.



In [37]:
# fitting the model 
model.fit(X_train2, y_train2, epochs=100, verbose=2)# number of epoachs increased to 100
# evaluating the model
scores = model.evaluate(X_test2, y_test2)
scores

Epoch 1/100
23/23 - 0s - loss: 35.3943 - 33ms/epoch - 1ms/step
Epoch 2/100
23/23 - 0s - loss: 35.4037 - 32ms/epoch - 1ms/step
Epoch 3/100
23/23 - 0s - loss: 35.3494 - 34ms/epoch - 1ms/step
Epoch 4/100
23/23 - 0s - loss: 35.3554 - 32ms/epoch - 1ms/step
Epoch 5/100
23/23 - 0s - loss: 35.3661 - 33ms/epoch - 1ms/step
Epoch 6/100
23/23 - 0s - loss: 35.4570 - 34ms/epoch - 1ms/step
Epoch 7/100
23/23 - 0s - loss: 35.4132 - 32ms/epoch - 1ms/step
Epoch 8/100
23/23 - 0s - loss: 35.3200 - 30ms/epoch - 1ms/step
Epoch 9/100
23/23 - 0s - loss: 35.3203 - 31ms/epoch - 1ms/step
Epoch 10/100
23/23 - 0s - loss: 35.3633 - 32ms/epoch - 1ms/step
Epoch 11/100
23/23 - 0s - loss: 35.3555 - 38ms/epoch - 2ms/step
Epoch 12/100
23/23 - 0s - loss: 35.4012 - 32ms/epoch - 1ms/step
Epoch 13/100
23/23 - 0s - loss: 35.3805 - 32ms/epoch - 1ms/step
Epoch 14/100
23/23 - 0s - loss: 35.3341 - 32ms/epoch - 1ms/step
Epoch 15/100
23/23 - 0s - loss: 35.3749 - 33ms/epoch - 1ms/step
Epoch 16/100
23/23 - 0s - loss: 35.3433 - 33ms/ep

40.17315673828125

In [38]:
# Predicting the output using the the test data
yhat2 = model.predict(X_test2)
#compute the mean squared error between the predicted concrete strength and the actual concrete strength.
mean_squared_error(yhat2, y_test2)

40.173160616188795

In [39]:
import numpy as np
MSEs2 = []
# creating a for lopp in range 0 to 50
for i in range(0, 50):
    X_train2, X_test2, y_train2, y_test2 = train_test_split( predictors_norm, target, test_size=0.3, random_state=42)
    model.fit(X_train2, y_train2, epochs=100, verbose=0) #number of epoachs increased to 100
    yhat2 = model.predict(X_test2)
    MSE2 = mean_squared_error(yhat2, y_test2)
    #print("Mean_Squared_Error "+str(i)+": "+str(MSE2))
    MSEs2.append(MSE2)
    
MSEs2 = np.array(MSEs2)
mean = np.mean(MSEs2)
STDV = np.std(MSEs2)

print('\n')
print("Report the mean and the standard deviation of the mean squared errors for normalised data is indicated below as having values")
print("Mean: ",(mean))
print("Standard Deviation: ",(STDV))



Report the mean and the standard deviation of the mean squared errors for normalised data is indicated below as having values
Mean:  38.56783607597417
Standard Deviation:  0.5922653543475711


##### How does the mean of the mean squared errors compare to that from Step B?

There is an improvement in the mean from 44.6942 to 38.5678

# D. Increase the number of hidden layers

Repeating part B but use a neural network with the following instead: - Three hidden layers, each of 10 nodes and ReLU activation function.

### Building a Neural Network

In [41]:
# define regression model
def regression_model():
    
    # create model
    model = Sequential()
    
    #Three hidden layers, each of 10 nodes and ReLU activation function
    model.add(Dense(10, activation='relu', input_shape=(n_cols,)))
    model.add(Dense(10, activation='relu'))
    model.add(Dense(10, activation='relu'))
    model.add(Dense(1))
    
    # compile model
    model.compile(optimizer='adam', loss='mean_squared_error')
    return model

In [42]:
# building the model
model = regression_model()

In [46]:
# fitting the model
model.fit(X_train2, y_train2, epochs=50, verbose=2)
# evaluating the model
scores = model.evaluate(X_test2, y_test2)
scores

Epoch 1/50
23/23 - 0s - loss: 18.0073 - 51ms/epoch - 2ms/step
Epoch 2/50
23/23 - 0s - loss: 17.8428 - 39ms/epoch - 2ms/step
Epoch 3/50
23/23 - 0s - loss: 18.0821 - 42ms/epoch - 2ms/step
Epoch 4/50
23/23 - 0s - loss: 18.7195 - 41ms/epoch - 2ms/step
Epoch 5/50
23/23 - 0s - loss: 17.8871 - 44ms/epoch - 2ms/step
Epoch 6/50
23/23 - 0s - loss: 17.6206 - 45ms/epoch - 2ms/step
Epoch 7/50
23/23 - 0s - loss: 17.6418 - 43ms/epoch - 2ms/step
Epoch 8/50
23/23 - 0s - loss: 17.7967 - 42ms/epoch - 2ms/step
Epoch 9/50
23/23 - 0s - loss: 17.7924 - 44ms/epoch - 2ms/step
Epoch 10/50
23/23 - 0s - loss: 18.1402 - 42ms/epoch - 2ms/step
Epoch 11/50
23/23 - 0s - loss: 17.8981 - 41ms/epoch - 2ms/step
Epoch 12/50
23/23 - 0s - loss: 17.6813 - 39ms/epoch - 2ms/step
Epoch 13/50
23/23 - 0s - loss: 17.7302 - 41ms/epoch - 2ms/step
Epoch 14/50
23/23 - 0s - loss: 17.7065 - 41ms/epoch - 2ms/step
Epoch 15/50
23/23 - 0s - loss: 18.1555 - 43ms/epoch - 2ms/step
Epoch 16/50
23/23 - 0s - loss: 17.8765 - 43ms/epoch - 2ms/step
E

34.8339729309082

In [47]:
# Predicting the output using the the test data
yhat2 = model.predict(X_test2)
#compute the mean squared error between the predicted concrete strength and the actual concrete strength.
mean_squared_error(yhat2, y_test2)

34.833973007126076

In [None]:
import numpy as np
MSEs2 = []
# creating a for lopp in range 0 to 50
for i in range(0, 50):
    X_train2, X_test2, y_train2, y_test2 = train_test_split( predictors_norm, target, test_size=0.3, random_state=42)
    model.fit(X_train2, y_train2, epochs=50, verbose=0)
    yhat2 = model.predict(X_test2)
    MSE2 = mean_squared_error(yhat2, y_test2)
    #print("Mean_Squared_Error "+str(i)+": "+str(MSE2))
    MSEs2.append(MSE2)
    
MSEs2 = np.array(MSEs2)
mean = np.mean(MSEs2)
STDV = np.std(MSEs2)

print('\n')
print("Report the mean and the standard deviation of the mean squared errors for normalised data is indicated below as having values")
print("Mean: ",(mean))
print("Standard Deviation: ",(STDV))

#### How does the mean of the mean squared errors compare to that from Step B?
There is an improvement in the mean from 44.6942 to 37.3903