In [72]:
import pandas as pd
import numpy as np
#imported 2 libraries called the numpy and the pandas

This is the data being used for the peer graded assessment.
The dataset revolves around the compressive strength of various concrete samples, depending on the quantities of diverse components employed in their creation. These components encompass:

1) Cement

2) Blast Furnace Slag

3) Fly Ash

4) Water

5) Superplasticizer

6) Coarse Aggregate

7) Fine Aggregate

In [73]:
#Reading the data through the data dataframe 
concrete_data = pd.read_csv('concrete_data.csv')
concrete_data.head()  #printing the first five rows of the dataset.

Unnamed: 0,Cement,Blast Furnace Slag,Fly Ash,Water,Superplasticizer,Coarse Aggregate,Fine Aggregate,Age,Strength
0,540.0,0.0,0.0,162.0,2.5,1040.0,676.0,28,79.99
1,540.0,0.0,0.0,162.0,2.5,1055.0,676.0,28,61.89
2,332.5,142.5,0.0,228.0,0.0,932.0,594.0,270,40.27
3,332.5,142.5,0.0,228.0,0.0,932.0,594.0,365,41.05
4,198.6,132.4,0.0,192.0,0.0,978.4,825.5,360,44.3


So here we know that the first concrete sample has 540 cubic meter of cement, zero blast furnace slag, zero fly ash, 162 cubic meter of water,2.5 cubic meter of superplatsicizer, 1040 cubic meter of coarse aggregate, 676 cubic meter of fine aggregate, 28 is the age and 79.99 is the strength.

In [74]:
#Checking the total number of data points 
concrete_data.shape

(1030, 9)

In [75]:
#Checking data for any missing values 
concrete_data.describe()

Unnamed: 0,Cement,Blast Furnace Slag,Fly Ash,Water,Superplasticizer,Coarse Aggregate,Fine Aggregate,Age,Strength
count,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0
mean,281.167864,73.895825,54.18835,181.567282,6.20466,972.918932,773.580485,45.662136,35.817961
std,104.506364,86.279342,63.997004,21.354219,5.973841,77.753954,80.17598,63.169912,16.705742
min,102.0,0.0,0.0,121.8,0.0,801.0,594.0,1.0,2.33
25%,192.375,0.0,0.0,164.9,0.0,932.0,730.95,7.0,23.71
50%,272.9,22.0,0.0,185.0,6.4,968.0,779.5,28.0,34.445
75%,350.0,142.95,118.3,192.0,10.2,1029.4,824.0,56.0,46.135
max,540.0,359.4,200.1,247.0,32.2,1145.0,992.6,365.0,82.6


In [76]:
# Removing all null data from the dataset 
concrete_data.isnull().sum()

Cement                0
Blast Furnace Slag    0
Fly Ash               0
Water                 0
Superplasticizer      0
Coarse Aggregate      0
Fine Aggregate        0
Age                   0
Strength              0
dtype: int64

As the data has no null values so now we can use this data to train our model.

For creating a Regression or a classification model we always need to divide the columns into two called the predictors and the target.

In [77]:
# Our target variable is Strength and others columns are the predictors
concrete_data_columns = concrete_data.columns
predictors = concrete_data[concrete_data_columns[concrete_data_columns != 'Strength']] # Every columns except Strength
target = concrete_data['Strength']  # Strength column

Printing the predictors and the targets just to check the output

In [78]:
target.head()  #As we have only 1 target variable so one column gets printed

0    79.99
1    61.89
2    40.27
3    41.05
4    44.30
Name: Strength, dtype: float64

In [79]:
predictors.head()

Unnamed: 0,Cement,Blast Furnace Slag,Fly Ash,Water,Superplasticizer,Coarse Aggregate,Fine Aggregate,Age
0,540.0,0.0,0.0,162.0,2.5,1040.0,676.0,28
1,540.0,0.0,0.0,162.0,2.5,1055.0,676.0,28
2,332.5,142.5,0.0,228.0,0.0,932.0,594.0,270
3,332.5,142.5,0.0,228.0,0.0,932.0,594.0,365
4,198.6,132.4,0.0,192.0,0.0,978.4,825.5,360


After printing the predictors and the target lets normalize the version of the data by subtracting the mean and diving by the standard deviation

In [80]:
predictors_norm = (predictors - predictors.mean()) / predictors.std()
predictors_norm.head()

Unnamed: 0,Cement,Blast Furnace Slag,Fly Ash,Water,Superplasticizer,Coarse Aggregate,Fine Aggregate,Age
0,2.476712,-0.856472,-0.846733,-0.916319,-0.620147,0.862735,-1.217079,-0.279597
1,2.476712,-0.856472,-0.846733,-0.916319,-0.620147,1.055651,-1.217079,-0.279597
2,0.491187,0.79514,-0.846733,2.174405,-1.038638,-0.526262,-2.239829,3.55134
3,0.491187,0.79514,-0.846733,2.174405,-1.038638,-0.526262,-2.239829,5.055221
4,-0.790075,0.678079,-0.846733,0.488555,-1.038638,0.070492,0.647569,4.976069


In [81]:
#Checking the number of predictors 
num_cols = predictors_norm.shape[1]
num_cols

8

Moving forward we import keras and start building our regression model

In [82]:
import keras     #Keras is always imported from the backend tensorflow

In [83]:
from keras.models import Sequential    #We are importing the sequential layer from keras which helps us to create a model layer by layer in a linear stack
from keras.layers import Dense    #A dense layer is imported from layers to form a fully connected neural network layer

Creating the regression model

In [84]:
def regression_model():    #Created a function to create a regression model.
    model = Sequential()
    model.add(Dense(10, activation='relu', input_shape=(num_cols,)))   #This contains 10 nodes with the relu activation function(1st hidden layer)
    model.add(Dense(1))  #Ouput layer

    model.compile(optimizer='adam', loss='mean_squared_error')  #Compiling the model with the optimizer adam and keeping the loss as MSE.
    return model

Now we want to split the data so we will have to import sklearn to train and test the data

In [85]:
from sklearn.model_selection import train_test_split

Splitting the data into 70 % training and 30% testing data

In [86]:
X_train, X_test, y_train, y_test = train_test_split(predictors_norm, target, test_size=0.3, random_state=42)

Now we have to build our model so we call the regression function here that we created above

In [87]:
#Calling our model here
model = regression_model()


In [88]:
#Training our model with epochs=100
epochs = 50
model.fit(X_train, y_train, epochs=epochs, verbose=1)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x7f55847d8050>

Now we evaluate the metrics for the model formed 

In [89]:
loss_val = model.evaluate(X_test, y_test)
y_pred = model.predict(X_test)
loss_val



325.6260548810743

As per the next step we need to compute the mean squared error

In [90]:
from sklearn.metrics import mean_squared_error

In [91]:
#Printing the mean and standard deviation of the test data 
mean_square_error = mean_squared_error(y_test, y_pred)
mean = np.mean(mean_square_error)
standard_deviation = np.std(mean_square_error)
print(mean, standard_deviation)

325.6260754375171 0.0


Creating a list of 50 mean squared errors.

In the code below it iterates through a specified number of training and evaluation cycles (total_mean_squared_errors). In each cycle, the dataset is split into training and testing sets, and a neural network model is trained for a fixed number of epochs. The mean squared error (MSE) is then calculated for the model's predictions on the test set, and both the MSE and the mean squared error between true and predicted values are recorded. After the specified number of cycles, the code calculates and prints the mean and standard deviation of the collected mean squared errors, providing insights into the model's overall performance and its variability across multiple runs.

In [92]:
total_mean_squared_errors = 50
#The total number of times the model will be trained and evaluated.
epochs = 100
mean_squared_errors = []
#An empty list to store the mean squared errors for each run.


for i in range(0, total_mean_squared_errors):
    X_train, X_test, y_train, y_test = train_test_split(predictors_norm, target, test_size=0.3, random_state=i)
    model.fit(X_train, y_train, epochs=epochs, verbose=0)
    MSE = model.evaluate(X_test, y_test, verbose=0)
    print("MSE "+str(i+1)+": "+str(MSE))
    y_pred = model.predict(X_test)
    mean_square_error = mean_squared_error(y_test, y_pred)
    mean_squared_errors.append(mean_square_error)

#Convert the list of mean squared errors to a NumPy array for easier calculation.
#Calculate the mean and standard deviation of the mean squared errors.
mean_squared_errors = np.array(mean_squared_errors)
mean = np.mean(mean_squared_errors)
standard_deviation = np.std(mean_squared_errors)

print('\n')
print("Printing the  mean and standard deviation of " +str(total_mean_squared_errors) + " mean squared errors with normalized data. Total number of epochs for each training is: " +str(epochs) + "\n")
print("Mean: "+str(mean))
print("Standard Deviation: "+str(standard_deviation))

MSE 1: 123.22822390559422
MSE 2: 100.73636392636593
MSE 3: 53.51167719572493
MSE 4: 46.899983168419894
MSE 5: 43.411688054649574
MSE 6: 46.3051046846754
MSE 7: 46.7011473325464
MSE 8: 34.448978967265404
MSE 9: 38.41333351011801
MSE 10: 37.64939184096253
MSE 11: 39.810041322677264
MSE 12: 35.32049270432358
MSE 13: 43.93169580848472
MSE 14: 45.50917029766589
MSE 15: 37.28622358748056
MSE 16: 32.93888642024068
MSE 17: 38.06004263127892
MSE 18: 37.455807614866586
MSE 19: 37.560519591890106
MSE 20: 39.25749264565872
MSE 21: 34.17748471602653
MSE 22: 36.10166991644307
MSE 23: 31.457313105512206
MSE 24: 36.32793720331778
MSE 25: 36.58001251591062
MSE 26: 39.72632737144298
MSE 27: 32.8816910530757
MSE 28: 32.59385999043783
MSE 29: 40.089712149116984
MSE 30: 38.835823701037555
MSE 31: 36.41255120089139
MSE 32: 34.294126701972246
MSE 33: 33.33003637320015
MSE 34: 37.70834408608841
MSE 35: 36.97102990505379
MSE 36: 40.98607935488803
MSE 37: 33.717584730352016
MSE 38: 38.421951664304274
MSE 39: 34

With 100 epochs, the mean MSE has increased slightly from 39.44 to 41.03. This means that the additional training epochs might be causing the model to fit the training data more closely, potentially leading to a slightly higher average error on the test data.
The standard deviation of MSE has decreased from 25.95 to 15.30. A lower standard deviation indicates that the model's performance is more consistent across different runs or subsets of the data. In this case, the increased number of epochs seems to have contributed to a more stable performance.






