# A. Build a baseline model (5 marks) 

In this project, you will build a regression model using the Keras library to model the same data about concrete compressive strength that we used in labs 

Use the Keras library to build a neural network with the following:

- One hidden layer of 10 nodes, and a ReLU activation function
- Use the adam optimizer and the mean squared error  as the loss function.

In [1]:
#!pip install pandas==2.2.2
#!pip install scikit-learn
#!pip install tensorflow_cpu==2.18.0
#!pip install numpy==2.0.2  #alternative to statistics 

In [2]:
import pandas as pd # data
import sklearn # helper functions
import keras # model
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Input
import statistics #processing manually final mean square errors list
import numpy  #statistics alternative

## Load Data

In [3]:
concrete_data = pd.read_csv('https://cocl.us/concrete_data')

In [4]:
concrete_data.describe()

Unnamed: 0,Cement,Blast Furnace Slag,Fly Ash,Water,Superplasticizer,Coarse Aggregate,Fine Aggregate,Age,Strength
count,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0
mean,281.167864,73.895825,54.18835,181.567282,6.20466,972.918932,773.580485,45.662136,35.817961
std,104.506364,86.279342,63.997004,21.354219,5.973841,77.753954,80.17598,63.169912,16.705742
min,102.0,0.0,0.0,121.8,0.0,801.0,594.0,1.0,2.33
25%,192.375,0.0,0.0,164.9,0.0,932.0,730.95,7.0,23.71
50%,272.9,22.0,0.0,185.0,6.4,968.0,779.5,28.0,34.445
75%,350.0,142.95,118.3,192.0,10.2,1029.4,824.0,56.0,46.135
max,540.0,359.4,200.1,247.0,32.2,1145.0,992.6,365.0,82.6


## Split Data to training and testing data

Lets make gunction to select inputs and result values (predictors,target, x,y)

In [5]:
def split_x_y(data):
    cols = data.columns
    x = data[cols[(cols != 'Strength') & (cols != 'Age')]]    # all columns except Strength, and assignment does not mention Age, so excluded as well
    y = data['Strength']    # Strength column, the calculated/ predicted value
    return x, y

Main split of supplied data to training and testing data

In [6]:
def split_train_test(data, test_size):
    # basic split
    train_data,test_data=sklearn.model_selection.train_test_split(concrete_data, test_size=test_size)
    # x, y on train
    train_x, train_y=split_x_y(train_data)
    # x, y on test
    test_x, test_y=split_x_y(test_data)
   
    return train_x, train_y, test_x, test_y

## Define and Create Model

Use the Keras library to build a neural network with the following:

One hidden layer of 10 nodes, and a ReLU activation function
Use the adam optimizer and the mean squared error as the loss function.

In [7]:
# define regression model
def regression_model(n_cols):
    # create model
    model = Sequential()
    model.add(Input(shape=(n_cols,)))
    model.add(Dense(10, activation='relu'))
    model.add(Dense(1))
    # compile model
    model.compile(optimizer='adam', loss='mean_squared_error')
    return model

## Train and evaluate model

In [8]:
# function to encapuslate N time to execute steps (1..3 from assignment)
def fix(data):
    # 30 percent for test, stable random seed 
    train_x, train_y, test_x, test_y = split_train_test(data, 0.3)
    #print(train_x.shape)
    #print(train_y.shape)
    #print(test_x.shape)
    #print(test_y.shape)
    n_cols = train_x.shape[1]
    model = regression_model(n_cols)
    # train
    model.fit(train_x, train_y,epochs=100, verbose=0)
    # predict on test_data
    predicted = model.predict(test_x)
    # manual evaluation
    mse = sklearn.metrics.mean_squared_error(test_y, predicted) 
    return mse

Repeat steps 1 - 3, 50 times, i.e., create a list of 50 mean squared errors.

In [9]:
mean_square_errors = []
for i in range(50):
    mse = fix(concrete_data)
    mean_square_errors.append(mse)    

[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step 
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step 
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step 
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step 
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step 
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step 
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step 
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step 
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step 
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step 
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step 
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step 
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━

In [11]:
print(mean_square_errors)

[165.6856440632942, 173.50736194512416, 172.67415617002916, 150.45457055938382, 159.88788779351012, 153.3155034381236, 181.29366018223, 182.78850702387697, 167.81392377727423, 161.7828742051304, 147.05096728385072, 160.3970449642211, 176.06743629878036, 189.48145600571854, 206.72223009010253, 169.3235190126035, 169.18856201155057, 177.83662876419078, 1419.7245340988613, 184.86786730457223, 140.99341838567472, 288.8496930196192, 233.3221718979634, 151.71833788034917, 150.38143786835323, 159.95007224602912, 158.48858528216704, 193.8253011076078, 158.26099065839105, 304.77996271484074, 163.14199508744477, 192.51260547671325, 158.1059203057855, 262.250254284971, 165.29426936783386, 213.17522921313486, 308.1938916446744, 163.44590501526332, 155.4579625217095, 163.41846252627053, 242.33522367974447, 149.315910102813, 158.271893124252, 160.54002149584588, 164.0026705905539, 181.1922223974338, 163.58225192931928, 155.58062312554407, 169.4214366378348, 158.76120417900637]


# A - Report the mean and the standard deviation of the mean squared errors.

In [12]:
statistics.mean(mean_square_errors)

205.16868517519146

In [13]:
statistics.stdev(mean_square_errors)

179.5409263018831

Or using numpy

In [14]:
numpy.mean(mean_square_errors)

np.float64(205.16868517519146)

In [15]:
numpy.std(mean_square_errors)

np.float64(177.73644908400598)

In [16]:
numpy.std(mean_square_errors,ddof=1)

np.float64(179.54092630188308)

# A - Interpretation
- mean of mean_square_errors is quite large
- stdev of mean_square_errors is quite large