In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

## Data Cleaning

I will start with loading and cleaning the data

In [2]:
df = pd.read_csv('concrete_data.csv')

In [3]:
df.head()

Unnamed: 0,Cement,Blast Furnace Slag,Fly Ash,Water,Superplasticizer,Coarse Aggregate,Fine Aggregate,Age,Strength
0,540.0,0.0,0.0,162.0,2.5,1040.0,676.0,28,79.99
1,540.0,0.0,0.0,162.0,2.5,1055.0,676.0,28,61.89
2,332.5,142.5,0.0,228.0,0.0,932.0,594.0,270,40.27
3,332.5,142.5,0.0,228.0,0.0,932.0,594.0,365,41.05
4,198.6,132.4,0.0,192.0,0.0,978.4,825.5,360,44.3


I want to find if there any null or duplicated values

In [4]:
df.isna().sum()

Unnamed: 0,0
Cement,0
Blast Furnace Slag,0
Fly Ash,0
Water,0
Superplasticizer,0
Coarse Aggregate,0
Fine Aggregate,0
Age,0
Strength,0


In [5]:
df.duplicated().sum()

25

So there are 25 duplicated values. Let's drop them

In [6]:
df = df.drop_duplicates()

Let's get some info for our data

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1005 entries, 0 to 1029
Data columns (total 9 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Cement              1005 non-null   float64
 1   Blast Furnace Slag  1005 non-null   float64
 2   Fly Ash             1005 non-null   float64
 3   Water               1005 non-null   float64
 4   Superplasticizer    1005 non-null   float64
 5   Coarse Aggregate    1005 non-null   float64
 6   Fine Aggregate      1005 non-null   float64
 7   Age                 1005 non-null   int64  
 8   Strength            1005 non-null   float64
dtypes: float64(8), int64(1)
memory usage: 78.5 KB


There are 1005 non-null values and the data types are correct

In [8]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Cement,1005.0,278.631343,104.344261,102.0,190.7,265.0,349.0,540.0
Blast Furnace Slag,1005.0,72.043483,86.170807,0.0,0.0,20.0,142.5,359.4
Fly Ash,1005.0,55.536318,64.207969,0.0,0.0,0.0,118.3,200.1
Water,1005.0,182.075323,21.339334,121.8,166.6,185.7,192.9,247.0
Superplasticizer,1005.0,6.033234,5.919967,0.0,0.0,6.1,10.0,32.2
Coarse Aggregate,1005.0,974.376816,77.579667,801.0,932.0,968.0,1031.0,1145.0
Fine Aggregate,1005.0,772.688259,80.340435,594.0,724.3,780.0,822.2,992.6
Age,1005.0,45.856716,63.734692,1.0,7.0,28.0,56.0,365.0
Strength,1005.0,35.250378,16.284815,2.33,23.52,33.8,44.87,82.6


## Part A

In [9]:
# split to features and target
predictors = df.drop(columns=['Strength']).values
target = df['Strength'].values

# split to 70% train and 30% test
X_train, X_test, y_train, y_test = train_test_split(predictors, target, test_size=0.3)

In [10]:
from keras.models import Sequential
from keras.layers import Dense, Input

n_cols = X_train.shape[1]

def regression_model():
    # Create Sequential model
    model = Sequential([
        Input(shape=(n_cols,)),
        Dense(10, activation='relu'),  # Hidden layer with 10 nodes
        Dense(1)  # Output layer
    ])

    # Compile the model
    model.compile(optimizer='adam', loss='mean_squared_error')

    return model


In [11]:
model = regression_model()

In [12]:
# fit the model
model.fit(X_train, y_train, epochs=50)

Epoch 1/50
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 23ms/step - loss: 26512.8672
Epoch 2/50
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - loss: 4252.2383
Epoch 3/50
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 3473.9541
Epoch 4/50
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 3084.7170
Epoch 5/50
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 2868.1985
Epoch 6/50
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 2501.6489
Epoch 7/50
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 2369.1455
Epoch 8/50
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - loss: 1770.0132
Epoch 9/50
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 1482.8287
Epoch 10/50
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/

<keras.src.callbacks.history.History at 0x7e2d22db1150>

In [13]:
y_pred = model.predict(X_test).flatten()
mse = mean_squared_error(y_test, y_pred)
print(f"The mean squared error is {mse:.2f}")

[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
The mean squared error is 119.34


Repeat the above steps 50 times

In [14]:
mse_list = []
for _ in range(50):
  # split to 70% train and 30% test
  X_train, X_test, y_train, y_test = train_test_split(predictors, target, test_size=0.3)
  model = regression_model()

  model.fit(X_train, y_train, epochs=50, verbose=0)
  y_pred = model.predict(X_test).flatten()
  mse_list.append(mean_squared_error(y_test, y_pred))


[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━

In [15]:
print(f'Mean of MSE is {np.mean(mse_list):.2f}')
print(f'Standard deviation of MSE is {np.std(mse_list):.2f}')

Mean of MSE is 307.83
Standard deviation of MSE is 383.57


## Part B

In [27]:
# Z-normalize the predictors
predictors_norm = (predictors - predictors.mean(axis=0))/predictors.std(axis=0)
# axis=0 must be used because we need the mean and std for each column.
target_norm = (target-target.mean())/target.std()

In [31]:
mse_list_norm = []

for _ in range(50):
  # split to 70% train and 30% test and use normalized predictors
  X_train, X_test, y_train, y_test = train_test_split(predictors_norm, target_norm, test_size=0.3)
  model = regression_model()

  model.fit(X_train, y_train, epochs=50, verbose=0)
  y_pred = model.predict(X_test).flatten()
  mse_list_norm.append(mean_squared_error(y_test, y_pred))

[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━

In [32]:
print(f'Mean of MSE is {np.mean(mse_list_norm):.2f}')
print(f'Standard deviation of MSE is {np.std(mse_list_norm):.2f}')

Mean of MSE is 0.27
Standard deviation of MSE is 0.05


Both the error and the standard deviation were reduced.

## Part C

Part B with 100 epochs

In [33]:
mse_list_norm_100 = []

for _ in range(50):
  # split to 70% train and 30% test
  X_train, X_test, y_train, y_test = train_test_split(predictors_norm, target_norm, test_size=0.3)
  model = regression_model()

  model.fit(X_train, y_train, epochs=100, verbose=0) # The same experiment for 100 epochs
  y_pred = model.predict(X_test).flatten()
  mse_list_norm_100.append(mean_squared_error(y_test, y_pred))

[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━

In [34]:
print(f'Mean of MSE is {np.mean(mse_list_norm_100):.2f}')
print(f'Standard deviation of MSE is {np.std(mse_list_norm_100):.2f}')

Mean of MSE is 0.20
Standard deviation of MSE is 0.03


Even though we doubled the number of epochs, the MSE and its standard deviation haven't changed significantly.

## Part D

In [35]:
def regression_model_3():
    # Create Sequential model
    model = Sequential([
        Input(shape=(n_cols,)),
        Dense(10, activation='relu'),  # Hidden layer
        Dense(10, activation='relu'),  # Hidden layer
        Dense(10, activation='relu'),  # Hidden layer
        Dense(1)  # Output layer
    ])

    # Compile the model
    model.compile(optimizer='adam', loss='mean_squared_error')

    return model

In [36]:
mse_list = []

for _ in range(50):
  # split to 70% train and 30% test
  X_train, X_test, y_train, y_test = train_test_split(predictors_norm, target_norm, test_size=0.3)
  model = regression_model_3()

  model.fit(X_train, y_train, epochs=50, verbose=0)
  y_pred = model.predict(X_test).flatten()
  mse_list.append(mean_squared_error(y_test, y_pred))

[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 21ms/step
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━

In [38]:
print(f'Mean of MSE is {np.mean(mse_list):.2f}')
print(f'Standard deviation of MSE is {np.std(mse_list):.2f}')

Mean of MSE is 0.20
Standard deviation of MSE is 0.03


Even though we made a more complex model, we had similar MSE and standard deviation.