In [9]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
import matplotlib.pyplot as plt

# Part A. Build a baseline model (5 marks)

Use the Keras library to build a neural network with the following:

- One hidden layer of 10 nodes, and a ReLU activation function

- Use the adam optimizer and the mean squared error  as the loss function.

1. Randomly split the data into a training and test sets by holding 30% of the data for testing. You can use the
train_test_split
helper function from Scikit-learn.

2. Train the model on the training data using 50 epochs.

3. Evaluate the model on the test data and compute the mean squared error between the predicted concrete strength and the actual concrete strength. You can use the mean_squared_error function from Scikit-learn.

4. Repeat steps 1 - 3, 50 times, i.e., create a list of 50 mean squared errors.

5. Report the mean and the standard deviation of the mean squared errors.

In [6]:
# URL of the CSV file
url = "https://cocl.us/concrete_data"

# Load the data into a pandas DataFrame
data = pd.read_csv(url)

# Preview the data
print(data.head())


   Cement  Blast Furnace Slag  Fly Ash  Water  Superplasticizer  \
0   540.0                 0.0      0.0  162.0               2.5   
1   540.0                 0.0      0.0  162.0               2.5   
2   332.5               142.5      0.0  228.0               0.0   
3   332.5               142.5      0.0  228.0               0.0   
4   198.6               132.4      0.0  192.0               0.0   

   Coarse Aggregate  Fine Aggregate  Age  Strength  
0            1040.0           676.0   28     79.99  
1            1055.0           676.0   28     61.89  
2             932.0           594.0  270     40.27  
3             932.0           594.0  365     41.05  
4             978.4           825.5  360     44.30  


In [7]:
# seperate the features and target
X = data.drop('Strength', axis=1)
y = data['Strength']



In [None]:
mse_list = []

for _ in range(50):
  # 70/30 split for training and testing as per the assignment requirement
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

  # Build the Keras model
  model = Sequential()

  # Add a Dense layer with a single neuron (linear regression)
  model.add(Dense(10, input_dim=X_train.shape[1], activation='relu'))
  model.add(Dense(1, activation='linear'))  # Output layer for strength


  # Compile the model
  model.compile(optimizer='adam', loss='mean_squared_error')

  # Training the model for 50 epochs
  model.fit(X_train, y_train, epochs=50, verbose=0)

  #evaluation
  y_pred = model.predict(X_test)
  mse = mean_squared_error(y_test, y_pred)
  mse_list.append(mse)

mean_mse = np.mean(mse_list)
std_mse = np.std(mse_list)

In [11]:
print(f"Mean Squared Error: {mean_mse}")
print(f"Standard Deviation of MSE: {std_mse}")

# pretty high mse considering the sd of strength in the data

Mean Squared Error: 351.3305720319265
Standard Deviation of MSE: 384.8557303611925


# Part B. Normalize the data (5 marks)

Repeat Part A but use a normalized version of the data. Recall that one way to normalize the data is by subtracting the mean from the individual predictors and dividing by the standard deviation.

How does the mean of the mean squared errors compare to that from Step A?



In [None]:
# Im going to just use the standard scaled from sklearn
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

mse_list = []

for _ in range(50):
  # 70/30 split
  # using x_scaled instead of unscaled data
  X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)

  # Build the Keras model
  model = Sequential()

  # Add a Dense layer with a single neuron (linear regression)
  model.add(Dense(10, input_dim=X_train.shape[1], activation='relu'))
  model.add(Dense(1, activation='linear'))  # Output layer for strength


  # Compile the model
  model.compile(optimizer='adam', loss='mean_squared_error')

  # Training the model for 50 epochs
  model.fit(X_train, y_train, epochs=50, verbose=0)

  #evaluation
  y_pred = model.predict(X_test)
  mse = mean_squared_error(y_test, y_pred)
  mse_list.append(mse)

mean_mse = np.mean(mse_list)
std_mse = np.std(mse_list)

In [14]:
print(f"Mean of the Mean Squared Error: {mean_mse}")

# Still a high mean squared error
# it is similar to the mean of the list from the models that trained on nonscaled data

Mean of the Mean Squared Error: 357.6950670120966


# C. Increate the number of epochs (5 marks)

Repeat Part B but use 100 epochs this time for training.

How does the mean of the mean squared errors compare to that from Step B?

In [None]:
mse_list = []

for _ in range(50):
  # 70/30 split
  # using x_scaled instead of unscaled data
  X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)

  # Build the Keras model
  model = Sequential()

  # Add a Dense layer with a single neuron (linear regression)
  model.add(Dense(10, input_dim=X_train.shape[1], activation='relu'))
  model.add(Dense(1, activation='linear'))  # Output layer for strength


  # Compile the model
  model.compile(optimizer='adam', loss='mean_squared_error')

  # Training the model for 100 epochs
  model.fit(X_train, y_train, epochs=100, verbose=0)

  #evaluation
  y_pred = model.predict(X_test)
  mse = mean_squared_error(y_test, y_pred)
  mse_list.append(mse)

mean_mse = np.mean(mse_list)
std_mse = np.std(mse_list)

In [16]:
print(f"Mean of the Mean Squared Error: {mean_mse}")
# much lower mse than part b
# might have to train a model for more epochs to see a lower error

Mean of the Mean Squared Error: 158.07195907918435


# D. Increase the number of hidden layers (5 marks)

Repeat part B but use a neural network with the following instead:

- Three hidden layers, each of 10 nodes and ReLU activation function.

How does the mean of the mean squared errors compare to that from Step B?

In [None]:
mse_list = []

for _ in range(50):
  # 70/30 split
  # using x_scaled instead of unscaled data
  X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)

  # Build the Keras model
  model = Sequential()

  # three hidden layers with 10 nodes
  model.add(Dense(10, input_dim=X_train.shape[1], activation='relu'))
  model.add(Dense(10, activation='relu'))
  model.add(Dense(10, activation='relu'))
  model.add(Dense(1, activation='linear'))  # Output layer for strength


  # Compile the model
  model.compile(optimizer='adam', loss='mean_squared_error')

  # Training the model for 50 epochs
  model.fit(X_train, y_train, epochs=50, verbose=0)

  #evaluation
  y_pred = model.predict(X_test)
  mse = mean_squared_error(y_test, y_pred)
  mse_list.append(mse)

mean_mse = np.mean(mse_list)

In [18]:
print(f"Mean of the Mean Squared Error: {mean_mse}")

# The mean is lower than both b and c
# The data is complex and more epochs and more neurons will be neccessary for a model to predict better

Mean of the Mean Squared Error: 124.0995156655636
