## Import Required Libraries

In [1]:
import requests
from pathlib import Path

import pandas as pd
import numpy as np
from sklearn.preprocessing import Normalizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from keras.models import Sequential
from keras.layers import Dense

## Import Data
Dowload data from url if file is not present on machine

In [2]:
cwd = Path.cwd()
filename = 'concrete_data.csv'
files = list()

# Get list of all files in working directory
for pth in cwd.iterdir():
    if pth.is_file():
        files += [pth.name]

# Download file if not found in list
if filename not in files:
    csv_file = open(filename, 'wb')
    file_data = requests.get('https://cocl.us/concrete_data').content
    csv_file.write(file_data)
    csv_file.close()

## Load Data
Load into pandas and check for any issues or inconsistencies

In [3]:
concrete_data = pd.read_csv(filename)
print(f"Shape of df: {concrete_data.shape[0]}r x {concrete_data.shape[1]}c\n")
print("Null Value Check")
print("================")
print(concrete_data.isnull().sum())
print()
concrete_data.describe()

Shape of df: 1030r x 9c

Null Value Check
Cement                0
Blast Furnace Slag    0
Fly Ash               0
Water                 0
Superplasticizer      0
Coarse Aggregate      0
Fine Aggregate        0
Age                   0
Strength              0
dtype: int64



Unnamed: 0,Cement,Blast Furnace Slag,Fly Ash,Water,Superplasticizer,Coarse Aggregate,Fine Aggregate,Age,Strength
count,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0
mean,281.167864,73.895825,54.18835,181.567282,6.20466,972.918932,773.580485,45.662136,35.817961
std,104.506364,86.279342,63.997004,21.354219,5.973841,77.753954,80.17598,63.169912,16.705742
min,102.0,0.0,0.0,121.8,0.0,801.0,594.0,1.0,2.33
25%,192.375,0.0,0.0,164.9,0.0,932.0,730.95,7.0,23.71
50%,272.9,22.0,0.0,185.0,6.4,968.0,779.5,28.0,34.445
75%,350.0,142.95,118.3,192.0,10.2,1029.4,824.0,56.0,46.135
max,540.0,359.4,200.1,247.0,32.2,1145.0,992.6,365.0,82.6


***
# Part A:
Build a baseline model

## Data Preprocessing

Separate predictors from target(s) and then split 30% of data for testing.

Normalize the training predictors after splitting and use that fit to normalize test predictors.

In [4]:
# Separate predictors from target columns 
columns = concrete_data.columns
X = concrete_data[columns[columns != 'Strength']]
y = concrete_data['Strength']

# Preprocess pipeline, training set is randomized on each function call
def preprocess(X, y, normalize=False):

    if normalize == True:
        """ The Normalizer class returns a signficantly greater MSE value when it's used"""
        # X = Normalizer().fit_transform(X)
        
        # Normalization works as expected when calculating the values
        X = (X - X.mean()) / X.std()
    
    # Retain 30% as test data with no random seed specified
    X_train, X_test, y_train, y_test = train_test_split(
        X,
        y,
        test_size=0.3)

    proc_data = {
        'X_train': X_train,
        'X_test': X_test,
        'y_train': y_train,
        'y_test': y_test,
    }
    return proc_data

proc_data = preprocess(X,y, normalize=False)

## Define NN Architecture

In [5]:
# Define baseline model
def baseline_model():
    # Create model
    model = Sequential()

    # Add hidden layer of 10 nodes
    model.add(Dense(10, activation='relu', input_shape=(X.shape[1],)))
    model.add(Dense(1))
    
    # Compile model
    model.compile(optimizer='adam', loss='mean_squared_error')
    return model

## Model Training

In [6]:
model = baseline_model()
model.fit(
    proc_data['X_train'],
    proc_data['y_train'],
    epochs=50,
    verbose=0)

<keras.callbacks.History at 0x1a77c1fe1c0>

## Model Evaluation

In [7]:
# Evaluate model using test set and get MSE (loss) of model predictions
loss = model.evaluate(
    proc_data['X_test'],
    proc_data['y_test'],
    verbose=0)

# Generate predictions from model using training set
yhat = model.predict(proc_data['X_test'], verbose=0)
# Calculate MSE of true and predicted values
mse = mean_squared_error(proc_data['y_test'], yhat)
print(f"loss: {loss:.8f}")
print(f" MSE: {mse:.8f}")

loss: 273.46676636
 MSE: 273.46678944


## Report MSE Statistics For Iterative Model Generation (n=50)

In [8]:
def get_model_stats(model, n, normalize=False, epochs=50):
    X = concrete_data[columns[columns != 'Strength']]
    y = concrete_data['Strength']
    mse_list = list()

    for i in range(n):
        # Generate random test train split and normalize inputs
        proc_data = preprocess(X,y, normalize=normalize)

        # Fit model to data set
        model.fit(
            proc_data['X_train'],
            proc_data['y_train'],
            epochs=epochs,
            verbose=0)

        # Generate predictions from model using training set
        """
        Can also use the loss value returned from model.evaluate function
        but the return value is *slightly* different!
        """
        yhat = model.predict(proc_data['X_test'], verbose=0)
        # Calculate MSE of true and predicted values
        mse = mean_squared_error(proc_data['y_test'], yhat)

        mse_list.append(mse)

    # Convert to numpy array for mean and stdev calcs
    mse_list = np.array(mse_list)

    print("Baseline Model Prediction MSE Statistics")
    print(f"            Mean value: {mse_list.mean():.2f}")
    print(f"    Standard deviation: {mse_list.std():.2f}")
    print(f"          sample size of n={mse_list.size}")

In [12]:
model = baseline_model()
get_model_stats(model, n=50)

Baseline Model Prediction MSE Statistics
            Mean value: 82.13
    Standard deviation: 89.22
          sample size of n=50


***
# Part B:
Normalize the data

In [13]:
model = baseline_model()
get_model_stats(model, n=50, normalize=True)

Baseline Model Prediction MSE Statistics
            Mean value: 45.61
    Standard deviation: 46.30
          sample size of n=50


**How does the mean of the mean squared errors compare to that from Step A?**

**A:** The mean MSE value decreased considerably when the data set was normalized

***
# Part C:
Increase the number of epochs

In [14]:
model = baseline_model()
get_model_stats(model, n=50, normalize=True, epochs=100)

Baseline Model Prediction MSE Statistics
            Mean value: 42.34
    Standard deviation: 23.08
          sample size of n=50


**How does the mean of the mean squared errors compare to that from Step B?**

**A:** The mean did not change significantly

***
# Part D:
Increase the number of hidden layers

In [15]:
def multilayer_model(n_features, layers=1):
    # Create model
    model = Sequential()

    # Add hidden layer of 10 nodes
    model.add(Dense(10, activation='relu', input_shape=(n_features,)))
    for i in range(layers-1):
        model.add(Dense(10, activation='relu'))
    model.add(Dense(1))
    
    # Compile model
    model.compile(optimizer='adam', loss='mean_squared_error')
    return model

In [16]:
model = multilayer_model(X.shape[1], layers=3)
get_model_stats(model, n=50, normalize=True)

Baseline Model Prediction MSE Statistics
            Mean value: 34.22
    Standard deviation: 20.11
          sample size of n=50


**How does the mean of the mean squared errors compare to that from Step B?**

**A:** The mean MSE value was lower than Part B without significantly increasing the training time of the model