In [18]:
import numpy as np
import pandas as pd
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from tensorflow.keras.optimizers import SGD
from tensorflow.keras.layers import Dropout
from matplotlib import pyplot as plt
from numpy import sqrt

## How training a neural network with stochastic gradient descent works.

Stochastic Gradient Descent is a way to reduce the burden of computing and accelerate the process. 

Considering we have a complex logistic regression that uses 3,000 genes to predict if the person is sick or not. Then we will have 3,000 derivatives to plug into the gradient descent equation. If we have 1,000,000 data entries, then we need to calculate 1,000,000 terms for each of 3,000 derivatives, whichi means we have to calculate 3,000,000,000 for each step, and normally, it's common to calculate at least 1,000 steps. In this way, the computation requirement is demanding.

Stochstic Gradient Descent will randomly pick one sample for each step, and just use that one sample to calculate the derivatives. So it largely reduce the number of derivatives we need to calculate. This leads to a drunken walk towards the minima but is much quicker to compute.

## 1. Data Pre-processing

In [3]:
df = pd.read_csv("Concrete_Data.csv")
df.head()

Unnamed: 0,Cement (component 1)(kg in a m^3 mixture),Blast Furnace Slag (component 2)(kg in a m^3 mixture),Fly Ash (component 3)(kg in a m^3 mixture),Water (component 4)(kg in a m^3 mixture),Superplasticizer (component 5)(kg in a m^3 mixture),Coarse Aggregate (component 6)(kg in a m^3 mixture),Fine Aggregate (component 7)(kg in a m^3 mixture),Age (day),"Concrete compressive strength(MPa, megapascals)"
0,540.0,0.0,0.0,162.0,2.5,1040.0,676.0,28,79.99
1,540.0,0.0,0.0,162.0,2.5,1055.0,676.0,28,61.89
2,332.5,142.5,0.0,228.0,0.0,932.0,594.0,270,40.27
3,332.5,142.5,0.0,228.0,0.0,932.0,594.0,365,41.05
4,198.6,132.4,0.0,192.0,0.0,978.4,825.5,360,44.3


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1030 entries, 0 to 1029
Data columns (total 9 columns):
 #   Column                                                 Non-Null Count  Dtype  
---  ------                                                 --------------  -----  
 0   Cement (component 1)(kg in a m^3 mixture)              1030 non-null   float64
 1   Blast Furnace Slag (component 2)(kg in a m^3 mixture)  1030 non-null   float64
 2   Fly Ash (component 3)(kg in a m^3 mixture)             1030 non-null   float64
 3   Water  (component 4)(kg in a m^3 mixture)              1030 non-null   float64
 4   Superplasticizer (component 5)(kg in a m^3 mixture)    1030 non-null   float64
 5   Coarse Aggregate  (component 6)(kg in a m^3 mixture)   1030 non-null   float64
 6   Fine Aggregate (component 7)(kg in a m^3 mixture)      1030 non-null   float64
 7   Age (day)                                              1030 non-null   int64  
 8   Concrete compressive strength(MPa, megapascals)  

There are no Null values, Dtypes are correct.

Now let's make the column names more readable.

In [9]:
df.columns = [i.split('(')[0].strip() for i  in df.columns]

Index(['Cement', 'Blast Furnace Slag', 'Fly Ash', 'Water', 'Superplasticizer',
       'Coarse Aggregate', 'Fine Aggregate', 'Age',
       'Concrete compressive strength'],
      dtype='object')

In [10]:
df.head()

Unnamed: 0,Cement,Blast Furnace Slag,Fly Ash,Water,Superplasticizer,Coarse Aggregate,Fine Aggregate,Age,Concrete compressive strength
0,540.0,0.0,0.0,162.0,2.5,1040.0,676.0,28,79.99
1,540.0,0.0,0.0,162.0,2.5,1055.0,676.0,28,61.89
2,332.5,142.5,0.0,228.0,0.0,932.0,594.0,270,40.27
3,332.5,142.5,0.0,228.0,0.0,932.0,594.0,365,41.05
4,198.6,132.4,0.0,192.0,0.0,978.4,825.5,360,44.3


## 2. Split and standarize the data

• The target variable is Concrete compressive strength.

• The reason to scaling the data is to stabilize the Gradient Descent. 

A target variable with a large spread of values, may result in large error gradient values causing weight values to change dramatically, making the learning process unstable.



In [13]:
X = df.drop(columns=['Concrete compressive strength'])
y = df[['Concrete compressive strength']]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [14]:
scaler_x = preprocessing.StandardScaler().fit(X_train)  
X_train = scaler_x.transform(X_train)
X_test = scaler_x.transform(X_test)

scaler_y = preprocessing.StandardScaler().fit(y_train)  
y_train = scaler_y.transform(y_train)
y_test = scaler_y.transform(y_test)

To accelerate training process, normalize the data.

To avoid the data leakage, normalize the data after splitting them.

In [None]:
X_train_norm = preprocessing.normalize(pd.DataFrame(X_train))
X_train_norm = pd.DataFrame(X_train_norm, columns=X.columns)

y_train_norm = preprocessing.normalize([np.array(y_train)]).T
y_train_norm = pd.DataFrame(y_train_norm, columns=['Concrete compressive strength(MPa, megapascals) '])

X_test_norm = preprocessing.normalize(pd.DataFrame(X_test))
X_test_norm = pd.DataFrame(X_test_norm, columns=X.columns)

y_test_norm = preprocessing.normalize([np.array(y_test)]).T
y_test_norm = pd.DataFrame(y_test_norm, columns=['Concrete compressive strength(MPa, megapascals) '])

## 3. Neural Network
• Choose the Sequential MLP.

• The input is 8, the hidden layer is 5, output is 1.

The number of neurons is concerned should be 
between the input layer size and the output layer size, usually 2/3 of the input size.

Because the number of input features are 8, the number of output is 1, so we try the above architect.

• To avoid overfitting, a dropout layer with 10% dropout is inserted between the first hidden layer and the output layer.

In [16]:
# Define model
model = Sequential()
n_features = X_train.shape[1]
model.add(Dense(8, activation='relu', input_shape=(n_features,)))
model.add(Dense(5, activation='relu'))
model.add(Dropout(0.1))
model.add(Dense(1))
# Model Text Description
model.summary()


Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 8)                 72        
                                                                 
 dense_1 (Dense)             (None, 5)                 45        
                                                                 
 dropout (Dropout)           (None, 5)                 0         
                                                                 
 dense_2 (Dense)             (None, 1)                 6         
                                                                 
Total params: 123
Trainable params: 123
Non-trainable params: 0
_________________________________________________________________


In [22]:
# compile the model
sgd = SGD(learning_rate=0.001, momentum=0.8)
model.compile(optimizer=sgd, loss='mse', metrics=['mae'])

# evaluate the model
model.fit(X_train, y_train, epochs=100, batch_size=32, verbose=0)

mse, mae = model.evaluate(X_test, y_test, verbose=0)
print('MSE: %.3f, RMSE: %.3f, MAE: %.3f' % (mse, sqrt(mse), mae))

MSE: 0.135, RMSE: 0.368, MAE: 0.295


In [24]:
# make a prediction
y_pred = model.predict(X_test)
result = pd.DataFrame(y_test).reset_index().drop(['index'],axis=1)
orig =pd.DataFrame(model.predict(X_test))
outcome = pd.concat([result,orig],axis =1)
outcome.columns = ["pred","original"]
outcome

Unnamed: 0,pred,original
0,1.011704,0.457006
1,1.189101,0.439865
2,2.292640,1.625688
3,-0.033098,0.183138
4,-1.502110,-0.907780
...,...,...
201,1.203340,1.742473
202,-1.015011,-0.659862
203,0.127093,0.276450
204,2.284334,1.632775
