In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_regression
import pandas as pd
import math
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error 

Some ideas and the usage are borrowed from deeplearning.ai

In [None]:
import numpy as np
import math
class DNNR:

  def __init__(self, learning_rate = 0.01, epochs = 3000, layers_dims = [13, 10, 5, 1], layers_activation = ['relu','relu','leaky_relu'], weight_init = 'Xavier',
               keep_prob = 0.9, regularization = 'l2', lam = 0.05, task = "train", batch = None, beta1 = 0.99, beta2 = 0.999, epsilon = 1e-8, solver='sgd', 
               decay = False, decay_rate = 0.0001, interval = 1000, verbose = 0):
    
    # Some simple judgements according to the setup
    self.error_msg(layers_dims, layers_activation, weight_init, keep_prob, regularization, solver)
    self.learning_rate = learning_rate
    self.epochs = epochs
    self.keep_prob = keep_prob
    self.regularization = regularization
    self.lam = lam
    self.weight_init = weight_init
    self.Activation = layers_activation
    self.layers_dims = layers_dims
    self.task = task
    self.batch = batch
    self.solver = solver
    self.beta1 = beta1
    self.beta2 = beta2
    self.t = 2
    self.epsilon = epsilon
    self.decay = decay
    self.decay_rate = decay_rate
    self.interval = interval
    self.verbose = verbose
    self.W = None
    self.b = None
    self.grads = None
    self.dA = None
    self.dW = None
    self.db = None
    self.caches = None
    self.L = None
    self.D = None
    self.v_dw = None
    self.v_db = None
    self.s_dw = None
    self.s_db = None
    self.initialize(self.layers_dims, self.Activation)
    
  
  def error_msg(self, layers_dims, layers_activation, weight_init, keep_prob, regularization, solver):

    # whether the layers_dims shape and layers_activation shape is correct
    assert len(layers_dims) == len(layers_activation) + 1, "Wrong lengths of layers_dims and layers_activation:("

    # whether the activation functions are valid
    activation_test = True
    for activation in layers_activation: 
      if activation not in ['relu','sigmoid','tanh','identity','leaky_relu']: activation_test = False
    assert activation_test == True, "Invalid activation function:("

    # whether the weight_init method is valid
    weight_init_test = True
    if weight_init not in ['Xavier','He']: weight_init_test = False
    assert weight_init_test == True, "Invalid weight_init method:("

    assert 0 < keep_prob <= 1.0, "keep_prob should be between 0 and 1:("

    assert regularization in [None, 'l2'], "Invalid regularization method:("

    assert solver in ['sgd','adam'], "Invalid solver:("

  
  def initialize(self, layers_dims, layers_activation):
    
    # initialize the parameters for the given settings
    L = len(layers_dims)
    if self.weight_init == 'Xavier':
      self.W = [np.random.randn(layers_dims[l], layers_dims[l - 1]) / np.sqrt(layers_dims[l - 1]) for l in range(1, L)]
    elif self.weight_init == 'He':
      self.W = [np.random.randn(layers_dims[l], layers_dims[l - 1]) / np.sqrt(2/layers_dims[l - 1]) for l in range(1, L)]
    self.L = len(self.W)
    # Shape initialization
    # Adam
    self.v_dw = [np.zeros((layers_dims[l], layers_dims[l - 1])) for l in range(1, L)]
    self.v_db = [np.zeros((layers_dims[l], 1)) for l in range(1, L)]
    self.s_dw = [np.zeros((layers_dims[l], layers_dims[l - 1])) for l in range(1, L)]
    self.s_db = [np.zeros((layers_dims[l], 1)) for l in range(1, L)]
    # Wight and bias
    self.b = [np.zeros((layers_dims[l], 1)) for l in range(1, L)]
    self.Activation = layers_activation
    self.caches = [[] for _ in range(L - 1)]
    self.dA = [[] for _ in range(L)]
    self.dW = [[] for _ in range(L - 1)]
    self.db = [[] for _ in range(L - 1)]
    # Dropout
    self.D = [[] for _ in range(L - 2)]

    # For regularization: if there's no regularization, set lambda to 0
    if self.regularization == None: self.lam = 0

  def forward(self, X):

    A = X

    # for the forward propogation, make the predictions
    for l in range(self.L):

        A_prev = A 
        W = self.W[l]
        b = self.b[l]
        activation = self.Activation[l]

        # the linear part
        Z = np.dot(W, A_prev) + b

        # a = activation(z)
        if activation == "sigmoid":   
          A = 1/(1 + np.exp(-Z))
        elif activation == "relu": 
          A = np.maximum(0, Z)
        elif activation == "identity":
          A = Z
        elif activation == "tanh":
          A = np.tanh(Z)
        elif activation == "leaky_relu":
          A = np.fmax(0.01*Z, Z)
        
        # Implement Dropout in training models (not in computing MSE)
        if l != self.L - 1 and self.task == "train":
          A = (A * self.D[l]) / self.keep_prob
  
        # keep those in caches --> don't need to recompute it during back propogation
        self.caches[l] = [A_prev, Z]

        if l == self.L - 1:
          Y_predict = A

    return Y_predict
  
 
  def backward(self, Y_predict, Y):
  
    m = Y_predict.shape[1]

    # initialize the back prop
    self.dA[-1] = (Y_predict - Y)
    
    for l in range(self.L - 1, -1, -1):
        
        dA = self.dA[l + 1]

        A_prev = self.caches[l][0]
        Z = self.caches[l][1]
        W = self.W[l]
        b = self.b[l]
        activation = self.Activation[l]

        if activation == "relu":   
          dA[Z <= 0] = 0
          dZ = dA
        elif activation == "sigmoid":
          dZ = dA * (1/(1 + np.exp(-Z))) * (1 - (1/(1 + np.exp(-Z))))     
        elif activation == "tanh":
          dZ = dA * ((1 - np.tanh(Z)) ** 2)
        elif activation == "identity":
          dZ = dA
        elif activation == "leaky_relu":
          dA[Z <= 0] = 0.01
          dZ = dA

        # regularization is implemented
        self.dW[l] = 1/m * np.dot(dZ, A_prev.T) + (self.lam / m) * W
        self.db[l] = 1/m * np.sum(dZ, axis = 1, keepdims = True)
        self.dA[l] = np.dot(W.T, dZ)
        
        # Dropout
        if l != 0:
          self.dA[l] = (self.dA[l] * self.D[l - 1]) / self.keep_prob
        
    
  def update_sgd(self):
    
    if self.decay: self.learning_rate *= 1 / (1 + self.decay_rate * math.floor(self.epochs / self.interval))

    for l in range(self.L):
        self.W[l] = self.W[l] - self.learning_rate * self.dW[l]
        self.b[l] = self.b[l] - self.learning_rate * self.db[l]
  
  def update_adam(self):

    if self.decay: self.learning_rate *= 1 / (1 + self.decay_rate * math.floor(self.epochs / self.interval))

    for l in range(self.L):
      # The corrected version of v and s -- a combination of momentum and rmsprop
      self.v_dw[l] = self.beta1 * self.v_dw[l] + (1 - self.beta1) * self.dW[l]
      self.v_db[l] = self.beta1 * self.v_db[l] + (1 - self.beta1) * self.db[l]
      v_dw_c = self.v_dw[l] / (1 - self.beta1 ** self.t)
      v_db_c = self.v_db[l] / (1 - self.beta1 ** self.t)
      self.s_dw[l] = self.beta2 * self.s_dw[l] + (1 - self.beta2) * np.square(self.dW[l])
      self.s_db[l] = self.beta2 * self.s_db[l] + (1 - self.beta2) * np.square(self.db[l])
      s_dw_c = self.s_dw[l] / (1 - self.beta2 ** self.t)
      s_db_c = self.s_db[l] / (1 - self.beta2 ** self.t)

      # Update w and b
      self.W[l] = self.W[l] - self.learning_rate * v_dw_c / (np.sqrt(s_dw_c) + self.epsilon)
      self.b[l] = self.b[l] - self.learning_rate * v_db_c / (np.sqrt(s_db_c) + self.epsilon)


  def dropout(self, m):

    for hidden_layer in range(self.L - 1):
      self.D[hidden_layer] = np.random.rand(self.layers_dims[hidden_layer + 1], m) < self.keep_prob
  
  def fit(self, X, Y):

    X = np.array(X).T
    Y = np.array(Y).reshape(1, -1)

    m = Y.shape[1]

    if self.batch == None: self.batch = m
    # Random pick the mini_batches
    mini_batches = []
    permutation = list(np.random.permutation(m))
    shuffled_X = X[:, permutation]
    shuffled_Y = Y[:, permutation].reshape((1, m))
    count = math.floor(m / self.batch)
    for k in range(count):
        mini_batches.append((shuffled_X[:, k*self.batch : (k+1)*self.batch],shuffled_Y[:, k*self.batch: (k+1)*self.batch]))
    if m % self.batch != 0:
        mini_batches.append((shuffled_X[:, int(m/self.batch)*self.batch :],shuffled_Y[:, int(m/self.batch)*self.batch : ]))

    for i in range(self.epochs):
      for mini_batch in mini_batches:
        (minibatch_X, minibatch_Y) = mini_batch
        self.dropout(minibatch_Y.shape[1])
        Y_predict = self.forward(minibatch_X)
        self.backward(Y_predict, minibatch_Y)

        # sgd and adam update methods  
        if self.solver == 'sgd':
          self.update_sgd()
        elif self.solver == 'adam':
          self.t += 1
          self.update_adam()
  
  def MSE(self, X, Y):

    # Do not use dropout during the calculation of error
    self.task = 'test'

    X = np.array(X).T
    Y = np.array(Y).reshape(1, -1)
    Y_predict = self.forward(X)

    self.task = 'train'

    return np.mean((Y_predict - Y) ** 2)  
  

In [None]:
boston = pd.read_csv("http://43.143.180.76/data/boston.csv").sample(frac=1.0).reset_index(drop=True)
n = len(boston) * 0.7
columns = list(boston.columns)

X_train = np.array(boston.loc[:n][columns[:-1]])
Y_train = np.array(boston.loc[:n][columns[-1]])
X_test = np.array(boston.loc[n:][columns[:-1]])
Y_test = np.array(boston.loc[n:][columns[-1]])

print("X.shape:", X_train.shape)
print("Y.shape", Y_train.shape)

X.shape: (355, 13)
Y.shape (355,)


In [None]:
model = DNNR()
model.fit(X_train,Y_train)
mse_train = model.MSE(X_train,Y_train)
mse_test = model.MSE(X_test,Y_test)
print(mse_train)
print(mse_test)

6.522385101882724
8.71071611879156


In [None]:
MLP = MLPRegressor(max_iter = 3000, learning_rate_init=0.01)
MLP = MLP.fit(X_train,Y_train)
MLP_Y_train_predict = MLP.predict(X_train)
MLP_Y_test_predict = MLP.predict(X_test)
mse_mlp_train = mean_squared_error(Y_train,MLP_Y_train_predict)
mse_mlp_test = mean_squared_error(Y_test,MLP_Y_test_predict)
print(mse_mlp_train)
print(mse_mlp_test)

6.994014705859778
8.861343306833792


- Compared DNN with MLP, the results are kind of similar if I run the result multiple times. Both MSE are around 10.