- Make sure not to forget the biases

In [210]:
import numpy as np

In [211]:
class Activation:
    
    def __init__(self, activation):
        self.activation = activation
        
    def output_scalar(self, x):
        if self.activation == 'sigmoid':
            return 1/(1+np.exp(-x))
        elif self.activation == 'relu':
            return np.max(0, x)
        else:
            print('Activation function not yet defined')
            return None
        
    def gradient_scalar(self, x):
        if self.activation == 'sigmoid':
            return x*(1-x)
        elif self.activation == 'relu':
            if x > 0:
                return 1
            else:
                return 0
        else:
            print('Activation function not yet defined')
            return None
        
    def output_vectorized(self, x):
        if self.activation == 'sigmoid':
            return 1/(1+np.exp(-x))
        elif self.activation == 'relu':
            return np.maximum(0, x)
        elif self.activation == 'linear':
            return x
        else:
            print('Activation function not yet defined')
            return None
        
    def gradient_vectorized(self, x):
        if self.activation == 'sigmoid':
            return x*(1-x)
        elif self.activation == 'relu':
            return 1 * (x > 0)
        elif self.activation == 'linear':
            return np.ones(len(x))
        else:
            print('Activation function not yet defined')
            return None
            
        

In [230]:
class NeuralNetwork:
    
    def __init__(self, nodes_lst, input_shape, activations):
        self.nodes_lst = nodes_lst
        self.input_shape = input_shape
        self.activations = [Activation(a) for a in activations] # List of Activation objects
        self.w = self.initialize_weights(nodes_lst, input_shape)
        
    def initialize_weights(self, nodes_lst, input_shape):
        w_lst = []
        w_lst.append(np.random.normal(size = (input_shape + 1, nodes_lst[0]))) # + 1 because we are including the biases as the first row of the weights matrix for each layer
        for x, y in zip(nodes_lst, nodes_lst[1:]):
            w_lst.append(np.random.normal(size = (x + 1, y)))                  # Again, "+ 1" to account for the biases
        
        return w_lst
    
    def get_w_shapes(self):
        return [w_layer.shape for w_layer in self.w]
    
    # Forward propagation through a single layer
    # x - input to the current layer
    def propagate_forward(self, x, w, activation_object):
        #print('x: ', x.shape)
        x = np.append(1, x) # To account for the biases; first element of the input to a layer needs to be a one
        #print('x: ', x.shape)
        #print('w: ', w.shape)
        weighted_sum = np.dot(x, w)
        output = activation_object.output_vectorized(weighted_sum)
        
        return weighted_sum, output
    
    # A "full" forward pass, from input layer to output layer
    # x - input to the network (i.e. the first layer)
    def forward_pass(self, x):
        
        weighted_sum_lst = []
        output_lst = []
        
        for weights, activation in zip(self.w, self.activations):
            weighted_sum, output = self.propagate_forward(x, weights, activation)
            weighted_sum_lst.append(weighted_sum)
            output_lst.append(output)
            x = output # Output of this layer becomes input to next layer
            
        return weighted_sum_lst, output_lst
    
    # Calculating the gradients of the final layer
    # x            - input to the final layer (vector)
    # weighted_sum - the weighted sums of all the nodes in the final layer (vector)
    # y            - the output of the final layer
    # t            - the targets (vector)
    # w            - the weights of the final layer (matrix) - THIS ONE DOES NOT SEEM TO BE NEEDED!
    # activation   - the activation of the final layer (activation object)
    def gradients_final_layer(self, x, weighted_sum, y, t, w, activation):
        dE_dy = -(t - y)
        dy_dweighted_sum = activation.gradient_vectorized(weighted_sum)
        dweighted_sum_dw = np.append(1, x) # Accounting for the bias node
#        print(dE_dy.shape)
#        print(dy_dweighted_sum.shape)
#        print(dweighted_sum_dw.shape)

        delta = - np.reshape(dE_dy * dy_dweighted_sum, (1, -1))
        dE_dw = np.dot(np.reshape(dweighted_sum_dw, (-1, 1)), delta)
        
        return delta, dE_dw
    
    # Calculating the gradients of a hidden layer
    # x            - input to the hidden layer (vector)
    # weighted_sum - the weighted sums of all the nodes in the hidden layer (vector)
    # y            - the output of the hidden layer
    # delta        - the delta from the layer after the hidden layer
    # w_hidden     - the weights of the hidden layer (matrix) - THIS ONE DOES NOT SEEM TO BE NEEDED!
    # w_next       - the weights of the next layer (matrix)
    # activation   - the activation of the hidden layer (activation object)
    def gradients_hidden_layer(self, x, weighted_sum, y, delta, w_next, activation):
        #print('INSIDE w_next: ', w_next.shape)
        #print('INSIDE delta: ', delta.shape)
        w_next = w_next[1:]
        #print('INSIDE w_next: ', w_next.shape)
        dE_dy = - np.dot(w_next, np.reshape(delta, (-1, 1)))
        dy_dweighted_sum = activation.gradient_vectorized(weighted_sum)
        dweighted_sum_dw = np.reshape(np.append(1, x), (-1, 1))
        
        #print('INSIDE ', dE_dy.shape, dy_dweighted_sum.shape)
        
        delta = - np.reshape(np.reshape(dE_dy, (-1)) * dy_dweighted_sum, (1, -1))
        #print('INSIDE dweighted_sum_dw: ', dweighted_sum_dw.shape)
        #print('INSIDE delta: ', delta.shape)
        dE_dw = np.dot(dweighted_sum_dw, delta)
        
        #print(dE_dy.shape)
        #print(dy_dweighted_sum.shape)
        #print(dweighted_sum_dw.shape)
        
        return delta, dE_dw
        
    # Compute all gradients (w.r.t. the weights) of the network
    # x - input to the network
    # t - targets
    def compute_all_gradients(self, x, t):
        weighted_sum_lst, output_lst = self.forward_pass(x)
        output_lst = [x] + output_lst # Adding the input to the beginning of the output_lst for convenience
        delta_lst = []
        dE_dw_lst = []
        
        # Compute gradients of the final layer
        delta, dE_dw = model.gradients_final_layer(x = output_lst[-2], 
                                                   weighted_sum = weighted_sum_lst[-1], 
                                                   y = output_lst[-1], 
                                                   t = t, 
                                                   w = model.w[-1], 
                                                   activation = model.activations[-1])
        
        delta_lst.append(delta)
        dE_dw_lst.append(dE_dw)
        
        for i in range(1, len(weighted_sum_lst)):
            print('i = ', i)
            print('output_lst[-(i+1)] = ', output_lst[-(i+1)])
            delta, dE_dw = model.gradients_hidden_layer(x = output_lst[-(i+2)], 
                                                        weighted_sum = weighted_sum_lst[-(i+1)], 
                                                        y = output_lst[-(i+1)], 
                                                        delta = delta_lst[-1], # The 'last' delta produced
                                                        w_next = model.w[-i], 
                                                        activation = model.activations[-(i+1)])
            
            
#            delta_hidden, dE_dw = model.gradients_hidden_layer(x = x, 
 #                                     weighted_sum = weighted_sum_lst[-2], 
  #                                    y = output_lst[-2], 
   #                                   delta = delta_final,
    #                                  w_next = model.w[-1], 
     #                                 activation = model.activations[-2])
            
            
            
            delta_lst.append(delta)
            dE_dw_lst.append(dE_dw)
            
        delta_lst = list(reversed(delta_lst)) # Making sure an increases index means we're going from input layer to output layer
        dE_dw_lst = list(reversed(dE_dw_lst)) # Making sure an increases index means we're going from input layer to output layer
        
        return delta_lst, dE_dw_lst

        
        
    def update_weights(self, dE_dw_lst, learning_rate):
        self.w = [w - learning_rate*dE_dw for w, dE_dw in zip(self.w, dE_dw_lst)]
        return
    
    def train_model(self, x, t, num_epochs, learning_rate):
        
        history = []
    
        for ep in range(num_epochs):
            _, output_lst = self.forward_pass(x)
            output_final_layer = output_lst[-1]
            error = np.dot(t-output_final_layer, t-output_final_layer)
            history.append(error)
                
            delta_lst, dE_dw_lst = self.compute_all_gradients(x, t)
            self.update_weights(dE_dw_lst, learning_rate)
            
        return history
        
            
        
        
        
        

In [231]:
lst = [np.array(range(3)), np.array(range(5))]
print(lst)
for a in lst:
    print(a)
    a = a+1
print(lst)

[array([0, 1, 2]), array([0, 1, 2, 3, 4])]
[0 1 2]
[0 1 2 3 4]
[array([0, 1, 2]), array([0, 1, 2, 3, 4])]


In [232]:
### Complete forward pass ###
model = NeuralNetwork(nodes_lst = [4, 3], input_shape = 5, activations = ['sigmoid', 'relu'])
print('weight shapes: ', model.get_w_shapes())
x = np.array(list(range(5)))
weighted_sum_lst, output_lst = model.forward_pass(x)
print('weigted_sum_lst: ', len(weigted_sum_lst), weigted_sum_lst[0].shape, weigted_sum_lst[1].shape)
print('output_lst: ', len(output_lst), output_lst[0].shape, output_lst[1].shape)

weight shapes:  [(6, 4), (5, 3)]
weigted_sum_lst:  2 (4,) (3,)
output_lst:  2 (4,) (3,)


In [233]:
### Calculate gradients of the last layer ###
t = weighted_sum_lst[-1] + 0.1 # Simulating the targets
delta_final, dE_dw = model.gradients_final_layer(x = output_lst[-2], 
                                           weighted_sum = weighted_sum_lst[-1], 
                                           y = output_lst[-1], 
                                           t = t, 
                                           w = model.w[-1], 
                                           activation = model.activations[-1])
print('delta_final = ', delta_final.shape)
print('dE_dw = ', dE_dw.shape)
print('This seems to be OK!')

delta_final =  (1, 3)
dE_dw =  (5, 3)
This seems to be OK!


In [234]:
### Calculate gradients of the last hidden layer ###
x = np.array(list(range(5)))
delta_hidden, dE_dw = model.gradients_hidden_layer(x = x, 
                                      weighted_sum = weighted_sum_lst[-2], 
                                      y = output_lst[-2], 
                                      delta = delta_final,
                                      w_next = model.w[-1], 
                                      activation = model.activations[-2])

print('delta_hidden = ', delta_hidden.shape)
print('dE_dw = ', dE_dw.shape)
print('This seems to be OK!')

delta_hidden =  (1, 4)
dE_dw =  (6, 4)
This seems to be OK!


In [235]:
### Compute all gradients ###
x = np.array(list(range(5)))
t = np.array(list(range(3)))+1
delta_lst, dE_dw_lst = model.compute_all_gradients(x, t)
print('delta_lst = ', len(delta_lst), delta_lst[0].shape, delta_lst[1].shape)
print('dE_dw_lst = ', len(dE_dw_lst), dE_dw_lst[0].shape, dE_dw_lst[1].shape)

i =  1
output_lst[-(i+1)] =  [0.29794241 0.98041342 0.00770862 0.97329948]
delta_lst =  2 (1, 4) (1, 3)
dE_dw_lst =  2 (6, 4) (5, 3)


In [236]:
### Update weights ###
print('w before: ', model.get_w_shapes())
print(dE_dw_lst, len(dE_dw_lst))
model.update_weights(dE_dw_lst = dE_dw_lst, learning_rate = 0.5)
print('w after :', model.get_w_shapes())

w before:  [(6, 4), (5, 3)]
[array([[0.66236278, 0.13185854, 0.93901224, 2.01088436],
       [0.        , 0.        , 0.        , 0.        ],
       [0.66236278, 0.13185854, 0.93901224, 2.01088436],
       [1.32472557, 0.26371708, 1.87802449, 4.02176873],
       [1.98708835, 0.39557562, 2.81703673, 6.03265309],
       [2.64945114, 0.52743416, 3.75604897, 8.04353746]]), array([[ 0.        , -0.30986135,  0.        ],
       [ 0.        , -0.09232084,  0.        ],
       [ 0.        , -0.30379223,  0.        ],
       [ 0.        , -0.0023886 ,  0.        ],
       [ 0.        , -0.30158789,  0.        ]])] 2
w after : [(6, 4), (5, 3)]


In [243]:
x = np.array(list(range(5)))
t = np.array(list(range(3)))+1
history = model.train_model(x, t, num_epochs = 3, learning_rate = 0.001)
print(history)

i =  1
output_lst[-(i+1)] =  [nan nan nan nan]
i =  1
output_lst[-(i+1)] =  [nan nan nan nan]
i =  1
output_lst[-(i+1)] =  [nan nan nan nan]
[nan, nan, nan]


