In [1]:
import numpy as np
import pandas as pd

### Back propagation formula
The four equations for doing back propagation:
$$\begin{eqnarray}\delta^L&=&\nabla_aC\odot\sigma'(z^L) \\
\delta^l&=&((w^{l+1})^T\delta^{l+1})\odot\sigma'(z^l) \\ \frac{\partial C}{\partial b_j^l}&=&\delta_j^l \\
\frac{\partial C}{\partial w_{jk}^l}&=&a_k^{l-1}\delta_j^l
\end{eqnarray}$$

where $C$ is the cost function, ${a^l_k}$ is the activation of the ${k_{th}}$ neuron in the ${l_{th}}$ layer, ${w^l}$ and ${b^l}$ are the weights and the bias vector connecting to the ${l_{th}}$ layer, and $z^l$ is the weighted input. Thus ${a^l}$ can be written as $\sigma(z^l)$ where $\sigma$ is the activation function. ${\delta^l}$ as the vector error in layer $l$ and ${\delta^L}$ as the vector error for the output layer.  
Credit: [Neural Networks and Deep Learning, Ch. 2](http://neuralnetworksanddeeplearning.com/chap2.html)

#### Be mindful of element-wise product $\odot$ vs. dot product

In [2]:
# element wise product 
a = np.arange(8)
print(a)
a*a

[0 1 2 3 4 5 6 7]


array([ 0,  1,  4,  9, 16, 25, 36, 49])

In [3]:
# dot product
a.dot(a)

140

In [4]:
class NN():
    def __init__(self,architecture,learning_rate=0.01,activation=lambda x:x):
        '''This is a fully connected NN. The architecture is a list, 
        with each element specifying the number of nodes in each layer'''
        self.arch=architecture
        self.activation=activation # your activation function
        self.lr=learning_rate
        self.initialized=False
        # add more attributes if needed
        
    def init_weights(self):
        self.weights=[]
        self.biases=[]
        for n in range(sum(self.arch[:2])*2):
            # use random number to initial weights here
            self.weights.append(np.random.random())
            if(n % 4 == 0):
                self.biases.append(np.random.random())
        self.weights_12 = np.array(self.weights[:12]).reshape((6,2))
        self.weights_23 = np.array(self.weights[12:]).reshape((2,2))
        self.biases = np.array(self.biases).reshape((2,2))
        self.initialized=True
        return self.weights,self.biases
        
    def feed_forward(self,x):
        if self.initialized:
            # define necessary variables
            # you may want to keep a record of values for each layer
            # loop over forward steps
            self.inputs = x
            inputs = np.array(x)
            self.z1 = inputs.dot(self.weights_12) + self.biases[0,:]
            self.act_2nd_layer = self.activation(self.z1)
            self.z2 = self.act_2nd_layer.dot(self.weights_23) + self.biases[1,:]
            self.act_output = self.activation(self.z2)
            return self.act_output
        else:
            print("Please initialize the weights first!")
        
    
    def back_prop(self):
        # Using back-propagation, update weights with a learning parameter
        dc_outerb = self.error_output
        dc_hiddenb = self.error_hidden
        dc_wouter = np.vstack((self.act_2nd_layer, self.act_2nd_layer)).T * self.error_output
        dc_whidden = np.vstack((self.inputs, self.inputs)).T * self.error_hidden
        self.biases[1,:] = self.biases[1,:] - ((self.lr) * dc_outerb)
        self.biases[0,:] = self.biases[0, :] - ((self.lr) * dc_hiddenb)
        self.weights_23 = self.weights_23 - ((self.lr) * dc_wouter)
        self.weights_12 = self.weights_12 - ((self.lr) * dc_whidden)
        return self.biases,self.weights_12, self.weights_23
            
    def fit(self,x,y,activation_grad):
        # train with (x, y) using feed-forward back-propagation
        # activation_grad: gradient function for your activation function
        pred_output = self.predict(x)
        print('Predicted Ouput')
        print(pred_output)
        lay_errors = self.layer_errors(activation_grad)
        print('Output & Hidden Layer Errors in That Order')
        print(lay_errors)
        new_weights = self.back_prop()
        print('New Biases, New Weights connecting Input to Hidden, New Weights connecting Hidden to Output')
        print(new_weights)
        
    def predict(self,x):
        return self.feed_forward(x)
    
    def layer_errors(self, activation_grad):
        grad_cost = np.array([(-1 - self.act_output[0]), (-1 - self.act_output[1])])
        self.error_output = grad_cost * activation_grad(self.z2)
        self.error_hidden = (self.weights_23.T.dot(self.error_output)) * (activation_grad(self.z1))
        return self.error_output, self.error_hidden
        

In [5]:
def tanh(x):
    return np.tanh(x)

def tanh_grad(x):
    return 1-np.tanh(x)**2

nn=NN([6,2,2], activation=tanh)
nn.init_weights()
nn.predict([-1,1,-1,-1,1,-1])
nn.fit([-1,1,-1,-1,1,-1], [-1,-1], tanh_grad)

Predicted Ouput
[-0.40806365 -0.73304806]
Output & Hidden Layer Errors in That Order
(array([-0.49336951, -0.12350279]), array([-0.25441521, -0.30135118]))
New Biases, New Weights connecting Input to Hidden, New Weights connecting Hidden to Output
(array([[0.61982275, 0.07659692],
       [0.65819187, 0.04793348]]), array([[0.54903414, 0.0373644 ],
       [0.89000455, 0.40458017],
       [0.49118796, 0.72942045],
       [0.38447091, 0.31411679],
       [0.01492478, 0.74039311],
       [0.96016532, 0.7592653 ]]), array([[0.78895717, 0.67050514],
       [0.93149726, 0.89964125]]))


## 2a:

In [6]:
nn=NN([6,2,2], activation=tanh)
weight, bias = nn.init_weights()
print(len(weight))
weight

16


[0.2132003908166632,
 0.23435013712904007,
 0.10002727308763193,
 0.16498758438894945,
 0.3230570998725264,
 0.8515414593015405,
 0.14317349923896794,
 0.5679163559208277,
 0.49357463484088704,
 0.011513949360452913,
 0.6595961610778186,
 0.37019428703675916,
 0.44767204931209714,
 0.8642870386850569,
 0.3354986300941968,
 0.019868237932705868]

In [7]:
bias

array([[0.74384522, 0.4832384 ],
       [0.21670742, 0.52172496]])

## 2b:

In [8]:
temp = nn.predict([-1,1,-1,-1,1,-1])
temp

array([-0.07819903,  0.4644163 ])

I am getting different outputs for the same input of [-1,1,-1,-1,1,-1] and I am assuming that is due to the use of random weights and random biases. One of the outputs that I got was [-0.22, -0.44] and given that both of the values of the output is negative, I am designating its corresponding secondary structure as a coil. I chose coil since its predicted output is (-1,-1), so both values of the output are negative just like the example output that I obtained.

## 2c:

$C = 1/2 * ∑_j(y_j−a^L_j)^2$


$ ∂C/∂a^L_j=(a^L_j−y_j) $

In [9]:
nn.layer_errors(tanh_grad)

(array([-0.91616407, -1.14856733]), array([-0.79548183, -0.18753859]))

## 2d:

In [10]:
# Function updates weights & biases
nn.back_prop()

(array([[0.75180004, 0.48511378],
        [0.22586906, 0.53321063]]),
 array([[0.20524557, 0.23247475],
        [0.10798209, 0.16686297],
        [0.31510228, 0.84966607],
        [0.13521868, 0.56604097],
        [0.50152945, 0.01338934],
        [0.65164134, 0.3683189 ]]),
 array([[0.44765757, 0.86426889],
        [0.32746041, 0.00979096]]))

In [11]:
# Clearly new weights were successfully updated for connection of input to hidden layer
nn.weights_12

array([[0.20524557, 0.23247475],
       [0.10798209, 0.16686297],
       [0.31510228, 0.84966607],
       [0.13521868, 0.56604097],
       [0.50152945, 0.01338934],
       [0.65164134, 0.3683189 ]])

In [12]:
# Clearly new weights were successfully updated for connection of hidden to output
nn.weights_23

array([[0.44765757, 0.86426889],
       [0.32746041, 0.00979096]])

In [13]:
# Clearly new biases were successfully updated as well
nn.biases

array([[0.75180004, 0.48511378],
       [0.22586906, 0.53321063]])

In [14]:
# new Run
nn.fit([-1,1,-1,-1,1,-1], [-1,-1], tanh_grad)

Predicted Ouput
[-0.03622421  0.51636087]
Output & Hidden Layer Errors in That Order
(array([-0.96251113, -1.11205577]), array([-0.79270695, -0.19852233]))
New Biases, New Weights connecting Input to Hidden, New Weights connecting Hidden to Output
(array([[0.75972711, 0.48709901],
       [0.23549417, 0.54433119]]), array([[0.1973185 , 0.23048953],
       [0.11590916, 0.16884819],
       [0.30717521, 0.84768085],
       [0.12729161, 0.56405575],
       [0.50945652, 0.01537456],
       [0.64371427, 0.36633368]]), array([[4.48177820e-01, 8.64869968e-01],
       [3.19044973e-01, 6.80260669e-05]]))
