In [1]:
import numpy as np
class Network(object):
    def __init__(self):
        self.inputLayerSize = 2
        self.outputLayerSize = 1
        self.hiddenLayerSize = 3
        self.W1 = np.absolute(np.random.randn(self.inputLayerSize, self.hiddenLayerSize))
        self.W2 = np.absolute(np.random.randn(self.hiddenLayerSize, self.outputLayerSize))
        self.Lambda = .0001
        
    def forward(self, hours):
        self.hours = hours
        self.z2 = np.dot(hours, self.W1)
        self.a2 = self.ReLU(self.z2)
        self.z3 = np.dot(self.a2, self.W2)
        self.yHat = self.ReLU(self.z3)
        return self.yHat
    
    def backward(self, known, L):
        dL_dyHat = -(known - self.yHat)
        dy_dz3 = np.square(self.dReLU(self.z3))
        dL_dz3 = dy_dz3 * dL_dyHat
        self.dL_dW2 = np.dot(self.a2.T, dL_dz3) + self.Lambda*self.W2
        dL_da2 = np.dot(dL_dz3, self.W2.T)
        da2_dz2 = self.dReLU(self.z2)
        dL_dz2 = da2_dz2 * dL_da2
        self.dL_dW1 = np.dot(self.hours.T, dL_dz2) + self.Lambda*self.W1
        return self.dL_dW1, self.dL_dW2
        
    def sigmoid(self, z):
        return 1/(1+np.exp(-z))
    
    def dsigmoid(self, z):
        return (1/(np.square(1+np.exp(-z)))*(-np.exp(-z)))
    
    def tanh(self, z):
        return np.tanh(z)
    
    def dtanh(self, z):
        return 1 - np.square(self.tanh(z))
    
    def ReLU(self, z):
        return np.maximum(np.array([0]), z)
    
    def dReLU(self, z):
        return np.where(z > np.array([0]), np.ones(z.shape), np.zeros(z.shape))
    
    def dmatrix(self, z, W):
        ans = np.ones(z.shape[0], z.shape[1])
        return ans * W.T
    
    def getParams(self):
        return np.concatenate((self.W1.ravel(), self.W2.ravel()))
    
    def setParams(self, params):
        W1_start = 0
        W1_end = self.inputLayerSize * self.hiddenLayerSize
        self.W1 = np.reshape(params[W1_start:W1_end], (self.inputLayerSize, self.hiddenLayerSize))
        self.W2 = np.reshape(params[W1_end:], (self.hiddenLayerSize, self.outputLayerSize))
    
    def computeNumericalGrad(self, X, y):
        paramsInitial = self.getParams()
        perturb = np.zeros(paramsInitial.shape)
        numgrad = np.zeros(paramsInitial.shape)
        e = .0001
        for i in range(len(paramsInitial)):
            perturb[i] = e
            self.setParams(paramsInitial + perturb)
            loss1 = self.cost(self.forward(X), y)
            self.setParams(paramsInitial - perturb)
            loss2 = self.cost(self.forward(X), y)
            numgrad[i] = np.sum((loss1 - loss2)/(2*e))
            perturb[i] = 0
        self.setParams(paramsInitial)
        return numgrad
        
    def cost(self, y, yHat):
        return np.sum(.5 * np.square(yHat - y)) + (self.Lambda/2)*(np.sum(self.W1**2) + np.sum(self.W2**2))
        
    def costFunction(self, X, y):
        yHat = self.forward(X)
        return 0.5 * np.sum(np.square((y - yHat)))/X.shape[0] + (self.Lambda/2)*(np.sum(self.W1**2)+np.sum(self.W2**2))
    
    def computeGradients(self, X, y):
        dJdW1, dJdW2 = self.backward(y, self.cost(y, self.forward(X)))
        return np.concatenate((dJdW1.ravel(), dJdW2.ravel()))

In [2]:
net = Network()
hours = np.array(([3,5], [5,1], [10,2], [6,1.5], [7, 2]))
scores = np.array(([75], [82], [93], [70], [74]))

In [3]:
L = net.costFunction(hours, scores)
print net.backward(scores, L)

(array([[-2108.59706236,  -875.87804399,  -207.75665087],
       [ -744.06891775,  -309.07452158,   -73.3118423 ]]), array([[-2795.57844801],
       [-3499.2270925 ],
       [ -717.43311975]]))


In [4]:
print net.computeNumericalGrad(hours, scores)

[-2108.59706236  -875.87804399  -207.75665086  -744.06891776  -309.07452158
   -73.31184229 -2795.57844799 -3499.22709252  -717.43311975]


In [5]:
from scipy import optimize

In [6]:
class trainer(object):
    def __init__(self, NN):
        self.NN = NN
        
    def costWrapper(self, params, X, y):
        self.NN.setParams(params)
        cost = self.NN.costFunction(X, y)
        grad = self.NN.computeGradients(X, y)
        print cost, grad
        print params
        return cost, grad
    
    def callBackF(self, params):
        self.NN.setParams(params)
        self.J.append(self.NN.cost(self.NN.forward(self.X), self.y))
    
    def train(self, X, y): 
        self.X = X
        self.y = y
        self.J = []
        params0 = self.NN.getParams()
        options = {'maxiter':10000, 'disp': True}
        magic = optimize.minimize(self.costWrapper, params0, jac=True, method='BFGS', args=(X,y), \
                                  options=options)
        self.NN.setParams(magic.x)
        self.optimization = magic

In [7]:
training = trainer(net)
training.train(hours, scores)

2162.09457443 [-2108.59706236  -875.87804399  -207.75665087  -744.06891775  -309.07452158
   -73.3118423  -2795.57844801 -3499.2270925   -717.43311975]
[ 0.75009112  1.31616073  0.06878586  1.70232129  1.06165763  0.78745041
  1.01885337  0.42321571  0.10038597]
2162.09457443 [-2108.59706236  -875.87804399  -207.75665087  -744.06891775  -309.07452158
   -73.3118423  -2795.57844801 -3499.2270925   -717.43311975]
[ 0.75009112  1.31616073  0.06878586  1.70232129  1.06165763  0.78745041
  1.01885337  0.42321571  0.10038597]
1135.16144528 [-2292.86502419 -1623.58800785  -352.79449011  -812.19400469  -575.11825984
  -124.96917282 -2660.23608215 -2758.18908885  -575.72883321]
[ 1.16390972  1.48805447  0.10955873  1.84834709  1.12231445  0.80183808
  1.56749417  1.1099497   0.24118441]
3382.57279804 [  9996.21811984  10248.18771568   2137.32504659   3243.84033406
   3325.60608103    693.57646625   9588.28015874   6957.82048168
   1465.47239965]
[ 2.81918414  2.17562942  0.27265024  2.43245026 

In [8]:
print net.forward(hours)
print scores

[[  74.44606241]
 [  52.89075896]
 [ 105.78151792]
 [  66.38106578]
 [  79.8713726 ]]
[[75]
 [82]
 [93]
 [70]
 [74]]
