In [15]:
import numpy as np
class Network(object):
    def __init__(self):
        self.inputLayerSize = 2
        self.outputLayerSize = 1
        self.hiddenLayerSize = 3
        self.W1 = np.random.randn(self.inputLayerSize, self.hiddenLayerSize)
        self.W2 = np.random.randn(self.hiddenLayerSize, self.outputLayerSize)
        self.Lambda = .0001
        
    def forward(self, hours):
        self.hours = hours
        self.z2 = np.dot(hours, self.W1)
        self.a2 = self.tanh(self.z2)
        self.z3 = np.dot(self.a2, self.W2)
        self.yHat = self.tanh(self.z3) * 100
        return self.yHat
    
    def backward(self, known, L):
        dL_dyHat = -(known - self.yHat)
        dy_dz3 = self.dtanh(self.z3) * 100
        dL_dz3 = dy_dz3 * dL_dyHat
        self.dL_dW2 = np.dot(self.a2.T, dL_dz3) + self.Lambda*self.W2
        dL_da2 = np.dot(dL_dz3, self.W2.T)
        da2_dz2 = self.dtanh(self.z2)
        dL_dz2 = da2_dz2 * dL_da2
        self.dL_dW1 = np.dot(self.hours.T, dL_dz2) + self.Lambda*self.W1
        return self.dL_dW1, self.dL_dW2
        
    def sigmoid(self, z):
        return 1/(1+np.exp(-z))
    
    def dsigmoid(self, z):
        return (1/(np.square(1+np.exp(-z)))*(-np.exp(-z)))
    
    def tanh(self, z):
        return np.tanh(z)
    
    def dtanh(self, z):
        return 1 - np.square(self.tanh(z))
    
    def dmatrix(self, z, W):
        ans = np.ones(z.shape[0], z.shape[1])
        return ans * W.T
    
    def getParams(self):
        return np.concatenate((self.W1.ravel(), self.W2.ravel()))
    
    def setParams(self, params):
        W1_start = 0
        W1_end = self.inputLayerSize * self.hiddenLayerSize
        self.W1 = np.reshape(params[W1_start:W1_end], (self.inputLayerSize, self.hiddenLayerSize))
        self.W2 = np.reshape(params[W1_end:], (self.hiddenLayerSize, self.outputLayerSize))
    
    def computeNumericalGrad(self, X, y):
        paramsInitial = self.getParams()
        perturb = np.zeros(paramsInitial.shape)
        numgrad = np.zeros(paramsInitial.shape)
        e = .0001
        for i in range(len(paramsInitial)):
            perturb[i] = e
            self.setParams(paramsInitial + perturb)
            loss1 = self.cost(self.forward(X), y)
            self.setParams(paramsInitial - perturb)
            loss2 = self.cost(self.forward(X), y)
            numgrad[i] = np.sum((loss1 - loss2)/(2*e))
            perturb[i] = 0
        self.setParams(paramsInitial)
        return numgrad
        
    def cost(self, y, yHat):
        return np.sum(.5 * np.square(yHat - y)) + (self.Lambda/2)*(np.sum(self.W1**2) + np.sum(self.W2**2))
    
    def computeGradients(self, X, y):
        dJdW1, dJdW2 = self.backward(y, self.cost(self.forward(X), y))
        return np.concatenate((dJdW1.ravel(), dJdW2.ravel()))

In [16]:
net = Network()
hours = np.array(([3,5], [5,1], [10,2], [6,1.5]))
scores = np.array(([75], [82], [93], [70]))
L = net.forward(hours)
print L

[[-22.95100843]
 [ 61.52707749]
 [ 41.37534713]
 [ 79.2698273 ]]


In [17]:
L = net.cost(L, scores)
print net.backward(scores, L)

(array([[  1.83216282e+04,   3.09386946e-01,  -5.74563994e+02],
       [  3.56341770e+03,   3.47185105e-01,  -5.20356629e+02]]), array([[ 5285.96577458],
       [-4072.65209661],
       [ 3994.70160604]]))


In [18]:
print net.computeNumericalGrad(hours, scores)

[  1.83216243e+04   3.09386978e-01  -5.74564107e+02   3.56341768e+03
   3.47185160e-01  -5.20356702e+02   5.28596571e+03  -4.07265199e+03
   3.99470151e+03]


In [19]:
from scipy import optimize

In [22]:
class trainer(object):
    def __init__(self, NN):
        self.NN = NN
        
    def costWrapper(self, params, X, y):
        self.NN.setParams(params)
        cost = self.NN.cost(self.NN.forward(X), y)
        grad = self.NN.computeGradients(X, y)
        print cost, grad
        print params
        return cost, grad
    
    def callBack(self, params):
        self.NN.setParams(params)
        self.J.append(self.NN.cost(self.NN.forward(self.X), self.y))
    
    def train(self, X, y): 
        self.X = X
        self.y = y
        self.J = []
        params = self.NN.getParams()
        options = {'maxiter':200, 'disp': True}
        magic = optimize.minimize(self.costWrapper, params, jac=True, method='BFGS', args=(X,y), \
                                  options=options, callback=self.callBack)
        self.X = magic.x
        self.optimization = magic

In [23]:
training = trainer(net)
training.train(hours, scores)
print net.forward(hours)
print scores

0.00356724401079 [-0.80517472 -0.01694018 -1.54560817 -0.13022794 -0.1123198  -2.57741657
 -0.23826484 -0.32256218  0.00360125]
[-0.17769375 -0.62744504  6.22815859  0.60660152  0.81406    -3.64923343
 -2.29737294  2.19099581  2.67532502]
0.00356724401079 [-0.80517472 -0.01694018 -1.54560817 -0.13022794 -0.1123198  -2.57741657
 -0.23826484 -0.32256218  0.00360125]
[-0.17769375 -0.62744504  6.22815859  0.60660152  0.81406    -3.64923343
 -2.29737294  2.19099581  2.67532502]
45143.7666273 [  2.73753731e+04  -3.10155349e+03   7.85425630e-03   5.80066588e+03
  -6.56511140e+02   1.16875225e-02  -7.53077434e+03   8.72443033e+03
  -8.77009227e+03]
[ 0.08114626 -0.62199927  6.72502619  0.64846598  0.85016752 -2.82066977
 -2.2207778   2.29469006  2.67416732]
0.00355527405975 [ 1.36031275  0.0249859   0.45667541  0.30316341  0.03629322  0.75972274
  0.09319353  0.13886023 -0.04305575]
[-0.17769053 -0.62744497  6.22816476  0.60660204  0.81406045 -3.64922315
 -2.29737199  2.19099709  2.675325  ]
0