In [1]:
import numpy as np
import math
# Helper function!
def sigmoid(z):
    """The sigmoid function."""
    return 1.0/(1.0+np.exp(-z))

def softmax(z):
    return np.exp(z) /np.sum(np.exp(z),axis = 1).reshape(-1,1)


def sigmoid_prime(z):
    """Derivative of the sigmoid function."""
    return z*(1-z)


In [15]:

class Network(object):
    def __init__(self,size): # (784 , 256, 10) 说是三层layer 其实就两个
        ## size should be a list of int or tuple of int!
        
        self.num_mt = len(size) - 1 # 2 这里我们要取2 就是我们有两个矩阵
        self.size = size  #无聊存一下 
        self.output = [0] * len(size) # output 也是每次的input 一共三个
        self.errorterm = [0] * (len(size)-1)  # 这就是就误差 乘以sigmoid 导数
        self.delta = []
        self.error = None # 就是最后的误差

        self.bias_dict = {'bias_{}'.format(k): self._init_b(size[k-1],size[k]) for k in range(1,len(size))}  
        # One is 256 and two has 10 bias.
        
        self.weights_dict = {'weights_{}'.format(k): self._init_W(size[k-1],size[k]) for k in range(1,len(size))}

    def _init_W(self, n_in, n_out):
        bound = 2.0 / (n_in + n_out)
        W = np.random.uniform(-bound, bound, (n_in, n_out))
        return W
    
    def _init_b(self,n_in,n_out):
        bound = 2.0 / (n_in + n_out)
        b = np.random.uniform(-bound, bound, (1, n_out))
        return b
        
    
    def forward(self,input_): # update output in each layers
        self.output[0] = input_
        for n,(w,b) in enumerate(zip(self.weights_dict.values(),self.bias_dict.values())): # 0,1
            n += 1
            if n == self.num_mt: #last layer ! is 2 就是说到了最后一层就softmax
                input_ = softmax(np.dot(input_,w) + b) # when it comes to the last layers
                
            else:
                input_ = sigmoid(np.dot(input_,w) + b)
            self.output[n] = input_
        return input_

    def loss(self,output,target):
        loss = 0.0
        # target is a array , and output is a matrix.
        self.bs = output.shape[0]
        log_likelihood = -np.log(output[range(self.bs),target])
        loss = np.sum(log_likelihood) / self.bs
        return loss
    
    def backpro(self,target,learnrate = 0.001): # devivertive of softmax is looks like sigmoid !

        self.lr = learnrate
        deltaw = [np.zeros(w.shape) for w in self.weights_dict.values()] # 变化量矩阵 有两个！
        deltab = [np.zeros(b.shape) for b in self.bias_dict.values()]
        y_idx = np.argmax(softmax(self.output[-1]),axis =1)
        y_matrix = np.eye(self.size[-1])[target]
        
        #print(y_idx)
        for i in range(self.num_mt)[::-1]: # we have 2 matrix here 1,0
            #print(self.error)
            self.errorterm[i] = (self.output[-1] - y_matrix)  #errer term softmax and loss dev

            if i == self.num_mt -1 : #== 1
                hidden_error = np.dot(self.weights_dict['weights_{}'.format(i+1)], self.errorterm[i].sum(axis=0))
                #print(hidden_error.shape)
            
            elif i != self.num_mt -1 : # 0
                self.errorterm[i] = hidden_error * sigmoid_prime(self.output[i+1])
                #print(hidden_error.shape)
          
        
        for j in range(self.num_mt): # get delta ~ 0,
            #print(deltaw[j].shape)
            deltaw[j] += np.dot(self.output[j].T,self.errorterm[j])
            deltab[j] += np.sum(self.errorterm[j],axis=0)
            

        
        for k in range(self.num_mt):
            #print(deltab[k].shape)
            self.weights_dict['weights_{}'.format(k+1)] -= self.lr * deltaw[k]/self.bs # update hidden-to-output weights with gradient descent step
            self.bias_dict['bias_{}'.format(k+1)] -= self.lr * deltab[k]/self.bs
    
        return self.weights_dict,self.bias_dict
  
        
        

In [37]:
from sklearn.model_selection import train_test_split
nn = Network([784,256,10])
x_train = np.load('mnist.train.npy')
y_train = np.load('mnist.trainlabel.npy')
x_test = np.load('mnist.test.npy')
x,xt,y,yt = train_test_split(x_train,y_train,test_size = 0.2,random_state = 224)
for e in range(10):
    for i in range(0,x.shape[0],32):
        data,target = x[i:i+32],y[i:i+32]
        data = data.reshape(-1,784)
        output = nn.forward(data)
        loss = nn.loss(output,target)
        if (i+1)%50 == 0:
            print(loss)
        nn.backpro(target,learnrate=0.01)
    
        

In [None]:
(nn.output[-1] - np.eye(nn.size[-1])[[2,5]]).sum(axis=0).shape



In [32]:

x[0:0+100].shape


(100, 28, 28)

In [40]:
output = nn.forward(xt.reshape(-1,784))
sum(np.argmax(softmax(output),axis=1)==yt)/len(yt)

0.8682142857142857

In [41]:
output

array([[7.25569395e-01, 4.56553399e-07, 8.36464449e-04, ...,
        1.90678206e-05, 5.50787785e-03, 1.67895064e-04],
       [1.18179855e-01, 5.81255813e-06, 5.32236834e-02, ...,
        1.58255263e-03, 2.62219985e-03, 2.55653119e-02],
       [7.59665884e-02, 1.62972683e-05, 5.53559847e-04, ...,
        4.39139010e-03, 1.31456531e-02, 1.01101204e-02],
       ...,
       [2.72618597e-05, 6.34925199e-07, 9.70451935e-06, ...,
        4.56800824e-04, 4.07842238e-04, 6.04758843e-02],
       [1.95580593e-04, 9.27223883e-06, 1.56488068e-02, ...,
        4.40424476e-05, 2.82022074e-04, 2.59933588e-03],
       [9.94219941e-05, 1.36568025e-04, 2.51112163e-04, ...,
        1.50123572e-01, 2.22722852e-02, 7.71600609e-01]])