## Summary

### function

In [112]:
import numpy as np

def sigmoid(x):
    return 1/(1+np.exp(-x))

def relu(x):
    return np.maximum(0,x)

def softmax(x):
    c = np.max(x,axis=1).reshape(-1,1)
    x = x-c
    return np.exp(x)/np.sum(np.exp(x),axis=1).reshape(-1,1)

def categorical_crossentropy(y,t):
    return np.mean(-t*np.log(y))

def make_one(x):
    result = np.zeros((x.size, np.unique(x).size))
    for idx1,idx2 in enumerate(x):
        result[idx1,idx2] = 1
    return result

### Relu, Sigmoid, Affine, Loss

In [113]:
class Relu:
    def __init__(self):
        self.mask = None    
        
    def forward(self,x):
        self.mask = (x <=0)
        out = x.copy()
        out[self.mask] = 0
        return out
    
    def backward(self,dout):
        dout[self.mask] = 0
        dx = dout
        return dx
    
class Sigmoid:
    def __init__(self):
        self.out = None
    
    def forward(self,x):
        out = sigmoid(x)
        self.out = out
        return self.out
    
    def backward(self,dout):
        dx = ((1-self.out)*self.out)*dout
        return dx
    
class Affine:
    def __init__(self,W,b):
        self.W = W
        self.b = b
        self.x = None
        self.origin_shape = None
        self.dW = None
        self.db = None
    
    def forward(self,x):
        self.origin_shape = x.shape
        self.x = x
        out = np.dot(x, self.W) + self.b
        return out
    
    def backward(self,dout):
        dx = np.dot(dout, self.W.T) 
        self.dW = np.dot(self.x.T, dout) 
        self.db = np.sum(dout, axis=0)
        dx = dx.reshape(self.origin_shape)
        return dx

class Loss:
    def __init__(self):
        self.loss = None
        self.y = None
        self.t = None
    
    def forward(self,y,t):
        self.y = softmax(y)
        self.t = t    
        self.loss = categorical_crossentropy(self.y, self.t)
        return self.loss
    
    def backward(self,dout=1):
        dx = (self.y - self.t)*dout
        return dx

### Network => Layers

In [114]:
class Layers:
    def __init__(self):
        self.layers = {}
    
    def add(self,x1,x2,activation):
        activation_dict = {
            "sigmoid" : Sigmoid,
            "relu" : Relu,
            "softmax" : Loss,
        }
        W = np.random.randn(x1,x2)
        b = np.zeros(x2)
        Affine_layer = "Affine"+str(int(len(self.layers)/2+1))
        activation_layer = "activation"+str(int(len(self.layers)/2+1))
        self.layers[Affine_layer] = Affine(W,b)
        self.layers[activation_layer] = activation_dict[activation]()

    def predict(self,x):
        out = x.copy()
        ind = 1
        layer_len = len(self.layers)
        for key,layer in self.layers.items():
            if ind < layer_len:
                out = layer.forward(out)
            ind += 1
        return out
    
    def loss(self,x,t):
        y = self.predict(x)
        out = list(self.layers.values())[-1].forward(y,t)
        return out
    
    def accuracy(self,x,t):
        y = self.predict(x)
        y = np.argmax(y,axis=1)
        t = np.argmax(t,axis=1)
        self.acc = np.sum(y==t)/t.size
        return self.acc
    
    def gradient(self,x,t):
        lr = 1e-4
        self.loss(x,t)
        dout = 1
        dout = list(self.layers.values())[-1].backward(dout)
        layers = list(self.layers.values())[::-1][1:]
        self.layers_key = list(self.layers.keys())[::-1][1:]
        for layer in layers:
            dout = layer.backward(dout)
        self.grad = {}
        for layer_key in self.layers_key:
            if "Affine" in layer_key:
                self.grad[layer_key] = [self.layers[layer_key].dW, self.layers[layer_key].db]
        return self.grad
    
    def new_gradient(self,x,t):
        self.loss(x,t)
        
        dout = 1
        dout = list(self.layers.values())[-1].backward(dout)
        
        layers = list(self.layers.values())
        layers.reverse()
        for layer in layers:
            dout = layer.backward(dout)
        
        grads = {
            "W3":self.layers["Affine3"].dW, "b3":self.layers["Affine3"].db,
            "W2":self.layers["Affine2"].dW, "b2":self.layers["Affine2"].db,
            "W1":self.layers["Affine1"].dW, "b1":self.layers["Affine1"].db,
        }
        return grads
    
    def fit(self,x,t,epochs,lr):
        self.lr = lr
        self.history = {}
        loss = []
        accuracy = []
        for epoch in range(epochs):
            self.gradient(x,t)
            loss.append(self.err)
            accuracy.append(self.accuracy(x,t))
            if epoch % 100 == 0:
                print(f'loss : {self.err} === accuracy : {self.accuracy(x,t)}')
        self.history["loss"] = loss
        self.history["accuracy"] = accuracy

In [115]:
network = Layers()

In [116]:
network.add(4,10,"relu")

In [117]:
network.add(10,3,"sigmoid")

In [118]:
network.add(3,4,"softmax")

In [119]:
x = np.random.randn(100,4)

In [120]:
network.predict(x).shape

(100, 4)

In [121]:
t = make_one(np.random.randint(0,4,100))
t.shape

(100, 4)

In [122]:
network.loss(x,t)

0.40467358583349095

In [123]:
network.gradient(x,t)

{'Affine3': [array([[ 1.24836301,  2.14520321, -2.50675793, -0.88680829],
         [ 7.67789456,  5.19293005, -3.65621699, -9.21460762],
         [ 4.67921647, -0.15506991, -3.71965605, -0.80449052]]),
  array([ 9.12430472,  5.38975319, -7.53032953, -6.98372837])],
 'Affine2': [array([[ 3.7012035 , -0.36348058,  5.07028725],
         [ 0.36722149, -0.74703103,  2.32579649],
         [ 0.28212938,  1.02209394,  0.84033867],
         [-0.02259275,  0.04867313,  0.85583793],
         [ 3.97318248,  0.20622796,  3.88408897],
         [-1.14749635,  1.20900891,  0.68091603],
         [ 1.62951106, -0.61567642,  0.53145892],
         [ 1.81605387, -0.30284671,  0.47666206],
         [-0.78855665,  0.58021444,  0.30388771],
         [ 1.45214726,  0.02245651,  2.52236848]]),
  array([ 0.59311688, -0.2862043 ,  2.61605603])],
 'Affine1': [array([[ 3.07359022,  0.29626232,  2.67754134, -0.35518682, -2.54246433,
           1.98977001, -0.39980398, -0.42157326, -0.19311129, -0.8973613 ],
        

In [124]:
network.new_gradient(x,t)

{'W3': array([[ 3.65772233,  4.27581985,  4.5304669 ,  3.58067933],
        [ 9.54577191,  9.61609326,  9.26202457, 11.81582374],
        [ 4.15743213,  4.82219674,  5.15468986,  3.9959718 ]]),
 'b3': array([20.27674049, 20.76570176, 20.57938721, 20.8583902 ]),
 'W2': array([[ 3.63705958,  2.45760934, -9.1037414 ],
        [ 2.31442459,  4.28550013, -4.62948269],
        [ 4.57372672,  3.57222659, -2.58805521],
        [ 1.20034006,  4.01938163, -3.67136722],
        [ 4.17259345,  2.87840938, -6.66873721],
        [ 3.91545214,  4.88500862, -6.8470903 ],
        [ 5.53171182,  4.61161421, -1.93004835],
        [ 3.14797777,  3.07564576, -1.7969338 ],
        [ 5.45938141,  1.68882714, -4.87480852],
        [ 4.50809114,  2.85249517, -8.49722189]]),
 'b2': array([ 4.5646162 ,  5.33863465, -7.29669821]),
 'W1': array([[ 1.50438558, -3.98478846, -3.41006493,  2.35991827, -0.28144333,
         -3.33526662,  2.08310243, -1.21508028,  1.24867354,  1.54611593],
        [ 1.46193766,  1.16340

In [125]:
print(network.new_gradient(x,t)["W3"].shape)
print(network.new_gradient(x,t)["b3"].shape)
print(network.new_gradient(x,t)["W2"].shape)
print(network.new_gradient(x,t)["b2"].shape)
print(network.new_gradient(x,t)["W1"].shape)
print(network.new_gradient(x,t)["b1"].shape)

(3, 4)
(4,)
(10, 3)
(3,)
(4, 10)
(10,)


In [126]:
# lr = 1e-4
# network.layers["Affine3"].dW -= lr*network.gradient(x,t)["Affine3"][0]
# network.layers["Affine3"].db -= lr*network.gradient(x,t)["Affine3"][1]
# network.layers["Affine2"].dW -= lr*network.gradient(x,t)["Affine2"][0]
# network.layers["Affine2"].db -= lr*network.gradient(x,t)["Affine2"][1]
# network.layers["Affine1"].dW -= lr*network.gradient(x,t)["Affine1"][0]
# network.layers["Affine1"].db -= lr*network.gradient(x,t)["Affine1"][1]