In [269]:
import numpy as np
np.random.seed(1337)  # for reproducibility

from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import SGD
import matplotlib.pyplot as plt

# layers
class ReLU:
    def __init__(self):
        self.mask = None

    def forward(self, x):
        self.mask = (x <= 0)
        out = x.copy()
        out[self.mask] = 0

        return out

    def backward(self, dout):
        dout[self.mask] = 0
        dx = dout

        return dout
    
class Affine:
    def __init__(self, W, b):
        self.W =W
        self.b = b
        
        self.x = None
        self.original_x_shape = None        
        self.dW = None
        self.db = None

    def forward(self, x):
        self.original_x_shape = x.shape
        x = x.reshape(x.shape[0], -1)
        self.x = x

        out = np.dot(self.x, self.W) + self.b

        return out

    def backward(self, dout):
        dx = np.dot(dout, self.W.T)
        self.dW = np.dot(self.x.T, dout)
        self.db = np.sum(dout, axis=0)
        
        dx = dx.reshape(*self.original_x_shape)
        return dx
    
# create data
X = np.linspace(-1, 1, 200)
np.random.shuffle(X)    # randomize the data
Y = -(0.5 * X + 2 + np.random.normal(0, 0.05, (200, )))

# plot data
#plt.scatter(X, Y)
#plt.show()

X_train, Y_train = X[:1], Y[:1]     # first 160 data points
print (X_train)
X_test, Y_test = X[160:], Y[160:]       # last 40 data points

# build a neural network from the 1st layer to the last layer
model = Sequential()

# relu expertment
model.add(Dense(units=2, activation='relu', input_dim=1)) 
model.add(Dense(units=1)) 

# choose loss function and optimizing method
model.compile(loss='mse', optimizer=SGD(lr=0.01))

model.fit(X_train, Y_train, batch_size = 10, epochs = 20)

[-0.70854271]
[[-1.37442088]
 [-0.94376338]]


In [318]:
W1, b1 = model.layers[0].get_weights()
W2, b2 = model.layers[1].get_weights()
affine1 = Affine(W1, b1)
relu = ReLU()
affine2 = Affine(W2, b2)

# forwoard

for i in range(3):
    print ('No. ' + str(i) + ' x data')
    X_train = X[[i]] 
    affine_f1 = affine1.forward(X_train)
    relu_f = relu.forward(affine_f1)
    affine_f2 = affine2.forward(relu_f)

    Y_train = Y_train.reshape(1, 1)

    # backwoard
    dy = Y_train - affine_f2
    affine_b2 = affine2.backward(dy)
    relu_b = relu.backward(affine_b2)
    affine_b1 = affine1.backward(relu_b)

    print ('Layer1 W : ', affine1.W)
    print ('Layer1 dW : ', affine1.dW)
    print ('relu_f : ', relu_f)
    
# 一直以為這樣ReLU會造成weight=0，那這樣x都乘上0不就都沒辦法訓練，
# 但觀察每筆x data的dW結果，當dW=0時頂多不會update W，並不會影響下一筆x data training
# 重點在Backforward後，ReLU是讓dW=0不是W=0

No. 0 x data
('Layer1 W : ', array([[ 0.44152594,  1.33729184]], dtype=float32))
('Layer1 dW : ', array([[ 0.,  0.]]))
('relu_f : ', array([[ 0.,  0.]]))
No. 1 x data
('Layer1 W : ', array([[ 0.44152594,  1.33729184]], dtype=float32))
('Layer1 dW : ', array([[ 0.31580914,  0.21685432]]))
('relu_f : ', array([[ 0.07765532,  0.23520208]]))
No. 2 x data
('Layer1 W : ', array([[ 0.44152594,  1.33729184]], dtype=float32))
('Layer1 dW : ', array([[ 0.,  0.]]))
('relu_f : ', array([[ 0.,  0.]]))
