# 练习4

训练浅层NN解决XOR问题

训练数据:

{0,0,1, 0}
{0,1,1, 1}
{1,0,1, 1}
{1,1,1, 0}


In [1]:
import numpy as np
import matplotlib.pyplot as plt
np.random.seed(42)

X_train = np.array([[0,0,1],[0,1,1], [1,0,1], [1,1,1]])

Y_train = np.array([[0],[1],[1],[0]])


class NN:
    def __init__(self, in_hidden_out:list, if_bias=False):
        self.input_size = in_hidden_out[0]
        self.output_size = in_hidden_out[-1]
        self.hidden_layers = len(in_hidden_out) - 2
        self.in_hidden_out = in_hidden_out
        self.if_bias = if_bias
        self.weights = []
        self.biases = []
        self.loss = []
        self.activations = []

        # Initialize weights and biases
        self.initialize_weights_and_biases()

    def sigmoid(self, x):
        return 1 / (1 + np.exp(-x))

    def sigmoid_derivative(self, x):
        return x * (1 - x)

    def initialize_weights_and_biases(self):
        for i in range(self.hidden_layers + 1):
            self.weights.append(np.random.rand(self.in_hidden_out[i], self.in_hidden_out[i+1]) * 2 -1)
            if self.if_bias:
                self.biases.append(np.random.rand((1, self.in_hidden_out[i+1]))*2 -1)
            else:
                self.biases.append(np.zeros((1, self.in_hidden_out[i+1])))

    def forward(self, X):
        self.activations = [X]
        for i in range(self.hidden_layers + 1):
            z = np.dot(self.activations[-1], self.weights[i]) + self.biases[i]
            a = self.sigmoid(z)
            self.activations.append(a)
        return self.activations[-1]

    def backward(self, X, y, learning_rate):
        _ = self.forward(X)   
        error = y - self.activations[-1]
        self.loss.append(np.mean(error ** 2))

        delta = error * self.sigmoid_derivative(self.activations[-1])
        self.weights[-1] += learning_rate * np.dot(self.activations[-2].T, delta)
        if self.if_bias:
            self.biases[-1] += learning_rate * np.sum(delta, axis=0, keepdims=True)

        for i in range(self.hidden_layers, 0, -1):
            delta = np.dot(delta, self.weights[i].T) * self.sigmoid_derivative(self.activations[i])
            self.weights[i-1] += learning_rate * np.dot(self.activations[i-1].T, delta)
            if self.if_bias:
                self.biases[i-1] += learning_rate * np.sum(delta, axis=0, keepdims=True)

    def train(self, X, y, epochs, learning_rate):
        for _ in range(epochs):
            self.backward(X, y, learning_rate)




In [36]:
NN_model_1 = NN([3,4,1])


print(NN_model_1.forward(X_train))

# print(NN_model_1.activations)
# print((NN_model_1.weights[-2]))

NN_model_1.train(X_train, Y_train, epochs=2000, learning_rate=0.9)

print(NN_model_1.forward(X_train))

[[0.35151934]
 [0.350779  ]
 [0.2994554 ]
 [0.30222388]]
[[0.01977956]
 [0.9671274 ]
 [0.97083262]
 [0.03572509]]


从训练的结果来看，模型的输出结果已经开始收敛到目标值 Y_train，已经达到较好效果。

# 练习5

尝试改变隐层节点个数(3、5、2?)，观察能否解决XOR问题？如何避免不收敛？

In [40]:
for i in [3,5,2]:
    NN_model = NN([3,i,1])
    NN_model.train(X_train, Y_train, epochs=2000, learning_rate=0.9)
    print("=== | hidden layer size={} | ===".format(i))
    print(NN_model.forward(X_train))

=== | hidden layer size=3 | ===
[[0.01455553]
 [0.95400736]
 [0.95551864]
 [0.05386896]]
=== | hidden layer size=5 | ===
[[0.0276844 ]
 [0.97142323]
 [0.975299  ]
 [0.02476168]]
=== | hidden layer size=2 | ===
[[0.02822207]
 [0.96421735]
 [0.49864679]
 [0.50209081]]


从结果来看，当Hidden layer size 过小时，模型无法解决XOR问题，避免问题的一个方法为设计更深、更宽的网络，解决更复杂的问题。

# 练习6

用动量算法训练浅层NN求解XOR问题

In [2]:
class NN_momentum:
    def __init__(self, in_hidden_out:list, if_bias=False, momentum=0.9):
        self.input_size = in_hidden_out[0]
        self.output_size = in_hidden_out[-1]
        self.hidden_layers = len(in_hidden_out) - 2
        self.in_hidden_out = in_hidden_out
        self.if_bias = if_bias
        self.weights = []
        self.biases = []
        self.loss = []
        self.activations = []
        self.momentum = momentum

        self.velocity_weights = []
        self.velocity_biases = []


        # Initialize weights and biases
        self.initialize_weights_and_biases()

    def sigmoid(self, x):
        return 1 / (1 + np.exp(-x))

    def sigmoid_derivative(self, x):
        return x * (1 - x)

    def initialize_weights_and_biases(self):
        for i in range(self.hidden_layers + 1):
            self.weights.append(np.random.rand(self.in_hidden_out[i], self.in_hidden_out[i+1]) * 2 -1)
            if self.if_bias:
                self.biases.append(np.random.rand((1, self.in_hidden_out[i+1]))*2 -1)
            else:
                self.biases.append(np.zeros((1, self.in_hidden_out[i+1])))

    def forward(self, X):
        self.activations = [X]
        for i in range(self.hidden_layers + 1):
            z = np.dot(self.activations[-1], self.weights[i]) + self.biases[i]
            a = self.sigmoid(z)
            self.activations.append(a)
        return self.activations[-1]

    def backward(self, X, y, learning_rate):
        _ = self.forward(X)
        error = y - self.activations[-1]
        self.loss.append(np.mean(error ** 2))

        delta = error * self.sigmoid_derivative(self.activations[-1])

        # Update the velocities
        updated_velocity_weights = [np.zeros_like(w) for w in self.weights]
        updated_velocity_biases = [np.zeros_like(b) for b in self.biases]

        # Update output layer weights and biases with momentum
        updated_velocity_weights[-1] = (self.momentum * updated_velocity_weights[-1]) + (learning_rate * np.dot(self.activations[-2].T, delta))
        self.weights[-1] += updated_velocity_weights[-1]
        if self.if_bias:
            updated_velocity_biases[-1] = (self.momentum * updated_velocity_biases[-1]) + (learning_rate * np.sum(delta, axis=0, keepdims=True))
            self.biases[-1] += updated_velocity_biases[-1]

        # Update hidden layers weights and biases with momentum
        for i in range(self.hidden_layers, 0, -1):
            delta = np.dot(delta, self.weights[i].T) * self.sigmoid_derivative(self.activations[i])
            updated_velocity_weights[i-1] = (self.momentum * updated_velocity_weights[i-1]) + (learning_rate * np.dot(self.activations[i-1].T, delta))
            self.weights[i-1] += updated_velocity_weights[i-1]
            if self.if_bias:
                updated_velocity_biases[i-1] = (self.momentum * updated_velocity_biases[i-1]) + (learning_rate * np.sum(delta, axis=0, keepdims=True))
                self.biases[i-1] += updated_velocity_biases[i-1]

        # Update the velocities for the next iteration
        self.velocity_weights = updated_velocity_weights
        self.velocity_biases = updated_velocity_biases


    def train(self, X, y, epochs, learning_rate):
        for _ in range(epochs):
            self.backward(X, y, learning_rate)

In [3]:
NN_model = NN_momentum([3,4,1])

NN_model.train(X_train, Y_train, epochs=2000, learning_rate=0.9)

In [4]:
print(NN_model.forward(X_train))

[[0.01977956]
 [0.9671274 ]
 [0.97083262]
 [0.03572509]]
