In [1]:
import numpy as np

# ============= 1. 超参数设定 =============
N = 200           # 训练样本数，可以根据需要调整
input_dim = 128   # 输入层神经元数（特征数）
hidden_size = 64  # 每个隐藏层神经元数，可自由修改
num_hidden_layers = 5  # 隐藏层层数
output_dim = 1    # 输出层神经元数（单神经元，Sigmoid，用于二分类）

learning_rate = 0.01
epochs = 2000

# ============= 2. 生成随机数据集 =============
# X: (N, 128)
X = np.random.randn(N, input_dim)

# y: (N, 1)  二分类标签(0或1)，这里用随机生成的示例数据
y = np.random.randint(0, 2, size=(N, 1))  # 随机标签 0/1

# ============= 3. 初始化权重和偏置 =============
# 为简化存储，将所有层的权重、偏置放进列表中。
# layers: [input_dim, hidden_size, hidden_size, ..., output_dim]
layer_dims = [input_dim] + [hidden_size]*num_hidden_layers + [output_dim]

# 定义一个函数来随机初始化每层的 W、b
def init_parameters(layer_dims):
    """
    根据每一层的输入输出维度初始化网络参数
    layer_dims: [l0, l1, l2, ..., l_{L}]
    返回:
      W_list: [W1, W2, ...]
      b_list: [b1, b2, ...]
    其中:
      Wk 形状: (layer_dims[k], layer_dims[k+1])
      bk 形状: (layer_dims[k+1], )
    """
    W_list = []
    b_list = []
    for i in range(len(layer_dims) - 1):
        W = np.random.randn(layer_dims[i], layer_dims[i+1]) * 0.01
        b = np.zeros((layer_dims[i+1],))
        W_list.append(W)
        b_list.append(b)
    return W_list, b_list

W_list, b_list = init_parameters(layer_dims)

# ============= 4. 定义激活函数及其导数 =============
def sigmoid(z):
    return 1.0 / (1.0 + np.exp(-z))

def sigmoid_derivative(a):
    # a = sigmoid(z)，则 a' = a(1-a)
    return a * (1 - a)

# ============= 5. 开始训练 =============
for epoch in range(epochs):
    # ------- 前向传播 --------
    # 缓存每层输出(激活值)和 z 值，方便反向传播
    a_cache = [X]  # a_cache[0] = X
    z_cache = []
    
    # forward pass
    current_a = X  # 当前层的输出（从输入层开始）
    for l in range(len(W_list)):
        z = np.dot(current_a, W_list[l]) + b_list[l]  # (N, layer_dims[l+1])
        a = sigmoid(z)
        
        z_cache.append(z)
        a_cache.append(a)
        
        current_a = a  # 传给下一层输入
    
    # 当前层输出即网络最终输出 y_pred
    y_pred = current_a  # shape: (N, 1)
    
    # ------- 计算损失 (MSE) --------
    loss = np.mean((y_pred - y)**2)
    
    # ------- 反向传播 --------
    # dLoss/dy_pred, 对 MSE 来说 = 2*(y_pred - y)/N
    dLoss_dy = 2.0 * (y_pred - y) / N
    
    # 我们需要逐层计算 dLoss/dWk、dLoss/dbk
    # L 层网络，对应 len(W_list) = L
    # 逆序遍历每一层
    dA = dLoss_dy  # 先对最后一层的输出进行求导
    for l in reversed(range(len(W_list))):
        # a_l   = a_cache[l]
        # z_l   = z_cache[l]
        # a_{l+1} = a_cache[l+1]  => y_pred
        # W_l   = W_list[l]
        
        # 先计算 dZ = dA * sigmoid'(Z)
        dZ = dA * sigmoid_derivative(a_cache[l+1])  # shape: (N, layer_dims[l+1])
        
        # dW = a_l^T dot dZ
        dW = np.dot(a_cache[l].T, dZ)  # shape: (layer_dims[l], layer_dims[l+1])
        
        # db = 对 dZ 在样本维度求和
        db = np.sum(dZ, axis=0)  # shape: (layer_dims[l+1], )
        
        # 更新权重
        W_list[l] -= learning_rate * dW
        b_list[l] -= learning_rate * db
        
        # 计算传回到前一层的 dA (如果还有前一层的话)
        # dA_{l-1} = dZ dot W_list[l]^T
        dA = np.dot(dZ, W_list[l].T)
    
    # 可选: 每隔一段打印一下损失，观察收敛情况
    if (epoch+1) % 200 == 0:
        print(f"Epoch: {epoch+1}, Loss: {loss:.6f}")

# ============= 6. 测试: 看最终输出分布 =============
print("\nFinal training loss:", loss)
# 这里简单打印前 10 个预测结果对比
print("\nSample predictions:")
for i in range(10):
    print(f"X[{i}]: Label={y[i,0]}, Pred={y_pred[i,0]:.4f}")


Epoch: 200, Loss: 0.246401
Epoch: 400, Loss: 0.246400
Epoch: 600, Loss: 0.246400
Epoch: 800, Loss: 0.246400
Epoch: 1000, Loss: 0.246400
Epoch: 1200, Loss: 0.246400
Epoch: 1400, Loss: 0.246400
Epoch: 1600, Loss: 0.246400
Epoch: 1800, Loss: 0.246400
Epoch: 2000, Loss: 0.246400

Final training loss: 0.24640000000361428

Sample predictions:
X[0]: Label=1, Pred=0.4400
X[1]: Label=1, Pred=0.4400
X[2]: Label=1, Pred=0.4400
X[3]: Label=1, Pred=0.4400
X[4]: Label=0, Pred=0.4400
X[5]: Label=1, Pred=0.4400
X[6]: Label=0, Pred=0.4400
X[7]: Label=1, Pred=0.4400
X[8]: Label=0, Pred=0.4400
X[9]: Label=1, Pred=0.4400
