In [None]:
import numpy as np

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def dsigmoid_from_activation(a):
    # a = sigmoid(z) já calculado
    return a * (1 - a)

def relu(x):
    return np.maximum(0, x)

def drelu_from_preactivation(z):
    return (z > 0).astype(float)

def mse(y_pred, y_true):
    return np.mean((y_pred - y_true) ** 2)


In [None]:
class CamadaDensa:
    def __init__(self, n_inputs, n_neuronios, ativacao="relu", seed=None):
        rng = np.random.default_rng(seed)
        # Inicialização simples (poderíamos melhorar depois)
        self.W = rng.standard_normal((n_neuronios, n_inputs))
        self.b = rng.standard_normal(n_neuronios)
        self.ativacao = ativacao
        # caches
        self.x = None
        self.z = None
        self.a = None

    def forward(self, x):
        self.x = x  # (n_inputs,)
        self.z = self W @ x + self.b  # pre-ativação
        if self.ativacao == "relu":
            self.a = relu(self.z)
        elif self.ativacao == "sigmoid":
            self.a = sigmoid(self.z)
        else:
            self.a = self.z  # linear
        return self.a

    def backward(self, grad_a):
        # grad_a = dL/da (vindo da frente)
        if self.ativacao == "relu":
            grad_z = grad_a * drelu_from_preactivation(self.z)
        elif self.ativacao == "sigmoid":
            grad_z = grad_a * dsigmoid_from_activation(self.a)
        else:
            grad_z = grad_a  # linear

        # Gradientes de W e b
        grad_W = np.outer(grad_z, self.x)     # (nout, nin)
        grad_b = grad_z                        # (nout,)
        # Gradiente que vai para trás (para a camada anterior)
        grad_x = self.W.T @ grad_z             # (nin,)

        return grad_x, grad_W, grad_b


In [None]:
class MLP:
    def __init__(self, n_inputs, n_hidden, n_outputs, seed=None):
        self.l1 = CamadaDensa(n_inputs, n_hidden, ativacao="relu", seed=seed)
        self.l2 = CamadaDensa(n_hidden, n_outputs, ativacao="sigmoid", seed=None if seed is None else seed+1)

    def forward(self, x):
        a1 = self.l1.forward(x)
        a2 = self.l2.forward(a1)
        return a2

    def backward(self, y_pred, y_true):
        # dL/dy_pred para MSE: 2*(y_pred - y_true)/n ; como n=dim saída, constante comum
        # Vamos usar forma simples: grad = (y_pred - y_true) (constante 2/n embutida na taxa de aprendizado)
        grad_out = (y_pred - y_true)  # dL/da2

        # Camada de saída (sigmoid)
        grad_a1, grad_W2, grad_b2 = self.l2.backward(grad_out)

        # Camada oculta (ReLU)
        _,     grad_W1, grad_b1 = self.l1.backward(grad_a1)

        return grad_W1, grad_b1, grad_W2, grad_b2

    def step(self, grads, lr=1e-2):
        grad_W1, grad_b1, grad_W2, grad_b2 = grads
        self.l1.W -= lr * grad_W1
        self.l1.b -= lr * grad_b1
        self.l2.W -= lr * grad_W2
        self.l2.b -= lr * grad_b2


In [None]:
# Exemplo: 3 entradas -> 5 ocultos -> 2 saídas
rng = np.random.default_rng(42)
rede = MLP(n_inputs=3, n_hidden=5, n_outputs=2, seed=0)

x = np.array([0.5, -1.2, 3.3])
y_true = np.array([1.0, 0.0])

lr = 0.05
epochs = 1000

for ep in range(1, epochs+1):
    y_pred = rede.forward(x)
    loss = mse(y_pred, y_true)

    grads = rede.backward(y_pred, y_true)
    rede.step(grads, lr=lr)

    if ep % 100 == 0 or ep == 1:
        print(f"época {ep:4d} | loss {loss:.6f} | y_pred {y_pred}")


In [None]:
# Pequeno dataset sintético
X = rng.standard_normal((8, 3))
Y = np.stack([
    (X[:,0] + 0.5*X[:,2] > 0).astype(float),   # alvo 1 binário
    (X[:,1] < 0).astype(float)                 # alvo 2 binário
], axis=1)

rede = MLP(n_inputs=3, n_hidden=6, n_outputs=2, seed=123)
lr = 0.05
epochs = 1000

for ep in range(1, epochs+1):
    # embaralhar
    idx = rng.permutation(len(X))
    Xb, Yb = X[idx], Y[idx]

    # forward em todo o lote, acumulando gradientes simples (SGD batch inteiro)
    sum_loss = 0.0
    # acumular gradientes
    acc_gW1 = np.zeros_like(rede.l1.W)
    acc_gb1 = np.zeros_like(rede.l1.b)
    acc_gW2 = np.zeros_like(rede.l2.W)
    acc_gb2 = np.zeros_like(rede.l2.b)

    for xi, yi in zip(Xb, Yb):
        yp = rede.forward(xi)
        sum_loss += mse(yp, yi)
        gW1, gb1, gW2, gb2 = rede.backward(yp, yi)
        acc_gW1 += gW1; acc_gb1 += gb1
        acc_gW2 += gW2; acc_gb2 += gb2

    # média do gradiente
    n = len(Xb)
    rede.step((acc_gW1/n, acc_gb1/n, acc_gW2/n, acc_gb2/n), lr=lr)

    if ep % 100 == 0 or ep == 1:
        print(f"época {ep:4d} | loss médio {sum_loss/n:.6f}")
