In [34]:
import numpy as np

def sigmoid(x):
    """Sigmoid activation: Maps to (0,1). Prone to vanishing gradients."""
    return 1 / (1 + np.exp(-x))

def d_sigmoid(x):
    """Derivative of sigmoid."""
    return sigmoid(x) * (1 - sigmoid(x))

def tanh(x):
    """Tanh activation: Maps to (-1,1). Zero-centered."""
    return np.tanh(x)

def d_tanh(x):
    """Derivative of tanh."""
    return 1 - np.tanh(x) ** 2

def relu(x):
    """ReLU activation: Fast, but can cause dying neurons."""
    return np.maximum(0, x)

def d_relu(x):
    """Derivative of ReLU."""
    return np.where(x > 0, 1, 0)

def leaky_relu(x, alpha=0.01):
    """Leaky ReLU: Allows small gradient for x <= 0 to prevent dying neurons."""
    return np.where(x > 0, x, alpha * x)

def d_leaky_relu(x, alpha=0.01):
    """Derivative of Leaky ReLU."""
    return np.where(x > 0, 1, alpha)


In [35]:
X = np.array([[0, 0], [0, 1], [1, 0], [1, 1]])
y = np.array([[0], [1], [1], [0]])


In [36]:
import numpy as np

# Network architecture (unchanged)
input_size = 2
hidden_size = 4
output_size = 1

# Define sigmoid function
sigmoid = lambda z: 1 / (1 + np.exp(-z))

def train_network(activation, d_activation, learning_rate, epochs=5000):
    # Hardcoded weights with formulas (instead of random)
    # W1: 2x4 matrix with patterned values for symmetry breaking
    W1 = np.array([[0.5, -0.2, 0.1, 0.4],
                   [-0.3, 0.8, -0.5, 0.2]])
    # b1: Small incremental biases
    b1 = np.array([[0.1, -0.1, 0.05, 0.0]])
    # W2: 4x1 matrix with mixed signs
    W2 = np.array([[0.4],
                   [-0.6],
                   [0.3],
                   [0.2]])
    # b2: Small constant bias
    b2 = np.array([[0.05]])
    
    for epoch in range(epochs):
        # Forward pass
        z1 = np.dot(X, W1) + b1  # Added z1 for derivative clarity
        a1 = activation(z1)
        z2 = np.dot(a1, W2) + b2  # Added z2 for derivative clarity
        a2 = sigmoid(z2)
        
        # Loss (MSE)
        loss = np.mean((y - a2) ** 2)
        
        # Backpropagation
        d_a2 = a2 - y
        d_z2 = d_a2 * (a2 * (1 - a2))  # d_sigmoid using a2
        d_W2 = np.dot(a1.T, d_z2)
        d_b2 = np.sum(d_z2, axis=0, keepdims=True)
        
        d_a1 = np.dot(d_z2, W2.T)
        d_z1 = d_a1 * d_activation(z1)  # Use stored z1
        d_W1 = np.dot(X.T, d_z1)
        d_b1 = np.sum(d_z1, axis=0, keepdims=True)
        
        # Gradient descent updates
        W2 -= learning_rate * d_W2
        b2 -= learning_rate * d_b2
        W1 -= learning_rate * d_W1
        b1 -= learning_rate * d_b1
        
        if epoch % 1000 == 0:
            print(f"Epoch {epoch}, Loss: {loss:.4f}")
    
    # Final predictions are sigmoid output values, no thresholding
    predictions = a2.flatten()
    print("Final Predictions (sigmoid output):", predictions)
    print("-" * 50)
    return predictions

# Sample data for test (XOR example)
X = np.array([[0,0],[0,1],[1,0],[1,1]])
y = np.array([[0],[1],[1],[0]])

# Define example activations (ReLU and its derivative) for testing
def relu(z):
    return np.maximum(0, z)
def d_relu(z):
    return (z > 0).astype(float)

# Run modified training with hardcoded weights
train_network(relu, d_relu, learning_rate=0.1, epochs=5000)


Epoch 0, Loss: 0.2553
Epoch 1000, Loss: 0.0077
Epoch 2000, Loss: 0.0022
Epoch 3000, Loss: 0.0012
Epoch 4000, Loss: 0.0008
Final Predictions (sigmoid output): [0.0411175  0.98157763 0.98629735 0.01380053]
--------------------------------------------------


array([0.0411175 , 0.98157763, 0.98629735, 0.01380053])

In [37]:
# Try different activation functions and learning rates

print("=== Using Sigmoid Activation, LR=0.1 ===")
train_network(sigmoid, d_sigmoid, learning_rate=0.1)

print("=== Using Tanh Activation, LR=0.1 ===")
train_network(tanh, d_tanh, learning_rate=0.1)

print("=== Using ReLU Activation, LR=0.01 ===")
train_network(relu, d_relu, learning_rate=0.01)

print("=== Using Leaky ReLU Activation, LR=0.01 ===")
train_network(leaky_relu, d_leaky_relu, learning_rate=0.01)


=== Using Sigmoid Activation, LR=0.1 ===
Epoch 0, Loss: 0.2530
Epoch 1000, Loss: 0.2500
Epoch 2000, Loss: 0.2496
Epoch 3000, Loss: 0.2478
Epoch 4000, Loss: 0.2281
Final Predictions (sigmoid output): [0.22660326 0.50993486 0.7518223  0.4932818 ]
--------------------------------------------------
=== Using Tanh Activation, LR=0.1 ===
Epoch 0, Loss: 0.2577
Epoch 1000, Loss: 0.0151
Epoch 2000, Loss: 0.0033
Epoch 3000, Loss: 0.0017
Epoch 4000, Loss: 0.0012
Final Predictions (sigmoid output): [0.01336453 0.96785792 0.96869476 0.0363606 ]
--------------------------------------------------
=== Using ReLU Activation, LR=0.01 ===
Epoch 0, Loss: 0.2553
Epoch 1000, Loss: 0.2048
Epoch 2000, Loss: 0.1587
Epoch 3000, Loss: 0.1284
Epoch 4000, Loss: 0.0848
Final Predictions (sigmoid output): [0.31953311 0.76920593 0.87433917 0.15196081]
--------------------------------------------------
=== Using Leaky ReLU Activation, LR=0.01 ===
Epoch 0, Loss: 0.2553
Epoch 1000, Loss: 0.2061
Epoch 2000, Loss: 0.1610


array([0.32464031, 0.76538026, 0.87026712, 0.15579364])

In [None]:
import numpy as np
import time

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def d_sigmoid(x):
    s = sigmoid(x)
    return s * (1 - s)

def tanh(x):
    return np.tanh(x)

def d_tanh(x):
    return 1 - np.tanh(x)**2

def relu(x):
    return np.maximum(0, x)

def d_relu(x):
    return np.where(x > 0, 1, 0)

def leaky_relu(x, alpha=0.01):
    return np.where(x > 0, x, alpha * x)

def d_leaky_relu(x, alpha=0.01):
    return np.where(x > 0, 1, alpha)

ACTIVATIONS = {
    'sigmoid': (sigmoid, d_sigmoid),
    'tanh': (tanh, d_tanh),
    'relu': (relu, d_relu),
    'leaky_relu': (leaky_relu, d_leaky_relu)
}


def train_network(X, y, activation_name, learning_rate, epochs=10000):
    print(f"--- Training with {activation_name.upper()} | LR: {learning_rate} ---")
    start_time = time.time()

    input_size, hidden_size, output_size = 2, 4, 1
    activation_func, d_activation_func = ACTIVATIONS[activation_name]

    np.random.seed(42)
    W1 = np.random.randn(input_size, hidden_size) * 0.1
    b1 = np.zeros((1, hidden_size))
    W2 = np.random.randn(hidden_size, output_size) * 0.1
    b2 = np.zeros((1, output_size))


    for epoch in range(epochs):
     
        z1 = np.dot(X, W1) + b1
        a1 = activation_func(z1)
        z2 = np.dot(a1, W2) + b2
        a2 = sigmoid(z2) 

        loss = np.mean((y - a2)**2)

    
        d_loss_a2 = a2 - y
        d_loss_z2 = d_loss_a2 * (a2 * (1 - a2))
        d_loss_W2 = np.dot(a1.T, d_loss_z2)
        d_loss_b2 = np.sum(d_loss_z2, axis=0, keepdims=True)
        d_loss_a1 = np.dot(d_loss_z2, W2.T)
        d_loss_z1 = d_loss_a1 * d_activation_func(z1)
        d_loss_W1 = np.dot(X.T, d_loss_z1)
        d_loss_b1 = np.sum(d_loss_z1, axis=0, keepdims=True)


        W1 -= learning_rate * d_loss_W1
        W2 -= learning_rate * d_loss_W2
        b1 -= learning_rate * d_loss_b1
        b2 -= learning_rate * d_loss_b2
        
        if epoch % 2000 == 0:
            print(f"Epoch {epoch}, Loss: {loss:.6f}")
    
    final_predictions_binary = (a2 > 0.5).astype(int)
    accuracy = np.mean(final_predictions_binary == y.flatten()) * 100
    end_time = time.time()

    print(f"Epoch {epochs}, Loss: {loss:.6f}")
    print(f"Input: [0,0], [0,1], [1,0], [1,1]")
    print(f"Predicted Output (binary): {final_predictions_binary}")
    print(f"Expected Output:         {y.flatten()}")
    print(f"Final Accuracy: {accuracy:.2f}%")
    print(f"Training Time: {end_time - start_time:.2f} seconds\n")


if __name__ == "__main__":
    X = np.array([[0, 0], [0, 1], [1, 0], [1, 1]])
    y = np.array([[0], [1], [1], [0]])

    experiments = [
        {'activation': 'sigmoid', 'lr': 0.5},
        {'activation': 'tanh', 'lr': 0.5},
        {'activation': 'relu', 'lr': 0.1},
        {'activation': 'leaky_relu', 'lr': 0.1}
    ]

    for exp in experiments:
        train_network(X, y, exp['activation'], exp['lr'], epochs=10000)


--- Training with SIGMOID | LR: 0.5 ---
Epoch 0, Loss: 0.250132
Epoch 2000, Loss: 0.250000
Epoch 4000, Loss: 0.250000
Epoch 6000, Loss: 0.250000
Epoch 8000, Loss: 0.250000
Epoch 10000, Loss: 0.250000
Input: [0,0], [0,1], [1,0], [1,1]
Predicted Output (binary): [[0]
 [1]
 [0]
 [1]]
Expected Output:         [0 1 1 0]
Final Accuracy: 50.00%
Training Time: 1.81 seconds

--- Training with TANH | LR: 0.5 ---
Epoch 0, Loss: 0.250028
Epoch 2000, Loss: 0.001220
Epoch 4000, Loss: 0.000237
Epoch 6000, Loss: 0.000126
Epoch 8000, Loss: 0.000085
Epoch 10000, Loss: 0.000064
Input: [0,0], [0,1], [1,0], [1,1]
Predicted Output (binary): [[0]
 [1]
 [1]
 [0]]
Expected Output:         [0 1 1 0]
Final Accuracy: 50.00%
Training Time: 1.07 seconds

--- Training with RELU | LR: 0.1 ---
Epoch 0, Loss: 0.250081
Epoch 2000, Loss: 0.166924
Epoch 4000, Loss: 0.166747
Epoch 6000, Loss: 0.166718
Epoch 8000, Loss: 0.166704
Epoch 10000, Loss: 0.166693
Input: [0,0], [0,1], [1,0], [1,1]
Predicted Output (binary): [[1]
 [