In [2]:
import numpy as np

class NeuralNetwork:
    def __init__(self, layer_sizes, activation='linear'):
        """
        Initialize a neural network with specified layer sizes
        
        Parameters:
        layer_sizes -- list containing the number of nodes in each layer
                      (including input and output layers)
        activation -- activation function to use ('linear' or 'sigmoid')
        """
        self.layer_sizes = layer_sizes
        self.num_layers = len(layer_sizes)
        self.activation = activation
        self.parameters = {}
        self.initialize_parameters()
        
    def initialize_parameters(self):
        """Initialize weights and biases with random values"""
        np.random.seed(42)  # For reproducibility
        
        for l in range(1, self.num_layers):
            # He initialization for better training with deep networks
            self.parameters[f'W{l}'] = np.random.randn(self.layer_sizes[l], 
                                                      self.layer_sizes[l-1]) * np.sqrt(2 / self.layer_sizes[l-1])
            self.parameters[f'b{l}'] = np.zeros((self.layer_sizes[l], 1))
    
    def activation_function(self, Z, derivative=False):
        """Apply the activation function"""
        if self.activation == 'linear':
            if derivative:
                return np.ones_like(Z)  # Derivative of linear function is 1
            return Z  # Linear function: f(x) = x
        
        elif self.activation == 'sigmoid':
            if derivative:
                # Derivative of sigmoid: f'(x) = f(x)(1-f(x))
                sigmoid = 1 / (1 + np.exp(-Z))
                return sigmoid * (1 - sigmoid)
            # Sigmoid function: 1 / (1 + e^(-x))
            return 1 / (1 + np.exp(-Z))
    
    def forward_propagation(self, X):
        """
        Perform forward propagation through the network
        
        Parameters:
        X -- input data, shape (input_size, number_of_examples)
        
        Returns:
        AL -- output of the network
        caches -- list of caches containing values needed for backpropagation
        """
        caches = []
        A = X
        
        # Forward propagation through each layer
        for l in range(1, self.num_layers):
            A_prev = A
            W = self.parameters[f'W{l}']
            b = self.parameters[f'b{l}']
            
            # Linear forward
            Z = np.dot(W, A_prev) + b
            
            # Activation
            A = self.activation_function(Z)
            
            # Save values for backpropagation
            cache = {
                "A_prev": A_prev,
                "W": W,
                "b": b,
                "Z": Z,
                "A": A
            }
            caches.append(cache)
        
        return A, caches
    
    def compute_cost(self, AL, Y):
        """
        Compute the mean squared error cost
        
        Parameters:
        AL -- output of the forward propagation (output of the last activation)
        Y -- target values
        
        Returns:
        cost -- mean squared error
        """
        m = Y.shape[1]  # Number of examples
        
        # Mean squared error cost
        cost = (1/m) * np.sum(np.square(AL - Y))
        
        return cost
    
    def backward_propagation(self, Y, caches):
        """
        Perform backward propagation through the network
        
        Parameters:
        Y -- target values
        caches -- list of caches from forward propagation
        
        Returns:
        gradients -- dictionary containing gradients
        """
        gradients = {}
        m = Y.shape[1]  # Number of examples
        L = len(caches)  # Number of layers (excluding input layer)
        
        # Initialize backpropagation with the output layer
        AL = caches[L-1]["A"]
        
        # Derivative of cost with respect to AL
        dAL = 2 * (AL - Y) / m
        
        # Backpropagate through layers
        dA_curr = dAL
        
        for l in reversed(range(L)):
            cache = caches[l]
            A_prev = cache["A_prev"]
            W = cache["W"]
            Z = cache["Z"]
            
            # Compute gradients
            dZ = dA_curr * self.activation_function(Z, derivative=True)
            dW = np.dot(dZ, A_prev.T)
            db = np.sum(dZ, axis=1, keepdims=True)
            
            if l > 0:
                # Propagate to previous layer (not needed for the first layer)
                dA_curr = np.dot(W.T, dZ)
            
            # Store gradients
            gradients[f'dW{l+1}'] = dW
            gradients[f'db{l+1}'] = db
        
        return gradients
    
    def update_parameters(self, gradients, learning_rate):
        """
        Update weights and biases using gradient descent
        
        Parameters:
        gradients -- dictionary containing gradients
        learning_rate -- learning rate for gradient descent
        """
        for l in range(1, self.num_layers):
            self.parameters[f'W{l}'] -= learning_rate * gradients[f'dW{l}']
            self.parameters[f'b{l}'] -= learning_rate * gradients[f'db{l}']
    
    def train(self, X, Y, num_iterations=1000, learning_rate=0.1, print_cost=False, print_interval=100):
        """
        Train the neural network
        
        Parameters:
        X -- input data, shape (input_size, number_of_examples)
        Y -- target values, shape (output_size, number_of_examples)
        num_iterations -- number of iterations for training
        learning_rate -- learning rate for gradient descent
        print_cost -- if True, print the cost every print_interval iterations
        print_interval -- interval at which to print costs
        
        Returns:
        costs -- list of costs during training
        """
        costs = []
        
        for i in range(num_iterations):
            # Forward propagation
            AL, caches = self.forward_propagation(X)
            
            # Compute cost
            cost = self.compute_cost(AL, Y)
            
            # Backward propagation
            gradients = self.backward_propagation(Y, caches)
            
            # Update parameters
            self.update_parameters(gradients, learning_rate)
            
            # Print cost
            if print_cost and i % print_interval == 0:
                print(f"Cost after iteration {i}: {cost}")
            
            # Record cost
            if i % 100 == 0:
                costs.append(cost)
        
        return costs
    
    def predict(self, X):
        """
        Make predictions using the trained network
        
        Parameters:
        X -- input data
        
        Returns:
        predictions -- output of the network
        """
        AL, _ = self.forward_propagation(X)
        return AL
    
    def get_parameters(self):
        """Return the network parameters"""
        return self.parameters

# Example usage:
if __name__ == "__main__":
    # Define network architecture [input_size, hidden_size, output_size]
    layer_sizes = [2, 3, 2]
    
    # Create neural network with linear activation
    nn = NeuralNetwork(layer_sizes, activation='linear')
    
    # Create sample data
    X = np.array([[1, 2, 3], [4, 5, 6]])  # 2 features, 3 examples
    Y = np.array([[4, 5, 6], [7, 8, 9]])  # 2 outputs, 3 examples
    
    # Train the network
    costs = nn.train(X, Y, num_iterations=1000, learning_rate=0.1, print_cost=True)
    
    # Make predictions
    predictions = nn.predict(X)
    print("Predictions:", predictions)
    print("Targets:", Y)

Cost after iteration 0: 117.36915667029264
Cost after iteration 100: nan
Cost after iteration 200: nan
Cost after iteration 300: nan
Cost after iteration 400: nan
Cost after iteration 500: nan
Cost after iteration 600: nan
Cost after iteration 700: nan
Cost after iteration 800: nan
Cost after iteration 900: nan
Predictions: [[nan nan nan]
 [nan nan nan]]
Targets: [[4 5 6]
 [7 8 9]]


  cost = (1/m) * np.sum(np.square(AL - Y))
