In [1]:
import numpy as np

# ReLU activation function
def relu(x):
    return np.maximum(0, x)

# Softmax activation function with max subtraction for numerical stability
def softmax(x):
    exp_x = np.exp(x - np.max(x, axis=-1, keepdims=True))
    return exp_x / np.sum(exp_x, axis=-1, keepdims=True)

# Categorical cross-entropy loss function
def categorical_crossentropy(y_true, y_pred):
    epsilon = 1e-10
    y_pred = np.clip(y_pred, epsilon, 1 - epsilon)
    return -np.sum(y_true * np.log(y_pred))

# BatchNormalization layer class
class BatchNormalization:
    def __init__(self, input_dim):
        self.gamma = np.ones(input_dim)
        self.beta = np.zeros(input_dim)
        self.epsilon = 1e-8
        self.mean = None
        self.var = None
        self.input = None
        self.normalized = None

    def forward(self, input_data):
        self.input = input_data
        self.mean = np.mean(input_data, axis=0)
        self.var = np.var(input_data, axis=0)
        self.normalized = (input_data - self.mean) / np.sqrt(self.var + self.epsilon)
        return self.gamma * self.normalized + self.beta

    def backward(self, d_output):
        batch_size = d_output.shape[0]
        d_normalized = d_output * self.gamma
        d_var = np.sum(d_normalized * (self.input - self.mean) * -0.5 * (self.var + self.epsilon) ** (-1.5), axis=0)
        d_mean = np.sum(d_normalized * -1 / np.sqrt(self.var + self.epsilon), axis=0) + d_var * np.mean(-2 * (self.input - self.mean), axis=0)
        d_input = d_normalized / np.sqrt(self.var + self.epsilon) + d_var * 2 * (self.input - self.mean) / batch_size + d_mean / batch_size
        d_gamma = np.sum(d_output * self.normalized, axis=0)
        d_beta = np.sum(d_output, axis=0)
        return d_input, d_gamma, d_beta

# Dense layer class
class DenseLayer:
    def __init__(self, input_dim, output_dim, activation):
        self.weights = np.random.randn(input_dim, output_dim) * np.sqrt(2 / input_dim)
        self.bias = np.zeros(output_dim)
        self.activation = activation
        self.input = None
        self.z = None
        self.batch_norm = BatchNormalization(output_dim)  # Add BatchNormalization

    def forward(self, input_data):
        self.input = input_data
        self.z = np.dot(input_data, self.weights) + self.bias
        normalized = self.batch_norm.forward(self.z)  # Use BatchNormalization
        return self.activation(normalized)

    def backward(self, d_output):
        d_activation = np.where(self.z > 0, 1, 0)
        d_z = d_output * d_activation
        d_input, d_weights, d_bias = self.batch_norm.backward(d_z)  # Use BatchNormalization
        d_input = np.dot(d_input, self.weights.T)
        d_weights = np.dot(self.input.T, d_z)
        d_bias = np.sum(d_z, axis=0)
        return d_input, d_weights, d_bias

# Model class
class DNN:
    def __init__(self):
        self.layers = []

    def add_layer(self, layer):
        self.layers.append(layer)

    def compile(self, loss, optimizer):
        self.loss = loss
        self.optimizer = optimizer

    def forward(self, input_data):
        output = input_data
        for layer in self.layers:
            output = layer.forward(output)
        return output

    def backward(self, d_output):
        for layer in reversed(self.layers):
            d_output, d_weights, d_bias = layer.backward(d_output)
            # Update weights and biases using the optimizer
            layer.weights -= self.optimizer * d_weights
            layer.bias -= self.optimizer * d_bias

    def fit(self, X_train, y_train, epochs, batch_size=32):
        for epoch in range(epochs):
            epoch_loss = 0
            for i in range(0, len(X_train), batch_size):
                batch_X = X_train[i:i+batch_size]
                batch_y = y_train[i:i+batch_size]

                # Forward pass
                predictions = self.forward(batch_X)

                # Compute loss
                loss = self.loss(batch_y, predictions)
                epoch_loss += loss

                # Backward pass
                d_output = predictions - batch_y  # Fixed the backward pass
                self.backward(d_output)

            epoch_loss /= len(X_train)
            print(f"Epoch {epoch + 1}/{epochs}, Loss: {epoch_loss:.4f}")

# Generating some dummy data to test the model
X_train = np.random.randn(1000, 28, 28)
y_train = np.random.randint(0, 10, (1000, 10))
y_train = (y_train == np.arange(10)).astype(float)

# Creating and training the model
model = DNN()
model.add_layer(DenseLayer(input_dim=28*28, output_dim=256, activation=relu))
model.add_layer(DenseLayer(input_dim=256, output_dim=128, activation=relu))
model.add_layer(DenseLayer(input_dim=128, output_dim=10, activation=softmax))
model.compile(loss=categorical_crossentropy, optimizer=0.001)
model.fit(X_train.reshape(-1, 28*28), y_train, epochs=30)


Epoch 1/30, Loss: 2.9475
Epoch 2/30, Loss: 2.6365
Epoch 3/30, Loss: 2.4741
Epoch 4/30, Loss: 2.3779
Epoch 5/30, Loss: 2.3144
Epoch 6/30, Loss: 2.2685
Epoch 7/30, Loss: 2.2284
Epoch 8/30, Loss: 2.1967
Epoch 9/30, Loss: 2.1673
Epoch 10/30, Loss: 2.1415
Epoch 11/30, Loss: 2.1195
Epoch 12/30, Loss: 2.0996
Epoch 13/30, Loss: 2.0762
Epoch 14/30, Loss: 2.0575
Epoch 15/30, Loss: 2.0319
Epoch 16/30, Loss: 2.0038
Epoch 17/30, Loss: 1.9796
Epoch 18/30, Loss: 1.9551
Epoch 19/30, Loss: 1.9272
Epoch 20/30, Loss: 1.8935
Epoch 21/30, Loss: 1.8625
Epoch 22/30, Loss: 1.8293
Epoch 23/30, Loss: 1.8013
Epoch 24/30, Loss: 1.7667
Epoch 25/30, Loss: 1.7260
Epoch 26/30, Loss: 1.6854
Epoch 27/30, Loss: 1.6473
Epoch 28/30, Loss: 1.6121
Epoch 29/30, Loss: 1.5785
Epoch 30/30, Loss: 1.5502


**Schema for the Deep Neural Network (DNN):**

```
Input Data (X_train) [Batch Size x Input Size]
 |
 V
[Dense Layer 1]
   |
   V
[Batch Normalization 1]
   |
   V
[ReLU Activation 1]
   |
   V
[Dense Layer 2]
   |
   V
[Batch Normalization 2]
   |
   V
[ReLU Activation 2]
   |
   V
[Dense Layer 3]
   |
   V
[Softmax Activation]
   |
   V
Predicted Probabilities [Batch Size x Output Size]
```

**Explanation:**

1. **Input Data (X_train):** The input data consists of a batch of samples, where each sample represents a flattened 28x28 image (784-dimensional vector). The size of the batch is determined by the chosen batch size, and each sample is fed into the DNN for training.

2. **Dense Layer 1:** The first dense layer receives the input data, which is a 784-dimensional vector for each sample. The layer has 256 neurons, and each neuron is fully connected to the input. The neurons in this layer learn to extract features from the input data.

3. **Batch Normalization 1:** After the dense layer, batch normalization is applied to normalize the output of the layer. It helps stabilize training by reducing internal covariate shift and speeding up convergence.

4. **ReLU Activation 1:** The ReLU (Rectified Linear Unit) activation function is applied after batch normalization. ReLU introduces non-linearity to the model, allowing it to learn complex patterns and making the training process more efficient.

5. **Dense Layer 2:** The second dense layer receives the output from ReLU Activation 1. It has 128 neurons, and each neuron is fully connected to the ReLU activation output. This layer further learns higher-level features from the input.

6. **Batch Normalization 2:** Similar to Batch Normalization 1, batch normalization is applied to the output of Dense Layer 2 to improve training stability.

7. **ReLU Activation 2:** ReLU is applied after Batch Normalization 2 to introduce non-linearity and enhance the model's representational power.

8. **Dense Layer 3:** The third dense layer receives the output from ReLU Activation 2. It has 10 neurons, each representing one digit class (0 to 9). This is the final layer before the softmax activation.

9. **Softmax Activation:** The softmax activation function is applied after Dense Layer 3. It converts the raw scores of the neurons into probabilities, representing the model's prediction of each input sample belonging to each digit class.

10. **Predicted Probabilities:** The final output of the DNN is a matrix of predicted probabilities, where each row corresponds to a sample in the batch, and each column represents the predicted probability for one of the 10 digit classes.

During the training process, the model adjusts the weights and biases of each dense layer to minimize the categorical cross-entropy loss between the predicted probabilities and the actual one-hot encoded labels (y_train). The optimization is performed using the Adam optimizer with a learning rate of 0.001. The training proceeds through 35 epochs, and the loss is monitored at each epoch to track the model's performance and convergence.

By training this DNN on a dataset of images and corresponding labels, the model can learn to recognize and classify the digits in the images accurately. This is a simplified version of a DNN, and in practice, more complex architectures and techniques are used to achieve better performance on real-world datasets.