In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import numpy as np

# Simulating Vanishing & Exploding Gradients in a Deep Network
class DeepNetwork(tf.keras.Model):
    def __init__(self, input_dim, hidden_dim, output_dim, activation='sigmoid', init='normal'):
        super(DeepNetwork, self).__init__()
        self.layers_list = []
        self.activation = activation

        # Add multiple hidden layers
        for _ in range(50):  # 50-layer deep network
            layer = layers.Dense(hidden_dim)
            if init == 'normal':
                layer.kernel_initializer = tf.keras.initializers.RandomNormal(mean=0., stddev=2.0)  # Large weights (causes exploding gradients)
            elif init == 'xavier':
                layer.kernel_initializer = tf.keras.initializers.GlorotUniform()  # Xavier Initialization (prevents vanishing)
            self.layers_list.append(layer)

        self.output_layer = layers.Dense(output_dim)

    def call(self, x):
        for layer in self.layers_list:
            x = layer(x)
            if self.activation == 'sigmoid':
                x = tf.nn.sigmoid(x)  # Causes vanishing gradients
            elif self.activation == 'relu':
                x = tf.nn.relu(x)  # Helps mitigate vanishing
        return self.output_layer(x)

# Input, Hidden, and Output dimensions
input_dim = 10
hidden_dim = 100
output_dim = 1

# Creating models for vanishing and exploding gradients
model_vanishing = DeepNetwork(input_dim, hidden_dim, output_dim, activation='sigmoid', init='normal')
model_exploding = DeepNetwork(input_dim, hidden_dim, output_dim, activation='relu', init='normal')

# Loss function and Optimizer
loss_fn = keras.losses.MeanSquaredError()
optimizer_vanishing = keras.optimizers.SGD(learning_rate=0.01)
optimizer_exploding = keras.optimizers.SGD(learning_rate=0.01)

# Generate random input data
x = np.random.randn(32, input_dim).astype(np.float32)
y = np.random.randn(32, output_dim).astype(np.float32)

# Training and Observing Gradients
def train_model(model, optimizer, name):
    with tf.GradientTape() as tape:
        output = model(x)
        loss = loss_fn(y, output)
    grads = tape.gradient(loss, model.trainable_variables)

    # Print gradient norms
    total_norm = sum(tf.norm(g).numpy() for g in grads if g is not None)
    print(f"{name} - Gradient Norm: {total_norm:.4f}")
    optimizer.apply_gradients(zip(grads, model.trainable_variables))

# Train and observe gradient issues
train_model(model_vanishing, optimizer_vanishing, "Vanishing Gradients Model")
train_model(model_exploding, optimizer_exploding, "Exploding Gradients Model")

Vanishing Gradients Model - Gradient Norm: 3404836.7383
Exploding Gradients Model - Gradient Norm: nan


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim

# Simulating Vanishing & Exploding Gradients in a Deep Network
class DeepNetwork(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, activation='sigmoid', init='normal'):
        super(DeepNetwork, self).__init__()
        self.layers = nn.ModuleList()
        self.activation = activation

        # Add the first hidden layer with the correct input dimension
        layer = nn.Linear(input_dim, hidden_dim)  # Changed from hidden_dim to input_dim
        self.layers.append(layer)
        if init == 'normal':
            nn.init.normal_(layer.weight, mean=0, std=2.0)
        elif init == 'xavier':
            nn.init.xavier_uniform_(layer.weight)

        # Add multiple hidden layers (keep the rest unchanged)
        for _ in range(49):  # 49 layers now to keep a total of 50
            layer = nn.Linear(hidden_dim, hidden_dim)
            self.layers.append(layer)
            if init == 'normal':
                nn.init.normal_(layer.weight, mean=0, std=2.0)
            elif init == 'xavier':
                nn.init.xavier_uniform_(layer.weight)

        self.output_layer = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        for layer in self.layers:
            x = layer(x)
            if self.activation == 'sigmoid':
                x = torch.sigmoid(x)
            elif self.activation == 'relu':
                x = torch.relu(x)
        return self.output_layer(x)

# Input, Hidden, and Output dimensions
input_dim = 10
hidden_dim = 100
output_dim = 1

# Creating models for vanishing and exploding gradients
model_vanishing = DeepNetwork(input_dim, hidden_dim, output_dim, activation='sigmoid', init='normal')
model_exploding = DeepNetwork(input_dim, hidden_dim, output_dim, activation='relu', init='normal')

# Loss function and Optimizer
criterion = nn.MSELoss()
optimizer_vanishing = optim.SGD(model_vanishing.parameters(), lr=0.01)
optimizer_exploding = optim.SGD(model_exploding.parameters(), lr=0.01)

# Generate random input data
x = torch.randn(32, input_dim)
y = torch.randn(32, output_dim)

# Training and Observing Gradients
def train_model(model, optimizer, name):
    optimizer.zero_grad()
    output = model(x)
    loss = criterion(output, y)
    loss.backward()

    # Print gradient norms
    total_norm = sum(p.grad.norm().item() for p in model.parameters() if p.grad is not None)
    print(f"{name} - Gradient Norm: {total_norm:.4f}")
    optimizer.step()

# Train and observe gradient issues
train_model(model_vanishing, optimizer_vanishing, "Vanishing Gradients Model")
train_model(model_exploding, optimizer_exploding, "Exploding Gradients Model")

Vanishing Gradients Model - Gradient Norm: 507532.2496
Exploding Gradients Model - Gradient Norm: nan
