In [None]:
# Essential imports for ReLU analysis
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow import keras
import seaborn as sns
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')

# Set style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print("ðŸš€ Ready to solve the ReLU problem!")
print(f"TensorFlow version: {tf.__version__}")

# Set random seed
np.random.seed(42)
tf.random.set_seed(42)


In [None]:
# Implement ReLU and its variants
def relu_activation(x):
    """Standard ReLU: f(x) = max(0, x)"""
    return np.maximum(0, x)

def relu_derivative(x):
    """ReLU derivative: f'(x) = 1 if x > 0, else 0"""
    return (x > 0).astype(float)

def leaky_relu(x, alpha=0.01):
    """Leaky ReLU: f(x) = x if x > 0, else Î±x"""
    return np.where(x > 0, x, alpha * x)

def leaky_relu_derivative(x, alpha=0.01):
    """Leaky ReLU derivative: f'(x) = 1 if x > 0, else Î±"""
    return np.where(x > 0, 1.0, alpha)

def parametric_relu(x, alpha):
    """Parametric ReLU: f(x) = x if x > 0, else Î±x (Î± is learnable)"""
    return np.where(x > 0, x, alpha * x)

def parametric_relu_derivative(x, alpha):
    """PReLU derivative: f'(x) = 1 if x > 0, else Î±"""
    return np.where(x > 0, 1.0, alpha)

def elu_activation(x, alpha=1.0):
    """ELU: f(x) = x if x > 0, else Î±(e^x - 1)"""
    return np.where(x > 0, x, alpha * (np.exp(np.clip(x, -500, 500)) - 1))

def elu_derivative(x, alpha=1.0):
    """ELU derivative: f'(x) = 1 if x > 0, else Î±*e^x"""
    return np.where(x > 0, 1.0, alpha * np.exp(np.clip(x, -500, 500)))

# Demonstrate the dying ReLU problem
def simulate_dying_relu():
    """
    Simulate how neurons can 'die' with ReLU
    """
    # Create a scenario where ReLU neurons die
    np.random.seed(42)
    
    # Initial weights and biases that push neurons negative
    weights = np.random.normal(-2, 0.5, (100, 10))  # Negative bias
    biases = np.random.normal(-1, 0.2, 100)
    
    # Sample inputs
    inputs = np.random.normal(0, 1, (1000, 10))
    
    dead_neurons = []
    
    for epoch in range(10):
        # Forward pass
        z = inputs @ weights.T + biases
        activations = relu_activation(z)
        
        # Count dead neurons (always output 0)
        dead_count = np.sum(np.all(activations == 0, axis=0))
        dead_neurons.append(dead_count)
        
        # Simulate some learning (but dead neurons won't recover)
        # In real training, dead neurons get zero gradients
        gradients = relu_derivative(z)
        alive_mask = np.any(gradients > 0, axis=0)
        
        # Only alive neurons get updated
        weights[alive_mask] += np.random.normal(0, 0.01, (np.sum(alive_mask), 10))
    
    return dead_neurons

# Run dying ReLU simulation
dead_neuron_counts = simulate_dying_relu()

print("ðŸ’€ Dying ReLU Simulation:")
print("=" * 40)
for epoch, count in enumerate(dead_neuron_counts):
    print(f"Epoch {epoch}: {count}/100 neurons are dead")

# Visualize ReLU variants
x = np.linspace(-3, 3, 1000)

# Calculate activations for different variants
relu_vals = relu_activation(x)
leaky_vals = leaky_relu(x, alpha=0.1)
prelu_vals = parametric_relu(x, alpha=0.2)
elu_vals = elu_activation(x, alpha=1.0)

# Calculate derivatives
relu_deriv = relu_derivative(x)
leaky_deriv = leaky_relu_derivative(x, alpha=0.1)
prelu_deriv = parametric_relu_derivative(x, alpha=0.2)
elu_deriv = elu_derivative(x, alpha=1.0)

# Create visualization
fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(15, 12))
fig.suptitle('ReLU Variants: Functions and Derivatives', fontsize=16, fontweight='bold')

# Plot activation functions
ax1.plot(x, relu_vals, label='ReLU', linewidth=2, color='red')
ax1.plot(x, leaky_vals, label='Leaky ReLU (Î±=0.1)', linewidth=2, color='blue')
ax1.plot(x, prelu_vals, label='PReLU (Î±=0.2)', linewidth=2, color='green')
ax1.plot(x, elu_vals, label='ELU (Î±=1.0)', linewidth=2, color='purple')
ax1.set_title('Activation Functions')
ax1.set_xlabel('Input (x)')
ax1.set_ylabel('Output f(x)')
ax1.legend()
ax1.grid(True, alpha=0.3)
ax1.axhline(y=0, color='k', linestyle='-', alpha=0.3)
ax1.axvline(x=0, color='k', linestyle='-', alpha=0.3)

# Plot derivatives
ax2.plot(x, relu_deriv, label="ReLU f'(x)", linewidth=2, color='red')
ax2.plot(x, leaky_deriv, label="Leaky ReLU f'(x)", linewidth=2, color='blue')
ax2.plot(x, prelu_deriv, label="PReLU f'(x)", linewidth=2, color='green')
ax2.plot(x, elu_deriv, label="ELU f'(x)", linewidth=2, color='purple')
ax2.set_title('Derivatives')
ax2.set_xlabel('Input (x)')
ax2.set_ylabel("Derivative f'(x)")
ax2.legend()
ax2.grid(True, alpha=0.3)
ax2.axhline(y=0, color='k', linestyle='-', alpha=0.3)
ax2.axvline(x=0, color='k', linestyle='-', alpha=0.3)

# Plot dying ReLU progression
ax3.plot(range(len(dead_neuron_counts)), dead_neuron_counts, 'ro-', linewidth=2)
ax3.set_title('Dying ReLU Problem Progression')
ax3.set_xlabel('Training Epoch')
ax3.set_ylabel('Number of Dead Neurons')
ax3.grid(True, alpha=0.3)

# Compare negative region behavior
x_neg = np.linspace(-2, 0, 100)
ax4.plot(x_neg, relu_activation(x_neg), label='ReLU (dies)', linewidth=3, color='red')
ax4.plot(x_neg, leaky_relu(x_neg, 0.1), label='Leaky ReLU (survives)', linewidth=3, color='blue')
ax4.plot(x_neg, elu_activation(x_neg), label='ELU (smooth)', linewidth=3, color='purple')
ax4.set_title('Negative Input Behavior')
ax4.set_xlabel('Negative Input')
ax4.set_ylabel('Output')
ax4.legend()
ax4.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print("\nðŸ“Š Key Insights:")
print("â€¢ ReLU has zero gradient for negative inputs â†’ neurons can die")
print("â€¢ Leaky ReLU provides small gradient for negative inputs")
print("â€¢ PReLU learns the optimal slope for negative region")
print("â€¢ ELU provides smooth transition and non-zero gradients")


In [None]:
# Create a challenging dataset that might cause dying ReLU
X, y = make_classification(
    n_samples=2000,
    n_features=20,
    n_informative=15,
    n_redundant=5,
    n_classes=2,
    random_state=42,
    flip_y=0.1  # Add some noise
)

# Split and scale
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Custom PReLU layer
class PReLU(keras.layers.Layer):
    def __init__(self, alpha_initializer='zeros', **kwargs):
        super(PReLU, self).__init__(**kwargs)
        self.alpha_initializer = alpha_initializer
    
    def build(self, input_shape):
        self.alpha = self.add_weight(
            shape=(input_shape[-1],),
            initializer=self.alpha_initializer,
            trainable=True,
            name='alpha'
        )
        super(PReLU, self).build(input_shape)
    
    def call(self, x):
        return tf.where(x >= 0, x, self.alpha * x)

# Create models with different ReLU variants
def create_relu_model(activation_type, input_dim):
    model = keras.Sequential([
        keras.layers.Dense(64, input_dim=input_dim),
    ])
    
    if activation_type == 'relu':
        model.add(keras.layers.ReLU())
    elif activation_type == 'leaky_relu':
        model.add(keras.layers.LeakyReLU(alpha=0.1))
    elif activation_type == 'prelu':
        model.add(PReLU())
    elif activation_type == 'elu':
        model.add(keras.layers.ELU())
    
    model.add(keras.layers.Dense(32))
    
    if activation_type == 'relu':
        model.add(keras.layers.ReLU())
    elif activation_type == 'leaky_relu':
        model.add(keras.layers.LeakyReLU(alpha=0.1))
    elif activation_type == 'prelu':
        model.add(PReLU())
    elif activation_type == 'elu':
        model.add(keras.layers.ELU())
    
    model.add(keras.layers.Dense(1, activation='sigmoid'))
    
    model.compile(
        optimizer='adam',
        loss='binary_crossentropy',
        metrics=['accuracy']
    )
    
    return model

# Train models with different ReLU variants
relu_variants = ['relu', 'leaky_relu', 'prelu', 'elu']
relu_results = {}

print("ðŸ”¬ Comparing ReLU Variants:")
print("=" * 50)

for variant in relu_variants:
    print(f"Training with {variant}...")
    
    model = create_relu_model(variant, X_train_scaled.shape[1])
    
    # Monitor dead neurons during training
    class DeadNeuronCallback(keras.callbacks.Callback):
        def __init__(self):
            self.dead_neurons = []
        
        def on_epoch_end(self, epoch, logs=None):
            # Get activations from first hidden layer
            layer_output = keras.Model(
                inputs=self.model.input,
                outputs=self.model.layers[1].output  # After first activation
            )
            activations = layer_output.predict(X_train_scaled[:100], verbose=0)
            
            # Count neurons that are always zero
            dead_count = np.sum(np.all(activations <= 0, axis=0))
            self.dead_neurons.append(dead_count)
    
    dead_callback = DeadNeuronCallback()
    
    history = model.fit(
        X_train_scaled, y_train,
        validation_data=(X_test_scaled, y_test),
        epochs=50,
        batch_size=32,
        callbacks=[dead_callback],
        verbose=0
    )
    
    # Evaluate
    train_loss, train_acc = model.evaluate(X_train_scaled, y_train, verbose=0)
    test_loss, test_acc = model.evaluate(X_test_scaled, y_test, verbose=0)
    
    relu_results[variant] = {
        'train_accuracy': train_acc,
        'test_accuracy': test_acc,
        'dead_neurons': dead_callback.dead_neurons,
        'history': history
    }
    
    print(f"  {variant:12} | Test Accuracy: {test_acc:.4f} | Final Dead Neurons: {dead_callback.dead_neurons[-1]}")

# Visualize results
fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(15, 12))
fig.suptitle('ReLU Variants Performance Comparison', fontsize=16, fontweight='bold')

# Training curves
for variant, results in relu_results.items():
    history = results['history']
    ax1.plot(history.history['val_accuracy'], label=f'{variant}', alpha=0.8)

ax1.set_title('Validation Accuracy During Training')
ax1.set_xlabel('Epoch')
ax1.set_ylabel('Validation Accuracy')
ax1.legend()
ax1.grid(True, alpha=0.3)

# Dead neuron progression
for variant, results in relu_results.items():
    dead_neurons = results['dead_neurons']
    ax2.plot(dead_neurons, label=f'{variant}', alpha=0.8)

ax2.set_title('Dead Neurons During Training')
ax2.set_xlabel('Epoch')
ax2.set_ylabel('Number of Dead Neurons')
ax2.legend()
ax2.grid(True, alpha=0.3)

# Final accuracy comparison
variants = list(relu_results.keys())
test_accs = [relu_results[v]['test_accuracy'] for v in variants]
colors = ['red', 'blue', 'green', 'purple']

bars = ax3.bar(variants, test_accs, color=colors, alpha=0.7)
ax3.set_title('Final Test Accuracy')
ax3.set_ylabel('Test Accuracy')
ax3.set_ylim([0, 1])

# Add value labels on bars
for bar, acc in zip(bars, test_accs):
    height = bar.get_height()
    ax3.text(bar.get_x() + bar.get_width()/2., height + 0.01,
             f'{acc:.4f}', ha='center', va='bottom')

# Loss comparison
train_losses = [relu_results[v]['history'].history['loss'][-1] for v in variants]
val_losses = [relu_results[v]['history'].history['val_loss'][-1] for v in variants]

x_pos = np.arange(len(variants))
width = 0.35

ax4.bar(x_pos - width/2, train_losses, width, label='Train Loss', alpha=0.7)
ax4.bar(x_pos + width/2, val_losses, width, label='Val Loss', alpha=0.7)
ax4.set_title('Final Loss Comparison')
ax4.set_xlabel('ReLU Variant')
ax4.set_ylabel('Loss')
ax4.set_xticks(x_pos)
ax4.set_xticklabels(variants)
ax4.legend()

plt.tight_layout()
plt.show()

print("\nðŸŽ¯ Key Findings:")
print("â€¢ Standard ReLU may suffer from dead neurons")
print("â€¢ Leaky ReLU prevents neuron death with minimal computation overhead")
print("â€¢ PReLU learns optimal negative slope but adds parameters")
print("â€¢ ELU provides smooth gradients but requires more computation")
print("\nðŸ’¡ Practical Recommendation:")
print("Leaky ReLU offers the best balance of performance and simplicity!")
