# Activation Function Exploration

Visual comparison of activation functions: **ReLU, LeakyReLU, Sigmoid, Tanh, Softmax**.

For each we plot:
1. The function itself
2. Its derivative
3. Impact on gradient flow

In [None]:
import sys
sys.path.insert(0, '..')

import numpy as np
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
from pathlib import Path

from src.core.activations import ReLU, LeakyReLU, Sigmoid, Tanh, Softmax

output_dir = Path('..') / 'outputs' / 'plots'
output_dir.mkdir(parents=True, exist_ok=True)

## 1. Compute Activations and Derivatives

In [None]:
Z = np.linspace(-5, 5, 500).reshape(-1, 1)

activations = {
    'ReLU': ReLU(),
    'LeakyReLU(0.1)': LeakyReLU(alpha=0.1),
    'Sigmoid': Sigmoid(),
    'Tanh': Tanh(),
}

results = {}
for name, act in activations.items():
    A = act.forward(Z.copy())
    dZ = act.backward(np.ones_like(Z))
    results[name] = {'A': A, 'dZ': dZ}
    print(f'{name:20s} | output range: [{A.min():.3f}, {A.max():.3f}] | grad range: [{dZ.min():.3f}, {dZ.max():.3f}]')

## 2. Plot Activation Functions

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

for ax, (name, data) in zip(axes.ravel(), results.items()):
    ax.plot(Z.ravel(), data['A'].ravel(), label=f'{name}(z)', linewidth=2)
    ax.plot(Z.ravel(), data['dZ'].ravel(), '--', label=f"{name}'(z)", linewidth=2)
    ax.axhline(0, color='k', linewidth=0.5)
    ax.axvline(0, color='k', linewidth=0.5)
    ax.set_title(name, fontsize=14)
    ax.legend()
    ax.grid(True, alpha=0.3)
    ax.set_xlabel('z')

fig.suptitle('Activation Functions & Their Derivatives', fontsize=16)
fig.tight_layout()
fig.savefig(output_dir / 'activation_functions.png', dpi=150)
plt.close(fig)
print('Saved activation_functions.png')

## 3. Vanishing Gradient Problem

Sigmoid/Tanh saturate for large |z|, causing gradients → 0.
ReLU doesn't saturate for z > 0 but "dies" for z < 0.

In [None]:
# Simulate gradient flow through 10 layers
n_layers = 10
gradient_norms = {}

for name, act_cls in [('Sigmoid', Sigmoid), ('Tanh', Tanh), ('ReLU', ReLU)]:
    norms = []
    grad = np.ones((1, 64))
    rng = np.random.default_rng(42)
    
    for _ in range(n_layers):
        act = act_cls()
        Z_layer = rng.standard_normal((1, 64))
        act.forward(Z_layer)
        grad = act.backward(grad)
        norms.append(np.linalg.norm(grad))
    
    gradient_norms[name] = norms

fig, ax = plt.subplots(figsize=(10, 5))
for name, norms in gradient_norms.items():
    ax.plot(range(1, n_layers + 1), norms, 'o-', label=name, linewidth=2)
ax.set_xlabel('Layer depth')
ax.set_ylabel('Gradient norm')
ax.set_title('Gradient Flow Through Activations (10 layers)')
ax.legend()
ax.set_yscale('log')
ax.grid(True, alpha=0.3)
fig.tight_layout()
fig.savefig(output_dir / 'gradient_flow.png', dpi=150)
plt.close(fig)
print('Saved gradient_flow.png')

## 4. Softmax Temperature

Softmax with temperature T: `softmax(z/T)` — higher T = more uniform.

In [None]:
Z_sm = np.array([[2.0, 1.0, 0.1, -1.0, 3.0]])
temperatures = [0.5, 1.0, 2.0, 5.0]
sm = Softmax()

fig, ax = plt.subplots(figsize=(10, 5))
x_ = np.arange(Z_sm.shape[1])
width = 0.18

for i, T in enumerate(temperatures):
    probs = sm.forward(Z_sm / T)
    ax.bar(x_ + i * width, probs.ravel(), width, label=f'T={T}')

ax.set_xlabel('Class')
ax.set_ylabel('Probability')
ax.set_title('Softmax Temperature Scaling')
ax.legend()
ax.set_xticks(x_ + 1.5 * width)
ax.set_xticklabels([f'Class {i}' for i in range(5)])
fig.tight_layout()
fig.savefig(output_dir / 'softmax_temperature.png', dpi=150)
plt.close(fig)
print('Saved softmax_temperature.png')

## Summary

| Activation | Pros | Cons |
|-----------|------|------|
| **ReLU** | Fast, no saturation for z>0 | Dead neurons (z<0 gives 0 grad) |
| **LeakyReLU** | No dead neurons | Extra hyperparameter α |
| **Sigmoid** | Output ∈ (0,1), interpretable | Vanishing gradients, not zero-centred |
| **Tanh** | Zero-centred, output ∈ (-1,1) | Vanishing gradients |
| **Softmax** | Proper probability distribution | Only for output layer |