# Implementation: The Impact of Initialization

**Goal**: Visualize how activations change with different weight initializations.

In [None]:
import numpy as np
import matplotlib.pyplot as plt

# 1. Setup
inputs = np.random.randn(1000, 500) # 1000 samples, 500 features
hidden_size = 500

def test_init(scale_factor):
    # Initialize weights
    W = np.random.randn(500, 500) * scale_factor
    # Forward pass (Linear part only)
    z = np.dot(inputs, W)
    # Activation (Tanh)
    a = np.tanh(z)
    return a

# 2. Compare
# A. Too Small (0.01)
a_small = test_init(0.01)
# B. Too Large (1.0)
a_large = test_init(1.0)
# C. Xavier (Sqrt(1/n)) -> Sqrt(1/500) approx 0.045
a_xavier = test_init(np.sqrt(1/500))

# 3. Plot Histograms of Activations
fig, ax = plt.subplots(1, 3, figsize=(15, 5))

ax[0].hist(a_small.flatten(), bins=100, range=(-1, 1))
ax[0].set_title("Small Weights: All outputs near 0")

ax[1].hist(a_large.flatten(), bins=100, range=(-1, 1))
ax[1].set_title("Large Weights: Output saturated at -1 and 1")

ax[2].hist(a_xavier.flatten(), bins=100, range=(-1, 1))
ax[2].set_title("Xavier: Nice Normal Distribution")

plt.show()

## Conclusion
*   **Small**: Tanh becomes linear near 0. Network loses power.
*   **Large**: Tanh saturates. Gradients become 0 (Vanishing Gradient).
*   **Xavier**: Activations are well spread, gradients will flow nicely.