# Imports

In [None]:
import numpy as np
import kagglehub

# Download latest version

Collecting kagglehub
  Downloading kagglehub-0.4.1-py3-none-any.whl.metadata (38 kB)
Collecting kagglesdk<1.0,>=0.1.14 (from kagglehub)
  Downloading kagglesdk-0.1.15-py3-none-any.whl.metadata (13 kB)
Downloading kagglehub-0.4.1-py3-none-any.whl (69 kB)
Downloading kagglesdk-0.1.15-py3-none-any.whl (160 kB)
Installing collected packages: kagglesdk, kagglehub
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2/2[0m [kagglehub]
[1A[2KSuccessfully installed kagglehub-0.4.1 kagglesdk-0.1.15


  from .autonotebook import tqdm as notebook_tqdm


Downloading to /Users/king/.cache/kagglehub/datasets/jeffheaton/glasses-or-no-glasses/2.archive...


100%|██████████| 6.11G/6.11G [03:41<00:00, 29.7MB/s]

Extracting files...





# Random Noise Sampler

- Implement a function to generate random noise vectors (latent codes) zz which serve as input to the Generator.

In [27]:
def random_noise_sampler(batch_size: int, latent_dim: int, mode: str = 'gaussian', seed: int = None) -> np.ndarray:
    """
    Generates a batch of random noise vectors.
    """
    if seed is not None:
        np.random.seed(seed)

    if mode == 'gaussian':
        samples = np.random.normal(0.0, 1.0, size=(batch_size, latent_dim))
    elif mode == 'uniform':
        samples = np.random.uniform(-1.0, 1.0, size=(batch_size, latent_dim))
    else:
        raise ValueError("mode must be 'gaussian' or 'uniform'")

    return samples.astype(np.float32)

# Generator Forward Pass (Linear)

- Implement the forward pass of a Generator network using only linear (fully-connected) layers and activation functions.
- 
    The first N−1N−1 layers use ReLU activation: hnew=ReLU(holdW+b)hnew​=ReLU(hold​W+b).
    The final layer uses Tanh activation: y=Tanh(holdW+b)y=Tanh(hold​W+b).

- 
    z: Latent input vectors of shape (batch_size, input_dim).
    weights: A list of weight matrices [W_0, W_1, ..., W_{N-1}].
        Each WiWi​ has shape (in_dim, out_dim).
    biases: A list of bias vectors [b_0, b_1, ..., b_{N-1}].
        Each bibi​ has shape (out_dim,).



In [None]:
def generator_forward(z: np.ndarray, weights: list[np.ndarray], biases: list[np.ndarray]) -> tuple[np.ndarray, list[np.ndarray]]:
    """
    Computes the forward pass of the generator.
     
    Args:
        z: Input latent vectors (batch_size, latent_dim).
        weights: List of weight matrices.
        biases: List of bias vectors.
        
    Returns:
        output: Final generated data.
        activations: List of activations [z, h1, ..., output].
    """
    activations = [z]
    h = z

    N = len(weights)

    for i in range(N):
        h = h @ weights[i] + biases[i]
         
        if i < N - 1:
            # First N−1 layers → ReLU
            h = np.maximum(0, h)
        else:
            # Final layer → Tanh
            h = np.tanh(h)

        activations.append(h)

    return h, activations

# Discriminator Forward Pass (Linear)

- The discriminator consists of NN layers (typically small, e.g., 2 layers for simple problems).

    The first N−1N−1 layers use LeakyReLU activation with slope 0.20.2: hnew=LeakyReLU(holdW+b)hnew​=LeakyReLU(hold​W+b).
    The final layer uses Sigmoid activation: y=Sigmoid(holdW+b)y=Sigmoid(hold​W+b).


In [29]:
def discriminator_forward(x: np.ndarray, weights: list[np.ndarray], biases: list[np.ndarray]) -> tuple[np.ndarray, list[np.ndarray]]:
    """
    Computes the forward pass of the discriminator.
    
    Args:
        x: Input data (batch_size, input_dim).
        weights: List of weight matrices.
        biases: List of bias vectors.
        
    Returns:
        output: Probability of real (batch_size, 1).
        activations: List of activations [x, h1, ..., output].
    """
    
    activations = [x]
    h = x
    slope = 0.2
    N = len(weights)

    for i in range(N):
            h = h @ weights[i] + biases[i]

            if i < N - 1:
                # Leaky ReLU
                h = np.where(h > 0, h, h * slope)
            else:
                # Sigmoid
                h = 1 / (1 + np.exp(-h))

            activations.append(h)
    return h, activations

# Binary Cross Entropy (BCE) Loss

The Binary Cross Entropy (BCE) loss function is widely used in Generative Adversarial Networks (GANs) for both the Generator and Discriminator.

**Definition:**

$$
\mathcal{L} = - \frac{1}{N} \sum_{i=1}^{N} \Big[ y_i \log(\hat{y}_i + \epsilon) + (1 - y_i) \log(1 - \hat{y}_i + \epsilon) \Big]
$$

**Where:**

- \(N\) is the batch size  
- \(y_i\) is the true label, typically 0 or 1  
- \(\hat{y}_i\) is the predicted probability from the discriminator, in the range [0, 1]  
- \(\epsilon = 1 \times 10^{-8}\) ensures numerical stability to avoid \(\log(0)\)  

**Notes:**

- The loss is averaged over the batch.  
- For GANs, the discriminator typically outputs \(\hat{y}_i\) through a sigmoid activation to represent probabilities.

In [30]:
def bce_loss(y_pred: np.ndarray, y_true: np.ndarray) -> float:
    """
    Computes Binary Cross Entropy loss.
    
    Args:
        y_pred: Predicted probabilities (batch_size, 1).
        y_true: True labels (batch_size, 1).
        
    Returns:
        loss: Scalar mean loss.
    """ 
    epsilon = 1e-8
    loss = y_true * np.log(y_pred + epsilon) + (1 - y_true) * np.log(1 - y_pred + epsilon)
    return float(-np.mean(loss))
    

## Binary Cross Entropy Loss (Backward)

Implement the backward pass (gradient) of the Binary Cross Entropy loss function with respect to the predicted probabilities `y_pred`.

---

### Formula

Given the loss:

$$
\mathcal{L} = -\frac{1}{N} \sum_{i=1}^{N}
\left[
y_i \log(\hat{y}_i + \epsilon)
+ (1 - y_i)\log(1 - \hat{y}_i + \epsilon)
\right]
$$

The gradient with respect to $\hat{y}_i$ is:

$$
\frac{\partial \mathcal{L}}{\partial \hat{y}_i}
=
\frac{1}{N}
\left(
\frac{1 - y_i}{1 - \hat{y}_i + \epsilon}
-
\frac{y_i}{\hat{y}_i + \epsilon}
\right)
$$

---

### Where

- $N$ is the batch size  
- $\epsilon = 1 \times 10^{-8}$ for numerical stability  

---

### Inputs

- `y_pred`: Predicted probabilities of shape `(batch_size, 1)`
- `y_true`: True labels of shape `(batch_size, 1)`

---

### Output

- `grad`: The gradient array of shape `(batch_size, 1)`, representing

$$
\frac{\partial \mathcal{L}}{\partial \hat{y}}
$$

In [31]:
def bce_loss_backward(y_pred: np.ndarray, y_true: np.ndarray) -> np.ndarray:
    """
    Computes the gradient of BCE loss with respect to y_pred.
    
    Args:
        y_pred: Predicted probabilities (batch_size, 1).
        y_true: True labels (batch_size, 1).
        
    Returns:
        grad: Gradient dL/dy_pred (batch_size, 1).
    """

    epsilon = 1e-8
    N = len(y_pred)
    grad = (1/N) * (((1-y_true)/(1-y_pred + epsilon)) - ((y_true)/(y_pred + epsilon))) 

    return grad

## Generator Backward Pass (Linear)

Implement the backward pass (backpropagation) for the Generator network.

---

### Inputs

- `d_output`: The gradient of the loss with respect to the generator's output,  
  $$
  \frac{\partial \mathcal{L}}{\partial h_N}
  $$  
  Shape `(batch_size, out_dim)`.

- `activations`: A list of activations  
  $$
  [h_0, h_1, \dots, h_N]
  $$  
  computed during the forward pass.

- `weights`: List of weight matrices  
  $$
  [W_0, \dots, W_{N-1}]
  $$

- `biases`: List of bias vectors  
  $$
  [b_0, \dots, b_{N-1}]
  $$

---

### Architecture Reminder

- Layers $0$ to $N-2$: ReLU activation  
- Layer $N-1$ (last): Tanh activation

---

### Derivatives

- **Tanh Derivative**:  
  If  
  $$
  y = \tanh(x)
  $$  
  then  
  $$
  y' = 1 - y^2
  $$

- **ReLU Derivative**:  
  If  
  $$
  y = \mathrm{ReLU}(x)
  $$  
  then  
  $$
  y' =
  \begin{cases}
  1 & \text{if } x > 0 \\
  0 & \text{otherwise}
  \end{cases}
  $$
  (Strictly, if $y > 0$)

---

### Algorithm

Iterate backwards from the last layer $l = N-1$ down to $0$.

#### Compute delta

- **Last layer**:
  $$
  \delta_N = d\_output \odot (1 - h_N^2)
  $$

- **Hidden layers**:
  $$
  \delta_{l+1} = (\delta_{l+2} W_{l+1}^T) \odot (h_{l+1} > 0)
  $$

---

### Compute Gradients

- **Weight gradients**:
  $$
  dW_l = h_l^T \delta_{l+1}
  $$

- **Bias gradients**:
  $$
  db_l = \sum \delta_{l+1}
  $$
  (sum over the batch dimension)

---

### Output

Returns a tuple `(grads_w, grads_b)`:

- `grads_w`: List of weight gradients, same shape as `weights`
- `grads_b`: List of bias gradients, same shape as `biases`


In [None]:
import numpy as np

def generator_backward(d_output: np.ndarray, activations: list[np.ndarray], weights: list[np.ndarray], biases: list[np.ndarray]) -> tuple[list[np.ndarray], list[np.ndarray]]:
    """
    Computes gradients for generator weights and biases.
    
    Args:
        d_output: Gradient dL/d(output), shape (batch_size, output_dim)
        activations: List of activations [h0, h1, ..., output], where h0 is input z
        weights: List of weight matrices, each shape (in_dim, out_dim)
        biases: List of bias vectors, each shape (out_dim,)
        
    Returns:
        grads_w: List of weight gradients
        grads_b: List of bias gradients
    """
    
    N = len(weights)
    delta = d_output * (1 - activations[-1]**2)  
    grads_w = []
    grads_b = []
    
    for i in reversed(range(N)):
        grads_w.append(activations[i].T @ delta)  # (in_dim, out_dim)
        grads_b.append(np.sum(delta, axis=0))     # make 1D
        
        if i > 0:
            delta = (delta @ weights[i].T) * (activations[i] > 0)
     
    # Reverse to match input → output order
    grads_w = grads_w[::-1]
    grads_b = grads_b[::-1]
    
    return grads_w, grads_b

# Discriminator Backward Pass (Linear)

## Inputs
- **d_output**: Gradient of the loss with respect to the discriminator output  
  Shape: `(batch_size, 1)`
- **activations**: List of activations `[h₀, h₁, ..., h_N]`
- **weights**: List of weight matrices
- **biases**: List of bias vectors

---

## Architecture
- Layers `0` to `N−2`: **LeakyReLU** activation (slope = 0.2)
- Layer `N−1` (last): **Sigmoid** activation

---

## Derivatives

### Sigmoid
If  
\[
y = \sigma(x)
\]

Then  
\[
y' = y(1 - y)
\]

---

### LeakyReLU
If  
\[
y = \text{LeakyReLU}(x)
\]

Then  
\[
y' =
\begin{cases}
1, & y \ge 0 \\
0.2, & y < 0
\end{cases}
\]

---

## Backpropagation Algorithm

Iterate backwards from layer `l = N−1` to `0`.

### 1. Delta Computation

- **Last layer**:
\[
\delta_N = d\_output \odot h_N (1 - h_N)
\]

- **Hidden layers**:
\[
\delta_{l+1} = (\delta_{l+2} W_{l+1}^T) \odot \text{LeakyReLU}'(h_{l+1})
\]

---

### 2. Gradient Computation

- **Weights**:
\[
dW_l = h_l^T \delta_{l+1}
\]

- **Biases**:
\[
db_l = \sum \delta_{l+1}
\]

---

## Output
Returns:

In [None]:
def discriminator_backward(d_output: np.ndarray, activations: list[np.ndarray], weights: list[np.ndarray], biases: list[np.ndarray]) -> tuple[list[np.ndarray], list[np.ndarray]]:
    """
    Computes gradients for discriminator weights and biases.
    
    Args:
        d_output: Gradient dL/d(output) (batch_size, 1).
        activations: List of [x, h1, ..., output].
        weights: List of weights.
        biases: List of biases.
        
    Returns:
        grads_w: List of weight gradients.
        grads_b: List of bias gradients.
    """
    N = len(weights)
    delta = d_output * activations[-1] * (1 - activations[-1])
    grads_w = []
    grads_b = []
    alpha = 0.2
    
    for i in reversed(range(N)):
        grads_w.append(activations[i].T @ delta)  # (in_dim, out_dim)
        grads_b.append(np.sum(delta, axis=0))     # make 1D
        
        if i > 0:
            leaky_grad = np.where(activations[i] >= 0, 1.0, alpha)
            delta = (delta @ weights[i].T) * leaky_grad 
    
    # Reverse to match input → output order
    grads_w = grads_w[::-1]
    grads_b = grads_b[::-1]
    
    return grads_w, grads_b


#  def discriminator_backward(activations: list[np.ndarray], y_true: np.ndarray, weights: list[np.ndarray], biases: list[np.ndarray]) -> tuple[list[np.ndarray], list[np.ndarray]]:
#     """
#     Args:
#         activations: List of [x, h1, ..., y_pred]
#         y_true: True labels (batch_size, 1) - You must pass this in!
#         weights: List of weights
#         biases: List of biases
#     """
#     N = len(weights)
#     y_pred = activations[-1]
#     batch_size = y_true.shape[0]

#     # STABLE GRADIENT: (Prediction - Target) / Batch_Size
#     # This combines BCE backward and Sigmoid backward into one step
#     delta = (y_pred - y_true) / batch_size 
    
#     grads_w = []
#     grads_b = []
#     alpha = 0.2
    
#     for i in reversed(range(N)):
#         # Calculate gradients for this layer
#         grads_w.append(activations[i].T @ delta)
#         grads_b.append(np.sum(delta, axis=0))
        
#         if i > 0:
#             # Backpropagate delta to the previous layer using LeakyReLU derivative
#             leaky_grad = np.where(activations[i] >= 0, 1.0, alpha)
#             delta = (delta @ weights[i].T) * leaky_grad
    
#     return grads_w[::-1], grads_b[::-1]


# Minibatch Discrimination (GANs)

Minibatch Discrimination is a technique used in GAN discriminators to reduce **mode collapse** by allowing the discriminator to consider **relationships between samples in a minibatch**, rather than evaluating each sample independently.

---

## Definitions

- **N**: batch size  
- **A**: input feature dimension  
- **B**: number of kernels  
- **C**: kernel dimension  

---

## Inputs

- **features**  
  Shape: `(N, A)`  
  Activations from a discriminator layer.

- **T**  
  Shape: `(A, B, C)`  
  Learnable transformation tensor.

---

## Step 1: Linear Projection

We compute:

$$
M = f(x) \cdot T
$$

Implementation details:

- Reshape \( T \) from `(A, B, C)` to `(A, B \cdot C)`
- Perform matrix multiplication
- Reshape the result to `(N, B, C)`

Final shape:

$$
M \in \mathbb{R}^{N \times B \times C}
$$

---

## Step 2: Pairwise L1 Distances

For each kernel \( b \in \{1, \dots, B\} \), compute the L1 distance between all samples:

$$
d_{i,j}^{(b)} = \sum_{c=1}^{C} \left| M_{i,b,c} - M_{j,b,c} \right|
$$

This produces:

$$
d \in \mathbb{R}^{N \times N \times B}
$$

---

## Step 3: Exponential Similarity

Convert distances into similarity scores:

$$
o_b(x_i) = \sum_{j=1}^{N} \exp\left(-d_{i,j}^{(b)}\right)
$$

---

## Step 4: Remove Self-Comparison (Optional)

Since:

$$
d_{i,i}^{(b)} = 0 \Rightarrow \exp(0) = 1
$$

We subtract self-similarity:

$$
o_b(x_i) \leftarrow o_b(x_i) - 1
$$

---

## Output

- **mb_features**  
  Shape: `(N, B)`

These features are typically concatenated with discriminator activations.


In [34]:
def minibatch_discrimination(features, T):
    """
    Computes minibatch discrimination features.
    
    Args:
        features: (N, A)
        T: (A, B, C)
        
    Returns:
        mb_features: (N, B)
    """
    N, A = features.shape
    _, B, C = T.shape

    # Step 1: Linear projection
    T_flat = T.reshape(A, B * C)
    M = features @ T_flat
    M = M.reshape(N, B, C)

    # Step 2: Pairwise L1 distances
    M_i = M[:, None, :, :]          # (N, 1, B, C)
    M_j = M[None, :, :, :]          # (1, N, B, C)
    
    distances = np.abs(M_i - M_j).sum(axis=3)  # (N, N, B)

    # Step 3: Exponential similarity
    mb_features = np.exp(-distances).sum(axis=1)

    # Step 4: Remove self-comparison
    mb_features -= 1

    return mb_features

# WGAN-GP Gradient Penalty

The **Gradient Penalty** used in **WGAN-GP (Wasserstein GAN with Gradient Penalty)** enforces the **1-Lipschitz constraint** on the discriminator (also called the *critic*).  
Instead of weight clipping, WGAN-GP penalizes deviations of the gradient norm from 1.

---

## Intuition

For the Wasserstein distance to be valid, the critic must be **1-Lipschitz**.  
This means the gradient of the critic with respect to its input should have **unit norm everywhere**.

The gradient penalty softly enforces this by penalizing gradients whose L2 norm is not close to 1.

---

## Formula

## Definitions

- Gradient penalty coefficient

$$
\lambda = 10
$$

- Interpolated samples between real and generated data

$$
\hat{x}
$$

- Critic output evaluated at interpolated samples

$$
D(\hat{x})
$$

- Gradient of critic output with respect to interpolated input

$$
\nabla_{\hat{x}} D(\hat{x})
$$

- L2 (Euclidean) norm

$$
\lVert \cdot \rVert_2
$$

---

## Interpolated Samples

The interpolated inputs are defined as:

$$
\hat{x} = \epsilon x_{\text{real}} + (1 - \epsilon) x_{\text{fake}}
$$

where:

$$
\epsilon \sim \mathcal{U}(0, 1)
$$

---

## Inputs

- **gradients**  
  Shape: `(batch_size, input_dim)`  
  Gradients of the critic output with respect to interpolated inputs.

- **lambda_gp**  
  Scalar \( \lambda \) controlling penalty strength.

---

## Computation Steps

1. Compute the L2 norm of each gradient:
   $$
   \left\| \nabla_{\hat{x}} D(\hat{x}_i) \right\|_2
   $$

2. Subtract 1 from each norm:
   $$
   \left\| \nabla_{\hat{x}} D(\hat{x}_i) \right\|_2 - 1
   $$

3. Square the result:
   $$
   \left(
   \left\| \nabla_{\hat{x}} D(\hat{x}_i) \right\|_2 - 1
   \right)^2
   $$

4. Take the mean over the batch:
   $$
   \mathbb{E}[\cdot]
   $$

5. Multiply by \( \lambda \).

---

## Output

- **penalty**  
  Scalar value representing the mean gradient penalty over the batch.

---

## Why This Works

- Enforces the Lipschitz constraint smoothly
- More stable than weight clipping
- Improves convergence and training stability
- Prevents exploding or vanishing gradients in the critic

In [None]:
def gradient_penalty(gradients: np.ndarray, lambda_gp: float = 10.0) -> float:
    """
    Computes WGAN-GP gradient penalty.

    Args:
        gradients: Gradients of D w.r.t interpolated inputs
                   Shape: (batch_size, input_dim)
        lambda_gp: Gradient penalty weight

    Returns:
        Scalar gradient penalty
    """

    grad_norm = np.linalg.norm(gradients, axis=1)  # (batch_size,)

    # Compute penalty
    penalty = lambda_gp * np.mean((grad_norm - 1.0) ** 2)

    return penalty

# Feature Matching Loss

In [None]:
def feature_matching_loss(real_features: np.ndarray, fake_features: np.ndarray) -> float:
    """
    Computes feature matching loss.
    
    Args:
        real_features: Features from real data (N, D).
        fake_features: Features from fake data (M, D).
    
    Returns:
        loss: Scalar squared L2 distance between means.
    """
    
    mu_real = np.mean(real_features, axis=0)
    mu_fake = np.mean(fake_features, axis=0)
    loss = np.sum(np.square(mu_real - mu_fake))
    return loss

# Hyper Parameters Definition

- batch size usually between 32-128
- learning rates usually ~ 1e-4
- num_epocks

In [None]:
latent_dim = 100
learning_rate_g = 0.0002
learning_rate_d = 0.0003
batch_size = 64
num_epochs = 10
