In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torchvision import datasets, transforms
from torch.utils.data import DataLoader
import matplotlib.pyplot as plt
from matplotlib.lines import Line2D
import numpy as np

For each practical exercise (TP), please work in groups of two or three. Then, create a **private GitHub repository** and add me (my GitHub is  **arthur-75**) to your project. Finally, share the link to your project (or TP) under  [Practical Exercises](https://docs.google.com/spreadsheets/d/1V-YKgHn71FnwjoFltDhWsPJS7uIuAh9lj6SP2DSCvlY/edit?usp=sharing) and make sure to choose your **team name** :-)

# **Variational Autoencoders on Fashion MNIST**

## **1: Data Preparation and Visualization**

**Identical to the First Practical Exercise**

**Goal**:

* Load the **Fashion MNIST** dataset and **pad** images to 32×32.  
* Create **DataLoaders** for training and validation.  
* Visualize a few samples to confirm data integrity.

**Key Points to Recall**:

* **Why** do we pad the images from 28×28 to 32×32?  
* **Which** transformations can help (e.g., normalization)?  
* **How** do we shuffle and batch the data for efficient training?

(Refer back to **Step 1** of the first exercise for detailed guidance.)

## **2: Define the VAE Model**

**Key Difference from a Standard Autoencoder**:

* Instead of directly learning a **latent vector** zz, the VAE learns a **distribution** N(μ,σ2)  by predicting **μ**(mean) and **log⁡(σ2)** (log-variance).  
* Use a **reparameterization trick**:

**Guided Outline**:

1. **Encoder**:  
   * Convolutional layers to reduce spatial dimensions and extract features.  
   * Two separate heads: one for **μ** (mean) and one for **log⁡(σ2)**(log-variance).  
2. **Reparameterization** (in a function like `reparameterize(mu, logvar)`).  
3. **Decoder**:  
   * Transposed convolution layers to reconstruct the image from zz.  
   * Output uses **Sigmoid** to ensure pixel values remain between 0 and 1\.

Why do we need separate heads for **μ** and **log⁡(σ2))**?  
How does the reparameterization trick help with backpropagation through stochastic nodes?

For each practical exercise (TP), please work in groups of two or three. Then, create a private GitHub repository and add me (my GitHub is  arthur-75) to your project. Finally, share the link to your project (or TP) under  Practical Exercises and make sure to choose your team name :-)
Variational Autoencoders on Fashion MNIST


## 1: Data Preparation and Visualization

Identical to the First Practical Exercise  
Goal:  
Load the Fashion MNIST dataset and pad images to 32×32.  
Create DataLoaders for training and validation.  
Visualize a few samples to confirm data integrity.  
Key Points to Recall:  
Why do we pad the images from 28×28 to 32×32?  
Which transformations can help (e.g., normalization)?  
How do we shuffle and batch the data for efficient training?  
(Refer back to Step 1 of the first exercise for detailed guidance.)  

### Why do we pad the images from 28×28 to 32×32?

### Which transformations can help (e.g., normalization)?

### How do we shuffle and batch the data for efficient training?

## 2: Define the VAE Model

Key Difference from a Standard Autoencoder:  
Instead of directly learning a latent vector zz, the VAE learns a distribution N(μ,σ2)  by predicting μ(mean) and log⁡(σ2) (log-variance).  
Use a reparameterization trick:  
$$z=\mu+\sigma\times\epsilon,\quad\epsilon\sim\mathcal{N}(0,1)$$
Guided Outline:
Encoder:  
Convolutional layers to reduce spatial dimensions and extract features.  
Two separate heads: one for $$μ$$ (mean) and one for $$log⁡(σ2)$$(log-variance).  
Reparameterization (in a function like reparameterize(mu, logvar)).  
Decoder:  
Transposed convolution layers to reconstruct the image from zz.
Output uses Sigmoid to ensure pixel values remain between 0 and 1.
Why do we need separate heads for μ and log⁡(σ2))?
How does the reparameterization trick help with backpropagation through stochastic nodes?


### Why do we need separate heads for μ and log⁡(σ2))?

### How does the reparameterization trick help with backpropagation through stochastic nodes?

In [None]:
class VAE(nn.Module):
    def __init__(self, input_shape=(1, 32, 32), latent_dim=2):
        """
        Variational Autoencoder initialization.
        
        Args:
            input_shape (tuple): Shape of the input image. Default is (1, 32, 32).
            latent_dim (int): Dimension of the latent representation.
        """
        super(VAE, self).__init__()

        # ------------------
        #     1. Encoder
        # ------------------
        # This block progressively reduces the spatial dimension of the input.
        # Each Conv2D layer uses stride=2 to downsample.
        self.encoder = nn.Sequential(
            # First convolutional layer: Convert 1-channel input into 32 feature maps.
            nn.Conv2d(in_channels=1, out_channels=32, kernel_size=3, stride=2, padding=1),
            nn.ReLU(), # acivation function 

            # Second convolutional layer: Reduce spatial dimensions further.
            nn.Conv2d(32, 64, kernel_size=3, stride=2, padding=1),
            nn.ReLU(), #acivation function 

            # Third convolutional layer: Extract higher-level features.
            nn.Conv2d(in_channels=64, out_channels=128, kernel_size=3, stride=2, padding=1),  # Add another Conv2D layer with 128 filters, kernel size 3, stride 2, padding 1
            nn.ReLU(), #acivation function 
        )

        # Dynamically calculate the flattened size after all convolution layers
        flattened_size, decode_shape = self.calculate_flattened_size(self.encoder, input_shape)

        # ------------------------------------------------
        #     2. Fully Connected Layers for Latent Space
        # ------------------------------------------------

        # Two separate heads for the VAE:
        #  - fc_mu: Predicts the mean of the latent distribution
        #  - fc_logvar: Predicts the log-variance of the latent distribution
        self.fc_mu = nn.Sequential(
            nn.Flatten(start_dim=1),
            nn.Linear(flattened_size, latent_dim)  # Latent mean
        )
        self.fc_logvar = nn.Sequential(
            nn.Flatten(start_dim=1),
            nn.Linear(flattened_size, latent_dim)  # Latent log variance (Hint: same dimension as mu)
        )

        # ------------------
        #     3. Decoder
        # ------------------
        # Inverts the encoder process with ConvTranspose2d (a.k.a. deconvolutions).
        self.decoder = nn.Sequential(
            nn.Linear(latent_dim, flattened_size),  # Map latent space back to the feature map
            nn.Unflatten(decode_shape[0],decode_shape[1:]),  # Reshape to match the encoded feature map

            # Transposed Convolution layers (Decoder)
            nn.ConvTranspose2d(128, 64, kernel_size=3, stride=2, padding=1, output_padding=1), # Add a conv transpose 2d
            nn.ReLU(), # acivation function 


            # Next deconvolution layer
            nn.ConvTranspose2d(64,32,kernel_size=3,stride=2, padding=1, output_padding=1),  # Add a ConvTranspose2d layer reducing from 64 channels to 32 channels and  kernel_size=3, stride=2, padding=1, output_padding=1


            nn.ReLU(), # acivation function 

            # Final deconvolution layer: Convert back to single-channel grayscale image
            nn.ConvTranspose2d(32, 1, kernel_size=3, stride=2, padding=1, output_padding=1),
            nn.Sigmoid()  # acivation function Output values should be between 0 and 1
        
        )

        
    def reparameterize(self, mu, logvar):
        """
        Reparameterization trick:
        
        z = mu + sigma * eps, where eps ~ N(0, I)

        Args:
            mu (Tensor): Mean of the latent distribution.
            logvar (Tensor): Log-variance of the latent distribution.

        Returns:
            z (Tensor): Latent variable sampled from N(mu, sigma^2).
        """
        std = torch.exp(0.5 * logvar)  # Convert log variance to standard deviation
        eps = torch.randn_like(std)    # Sample noise from a normal distribution
        return mu + std * eps

    def calculate_flattened_size(self, model, input_shape):
       #same as the last one 
        with torch.no_grad():
            dummy_input = torch.zeros(1, *input_shape)
            output = model(dummy_input)
            return output.numel(), output.shape

    def forward(self, x):
        """
        Forward pass for the VAE:
        
        1) Encode input into latent distribution parameters (mu, logvar).
        2) Sample z using the reparameterization trick.
        3) Decode z back to a reconstructed image.

        Args:
            x (Tensor): Input images.

        Returns:
            recon_x (Tensor): Reconstructed images.
            mu (Tensor): Mean of latent distribution.
            logvar (Tensor): Log-variance of latent distribution.
            z (Tensor): Sampled latent variable.
        """
        # Encode input
        x_encoded = self.encoder(x)

        # Compute mu and logvar
        mu = self.fc_mu(x_encoded)
        logvar = self.fc_logvar(x_encoded)

        # Sample from the latent distribution
        z = self.reparameterize(mu, logvar)

        # Decode latent vector to reconstruct the input
        recon_x = self.decoder(z)
        
        return recon_x, mu, logvar, z
