**GROUP 1 : ASSIGNMENT 1 : GAN**


In [None]:
import torch
import time
import torch.nn as nn
import torch.optim as optim
import numpy as np
import matplotlib.pyplot as plt
import torchvision.transforms as transforms
import os
import zipfile
import random
from PIL import Image
from torch.utils.data import Dataset, DataLoader,random_split,Tensor_Dataset
from scipy.linalg import sqrtm
from torchvision import transforms, models

In [None]:
# Device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cpu


In [None]:
from google.colab import drive
drive.mount('/content/drive')

**Organising Data for Butterfly and Animal dataset and also Data Augmentation**

In [None]:
# We convert the entire dataset into a torch tensor of shape (90,60,3,128,128) and work with this file
# transforms.ToTensor also scales pixel values to be in [0,1]
folder_path = "/content/drive/MyDrive/adrl/training_images.pt"
transform = transforms.Compose([transforms.Resize((128, 128)),transforms.ToTensor()])

# Function to load images as tensors and resize as well
def load_images_as_tensors(folder):
    all_images = []
    i = 0
    # Iterate over subfolders
    for subdir, _, files in os.walk(folder):
        print(i)
        i += 1
        subfolder_images = []
        # Sort files to maintain consistent order
        files.sort()
        for file in files:
            file_path = os.path.join(subdir, file)
            try:
                # Load a single image,resize and then and convert to tensor
                img = Image.open(file_path)
                img_tensor = transform(img)
                subfolder_images.append(img_tensor)
            except Exception as e:
                print(f"Error processing {file_path}: {e}")
        if subfolder_images:
            # Broadcast across all the classes of images
            all_images.append(torch.stack(subfolder_images))
    return torch.stack(all_images)
    # Expected shape is (90, 60, 3, 128, 128)


In [None]:
# Butterfly dataset
# train has 6499 images from 75 classes
# test has 2786 images

# Convert the image to a tensor and normalize it between 0 and 1
transform = transforms.Compose([transforms.Resize((128, 128)),transforms.ToTensor()])
# Load CSV containing filenames and labels
df = pd.read_csv(r"C:\ADRL data\Butterfly dataset\Training_set.csv")
N = len(df)
# Create a mapping of distinct labels to indices
class_names = sorted(df['label'].unique())
class_to_idx = {class_name: idx for idx, class_name in enumerate(class_names)}

# Save the class_to_idx dictionary to a JSON file
with open(r"C:\ADRL data\Butterfly dataset\class_to_idx.json", 'w') as f:
    json.dump(class_to_idx, f)
with open(r"C:\ADRL data\Butterfly dataset\class_to_idx.json", 'r') as f:
    loaded_class_to_idx = json.load(f)
print(loaded_class_to_idx)

# Initialize a tensor to store the images
# Shape: (N, 3 (channels), 128 (H), 128 (W))
image_tensor = torch.zeros((N, 3, 128, 128))
# Initialize a tensor to store class indices
# Shape: (N,)
class_indices = torch.zeros(N, dtype=torch.long)  # Store class indices

# Process each image and store it in the appropriate class index
for i, row in df.iterrows():
    print(i)
    img_path = os.path.join(r"C:\ADRL data\Butterfly dataset\train", row['filename'])  # Assuming images are in the 'train' folder
    label = row['label']

    # Load the image
    img = Image.open(img_path).convert('RGB')  # Ensure image is in RGB mode
    # (3,224,224)

    # Apply transformations
    img_tensor = transform(img)
    # Get class index
    class_idx = class_to_idx[label]

    # Store the image tensor and class index
    image_tensor[i] = img_tensor
    class_indices[i] = class_idx

torch.save(image_tensor, r"C:\ADRL data\Butterfly dataset\butterfly_training_images.pth")
torch.save(class_indices, r"C:\ADRL data\Butterfly dataset\butterfly_training_classindices.pth")
# Now, image_tensor contains the (N, 3, 128, 128) tensor.
print(f'Successfully created tensor with shape: {image_tensor.shape}')


In [None]:
# Loading the dataset
# Note that variable name change wrt A1
animal_images = torch.load("/home/sahapthank/saha_adrl/training_images.pt")
butterfly_images = torch.load("/home/sahapthank/saha_adrl/butterfly_training_images.pth")
butterfly_labels = torch.load("/home/sahapthank/saha_adrl/butterfly_training_classindices.pth")
anime_images = torch.load("/home/sahapthank/saha_adrl/anime_faces_images.pt")


In [None]:
# DATA AUGMENTATION FOR ANIMAL
horizontal_flip = transforms.RandomHorizontalFlip(p=1)    # Randomly flip the image horizontally
rotation = transforms.RandomRotation(degrees=20)    # Random rotation with expanding size
color_jitter = transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.1)  # Random color adjustments

tensor_shape = (90,60,3,128,128)
flipped_images = torch.zeros(tensor_shape)
rotated_images = torch.zeros(tensor_shape)
color_jittered_images = torch.zeros(tensor_shape)

for i in range(90):
    for j in range(60):
        flipped_images[i][j] = horizontal_flip(animal_images[i][j])
        rotated_images[i][j] = rotation(animal_images[i][j])
        color_jittered_images[i][j] = color_jitter(animal_images[i][j])

animal_images_augmented = torch.stack([animal_images,flipped_images,rotated_images,color_jittered_images])


In [None]:
# DATA AUGMENTATION FOR BUTTERFLY
horizontal_flip = transforms.RandomHorizontalFlip(p=1)    # Randomly flip the image horizontally
rotation = transforms.RandomRotation(degrees=20)    # Random rotation with expanding size
color_jitter = transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.1)  # Random color adjustments

tensor_shape = (6499,3,128,128)
flipped_images = torch.zeros(tensor_shape)
rotated_images = torch.zeros(tensor_shape)
color_jittered_images = torch.zeros(tensor_shape)

for i in range(6499):
    flipped_images[i] = horizontal_flip(butterfly_images[i])
    rotated_images[i] = rotation(butterfly_images[i])
    color_jittered_images[i]= color_jitter(butterfly_images[i])

butterfly_images_augmented = torch.stack([butterfly_images,flipped_images,rotated_images,color_jittered_images])


In [None]:
# CHANGE THIS CODE BASED ON DATASET
training_images = torch.reshape(animal_images,(5400,3,128,128))
# Convert values from [0, 1] to [-1, 1]
training_images = ((2 * training_images) - 1)
print(training_images.size())
first_image = (training_images[0] + 1)/2
first_image = first_image.permute(1, 2, 0)
plt.imshow(first_image)
plt.axis('off')
plt.show()
print(torch.min(training_images))


**Functions to evaluate conjugate of various F-divergences**

In [None]:
# u is real-valued!
# While training GAN using Variational Divergence Minimisation and f-divergences
# The loss term only uses the conjugate of the generator of the f-divergence
# This function is for computing f*(u) where u would be output of critic and f* is the conjugate of f

epsilon = 1e-15  # Small constant for numerical stability
def JSD(u):
    return -torch.log(2 - torch.exp(u))
def Usual_GAN(u):
    # Generally if we dont use framework of f* then (BCE + sigmoid)
    # If we want to use f* framework we need to use (sigmoid + another_activation {g_f mentioned in paper})
    # For Usual_GAN loss without f* is simpler but with f* is the general framework
    # Below we do without f* using (BCE + sigmoid). For that we only need log
    return (torch.log(u))
def total_variation(u):
    return (u)
def KL(u):
    return (torch.exp(u - 1))
def Reverse_KL(u):
    return (-torch.log(-u) - 1)
def Pearson(u):
    return ((u ** 2) / 4 + u)
def Neyman(u):
    return (2 * (1 - torch.sqrt(1 - u)))
def Squared_Hellinger(u):
    return (u / (1 - u))


**[Q1, Q2 AND Q3] Vanilla DC-GAN Implementation for 2 datasets**

**CRITIC NETWORK (Denote by D/T)**

* Below is a DC-GAN where the Critic does not have any FFNN and relies only on Strided Convolutions(instead of Pooling) to continuously downsample the image input to a single real value (which is converted to Probability using Sigmoid)
* BatchNormalisation and Activation functions are chosen as in the DC-GAN paper and there are no skip connections
* The class instantiation is made flexible by making the architectural parameters as inputs

In [None]:
class Critic_DCGAN(nn.Module):
    def __init__(self,architecture):
        # architecture = [conv_params , use_batchnorm, activation_fn]
        # conv_params = [[in_channels,output_channels,kernel_size,stride,padding],....[*,1,*,*,*]]
        # use_batchnorm = [False,True,....False]
        # activation_fn = [nn.Relu,nn.LeakyRelu,.....nn.sigmoid()]
        super(Critic_DCGAN, self).__init__()
        assert len(architecture[0]) == len(architecture[1]) == len(architecture[2])
        self.conv_params = architecture[0]
        self.use_batchnorm = architecture[1]
        self.activation_fn = architecture[2]
        layers = []
        for j,i in enumerate(self.conv_params):
            layers.append(nn.Conv2d(in_channels=i[0],out_channels=i[1],kernel_size=i[2],stride=i[3],padding=i[4]))
            if (self.use_batchnorm)[j]:
                layers.append(nn.BatchNorm2d(i[1]))
            layers.append(self.activation_fn[j])
        # The last conv2d layer outputs [1,1,1]
        # Sigmoid is the last activation function
        # the * operator is used to unpack a list or tuple into separate positional arguments
        self.model = nn.Sequential(*layers)

    # Below function just asks the opinion of T
    # Return value must be in dom(f*)
    def forward(self, x):
        return self.model(x)

    # We will use this to prevent calculating gradients wrt parameters of T
    # During training the generator G
    def set_requires_grad(self, requires_grad):
        for param in self.parameters():
            param.requires_grad = requires_grad


In [None]:
# Example instantiation used in the DC-GAN paper where no BatchNormalisation at the beginning and end
# Assuming input is [3,128,128] final output is [1024,4,4] then a final Conv2d to give [1,1,1]
# output_size = ({input_size - kernel_size + 2*padding}/stride) + 1
# TRIAL 1
# conv_params_1 = [[3,64,4,2,1],[64,128,4,2,1],[128,256,4,2,1],[256,512,4,2,1],[512,1024,4,2,1],[1024,1,4,4,1]]
# use_batchnorm_1 = [True for _ in range(5)] + [False]
# activation_fn_1 = [nn.LeakyReLU(0.2, inplace=False) for _ in range(5)] +[nn.Sigmoid()]
# architecture_1 = [conv_params_1,use_batchnorm_1,activation_fn_1]

# TRIAL 2
# conv_params_1= [[3,32,4,2,1],[32,64,1,1,0],[64,64,4,2,1],[64,128,4,4,0],[128,1,8,1,0]]
# use_batchnorm_1 = [True for _ in range(4)] + [False]
# activation_fn_1= [nn.LeakyReLU(0.2, inplace=False) for _ in range(4)] +[nn.Sigmoid()]
# architecture_1= [conv_params_1,use_batchnorm_1,activation_fn_1]

# TRIAL 3
# conv_params_1 = [[3,32,4,2,1],[32,64,4,2,1],[64,64,4,2,1],[64,128,4,2,1],[128,256,4,2,1],[256,1,4,4,1]]
# use_batchnorm_1 = [True for _ in range(5)] + [False]
# activation_fn_1 = [nn.LeakyReLU(0.2, inplace=False) for _ in range(5)] +[nn.Sigmoid()]
# architecture_1 = [conv_params_1,use_batchnorm_1,activation_fn_1]

# TRIAL 4
conv_params_1 = [[3,32,4,2,1],[32,64,4,2,1],[64,128,4,2,1],[128,128,4,2,1],[128,256,4,2,1],[256,1,4,4,1]]
use_batchnorm_1 = [True for _ in range(5)] + [False]
activation_fn_1 = [nn.LeakyReLU(0.2, inplace=False) for _ in range(5)] +[nn.Sigmoid()]
architecture_1 = [conv_params_1,use_batchnorm_1,activation_fn_1]


In [None]:
# Just checking
T_1 = Critic_DCGAN(architecture_1)
random_tensor = torch.randn((5,3,128,128))
random_output = T_1(random_tensor)
print(random_output.shape)
print(random_output[0])
TP1 = sum(p.numel() for p in T_1.parameters())
print(TP1)


**GENERATOR NETWORK (Denote by G)**


* Strided 2D-Convolutional transpose layers are being used to up-sample latent space vectors to images
*  Existence of the BatchNorm after the Conv-transpose layers is the critical contribution of the DCGAN paper which helps in the flow of the gradients during training





In [None]:
class Generator_DCGAN(nn.Module):
    def __init__(self, architecture):
        super(Generator_DCGAN, self).__init__()
        assert len(architecture[0]) == len(architecture[1]) == len(architecture[2])
        self.transpose_conv_params = architecture[0]
        self.use_batchnorm = architecture[1]
        self.activation_fn = architecture[2]
        layers = []
        # Starting with input latent vector (e.g., size 100)
        for j, i in enumerate(self.transpose_conv_params):
            layers.append(nn.ConvTranspose2d(in_channels=i[0], out_channels=i[1], kernel_size=i[2], stride=i[3], padding=i[4]))
            if self.use_batchnorm[j]:
                layers.append(nn.BatchNorm2d(i[1]))
            layers.append(self.activation_fn[j])
        # tanh() us used as the final activation function
        self.model = nn.Sequential(*layers)

    # Sampling images from the generator G
    def forward(self, x):
        return self.model(x)

    # We will use this to prevent calculating gradients wrt parameters of G
    # During training the discriminator T
    def set_requires_grad(self, requires_grad):
        for param in self.parameters():
            param.requires_grad = requires_grad


In [None]:
# Example instantiation used in the DC-GAN paper where no BatchNormalisation at the beginning
# Assuming input is [100,1,1] final output is [3,64,64]
# output_size = {(input_size - 1) * stride} - (2*padding) + kernel_size}
# TRIAL 1
# transpose_conv_params_2 = [[100,512,4,1,0],[512,256,4,2,1],[256,128,4,2,1],[128,64,4,2,1],[64,32,4,2,1],[32,3,4,2,1]]
# use_batchnorm_2 = [True for _ in range(5)] + [False]
# activation_fn_2 = [nn.ReLU(inplace = False) for _ in range(5)] +[nn.Tanh()]
# architecture_2 = [transpose_conv_params_2, use_batchnorm_2, activation_fn_2]

# TRIAL 2
# transpose_conv_params_2 = [[100,256,4,1,0],[256,128,4,2,1],[128,64,4,2,1],[64,32,4,2,1],[32,3,4,4,0]]
# use_batchnorm_2 = [True for _ in range(4)] + [False]
# activation_fn_2 = [nn.ReLU(inplace = False) for _ in range(4)] +[nn.Tanh()]
# architecture_2 = [transpose_conv_params_2, use_batchnorm_2, activation_fn_2]

# TRIAL 3
transpose_conv_params_2 = [[200,512,4,1,0],[512,256,4,2,1],[256,128,4,2,1],[128,64,4,2,1],[64,32,4,2,1],[32,3,4,2,1]]
use_batchnorm_2 = [True for _ in range(5)] + [False]
activation_fn_2 = [nn.ReLU(inplace = False) for _ in range(5)] +[nn.Tanh()]
architecture_2 = [transpose_conv_params_2,use_batchnorm_2,activation_fn_2]


In [None]:
# Just checking
G_1 = Generator_DCGAN(architecture_2)
random_tensor = torch.randn((5,100,1,1))
random_output = G_1(random_tensor)
print(random_output.shape)
print(torch.max(random_output),torch.min(random_output))
TP2 = sum(p.numel() for p in G_1.parameters())
print(TP2)


**Generator using BilinearInterpolation and Conv2d**

In [None]:
class Generator_BilinearUpsample(nn.Module):
    def __init__(self, architecture):
        super(Generator_BilinearUpsample, self).__init__()
        assert len(architecture[0]) == len(architecture[1]) == len(architecture[2])
        self.conv_params = architecture[0]
        self.use_batchnorm = architecture[1]
        self.activation_fn = architecture[2]
        self.Bilinear_size = architecture[3]
        layers = []
        # Starting with input latent vector (e.g., size 100)
        # Upsampling done on h,w input is (bs,c,h,w)
        for j, i in enumerate(self.conv_params):
            layers.append(nn.Upsample(scale_factor = self.Bilinear_size[j], mode='bilinear', align_corners=False))
            layers.append(nn.Conv2d(in_channels=i[0],out_channels=i[1],kernel_size=i[2],stride=i[3],padding=i[4]))
            if self.use_batchnorm[j]:
                layers.append(nn.BatchNorm2d(i[1]))
            layers.append(self.activation_fn[j])
        # tanh() us used as the final activation function
        self.model = nn.Sequential(*layers)

    def forward(self, x):
        return self.model(x)

    # We will use this to prevent calculating gradients wrt parameters of G
    # During training the discriminator T
    def set_requires_grad(self, requires_grad):
        for param in self.parameters():
            param.requires_grad = requires_grad


In [None]:
# Example instantiation used in the DC-GAN paper where no BatchNormalisation at the beginning
# Assuming input is [100,1,1] final output is [3,64,64]
# output_size = ({input_size - kernel_size + 2*padding}/stride) + 1
# We take (s=1 and k=3 to match 2p = 2)
Bilinear_size_3 = [1,2,2,2,4,4]
conv_params_3 = [[100,512,1,1,0],[512,256,3,1,1],[256,128,3,1,1],[128,64,3,1,1],[64,32,3,1,1],[32,3,3,1,1]]
use_batchnorm_3 = [True for _ in range(5)] + [False]
activation_fn_3 = [nn.ReLU(inplace = False) for _ in range(5)] + [nn.Tanh()]
architecture_3 = [conv_params_3, use_batchnorm_3, activation_fn_3 ,Bilinear_size_3]

# TRIAL 2
# TO BE ADDED

In [None]:
G_2 = Generator_BilinearUpsample(architecture_3)
random_tensor = torch.randn((5,100,1,1))
random_output = G_2(random_tensor)
print(random_output.shape)
TP2 = sum(p.numel() for p in G_2.parameters())
print(TP2)


**Sampling from various distributions**

In [None]:
# Sample from a Gaussian for inference
def sample_gaussian(mean, covariance_matrix , n_samples):
    # mean should be of size ([d])
    # variance is the covariance matrix of size ([d,d]) must be +ve semidefinite
    z = torch.distributions.MultivariateNormal(mean,covariance_matrix)
    return z.sample([n_samples])
    # returns of size ([n_samples,d])

# Sample from "CURRENT TRAINING IMAGES"
def sample_train(n_samples):
    i_indices = torch.randint(0, training_images.size(0), (n_samples,)).tolist()
    sampled_images = []
    for i in i_indices:
        sampled_images.append(training_images[i])
    sampled_images_tensor = torch.stack(sampled_images)
    return sampled_images_tensor

# Sample from Real_images [BUTTERFLY]
def sample_real_butterfly(n_samples):
    i_indices = torch.randint(0, 6499, (n_samples,)).tolist()
    sampled_images = []
    for i in i_indices:
        sampled_images.append(butterfly_images[i])
    sampled_images_tensor = torch.stack(sampled_images)
    return sampled_images_tensor

# Sample from Real_images [ANIMAL]
def sample_real_animal(n_samples):
    i_indices = torch.randint(0, 90, (n_samples,)).tolist() # chooses class
    j_indices = torch.randint(0, 60, (n_samples,)).tolist() # chooses image in class
    sampled_images = []
    for i, j in zip(i_indices, j_indices):
        sampled_images.append(animal_images[i, j])
    sampled_images_tensor = torch.stack(sampled_images)
    return sampled_images_tensor

# Sample from Real_images [ANIMAL] including augmented
def sample_real_augmented_animal(n_samples):
    i_indices = torch.randint(0, 90, (n_samples,)).tolist() # chooses class
    j_indices = torch.randint(0, 60, (n_samples,)).tolist() # chooses image in class
    k_indices = torch.randint(0, 4, (n_samples,)).tolist() # chooses augmentation type
    sampled_images = []
    for i, j , k in zip(i_indices, j_indices , k_indices):
        sampled_images.append(animal_images_augmented[k, i, j])
    sampled_images_tensor = torch.stack(sampled_images)
    return sampled_images_tensor

# Sample from Real_images [BUTTERFLY] including augmented
def sample_real_augmented_butterfly(n_samples):
    i_indices = torch.randint(0, 6499, (n_samples,)).tolist() # chooses image
    k_indices = torch.randint(0, 4, (n_samples,)).tolist() # chooses augmentation type
    sampled_images = []
    for i, k in zip(i_indices, k_indices):
        sampled_images.append(butterfly_images_augmented[k, i])
    sampled_images_tensor = torch.stack(sampled_images)
    return sampled_images_tensor

# Sample from Real_images [ANIME]
def sample_real_anime(n_samples):
    i_indices = torch.randint(0, 21551, (n_samples,)).tolist()
    sampled_images = []
    for i in i_indices:
        sampled_images.append(anime_images[i])
    sampled_images_tensor = torch.stack(sampled_images)
    return sampled_images_tensor


In [None]:
# Example usage
t = sample_gaussian(torch.randn((100,)),torch.eye(100),10)
print(t.size())
t1 = sample_train(10)
print(t1.size())
image_1 = (t1[0]+1)/2
image_1 = image_1.permute(1,2,0)
plt.imshow(image_1)
plt.axis('off')
plt.show()


**Different types of Optimizer classes to be experimented with**

* In the W-GAN paper they have used RMSProp for optimisation
* RMSProp (Root Mean Square Propagation) is an adaptive learning rate optimization algorithm designed to address some of the challenges faced by traditional SGD especially in dealing with non-stationary and noisy gradients
* In the DC-GAN paper they use Adam Optimizer with beta = 0.5 for both G,D
* In some other papers SGD is used for Discriminator and Adam for Generator



**DC-GAN Training Algorithm**

**Experiments and Observations on using Vanilla DC-GAN**

* Using DC-GAN architecture with the loss function mentioned in the original GAN paper the gradients quickly became zero for both the generator and discriminator. So the modified loss function of max (log(D(G(z))) instead of minimising (log(1-D(G(z)) improved the training stability, but still G_loss kept fluctuating a bit and D_loss seems to be hovering around zero when n_critic = 5.
* Even after implementing the (-logD) trick still at times the loss function explodes to "nan" in between of the training or else the D_loss stays zero and G_loss doesnt decrease and stays quite high.So we need to move to W-GAN or try various hyperparameters and architectures.




In [None]:
"""Assuming "Critic/Discriminator/D" and "Generator/G" are defined as classes derived from nn.Module
Hyperparameters := {lr,mini_batch_size,# D updates per G update,latent_space dimesnion} to be chosen for optimisation"""

# Initialize model, loss function, and optimizer
lr_D, lr_G, m, n_critic, d = 2e-4, 2e-4, 128, 5, 100
D_architecture = architecture_1
G_architecture = architecture_2
D = Critic_DCGAN(D_architecture).to(device)
G = Generator_DCGAN(G_architecture).to(device)
D_loss = []
G_loss = []

# We will use 2 optimizers for convenience
# Adam optimizer
D_optimizer = optim.Adam(D.parameters(), lr=lr_D, betas=(0.5, 0.99), eps=1e-8, weight_decay=0)
G_optimizer = optim.Adam(G.parameters(), lr=lr_G, betas=(0.5, 0.99), eps=1e-8, weight_decay=0)
# RMSPROP
# D_optimizer = optim.RMSprop(D.parameters(), lr = lr_D, alpha=0.99, eps=1e-08, weight_decay=0, momentum=0, centered=False)
# G_optimizer = optim.RMSprop(G.parameters(), lr = lr_G, alpha=0.99, eps=1e-08, weight_decay=0, momentum=0, centered=False)


In [None]:
# This part is used for manual changing if needed inbetween of the training
lr_D, lr_G, m, d = 1e-4 , 1e-4 , 128, 100
D_optimizer = optim.Adam(D.parameters(), lr=lr_D, betas=(0.5, 0.99), eps=1e-8, weight_decay=0)
G_optimizer = optim.Adam(G.parameters(), lr=lr_G, betas=(0.5, 0.99), eps=1e-8, weight_decay=0)


In [None]:
num_epochs = 20
n_critic = 5
n_generator = 1
mini_batch_epochs = 50
epsilon = 1e-15
# Training loop
for epoch in range(num_epochs):
    for _ in range(mini_batch_epochs):
    # start = time.time()
    # We will track image quality and gradient norm and decide when to halt
        for t in range(n_critic):
            """How freezing works in Pytorch:
            In the forward pass, torch.no_grad()/reqd_grad = False will prevent the computation of gradients for the frozen layers.
            However, in the backward pass, gradients will still flow through them
            but the optimizer won't update its parameters because they are marked as not requiring gradient
            The parameter gradients, which are not needed, won’t be computed (their .grad attribute won’t be updated)
            But the gradient calculation will continue, if it’s needed for earlier layers."""
            G.set_requires_grad(False)
            D.set_requires_grad(True)
            D_optimizer.zero_grad()
            G_optimizer.zero_grad()
            x = sample_train(m).to(device)
            z = (sample_gaussian(torch.zeros((d,)),torch.eye(d),m)).unsqueeze(-1).unsqueeze(-1).to(device)
            y = G(z)
            # f-GAN objective F(θ,w) maximizes wrt w and minimises wrt θ
            # Compute gradients for D where loss_D is a 0-dimensional tensor
            loss_D = -((torch.log(D(x) + epsilon)).mean() + (torch.log(1 - D(y) + epsilon)).mean())
            loss_D.backward()
            D_optimizer.step()
            D_optimizer.zero_grad()

        for t in range(n_generator):
            # Sample another batch of noise z from prior p(z)
            G.set_requires_grad(True)
            D.set_requires_grad(False)
            D_optimizer.zero_grad()
            G_optimizer.zero_grad()
            z1 = (sample_gaussian(torch.zeros((d,)),torch.eye(d),m)).unsqueeze(-1).unsqueeze(-1).to(device)
            y1 = G(z1)
            # Compute gradients for G
            loss_G = -(torch.log(D(y1) + epsilon)).mean()
            loss_G.backward()
            G_optimizer.step()
            G_optimizer.zero_grad()

        D_loss.append(loss_D.item())
        G_loss.append(loss_G.item())
        # Optional: Logging after each epoch
        # print(f"Epoch [{epoch+1}/{num_epochs}] | D Loss: {loss_D.item():.4f} | G Loss: {loss_G.item():.4f}")
        # stop = time.time()
        # print(stop - start)


In [None]:
# Plotting the Loss curves so far
# Create a figure with two subplots (1 row, 2 columns)
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 6))
# Plot D_loss in the first subplot
D_steps = range(len(D_loss))
ax1.plot(D_steps, D_loss)
ax1.set_xlabel('Mini-Batch')
ax1.set_ylabel('loss')
ax1.set_title('D_loss')
# Plot G_loss in the second subplot
G_steps = range(len(G_loss))
ax2.plot(G_steps, G_loss)
ax2.set_xlabel('Mini-Batch')
ax2.set_ylabel('loss')
ax2.set_title('G_loss')
# Display the plots
plt.tight_layout()
plt.show()


In [None]:
# Inference
with torch.no_grad():
    z = (sample_gaussian(torch.zeros((d,)),torch.eye(d),100)).unsqueeze(-1).unsqueeze(-1).to(device)
    y = (G(z) + 1) / 2
    images = y.detach().cpu().numpy()
    # Create a figure for the grid of images
    fig, axes = plt.subplots(nrows=10, ncols=10, figsize=(15, 15))
    # Loop through the 100 images and display them in the grid
    for i, ax in enumerate(axes.flat):
        img = images[i].transpose(1, 2, 0)
        ax.imshow(img)
        ax.axis('off')
    plt.tight_layout()
    plt.show()


**FOLLOWING WAS THE BEST DCGAN IMAGES OBTAINED FOR BUTTERFLY DATASET**
* D has 760,929 params and G has 3,608,995 params . Latent_dim = d = 100 ; lr = 2e-4 ; m = 128 ; Adam Optimizer with (0.5,0.99) was used.
* Less parameters in G caused very blurry images and making D,G somewhat equally powerful seems to give best results.
* Augmentation was not used . {n_critic and n_generator} were varied manually every 20/30 epochs taking {5,1},{5,2},{5,3} values progressively.

**FOLLOWING WAS THE BEST DCGAN IMAGES OBTAINED FOR ANIMAL DATASET**
* Same hyperparameters as the [butterfly-best] was used along with augmentations{flip,colour-jitter,rotations} for the first 130 epochs and then only flip was used.
* Animal images seemed to require more iterations when using augmentations and also seem to require More parameters to model the distribution due to much higher variety than butterflies.



**Increasing Dimension of LatentSpace**

* Increasing the dimension of input latent space to d = 200 and using a different architecture for D,G some slight imporvements seem to happen visually but image quality is still not recognisable and loss function saturates
* Below are few images of d = 200




**[Q5] Latent Space Traversal**
* We randomly sample 2 Gaussians z1,z2
* We then try Linear and Non-linear interpolations on some of the saved weights of various GANs below



In [None]:
# Linear Interpolation
# Load the pretrained generator
device = torch.device("cpu")
d_t = 200
G_t = torch.load("A2_weights/1c.New_530_G.pth", map_location=torch.device('cpu')).to(device)

n_pairs = 10  # Number of random pairs
n_steps = 10  # Number of interpolations between each pair

# Store all interpolations
all_interpolations = []
for _ in range(n_pairs):
    # Sample two random latent vectors (z1, z2)
    z = sample_gaussian(torch.zeros((d_t,)), torch.eye(d_t), 2).unsqueeze(-1).unsqueeze(-1)
    z1 = z[0]
    z2 = z[1]
    # Interpolate between z1 and z2
    interpolations = []
    for alpha in np.linspace(0, 1, n_steps):
        z_interpolated = (1 - alpha) * z1 + alpha * z2  # Linear interpolation
        interpolations.append(z_interpolated)
    all_interpolations.append(torch.stack(interpolations))

# Stack all interpolations into a single tensor of shape (n_pairs * n_steps, 3, H, W)
all_interpolations = torch.cat(all_interpolations, dim=0).to(device)
with torch.no_grad():
    generated_images = (G_t(all_interpolations) + 1) / 2

# Plot the images row-wise
fig, axs = plt.subplots(n_pairs, n_steps, figsize=(n_steps * 2, n_pairs * 2))

# Display the images in rows, each row corresponding to one interpolation sequence
for i in range(n_pairs):
    for j in range(n_steps):
        img_idx = i * n_steps + j
        img = generated_images[img_idx].permute(1, 2, 0).cpu().numpy()  # Convert from (C, H, W) to (H, W, C)
        axs[i, j].imshow(img)
        axs[i, j].axis('off')

plt.tight_layout()
plt.show()

In [None]:
#Spherical interpolation
#Linear interpolation moves along a straight line between two points
#Spherical interpolation moves along the shortest arc on a sphere between the two points.
#It can produce more natural transitions in the latent space of a GAN
def s_i(val, low, high):
    """Spherical linear interpolation between two vectors."""
    omega = torch.acos(torch.clamp(torch.dot(low / low.norm(), high / high.norm()), -1, 1))
    so = torch.sin(omega)
    if so == 0:
        return (1.0 - val) * low + val * high  # Linear interpolation as a fallback
    return torch.sin((1.0 - val) * omega) / so * low + torch.sin(val * omega) / so * high

# Load the pretrained generator
G_t = torch.load("A2_weights/1c.New_530_G.pth",map_location=torch.device('cpu')).to(device)
d_t = 200

n_pairs = 10  # Number of random pairs
n_steps = 9  # Number of interpolations between each pair

# Store all interpolations
all_interpolations = []
for _ in range(n_pairs):
    # Sample two random latent vectors (z1, z2)
    z = sample_gaussian(torch.zeros((d_t,)), torch.eye(d_t), 2).unsqueeze(-1).unsqueeze(-1) # shape (2, 100, 1, 1)
    z1 = z[0]
    z2 = z[1]
    # Generate interpolations using s_i
    interpolations = []
    for step in range(n_steps + 1):
        alpha = step / n_steps  # Interpolation parameter (0 to 1)
        z_interpolated = s_i(alpha, z1.squeeze().squeeze(), z2.squeeze().squeeze())
        interpolations.append(z_interpolated.unsqueeze(-1).unsqueeze(-1))
    all_interpolations.append(torch.stack(interpolations))

# Stack all interpolations into a single tensor of shape (n_pairs * (n_steps + 1), 100, 1, 1)
all_interpolations = torch.cat(all_interpolations, dim=0).to(device)
with torch.no_grad():
    generated_images = (G_t(all_interpolations) + 1) / 2

# Plot the images row-wise
fig, axs = plt.subplots(n_pairs, n_steps + 1, figsize=((n_steps + 1) * 2, n_pairs * 2))
# Display the images in rows, each row corresponding to one interpolation sequence
for i in range(n_pairs):
    for j in range(n_steps + 1):
        img_idx = i * (n_steps + 1) + j
        img = generated_images[img_idx].permute(1, 2, 0).cpu().numpy()  # Convert from (C, H, W) to (H, W, C)
        axs[i, j].imshow(img)
        axs[i, j].axis('off')

plt.tight_layout()
plt.show()


**[Q7] W-GAN Implementation**

**Observations**
* Last activation of discriminator is tried with both sigmoid and a linear layer . The principled way is to make it Linear since the optimisation is over space of all K-Lipshitz functions
* The G_loss increases for most choices of the hyperparameters of W-GAN when using weight/gradient clipping
* Increasing the lr to order of -4 causes excessive instability in training and {lr = 5e-5} as outlined in the paper was used
* W-GAN was significantly "slower" than DC-GAN training due to learning rate and convergence properties . In the paper for GNP the authors suggest to use Adam optimizer over RMSProp.
* Weight-clipping, Gradient Clipping and Gradient Norm Penalty were tried .







In [None]:
# Function to clip the gradients of the critic
def clip_gradients(critic, c):
    for name, param in critic.named_parameters():
        if param.grad is not None:
            # print(f"Param: {name}, Grad: {param.grad}")
            param.grad.data = torch.clamp(param.grad.data, -c, c)

# Function to clip the weights of the critic
def clip_weights(critic, c):
    for name, param in critic.named_parameters():
        param.data = torch.clamp(param.data, -c, c)
        # if param.grad is not None:
            # print(f"Param: {name}, Grad: {param.grad}")

# Gradient Norm Penalty
def compute_gp(netD,x,y):
        # x is real_image and y is generated_image
        m = x.size(0)
        # Sample Epsilon from uniform distribution
        eps = torch.rand(m, 1, 1, 1).to(x.device)
        eps = eps.expand_as(x)
        # Interpolation between real data and fake data.
        interpolation = eps * x + (1 - eps) * y
        interpolation.requires_grad_(True)
        # get logits for interpolated images
        interp_logits = netD(interpolation)
        grad_outputs = torch.ones_like(interp_logits)

        # Compute Gradients wrt interpolated images
        gradients = torch.autograd.grad(
            outputs=interp_logits,
            inputs=interpolation,
            create_graph = True,
            grad_outputs=grad_outputs,
            retain_graph = True
        )[0]

        # Compute and return Gradient Norm
        gradients = gradients.view(m, -1)
        grad_norm = gradients.norm(2, 1)
        return torch.mean((grad_norm - 1) ** 2)


In [None]:
# (D) (TRIAL_3)
conv_params_1 = [[3,32,4,2,1],[32,64,4,2,1],[64,64,4,2,1],[64,128,4,2,1],[128,256,4,2,1],[256,1,4,4,1]]
use_batchnorm_1 = [False for _ in range(6)]
activation_fn_1 = [nn.LeakyReLU(0.2, inplace=False) for _ in range(5)] +[nn.Identity()]
architecture_1 = [conv_params_1,use_batchnorm_1,activation_fn_1]
# 760,929 [760k]
# (G) (DCGAN/TRIAL_1)
transpose_conv_params_2 = [[100,512,4,1,0],[512,256,4,2,1],[256,128,4,2,1],[128,64,4,2,1],[64,32,4,2,1],[32,3,4,2,1]]
use_batchnorm_2 = [True for _ in range(5)] + [False]
activation_fn_2 = [nn.ReLU(inplace = False) for _ in range(5)] +[nn.Tanh()]
architecture_2 = [transpose_conv_params_2,use_batchnorm_2,activation_fn_2]
# 3,608,995 [3M]

# Most prior GAN implementations [22, 23, 2] use batch normalization
# in both the generator and the discriminator to help stabilize training, but batch normalization
# changes the form of the discriminator’s problem from mapping a single input to a single output to
# mapping from an entire batch of inputs to a batch of outputs [23]. Our penalized training objective
# is no longer valid in this setting, since we penalize the norm of the critic’s gradient with respect
# to each input independently, and not the entire batch


In [None]:
"""Assuming "Critic/Discriminator/D" and "Generator/G" are defined as classes derived from nn.Module
Hyperparameters := {lr,clipping_parameter,mini_batch_size,# D updates per G update,latent_space dimesnion} to be chosen for optimisation"""

# Initialize model, loss function, and optimizer
lr_D, lr_G, c, m, d = 5e-5, 5e-5, 0.01, 64, 100
D_architecture = architecture_1
G_architecture = architecture_2
D = Critic_DCGAN(D_architecture).to(device)
G = Generator_DCGAN(G_architecture).to(device)
D_loss = []
G_loss = []

# RMSPROP
# D_optimizer = optim.RMSprop(D.parameters(), lr = lr_D, alpha=0.99, eps=1e-08, weight_decay=0, momentum=0, centered=False)
# G_optimizer = optim.RMSprop(G.parameters(), lr = lr_G, alpha=0.99, eps=1e-08, weight_decay=0, momentum=0, centered=False)
# Adam optimizer
D_optimizer = optim.Adam(D.parameters(), lr=lr_D, betas=(0.5, 0.99), eps=1e-8, weight_decay=0)
G_optimizer = optim.Adam(G.parameters(), lr=lr_G, betas=(0.5, 0.99), eps=1e-8, weight_decay=0)


In [None]:
# This part is used for manual changing if needed inbetween of the training
lr_D, lr_G, m, d , c = 5e-5, 5e-5, 64, 100 , 0.01

# RMSPROP [used in paper]
# D_optimizer = optim.RMSprop(D.parameters(), lr = lr_D, alpha=0.99, eps=1e-08, weight_decay=0, momentum=0, centered=False)
# G_optimizer = optim.RMSprop(G.parameters(), lr = lr_G, alpha=0.99, eps=1e-08, weight_decay=0, momentum=0, centered=False)
# Adam Optimizer
D_optimizer = optim.Adam(D.parameters(), lr=lr_D, betas=(0.5, 0.999), eps=1e-8, weight_decay=0)
G_optimizer = optim.Adam(G.parameters(), lr=lr_G, betas=(0.5, 0.999), eps=1e-8, weight_decay=0)


In [None]:
num_epochs = 50
n_critic = 5
n_generator = 2
gp_factor = 10
mini_batch_epochs = 84
epsilon = 1e-15
# Training loop
for epoch in range(num_epochs):
    for _ in range(mini_batch_epochs):
      # start = time.time()
      # We will track image quality and gradient norm and decide when to halt
      for t in range(n_critic):
          G.set_requires_grad(False)
          D.set_requires_grad(True)
          D_optimizer.zero_grad()
          G_optimizer.zero_grad()
          x = sample_train(m).to(device)
          z = (sample_gaussian(torch.zeros((d,)),torch.eye(d),m)).unsqueeze(-1).unsqueeze(-1).to(device)
          y = G(z)
          # The W-GAN loss terms is just difference of D(x) - D(y)
          # We need to maximise the loss wrt the critic to correctly approximate W(P_r,P_theta)
          # And then we minimise the Wasserstein metric wrt the G
          gradient_norm_penalty = (gp_factor) * compute_gp(D,x,y)
          loss_D = (D(y).mean() - D(x).mean()) + gradient_norm_penalty
          loss_D.backward()
          D_optimizer.step()
          D_optimizer.zero_grad()
          # clip_weights(D,c)

      for t in range(n_generator):
          # Sample another batch of noise z from prior p(z)
          G.set_requires_grad(True)
          D.set_requires_grad(False)
          D_optimizer.zero_grad()
          G_optimizer.zero_grad()
          z1 = (sample_gaussian(torch.zeros((d,)),torch.eye(d),m)).unsqueeze(-1).unsqueeze(-1).to(device)
          y1 = G(z1)
          # Compute gradients for G
          loss_G = -(D(y1).mean())
          loss_G.backward()
          G_optimizer.step()
          G_optimizer.zero_grad()

      D_loss.append(loss_D.item())
      G_loss.append(loss_G.item())
      # Optional: Logging after each epoch
      # print(f"Epoch [{epoch+1}/{num_epochs}] | D Loss: {loss_D:.4f} | G Loss: {loss_G:.4f}")
      # stop = time.time()
      # print(stop - start)


In [None]:
# Plotting the Loss curves so far
# Create a figure with two subplots (1 row, 2 columns)
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 6))
# Plot D_loss in the first subplot
D_steps = range(len(D_loss))
ax1.plot(D_steps, D_loss)
ax1.set_xlabel('Mini-batch iteration')
ax1.set_ylabel('loss')
ax1.set_title('D_loss')
# Plot G_loss in the second subplot
G_steps = range(len(G_loss))
ax2.plot(G_steps, G_loss)
ax2.set_xlabel('Mini-batch iterations')
ax2.set_ylabel('loss')
ax2.set_title('G_loss')
# Display the plots
plt.tight_layout()
plt.show()


In [None]:
# Inference
with torch.no_grad():
    z = (sample_gaussian(torch.zeros((d,)),torch.eye(d),100)).unsqueeze(-1).unsqueeze(-1).to(device)
    y = (G(z) + 1) / 2
    images = y.detach().cpu().numpy()
    # Create a figure for the grid of images
    fig, axes = plt.subplots(nrows=10, ncols=10, figsize=(15, 15))
    # Loop through the 100 images and display them in the grid
    for i, ax in enumerate(axes.flat):
        img = images[i].transpose(1, 2, 0)
        ax.imshow(img)
        ax.axis('off')
    plt.tight_layout()
    plt.show()


In [None]:
torch.save(D, 'Animal_1d.New_85_D.pth')
torch.save(G, 'Animal_1d.New_85_G.pth')

# # Load entire model
# D = torch.load('1d_650_D.pth')
# G = torch.load('1d_650_G.pth')

**Below is the best Images obtained using Gradient Norm Penalty**
* gp_factor = 10 and Adam(0.5,0.99)
* lr = 5e-5 for stability and No BN in critic with last layer being nn.Identity()





**Below is the results of WeightClipping/Gradient Clipping**
* Same architecture as before ; d=100 ; Adam
* Clipping factor = c = 0.01



**[Q8 AND Q9 ] AUTO ENCODING GAN / LATENT-VARIABLE INFERENCE**

* There exists various ways to implement a decoder and encoder along with the GAN architectures in the literature which are Cycle-GAN,DE-GAN,AAA VAE,VAE/GAN and AutoEncoding GAN.
* But most of these have a lower dimensional latent space as the intermediate. But for this assignment we simply implement a single decoder that solely converts generated images into the input random noise
* We train (GAN + Decoder) together with (GAN_loss + Reconstruction_loss btw the predicted noise and actual noise).
Note that the decoder has no direct relation with real images
* The architecture used for the decoder is going to be very similar to that of the critic itself using convolutions but this time the final o/p is not [1,1,1] but [100,1,1]
* Hopefully the "predicted noise" would also be distributed as the gaussian which was the true input. This is enforced using the reconstruction loss









**Decoder that tries to predict the noise that would generate the image**

In [None]:
class Decoder_GAN(nn.Module):
    def __init__(self,architecture):
        # architecture = [conv_params , use_batchnorm, activation_fn]
        # conv_params = [[in_channels,output_channels,kernel_size,stride,padding],....[*,100,*,*,*]]
        # use_batchnorm = [False,True,....False]
        # activation_fn = [nn.Relu,nn.LeakyRelu,.....nn.Identity()]
        super(Decoder_GAN, self).__init__()
        assert len(architecture[0]) == len(architecture[1]) == len(architecture[2])
        self.conv_params = architecture[0]
        self.use_batchnorm = architecture[1]
        self.activation_fn = architecture[2]
        layers = []
        for j,i in enumerate(self.conv_params):
            layers.append(nn.Conv2d(in_channels=i[0],out_channels=i[1],kernel_size=i[2],stride=i[3],padding=i[4]))
            if (self.use_batchnorm)[j]:
                layers.append(nn.BatchNorm2d(i[1]))
            layers.append(self.activation_fn[j])
        # The last conv2d layer outputs [100,1,1]
        # Since ouptut is gaussian and we expect any real number better to have no activations in the final layer
        self.model = nn.Sequential(*layers)

    # return value must be (bs,100,1,1) in our general examples
    # input is (bs,3,128,128) generated image
    def forward(self, x):
        return self.model(x)

    # We will use this for flexibility
    def set_requires_grad(self, requires_grad):
        for param in self.parameters():
            param.requires_grad = requires_grad


In [None]:
# Example usage
# output_size = ({input_size - kernel_size + 2*padding}/stride) + 1
# (E) (TRIAL 1)
conv_params_4 = [[3,8,4,2,1],[8,16,4,2,1],[16,32,4,2,1],[32,64,4,2,1],[64,100,8,1,0]]
use_batchnorm_4 = [True for _ in range(4)] + [False]
activation_fn_4 = [nn.LeakyReLU(0.4, inplace=False) for _ in range(4)] + [nn.Identity()]
architecture_4 = [conv_params_4,use_batchnorm_4,activation_fn_4]
# 453,452
# (D) (TRIAL_3)
conv_params_1 = [[3,32,4,2,1],[32,64,4,2,1],[64,64,4,2,1],[64,128,4,2,1],[128,256,4,2,1],[256,1,4,4,1]]
use_batchnorm_1 = [True for _ in range(5)] + [False]
activation_fn_1 = [nn.LeakyReLU(0.2, inplace=False) for _ in range(5)] +[nn.Sigmoid()]
architecture_1 = [conv_params_1,use_batchnorm_1,activation_fn_1]
# 760,929 [760k]
# (G) (DCGAN/TRIAL_1)
transpose_conv_params_2 = [[100,512,4,1,0],[512,256,4,2,1],[256,128,4,2,1],[128,64,4,2,1],[64,32,4,2,1],[32,3,4,2,1]]
use_batchnorm_2 = [True for _ in range(5)] + [False]
activation_fn_2 = [nn.ReLU(inplace = False) for _ in range(5)] +[nn.Tanh()]
architecture_2 = [transpose_conv_params_2,use_batchnorm_2,activation_fn_2]


In [None]:
# Just Check
Decoder_1 = Decoder_GAN(architecture_4)
random_tensor = torch.randn((5,3,128,128))
random_output = Decoder_1(random_tensor)
print(random_output.shape)
TP4 = sum(p.numel() for p in Decoder_1.parameters())
print(TP4)


**Training Decoder along with the GAN**

In [None]:
"""Assuming "Critic/Discriminator/D" and "Generator/G" are defined as classes derived from nn.Module
Hyperparameters := {lr,mini_batch_size,# D updates per G update,latent_space dimesnion} to be chosen for optimisation
This time we also have the Decoder name it E becos it actually encodes !!"""

# Initialize model, loss function, and optimizer
lr_D, lr_G, lr_E , m, d = 2e-4, 2e-4, 2e-4, 128, 100
D_architecture = architecture_1
G_architecture = architecture_2
E_architecture = architecture_4
D = Critic_DCGAN(D_architecture).to(device)
G = Generator_DCGAN(G_architecture).to(device)
E = Decoder_GAN(E_architecture).to(device)
# We will use 2 optimizers for convenience
# Adam optimizer
D_optimizer = optim.Adam(D.parameters(), lr=lr_D, betas=(0.5, 0.99), eps=1e-8, weight_decay=0)
G_optimizer = optim.Adam(G.parameters(), lr=lr_G, betas=(0.5, 0.99), eps=1e-8, weight_decay=0)
E_optimizer = optim.Adam(E.parameters(), lr=lr_E, betas=(0.5, 0.99), eps=1e-8, weight_decay=0)
D_loss = []
G_loss = []
E_loss = []
# RMSPROP
# D_optimizer = optim.RMSprop(D.parameters(), lr = lr_D, alpha=0.99, eps=1e-08, weight_decay=0, momentum=0, centered=False)
# G_optimizer = optim.RMSprop(G.parameters(), lr = lr_G, alpha=0.99, eps=1e-08, weight_decay=0, momentum=0, centered=False)


In [None]:
# This part is used for manual changing if needed inbetween of the training
lr_D, lr_G, lr_E, m, d = 1e-4 ,1e-4, 1e-4, 64, 100
D_optimizer = optim.Adam(D.parameters(), lr=lr_D, betas=(0.5, 0.99), eps=1e-8, weight_decay=0)
G_optimizer = optim.Adam(G.parameters(), lr=lr_G, betas=(0.5, 0.99), eps=1e-8, weight_decay=0)
E_optimizer = optim.Adam(E.parameters(), lr=lr_E, betas=(0.5, 0.99), eps=1e-8, weight_decay=0)


In [None]:
num_epochs = 10
n_critic = 4
n_generator = 1
n_decoder = 1
mini_batch_epochs = 42
epsilon = 1e-15
mse_loss = nn.MSELoss()
# Training loop
for epoch in range(num_epochs):
    for _ in range(mini_batch_epochs):
        # We will track image quality and gradient norm and decide when to halt
        for t in range(n_critic):
            G.set_requires_grad(False)
            D.set_requires_grad(True)
            E.set_requires_grad(False)
            D_optimizer.zero_grad()
            G_optimizer.zero_grad()
            E_optimizer.zero_grad()
            x = sample_train(m).to(device)
            z = (sample_gaussian(torch.zeros((d,)),torch.eye(d),m)).unsqueeze(-1).unsqueeze(-1).to(device)
            y = G(z)
            # f-GAN objective F(θ,w) maximizes wrt w and minimises wrt θ
            # Compute gradients for D where loss_D is a 0-dimensional tensor
            loss_D = -((torch.log(D(x) + epsilon)).mean() + (torch.log(1 - D(y) + epsilon)).mean())
            loss_D.backward()
            D_optimizer.step()
            D_optimizer.zero_grad()

        for t in range(n_generator):
            # Sample another batch of noise z from prior p(z)
            G.set_requires_grad(True)
            D.set_requires_grad(False)
            E.set_requires_grad(False)
            E_optimizer.zero_grad()
            D_optimizer.zero_grad()
            G_optimizer.zero_grad()
            z1 = (sample_gaussian(torch.zeros((d,)),torch.eye(d),m)).unsqueeze(-1).unsqueeze(-1).to(device)
            y1 = G(z1)
            z2 = E(y1)
            # Compute gradients for G
            loss_G = -(torch.log(D(y1) + epsilon).mean()) + (mse_loss(z1,z2))
            loss_G.backward()
            G_optimizer.step()
            G_optimizer.zero_grad()

        for t in range(n_decoder):
            # Sample yet another batch of noise z from prior p(z)
            D.set_requires_grad(False)
            G.set_requires_grad(False)
            E.set_requires_grad(True)
            D_optimizer.zero_grad()
            G_optimizer.zero_grad()
            E_optimizer.zero_grad()
            z3 = (sample_gaussian(torch.zeros((d,)),torch.eye(d),m)).unsqueeze(-1).unsqueeze(-1).to(device)
            y2 = G(z3)
            z4 = E(y2)
            # Compute gradients for E
            loss_E = (mse_loss(z4,z3))
            loss_E.backward()
            E_optimizer.step()
            E_optimizer.zero_grad()

        D_loss.append(loss_D.item())
        G_loss.append(loss_G.item())
        E_loss.append(loss_E.item())

    # Optional: Logging after each epoch
    # print(f"Epoch [{epoch+1}/{num_epochs}] | D Loss: {loss_D.item():.4f} | G Loss: {loss_G.item():.4f} | E_loss: {loss_E.item():.4f}")
    # stop = time.time()
    # print(stop - start)


In [None]:
# Plotting the Loss curves with different colors
# Create a figure with three subplots (1 row, 3 columns)
fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(12, 6))

# Plot D_loss in the first subplot (red)
D_steps = range(len(D_loss))
ax1.plot(D_steps, D_loss, color='red')
ax1.set_xlabel('Mini-batch iteration')
ax1.set_ylabel('loss')
ax1.set_title('D_loss')
# Plot G_loss in the second subplot (blue)
G_steps = range(len(G_loss))
ax2.plot(G_steps, G_loss, color='blue')
ax2.set_xlabel('Mini-batch iterations')
ax2.set_ylabel('loss')
ax2.set_title('G_loss')
# Plot E_loss in the third subplot (green)
E_steps = range(len(E_loss))
ax3.plot(E_steps, E_loss, color='green')
ax3.set_xlabel('Mini-batch iterations')
ax3.set_ylabel('loss')
ax3.set_title('E_loss')

# Display the plots
plt.tight_layout()
plt.show()


In [None]:
# Inference
with torch.no_grad():
    z = (sample_gaussian(torch.zeros((d,)),torch.eye(d),100)).unsqueeze(-1).unsqueeze(-1).to(device)
    y = (G(z) + 1) / 2
    images = y.detach().cpu().numpy()
    # Create a figure for the grid of images
    fig, axes = plt.subplots(nrows=10, ncols=10, figsize=(15, 15))
    # Loop through the 100 images and display them in the grid
    for i, ax in enumerate(axes.flat):
        img = images[i].transpose(1, 2, 0)
        ax.imshow(img)
        ax.axis('off')
    plt.tight_layout()
    plt.show()


**Following were the best images obtained**
*  lr = 1e-4 then reduced to 5e-5 using Adam(0.5,0.99)
*  We update {5,2,1} for D,G,E respectively then slowly change it to {5,3,1} and finally {4,2,1} with updates to G,E happening separately



**A simple MLP for classifying using Decoder outputs**

In [None]:
class MLP(nn.Module):
    def __init__(self, architecture):
        # architecture = [linear_params, use_batchnorm, activation_fn]
        # linear_params = [[in_dim, out_dim], .... [*, *]]
        # use_batchnorm = [False, True, .... False]
        # activation_fn = [nn.ReLU(), nn.LeakyReLU(), .... nn.Identity()]
        super(MLP, self).__init__()
        assert len(architecture[0]) == len(architecture[1]) == len(architecture[2])
        self.linear_params = architecture[0]
        self.use_batchnorm = architecture[1]
        self.activation_fn = architecture[2]
        layers = []
        for i in range(len(self.linear_params)):
            in_dim, out_dim = self.linear_params[i]
            layers.append(nn.Linear(in_dim, out_dim))
            if self.use_batchnorm[i]:
                layers.append(nn.BatchNorm1d(out_dim))
            layers.append(self.activation_fn[i])
        # Final layer would be softmax and return shape would be (bs,90)
        # But this gets done automatically in nn.CELoss
        self.model = nn.Sequential(*layers)

    def forward(self, x):
        return self.model(x)

    # We will use this for flexibility
    def set_requires_grad(self, requires_grad):
        for param in self.parameters():
            param.requires_grad = requires_grad


In [None]:
linear_params_5 = [[100,512],[512,256],[256,90]]
use_batchnorm_5 = [True for _ in range(2)] + [False]
activation_fn_5 = [nn.ReLU(inplace = False) for _ in range(2)] + [nn.Identity()]
architecture_5 = [linear_params_5, use_batchnorm_5, activation_fn_5]
MLP_1 = MLP(architecture_5)
random_input = torch.randn((7,100))
random_output = MLP_1(random_input)
print(random_output.shape)


**Training the MLP to classify**

In [None]:
learning_rate = 2e-3
N = MLP(architecture_5).to(device)
N_optimizer = optim.SGD(N.parameters(), lr = learning_rate, momentum=0.2)
N_loss = []
E = torch.load("E_2a_New_330.pth").to(device)
E.set_requires_grad(False)


In [None]:
# Changing lr manually during training if needed
learning_rate = 1e-5
N_optimizer = optim.SGD(N.parameters(), lr = learning_rate, momentum=0.2)


In [None]:
# Training the MLP
num_epochs = 100
m = 32
criterion = nn.CrossEntropyLoss()

# We keep images in [0,1]
for epoch in range(num_epochs):
    # start = time.time()
    N.set_requires_grad(True)
    N_optimizer.zero_grad()
    i_indices_0 = torch.randint(0, 90, (m,)).to(device)
    i_indices = i_indices_0.tolist() # chooses class
    j_indices = torch.randint(0, 60, (m,)).tolist() # chooses image in class
    sampled_images = []
    for i, j in zip(i_indices, j_indices):
        sampled_images.append(animal_images[i, j])

    x = torch.stack(sampled_images).to(device)
    x_1 = (2*x - 1)
    # Get the encodings using E of shape (bs,100,1,1)
    z = (E(x_1).squeeze(-1).squeeze(-1))
    y = N(z) # Output of shape (bs,90)
    loss_N = criterion(y, i_indices_0)
    loss_N.backward()
    N_optimizer.step()
    N_optimizer.zero_grad()
    N_loss.append(loss_N.item())

    # print(f"Epoch [{epoch+1}/{num_epochs}] | N Loss: {N_loss[-1]:.4f}")
    # stop = time.time()
    # print(stop - start)


In [None]:
# Create a figure for the plot
fig, ax = plt.subplots(figsize=(8, 6))
# Plot N_loss
N_steps = range(len(N_loss))  # X-axis values (iterations)
ax.plot(N_steps, N_loss)      # Y-axis values (D_loss)
# Set labels and title
ax.set_xlabel('Mini-batch iteration')
ax.set_ylabel('Loss')
ax.set_title('N_loss')
# Display the plot
plt.show()


In [None]:
# Determine the final classification accuracy on TRAINING SET
Correct = 0
for i in range(0,90):
    with torch.no_grad():
        # bs = 60
        x = animal_images[i].to(device)
        z = (E(x).squeeze(-1).squeeze(-1))
        y = N(z)
    prediction = torch.argmax(y, dim=1)
    # (60,)
    for j in range(60):
        if (int(prediction[j]) == i):
            Correct += 1
print("Final Accuracy of MLP based on decoder outputs", Correct/54)


*  An MLP trained on the latents after (2000 mini-batch epochs for the {GAN + Decoder}) initially gives 1% to around 4% accuracy
*  On further training CE_Loss decreases to around 0.3 but accuracy drops to 1% again which shows that Latents learnt in previous Decoder isnt good and that CE_Loss doesnt directly correlate with Accuracy

