In [None]:
import matplotlib.pyplot as plt

Import the [dataset](https://www.kaggle.com/datasets/lokeshbolisetty/speech-to-image-dataset) from kaggle. 

In [None]:
import os

DATA_DIR = '../input/speech-to-image-dataset/Photos/CatImages' #directory containing cat images

Load the dataset using the ImageFolder class from torchvisison. Resize the images to 64x64 and normalize the pixels so that all the pixels are in the range (-1,1). 

In [None]:
from torch.utils.data import DataLoader
from torchvision.datasets import ImageFolder
import torchvision.transforms as T

In [None]:
# to normalize the pixel values, we choose a mean standard deviation of 0.5 for each channel
# this will ensure that the pixel values are in the range of (-1, 1)
# as its very convenient to train the discriminator when the pixel values are in the range of (-1, 1)

image_size = 64
batch_size = 128 #used to create a data loader
stats = (0.5, 0.5, 0.5), (0.5, 0.5, 0.5) # means, standard deviations




In [None]:
train_ds = ImageFolder(DATA_DIR, transform=T.Compose([
    T.Resize(image_size), #resizes every image in 64*64 pixels
    T.CenterCrop(image_size), # to pick central crop of every image
    T.ToTensor(), #converting into tensors
    T.Normalize(*stats)])) #normalizing the tensors, so that values with in the tensors range will change from (0, 1) to (-1, 1) 

#loading the data:
train_dl = DataLoader(train_ds, 
                      batch_size, 
                      shuffle=True,#to make sure we use images in different order in each epoch (This RANDOMIZATION helps to trai and generalize the model faster)
                      num_workers=3,#this make sures that we use multiple cores from our machine to read the images
                      pin_memory=True)

#Ignore the warning here

In [None]:
import torch
torch.manual_seed(0)
from torchvision.utils import make_grid
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
# as we normalized the pixel values into (-1, 1) 
# this denormalization brings the pixel values back 
# into the range of (0, 1) we use this while we view images

def denorm(img_tensors):
    return img_tensors * stats[1][0] + stats[0][0]

In [None]:
# show_images takes image tensors and maximum number of images it should show and plots them in a grid
def show_images(images, nmax=64):
    fig, ax = plt.subplots(figsize=(8, 8))
    ax.set_xticks([]); ax.set_yticks([])
    ax.imshow(make_grid(denorm(images.detach()[:nmax]), nrow=8).permute(1, 2, 0))

# show_batch takes the data loader so as get the batch of images from dataloader and show the images 
def show_batch(dl, nmax=64):
    for images, _ in dl:
        show_images(images, nmax)
        break

In [None]:
def get_default_device():
    """Picks GPU if available, else CPU"""
    if torch.cuda.is_available(): # for this to retrun true 3 conditions should hold true, 
                                    # Execution environment should be connected to a hardware which is a Nvidia GPU or a graphics card
                                    # Cuda Drivers installed
                                    # Pytorch version that is compatable with GPU
                                    # all these are ensured in colab/kaggle 

        return torch.device('cuda')
    else:
        return torch.device('cpu')
    
# to_device takes data and move it onto a target device
def to_device(data, device):
    """Move tensor(s) to chosen device"""
    if isinstance(data, (list,tuple)):
        return [to_device(x, device) for x in data]
    return data.to(device, non_blocking=True)

class DeviceDataLoader():
    """Wrap a dataloader to move data to a device"""
    def __init__(self, dl, device):
        self.dl = dl
        self.device = device
        
    def __iter__(self):
        """Yield a batch of data after moving it to device"""
        for b in self.dl: 
            yield to_device(b, self.device)

    def __len__(self):
        """Number of batches"""
        return len(self.dl)

In [None]:
# this cell is just to make sure we are using gpu
# it outputs 'cuda' in case we are using gpu, else it outputs 'cpu'

device = get_default_device()
device



We can now move our training data loader using `DeviceDataLoader` for automatically transferring batches of data to the GPU (if available).

In [None]:
# we are converting training data loader to a device data loader

train_dl = DeviceDataLoader(train_dl, device)


## Discriminator Network

In [None]:
import torch.nn as nn

In [None]:
discriminator = nn.Sequential(
    # in: 3 x 64 x 64

    nn.Conv2d(3, 64, kernel_size=4, stride=2, padding=1, bias=False),
    nn.BatchNorm2d(64),
    nn.LeakyReLU(0.2, inplace=True), #Activation Function
    # out: 64 x 32 x 32

    nn.Conv2d(64, 128, kernel_size=4, stride=2, padding=1, bias=False),
    nn.BatchNorm2d(128),
    nn.LeakyReLU(0.2, inplace=True), #Activation Function
    # out: 128 x 16 x 16

    nn.Conv2d(128, 256, kernel_size=4, stride=2, padding=1, bias=False),
    nn.BatchNorm2d(256),
    nn.LeakyReLU(0.2, inplace=True), #Activation Function
    # out: 256 x 8 x 8

    nn.Conv2d(256, 512, kernel_size=4, stride=2, padding=1, bias=False),
    nn.BatchNorm2d(512),
    nn.LeakyReLU(0.2, inplace=True), #Activation Function
    # out: 512 x 4 x 4

    nn.Conv2d(512, 1, kernel_size=4, stride=1, padding=0, bias=False),
    # out: 1 x 1 x 1

    nn.Flatten(), # to flatten it out into a single vector
    nn.Sigmoid()) # as we have a single class we are using Sigmoid()

Note that we're using the Leaky ReLU activation for the discriminator.

Just like any other binary classification model, the output of the discriminator is a single number between 0 and 1, which can be interpreted as the probability of the input image being real i.e. picked from the original dataset.

In [None]:
discriminator = to_device(discriminator, device) #moving the discriminator model to device

## Generator Network

We use `ConvTranspose2d` to perform *transposed convolution*. This will convert a latent tensor of (128,1,1) to (3,28,28)

In [None]:
latent_size = 128

In [None]:
generator = nn.Sequential(
    # in: latent_size x 1 x 1

    nn.ConvTranspose2d(latent_size, 512, kernel_size=4, stride=1, padding=0, bias=False),
    nn.BatchNorm2d(512),
    nn.ReLU(True), #Activation Function
    # out: 512 x 4 x 4

    nn.ConvTranspose2d(512, 256, kernel_size=4, stride=2, padding=1, bias=False),
    nn.BatchNorm2d(256),
    nn.ReLU(True), #Activation Function
    # out: 256 x 8 x 8

    nn.ConvTranspose2d(256, 128, kernel_size=4, stride=2, padding=1, bias=False),
    nn.BatchNorm2d(128),
    nn.ReLU(True), #Activation Function
    # out: 128 x 16 x 16

    nn.ConvTranspose2d(128, 64, kernel_size=4, stride=2, padding=1, bias=False),
    nn.BatchNorm2d(64),
    nn.ReLU(True), #Activation Function
    # out: 64 x 32 x 32

    nn.ConvTranspose2d(64, 3, kernel_size=4, stride=2, padding=1, bias=False),
    nn.Tanh() # reduces the pixel values into the range of (-1, 1)
    # out: 3 x 64 x 64
)
# So the outputs of generator are pixel values in the range of (-1, 1) and are of the shape 3*64*64
# which is same as the Images picked from the dataset after normalization 


We use the TanH activation function for the output layer of the generator.

In [None]:
xb = torch.randn(batch_size, latent_size, 1, 1) # random latent tensors
fake_images = generator(xb)
print(fake_images.shape)
show_images(fake_images)

In [None]:
generator = to_device(generator, device)
# moving the generator model to the device

## Training the discriminator

In [None]:
def train_discriminator(real_images, opt_d):
    # Clear discriminator gradients
    opt_d.zero_grad()

    # Pass real images through discriminator
    # targets are set to ones for all the real images
    real_preds = discriminator(real_images)
    real_targets = torch.ones(real_images.size(0), 1, device=device)
    real_loss = F.binary_cross_entropy(real_preds, real_targets)
    real_score = torch.mean(real_preds).item()
    
    # Generate fake images
    latent = torch.randn(batch_size, latent_size, 1, 1, device=device)
    fake_images = generator(latent)

    # Pass fake images through discriminator
    # targets are set to zero for all the fake images
    fake_targets = torch.zeros(fake_images.size(0), 1, device=device)
    fake_preds = discriminator(fake_images)
    fake_loss = F.binary_cross_entropy(fake_preds, fake_targets)
    fake_score = torch.mean(fake_preds).item()

    # Update discriminator weights
    loss = real_loss + fake_loss
    loss.backward()
    opt_d.step()
    return loss.item(), real_score, fake_score

## Generator Training

- We generate a batch of images using the generator, pass the into the discriminator.

- We calculate the loss by setting the target labels to 1 i.e. real. We do this because the generator's objective is to "fool" the discriminator. 

- We use the loss to perform gradient descent i.e. change the weights of the generator, so it gets better at generating real-like images to "fool" the discriminator.


In [None]:
def train_generator(opt_g):
    # Clear generator gradients
    opt_g.zero_grad()
    
    # Generate fake images
    latent = torch.randn(batch_size, latent_size, 1, 1, device=device)
    fake_images = generator(latent)
    
    # Try to fool the discriminator
    preds = discriminator(fake_images)
    targets = torch.ones(batch_size, 1, device=device)
    loss = F.binary_cross_entropy(preds, targets) 
    # overall loss for the generator
    
    # Update generator weights
    loss.backward()
    opt_g.step()
    
    return loss.item()

Saving the intermediate outputs from the generator to understand the speed of the training.

In [None]:
# for visual inspection, after each and every epoch 
# we are going to genearate a batch of images and svaing that file

from torchvision.utils import save_image


In [None]:
import os
sample_dir = 'generated'
os.makedirs(sample_dir, exist_ok=True)

In [None]:
#f= open("./generated/demofile.txt",'w')

In [None]:
def save_samples(index, latent_tensors, show=True):
    fake_images = generator(latent_tensors)
    fake_fname = 'generated-images-{0:0=4d}.png'.format(index)
    save_image(denorm(fake_images), os.path.join(sample_dir, fake_fname), nrow=8)
    print('Saving', fake_fname)
    if show:
        fig, ax = plt.subplots(figsize=(8, 8))
        ax.set_xticks([]); ax.set_yticks([])
        ax.imshow(make_grid(fake_images.cpu().detach(), nrow=8).permute(1, 2, 0))

We'll use a fixed set of input vectors to the generator to see how the individual generated images evolve over time as we train the model. Let's save one set of images before we start training our model.

In [1]:
# creating a set of latent tensors that we can use after each epoch.
fixed_latent = torch.randn(64, latent_size, 1, 1, device=device)


In [None]:
#saving the samples before training
save_samples(0, fixed_latent)


## Training

In [None]:
from tqdm.notebook import tqdm
import torch.nn.functional as F

In [None]:
def fit(epochs, lr, start_idx=1):
    torch.cuda.empty_cache() #to remove unused data from GPU  
    
    # Losses & scores
    losses_g = [] # generator losses
    losses_d = [] # discriminator losses
    real_scores = []
    fake_scores = []
    
    # Create optimizers
    opt_d = torch.optim.Adam(discriminator.parameters(), lr=lr, betas=(0.5, 0.999))
    opt_g = torch.optim.Adam(generator.parameters(), lr=lr, betas=(0.5, 0.999))
    
    for epoch in range(epochs):
        for real_images, _ in tqdm(train_dl):
            # Train discriminator
            loss_d, real_score, fake_score = train_discriminator(real_images, opt_d)
            # Train generator
            loss_g = train_generator(opt_g)
            
        # Record losses & scores
        losses_g.append(loss_g)
        losses_d.append(loss_d)
        real_scores.append(real_score)
        fake_scores.append(fake_score)
        
        # Log losses & scores (last batch)
        print("Epoch [{}/{}], loss_g: {:.4f}, loss_d: {:.4f}, real_score: {:.4f}, fake_score: {:.4f}".format(
            epoch+1, epochs, loss_g, loss_d, real_score, fake_score))
    
        # Save generated images
        save_samples(epoch+start_idx, fixed_latent, show=False)
    
    return losses_g, losses_d, real_scores, fake_scores

useWe are now ready to train the model. Try different learning rates to see if you can maintain the fine balance between the training the generator and the discriminator.

In [None]:
lr = 0.0002
epochs = 1000 # Number of epochs is still not fixed. Looks like we need to do a lot more than this for getting decent results. 

In [None]:
history = fit(epochs, lr)

In [None]:
losses_g, losses_d, real_scores, fake_scores = history

## Saving the checkpoints

In [None]:
# Save the model checkpoints 
torch.save(generator.state_dict(), 'G.pth')
torch.save(discriminator.state_dict(), 'D.pth')

Here's how the generated images look, after the 1st, 5th and 10th epochs of training.

In [None]:
!zip -r 'generated200.zip' ./generated

In [None]:
from IPython.display import Image

In [None]:
Image('./generated/generated-images-0001.png')

In [None]:
Image('./generated/generated-images-0005.png')

In [None]:
Image('./generated/generated-images-0010.png')

In [None]:
Image('./generated/generated-images-0020.png')

In [None]:
Image('./generated/generated-images-0025.png')

In [None]:
def save_samples(index, latent_tensors, show=True):
    fake_images = generator(latent_tensors)
    fake_fname = 'generated-images-{0:0=4d}.png'.format(index)
    #save_image(denorm(fake_images), os.path.join(sample_dir, fake_fname), nrow=8)
    print('Saving', fake_fname)
    if show:
        fig, ax = plt.subplots(figsize=(8,8))
        ax.set_xticks([]); ax.set_yticks([])
        ax.imshow(make_grid(fake_images.cpu().detach(), nrow=8).permute(1, 2, 0))

In [None]:
save_samples(10000000, fixed_latent, True)

In [None]:
def show_images1(images, nmax=64):
    fig, ax = plt.subplots(figsize=(2, 2))
    ax.set_xticks([]); ax.set_yticks([])
    ax.imshow(make_grid(denorm(images.detach()[:nmax]), nrow=8).permute(1, 2, 0))


In [None]:
xb = torch.randn(1, latent_size, 1, 1) # random latent tensors
xb = xb.to(device)
fake_images = generator(xb)
fake_images = fake_images.cpu()
print(fake_images.shape)
show_images1(fake_images)

<p align="center"> <img src="https://thumbs.gfycat.com/FlatIdealisticBoaconstrictor-size_restricted.gif" alt="Tried to add some humor but failed :(" > </p>