<a href="https://colab.research.google.com/github/Kwanikaze/vpandas/blob/master/VAE_OHE_8digits.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
import matplotlib.pyplot as plt
import numpy as np

## Generate Data

In [2]:
def generate_data(num=8):
    """ Generate 'num' number of one-hot encoded integers. """ 
    x_train = np.eye(num)[np.arange(num)]                       # This is a simple way to one-hot encode integers
    
    # Repeat x_train multiple times for training
    x_train = np.repeat(x_train, 100, axis=0)
    
    # The target is x_train itself!
    x_target = x_train.copy()
    return x_train, x_target

In [3]:
num = 8
np.random.seed(10)
x_train, x_target = generate_data(num=num)

In [4]:
print(x_train)
print(np.shape(x_train))
print(np.shape(x_target))

[[1. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 1.]
 [0. 0. 0. ... 0. 0. 1.]
 [0. 0. 0. ... 0. 0. 1.]]
(800, 8)
(800, 8)


## Variational Autoencoder Parameters

In [5]:
#Parameters
latent_dims = 3
num_epochs = 2000
batch_size = 64
learning_rate = 1e-3
use_gpu = True
variational_beta = 0.00001 #tuned

## VAE Definition
https://colab.research.google.com/github/smartgeometry-ucl/dl4g/blob/master/variational_autoencoder.ipynb#scrollTo=0psoODlF9S_Y

https://stats.stackexchange.com/questions/361643/sampling-z-in-vae

https://stats.stackexchange.com/questions/318748/deriving-the-kl-divergence-loss-for-vaes

In [6]:
class VariationalAutoencoder(nn.Module):
    def __init__(self, latent_dims):
        super().__init__()
        self.fc1 = nn.Linear(num, latent_dims) # why have this additional fc layer?
        self.fc_mu = nn.Linear(latent_dims, latent_dims)
        self.fc_logvar = nn.Linear(latent_dims, latent_dims)
        self.fc_out = nn.Linear(latent_dims,num)

    def encode(self, x):
        h1 = torch.sigmoid(self.fc1(x))
        return self.fc_mu(h1), self.fc_logvar(h1)

    def reparameterize(self, mu, logvar):
        std = torch.exp(0.5*logvar)
        eps = torch.randn_like(std)
        return mu + eps*std

    def decode(self, z):
        if z.size()[0] == latent_dims: #resize from [3] to [1,3]
          z = z.view(1, latent_dims)
        softmax = nn.Softmax(dim=1)
        recon = softmax(self.fc_out(z))
        return recon

    def forward(self, x, latent_dims):
        mu, logvar = self.encode(x)
        z = self.reparameterize(mu, logvar)
        return self.decode(z), mu, logvar

def vae_loss(batch_recon, x_batch_targets, mu, logvar):
  criterion = nn.CrossEntropyLoss()
  CE = criterion(batch_recon, x_batch_targets)
  #print(CE)
  #BCE = F.binary_cross_entropy(recon, x, reduction='sum') #one hot encoded input
  KLd = -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp()) # https://stats.stackexchange.com/questions/318748/deriving-the-kl-divergence-loss-for-vaes
  #print(KLd)
  return CE,variational_beta*KLd, CE + variational_beta*KLd

In [7]:
def trainVAE(VAE, latent_dims):
  VAE.train()
  x_train, x_target = generate_data(num=num)
  inds = list(range(x_train.shape[0]))
  N = x_train.shape[0] # 800
  freq = num_epochs // 10 # floor division

  loss_hist = []
  x_train = Variable(torch.from_numpy(x_train))
  x_target = Variable(torch.from_numpy(x_target))
  for epoch in range(num_epochs):
      inds = np.random.permutation(inds)
      x_train = x_train[inds]
      x_train = x_train.to(device)
      x_target = x_target[inds]
      x_target = x_target.to(device)
      
      loss = 0
      CE = 0
      KLd = 0
      num_batches = N / batch_size
      for b in range(0, N, batch_size):
          #get the mini-batch
          x_batch = x_train[b: b+batch_size]
          x_target_batch = x_target[b: b+batch_size]
          
          #feed forward
          batch_recon,latent_mu,latent_logvar = VAE(x=x_batch.float(),latent_dims = latent_dims)
          
          # Error
          #Convert x_batch from OHE vectors to single scalar for target class, of each sample in batch 
          _, x_batch_targets = x_batch.max(dim=1)
          train_CE, train_KLd, train_loss = vae_loss(batch_recon, x_batch_targets, latent_mu, latent_logvar)
          #print(batch_recon.size())
          #print(x_batch_targets.size())
          loss += train_loss.item() / N # update epoch loss
          CE += train_CE.item() / N 
          KLd += train_KLd.item() / N 

          #Backprop the error, compute the gradient
          optimizer.zero_grad()
          train_loss.backward()
          
          #update parameters based on gradient
          optimizer.step()
          
      #Record loss per epoch        
      loss_hist.append(loss)
      
      if epoch % freq == 0:
          print()
          print("Epoch %d/%d\t CE: %.5f, KLd: %.5f, Train loss=%.5f" % (epoch + 1, num_epochs,CE,KLd, loss), end='\t', flush=True)
          
          #Test with all training data
          VAE.eval()
          train_recon, train_mu, train_logvar = VAE(x = x_train.float(),latent_dims=latent_dims)
          _, x_targets = x_target.max(dim=1)
          CE,KLd,test_loss = vae_loss(train_recon, x_targets, train_mu, train_logvar)
          print("CE: {:.5f}, KLd: {:.5f}, Test loss: {:.5f}".format(CE,KLd,test_loss.item()), end='')
      
  print("\nTraining finished!")

## Latent dimensions set to 3

In [8]:
#  use gpu if available
device = torch.device("cuda:0" if use_gpu and torch.cuda.is_available() else "cpu")
VAE = VariationalAutoencoder(latent_dims=3)
VAE = VAE.to(device)
num_params = sum(p.numel() for p in VAE.parameters() if p.requires_grad)
print(VAE.parameters)
print("Number of parameters: %d" % num_params) #8*3 + 3 = 27, 3*8 + 8 = 32, 27+32

# optimizer object
optimizer = torch.optim.Adam(params = VAE.parameters(), lr = learning_rate)
#criterion = nn.CrossEntropyLoss()    # for target, does not accept a OHE vector
#criterion = nn.NLLLoss()

trainVAE(VAE, latent_dims=3)

<bound method Module.parameters of VariationalAutoencoder(
  (fc1): Linear(in_features=8, out_features=3, bias=True)
  (fc_mu): Linear(in_features=3, out_features=3, bias=True)
  (fc_logvar): Linear(in_features=3, out_features=3, bias=True)
  (fc_out): Linear(in_features=3, out_features=8, bias=True)
)>
Number of parameters: 83

Epoch 1/2000	 CE: 0.03392, KLd: 0.00000, Train loss=0.03392	CE: 2.08903, KLd: 0.00068, Test loss: 2.08971
Epoch 201/2000	 CE: 0.02220, KLd: 0.00011, Train loss=0.02232	CE: 1.36513, KLd: 0.09116, Test loss: 1.45629
Epoch 401/2000	 CE: 0.02077, KLd: 0.00016, Train loss=0.02093	CE: 1.27841, KLd: 0.12675, Test loss: 1.40516
Epoch 601/2000	 CE: 0.02073, KLd: 0.00013, Train loss=0.02086	CE: 1.27559, KLd: 0.10305, Test loss: 1.37864
Epoch 801/2000	 CE: 0.02072, KLd: 0.00009, Train loss=0.02080	CE: 1.27484, KLd: 0.06985, Test loss: 1.34469
Epoch 1001/2000	 CE: 0.02071, KLd: 0.00007, Train loss=0.02078	CE: 1.27445, KLd: 0.05453, Test loss: 1.32898
Epoch 1201/2000	 CE: 0

KL Divergence increases, then decreases. Cross Entropy loss (reconstruction term of the ELBO) steadily decreases.

In [9]:
print("Print prediction results:")
x_test = np.eye(num)[np.arange(num)]                        # Test data (one-hot encoded)
x_test = Variable(torch.from_numpy(x_test))
x_test = x_test.to(device)
#np.set_printoptions(2)
for x in x_test:
    print("\tInput: {} \t Output: {}".format(x.cpu().detach().numpy(), np.round(VAE(x=x.float(),latent_dims=3)[0].cpu().detach().numpy(),decimals=2)))

Print prediction results:
	Input: [1. 0. 0. 0. 0. 0. 0. 0.] 	 Output: [[1. 0. 0. 0. 0. 0. 0. 0.]]
	Input: [0. 1. 0. 0. 0. 0. 0. 0.] 	 Output: [[0. 1. 0. 0. 0. 0. 0. 0.]]
	Input: [0. 0. 1. 0. 0. 0. 0. 0.] 	 Output: [[0. 0. 1. 0. 0. 0. 0. 0.]]
	Input: [0. 0. 0. 1. 0. 0. 0. 0.] 	 Output: [[0. 0. 0. 1. 0. 0. 0. 0.]]
	Input: [0. 0. 0. 0. 1. 0. 0. 0.] 	 Output: [[0. 0. 0. 0. 1. 0. 0. 0.]]
	Input: [0. 0. 0. 0. 0. 1. 0. 0.] 	 Output: [[0. 0. 0. 0. 0. 1. 0. 0.]]
	Input: [0. 0. 0. 0. 0. 0. 1. 0.] 	 Output: [[0. 0. 0. 0. 0. 0. 1. 0.]]
	Input: [0. 0. 0. 0. 0. 0. 0. 1.] 	 Output: [[0. 0. 0. 0. 0. 0. 0. 1.]]


## Extract intermediate features using Forward Hook

In [10]:
def printnorm_encoder(self, input1, output):
    # input is a tuple of packed inputs
    # output is a Tensor. output.data is the Tensor we are interested
    print('\tInside ' + self.__class__.__name__ + ' forward')
    #print('\t input:', input1.cpu().detach().numpy())
    print('\t output rounded to 2 decimals:', np.round(output.cpu().detach().numpy(),decimals=2))
    print('\t output rounded to integer:', np.round(output.cpu().detach().numpy(),decimals=0))

In [11]:
def inside_decoder(self, input1, output):
    # input is a tuple of packed inputs
    # output is a Tensor. output.data is the Tensor we are interested
    print('\tInside ' + self.__class__.__name__ + ' forward')
    #print('\t input:', input1[0].cpu().detach().numpy())
    print('\t input rounded to 2 decimals:', np.round(input1[0].cpu().detach().numpy(),2))
    #print('\t output:', output.cpu().detach().numpy())
    print('\t output rounded to 2 decimals:', np.round(output.cpu().detach().numpy(),2))

In [12]:
#AE.encoder_layer.register_forward_hook(printnorm_encoder)
decoder_hook = VAE.fc_mu.register_forward_hook(inside_decoder)
for x in x_test:
    print('INPUT: {}'.format(x.cpu().detach().numpy()))
    out = VAE(x=x.float(),latent_dims=3)
    #print(out)
decoder_hook.remove() #remove hook after use

INPUT: [1. 0. 0. 0. 0. 0. 0. 0.]
	Inside Linear forward
	 input rounded to 2 decimals: [0.68 0.05 0.77]
	 output rounded to 2 decimals: [-0.85 -2.36 -0.66]
INPUT: [0. 1. 0. 0. 0. 0. 0. 0.]
	Inside Linear forward
	 input rounded to 2 decimals: [0.9  0.5  0.71]
	 output rounded to 2 decimals: [ 1.54 -1.31 -0.55]
INPUT: [0. 0. 1. 0. 0. 0. 0. 0.]
	Inside Linear forward
	 input rounded to 2 decimals: [0.61 0.06 0.06]
	 output rounded to 2 decimals: [ 0.05 -1.22  1.89]
INPUT: [0. 0. 0. 1. 0. 0. 0. 0.]
	Inside Linear forward
	 input rounded to 2 decimals: [0.11 0.05 0.55]
	 output rounded to 2 decimals: [-2.39 -0.21 -0.1 ]
INPUT: [0. 0. 0. 0. 1. 0. 0. 0.]
	Inside Linear forward
	 input rounded to 2 decimals: [0.02 0.64 0.83]
	 output rounded to 2 decimals: [-1.1   1.91 -1.41]
INPUT: [0. 0. 0. 0. 0. 1. 0. 0.]
	Inside Linear forward
	 input rounded to 2 decimals: [0.51 0.85 0.82]
	 output rounded to 2 decimals: [ 1.24  1.1  -1.26]
INPUT: [0. 0. 0. 0. 0. 0. 1. 0.]
	Inside Linear forward
	 input 

In [13]:
#AE.encoder_layer.register_forward_hook(printnorm_encoder)
decoder_hook = VAE.fc_out.register_forward_hook(inside_decoder)
for x in x_test:
    print('INPUT: {}'.format(x.cpu().detach().numpy()))
    out = VAE(x=x.float(),latent_dims=3)
    #print(out)
decoder_hook.remove() #remove hook after use

INPUT: [1. 0. 0. 0. 0. 0. 0. 0.]
	Inside Linear forward
	 input rounded to 2 decimals: [[-1.15 -2.43 -0.85]]
	 output rounded to 2 decimals: [[ 22.21   7.66   4.29  12.17  -8.68 -10.17 -20.15 -15.57]]
INPUT: [0. 1. 0. 0. 0. 0. 0. 0.]
	Inside Linear forward
	 input rounded to 2 decimals: [[ 1.5  -1.04 -0.99]]
	 output rounded to 2 decimals: [[  5.13  17.29  -0.45  -9.21  -9.11   9.21   2.19 -13.59]]
INPUT: [0. 0. 1. 0. 0. 0. 0. 0.]
	Inside Linear forward
	 input rounded to 2 decimals: [[ 0.28 -1.22  2.12]]
	 output rounded to 2 decimals: [[  1.07   1.61  18.24  -4.6  -20.82 -13.47   5.8    3.55]]
INPUT: [0. 0. 0. 1. 0. 0. 0. 0.]
	Inside Linear forward
	 input rounded to 2 decimals: [[-2.1  -0.44  0.4 ]]
	 output rounded to 2 decimals: [[  7.02 -11.44   3.24  16.55   2.08 -12.72 -15.71   3.82]]
INPUT: [0. 0. 0. 0. 1. 0. 0. 0.]
	Inside Linear forward
	 input rounded to 2 decimals: [[-1.35  1.69 -1.95]]
	 output rounded to 2 decimals: [[ -3.72 -10.31 -18.99  13.41  24.83  12.06 -13.71   2.

## Generate Samples

## Latent dimensions set to 4

In [14]:
AE = Autoencoder(latent_dims=4)
AE = AE.to(device)

# optimizer object
optimizer = torch.optim.Adam(params = AE.parameters(), lr = learning_rate)
#criterion = nn.CrossEntropyLoss()    # for target, does not accept a OHE vector
criterion = nn.NLLLoss()

trainAE(AE)

NameError: ignored

In [None]:
print("Print prediction results:")
x_test = np.eye(num)[np.arange(num)]                        # Test data (one-hot encoded)
x_test = Variable(torch.from_numpy(x_test))
x_test = x_test.to(device)
#np.set_printoptions(2)
for x in x_test:
    print("\tInput: {} \t Output: {}".format(x.cpu().detach().numpy(), np.round(AE(features=x.float(),latent_dims=4).cpu().detach().numpy(),decimals=2)))

In [None]:
decoder_hook = AE.decoder_layer.register_forward_hook(inside_decoder)
for x in x_test:
    print('INPUT: {}'.format(x.cpu().detach().numpy()))
    out = AE(features=x.float(),latent_dims=4)
    #print(out)
decoder_hook.remove() #remove hook after use