# Estimating pose from images using a deep conditional latent variable model

In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

from deep_lvm_pose_tracking import vae
from deep_lvm_pose_tracking.vae import VAE
from deep_lvm_pose_tracking import toy_data as toy
from deep_lvm_pose_tracking import notebook_utils as nu

import numpy as np
import matplotlib.pyplot as plt

import torch
from torch.utils.data import DataLoader, Subset
device = 'cuda' if torch.cuda.is_available() else "cpu"
print(f'Pytorch: Train with {device}')
device = torch.device(device)
torch.manual_seed(42)

import sys
import os
from functools import partial

from IPython.display import HTML, display, Markdown
from IPython.core.debugger import set_trace

# you might have to start the visdom server manually.
cmd = sys.executable + ' -m visdom.server&'
os.system(cmd)
plotter = nu.VisdomLinePlotter(env_name='main')
print("Started server on port:", plotter.viz.port)

import time
t0 = time.time()

## The problem in its simplest form: introducing hierarchical toy data

In [None]:
N = 360000
batch_size = 128
d = 64  # image edge length
img_shape = (d, d)

bone_lengths, key_marker_width = nu.make_bonelengths_and_width()
poses = nu.make_poses(N=N)

# generate training data
h = toy.HierarchyImages(angles=poses, bone_lengths=bone_lengths,
                        key_marker_width=key_marker_width,
                        img_shape=img_shape)

# data loader for easy batching
data_loader = DataLoader(h, batch_size=batch_size, shuffle=True, num_workers=4,
                         drop_last=True)

poses_val = nu.make_poses(N=N//10)

h_val = toy.HierarchyImages(angles=poses_val, bone_lengths=bone_lengths,
                            key_marker_width=key_marker_width,
                            img_shape=img_shape)

val_loader = DataLoader(h_val, batch_size=batch_size, shuffle=False, num_workers=4,
                        drop_last=True)

# dataloader dictionary with reduced validation set size
dataloader = {'train': data_loader,
              'val': DataLoader(torch.utils.data.Subset(h_val, np.random.choice(range(len(h_val)), size=N//5)),
                               drop_last=True, batch_size=batch_size, num_workers=4),
              'pretrain': DataLoader(torch.utils.data.Subset(h_val, np.random.choice(range(len(h_val)), size=N//5)),
                               drop_last=True, batch_size=batch_size, num_workers=4)
             }

# bake notebook specific functions
plot_sample_grid = partial(nu.plot_sample_grid, img_shape=img_shape)
pose_to_image = partial(nu.pose_to_image, bone_lengths=bone_lengths, d=d)

### Example: See end of notebook for video!

In [None]:
h.plot_image(np.random.randint(0, len(poses)))

## Deep Latent Variable Model a.k.a. Variational Autoencoder

We want to find the marginal likelihood of images $X$:

$p(X) = \int p(X|z)p(z) dz$.

If $p(X|z)$ is a density parameterized by $\eta = {NeuralNetwork}(z)$, we can not solve this integral analytically.
We can still sample from $z$ to estimate $p(X)$, but X is super-high-dimensional, $p(X|z)$ will be practically zero for all for most samples of $z$, if we don't sample in a smart manner.

Sampling $z$ from the posterior $p(z|X)$ would result in the most likely images, but we do not have the posterior.
Let's introduce an approximation $Q(z)$ to the posterior such that $\mathcal{KL}(Q(z)||P(z|X))$ is small:

\begin{align}
\mathcal{KL}[Q(z)||P(z|X)] &= \mathbb{E}[\log Q(z) - \log P(z|X)]\\
&= \mathbb{E}[\log Q(z) - \log P(X|z) - \log P(z)] + \log P(X)\\
&= \mathbb{E}[- \log P(X|z)] + \mathcal{KL}[Q(z)|| P(z)] + \log P(X)
\end{align}

\begin{align}
\Rightarrow\log P(X) - \mathcal{KL}[Q(z)||P(z|X)] &= \mathbb{E}[\log P(X|z)] - \mathcal{KL}[Q(z)|| P(z)]
\end{align}

![](https://github.com/hse-aml/bayesian-methods-for-ml/blob/master/week5/VAE.png?raw=1)

### Bernoulli observation model

\begin{align}
\log P(X|z) = X\log(\text{NN}(z))+(1-X)\log(1-\text{NN}(z))
\end{align}

In [None]:
# These parameters are not optimized yet
latent_dim = 3
hidden = 600
beta = 1

model = VAE(input_dim=d**2, latent_dim=latent_dim,
            hidden=hidden).to(device)

In [None]:
loss_func = partial(vae.loss_function, likelihood='bernoulli')
val_loss = vae.fit(model, dataloader, epochs=1, device=device, beta=beta, stop_crit=0, plotter=plotter)

The plots show $p(x|z)$ with $z \sim \mathcal{N}(0, 1)$.
Even though there is room for improvement (e.g. changing the output activation function), I think the network learned the hierarchical constraints.

In [None]:
plot_sample_grid(nu.draw_samples(model))

### Traversing the latent space

Maybe the 3 latent dimensions correspond to the angles already?
$\rightarrow$ Not quite...

In [None]:
traverser = nu.LatentTraverser(model)
anims = traverser.get_anims(range(3))
vids = [anim.to_html5_video() for anim in anims]

In [None]:
for i, vid in enumerate(vids):
    display(Markdown(f"#### Latent $z_{i}$"))
    display(HTML(vid))

## Conditional VAE

![](https://github.com/hse-aml/bayesian-methods-for-ml/blob/master/week5/CVAE.png?raw=1)

\begin{align}
p(x | z, c), q(t | x, c)
\end{align}

## Generating poses given image

Estimating poses from images by training a shared latent variable model and then conditioning on the pose has some advantages:

- Errorbars: it's a probabilistic model
- Anomaly detection: $\mathcal{KL}(q(z|x)|\mathcal{N}(0, 1))$ will be large if input is very different than training samples

In [None]:
latent_dim = 1 # one dimension is enough for this task!
weight_fn = None
beta = 1
cvae = vae.cVAE(input_dim=3, condition_dim=d**2, latent_dim=latent_dim, hidden=600, likelihood='bernoulli',
           condition_on='image').to(device)

In [None]:
loss_func = partial(vae.joint_loss, likelihood='bernoulli')
hist = vae.fit(cvae, dataloader, epochs=1, device=device, weight_fn=weight_fn,
           loss_func=loss_func,
           conditional=True, plotter=plotter, stop_crit=0, beta=beta)

In [None]:
# draw random poses from validation set
idxs = np.random.choice(range(len(h_val)), size=9)
poses = np.array([h_val[i]['angles'] for i in idxs])
poses_degree = nu.un_normalize(poses)

# draw corresponding images from validation set
imgs = np.array([h_val[i]['image'] for i in idxs])
# generate poses from noise given images
samples = nu.draw_samples(cvae, imgs.reshape(9, d**2))

poses_recovered = nu.un_normalize(samples[:, d**2:])
np.std(poses_recovered-poses_degree, axis=0, ddof=1)

for i in range(9):
    label = samples[i, d**2:]
    pose_true = poses_degree[i]
    pose_recovered = nu.un_normalize(label)
    print('Ground truth:\n', pose_true)
    print('Pose recovered from image:\n', pose_recovered)
    print('Error:\n', pose_true-pose_recovered)
    fig, ax = plt.subplots(ncols=2, sharey=True)
    ax[0].set_title('pose (ground truth)')
    ax[0].imshow(pose_to_image(poses[i]))
    ax[1].set_title('pose|image')
    ax[1].imshow(pose_to_image(label))
    plt.show()

How does the one-dimensional latent space look like?

In [None]:
from datetime import timedelta
T = time.time()
td = timedelta(seconds=T-t0)
print('Total runtime:', str(td).split('.')[0])