In [1]:

import os, sys, math, time
import numpy as np
import numpy.linalg as la
import plotly.graph_objects as go
import plotly.express as ex
from plotly.subplots import make_subplots
import pandas as pd

import json as js
import _pickle as pickle
import bz2
import ray

import torch
import torch.nn as nn
import torchvision
from torch.utils.data import Dataset, TensorDataset
from torch.utils.data import DataLoader
from torch.utils.data.dataset import random_split
from collections import OrderedDict

sys.path.append("../")
import func

In [2]:
torch.cuda.device_count()

1

## Analyse features

In [9]:
data_path = "../../data/"
# load data
data = func.load(data_path+"LOCO_R2-default-locomotion.pbz2")
data_2 = func.load(data_path+"LOCO_R2-default-locomotion-small.pbz2")


## DEC-MLP Autoencoder
$
f(x,\theta) = dec(enc(x,\theta_1), \theta_2) = x,   \quad \theta = (\theta_1, \theta_2)
$

$
enc(x, \theta_1) = z, \quad   z \in Z \quad \text{ = latent space}
$

$
dec(z, \theta_2) = x, \quad   x \in X \quad \text{ = input space}
$

This model uses Deep Embedded Clustering (DEC) model for encoder

$
enc = dec(x, \theta) = z, \quad z \in Z
$

$
dec = mlp(X, \theta), \quad \theta = W,b
$

$
mlp(X, W) = f(f(X \cdot w_1 + b_1) \cdot w_2 + b_2) \cdot w_3 + b_3
$

In [7]:
"""
from fabiozinno {https://github.com/electronicarts/character-motion-vaes/blob/main/vae_motion/models.py}
"""
import torch.nn.functional as F

class AutoEncoder(nn.Module):
    def __init__(self, frame_size, latent_size, normalization):
        super().__init__()
        self.frame_size = frame_size
        self.latent_size = latent_size

        self.mode = normalization.get("mode")
        self.data_max = normalization.get("max")
        self.data_min = normalization.get("min")
        self.data_avg = normalization.get("avg")
        self.data_std = normalization.get("std")

        h1 = 256
        h2 = 128
        # Encoder
        # Takes pose | condition (n * poses) as input
        self.fc1 = nn.Linear(frame_size, h1)
        self.fc2 = nn.Linear(h1, h2)
        self.fc3 = nn.Linear(h2, latent_size)

        # Decoder
        # Takes latent | condition as input
        self.fc4 = nn.Linear(latent_size, h2)
        self.fc5 = nn.Linear(h2, h1)
        self.fc6 = nn.Linear(h1, frame_size)

    def normalize(self, t):
        if self.mode == "minmax":
            return 2 * (t - self.data_min) / (self.data_max - self.data_min) - 1
        elif self.mode == "zscore":
            return (t - self.data_avg) / self.data_std
        elif self.mode == "none":
            return t
        else:
            raise ValueError("Unknown normalization mode")

    def denormalize(self, t):
        if self.mode == "minmax":
            return (t + 1) * (self.data_max - self.data_min) / 2 + self.data_min
        elif self.mode == "zscore":
            return t * self.data_std + self.data_avg
        elif self.mode == "none":
            return t
        else:
            raise ValueError("Unknown normalization mode")

    def forward(self, x):
        latent = self.encode(x)
        return self.decode(latent)

    def encode(self, x):
        h1 = F.relu(self.fc1(x))
        h2 = F.relu(self.fc2(h1))
        return self.fc3(h2)

    def decode(self, x):
        h4 = F.relu(self.fc4(x))
        h5 = F.relu(self.fc5(h4))
        return self.fc6(h5)


class Encoder(nn.Module):
    def __init__(
        self,
        frame_size,
        latent_size,
        hidden_size,
        num_condition_frames,
        num_future_predictions,
    ):
        super().__init__()
        # Encoder
        # Takes pose | condition (n * poses) as input
        input_size = frame_size * (num_future_predictions + num_condition_frames)
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.fc2 = nn.Linear(frame_size + hidden_size, hidden_size)
        self.mu = nn.Linear(frame_size + hidden_size, latent_size)
        self.logvar = nn.Linear(frame_size + hidden_size, latent_size)

    def encode(self, x, c):
        h1 = F.elu(self.fc1(torch.cat((x, c), dim=1)))
        h2 = F.elu(self.fc2(torch.cat((x, h1), dim=1)))
        s = torch.cat((x, h2), dim=1)
        return self.mu(s), self.logvar(s)

    def reparameterize(self, mu, logvar):
        std = torch.exp(0.5 * logvar)
        eps = torch.randn_like(std)
        return mu + eps * std

    def forward(self, x, c):
        mu, logvar = self.encode(x, c)
        z = self.reparameterize(mu, logvar)
        return z, mu, logvar


class Decoder(nn.Module):
    def __init__(
        self,
        frame_size,
        latent_size,
        hidden_size,
        num_condition_frames,
        num_future_predictions,
    ):
        super().__init__()
        # Decoder
        # Takes latent | condition as input
        input_size = latent_size + frame_size * num_condition_frames
        output_size = num_future_predictions * frame_size
        self.fc4 = nn.Linear(input_size, hidden_size)
        self.fc5 = nn.Linear(latent_size + hidden_size, hidden_size)
        self.out = nn.Linear(latent_size + hidden_size, output_size)

    def decode(self, z, c):
        h4 = F.elu(self.fc4(torch.cat((z, c), dim=1)))
        h5 = F.elu(self.fc5(torch.cat((z, h4), dim=1)))
        return self.out(torch.cat((z, h5), dim=1))

    def forward(self, z, c):
        return self.decode(z, c)


class MixedDecoder(nn.Module):
    def __init__(
        self,
        frame_size,
        latent_size,
        hidden_size,
        num_condition_frames,
        num_future_predictions,
        num_experts,
    ):
        super().__init__()

        input_size = latent_size + frame_size * num_condition_frames
        inter_size = latent_size + hidden_size
        output_size = num_future_predictions * frame_size
        self.decoder_layers = [
            (
                nn.Parameter(torch.empty(num_experts, input_size, hidden_size)),
                nn.Parameter(torch.empty(num_experts, hidden_size)),
                F.elu,
            ),
            (
                nn.Parameter(torch.empty(num_experts, inter_size, hidden_size)),
                nn.Parameter(torch.empty(num_experts, hidden_size)),
                F.elu,
            ),
            (
                nn.Parameter(torch.empty(num_experts, inter_size, output_size)),
                nn.Parameter(torch.empty(num_experts, output_size)),
                None,
            ),
        ]

        for index, (weight, bias, _) in enumerate(self.decoder_layers):
            index = str(index)
            torch.nn.init.kaiming_uniform_(weight)
            bias.data.fill_(0.01)
            self.register_parameter("w" + index, weight)
            self.register_parameter("b" + index, bias)

        # Gating network
        gate_hsize = 64
        self.gate = nn.Sequential(
            nn.Linear(input_size, gate_hsize),
            nn.ELU(),
            nn.Linear(gate_hsize, gate_hsize),
            nn.ELU(),
            nn.Linear(gate_hsize, num_experts),
        )

    def forward(self, z, c):
        coefficients = F.softmax(self.gate(torch.cat((z, c), dim=1)), dim=1)
        layer_out = c

        for (weight, bias, activation) in self.decoder_layers:
            flat_weight = weight.flatten(start_dim=1, end_dim=2)
            mixed_weight = torch.matmul(coefficients, flat_weight).view(
                coefficients.shape[0], *weight.shape[1:3]
            )

            input = torch.cat((z, layer_out), dim=1).unsqueeze(1)
            mixed_bias = torch.matmul(coefficients, bias).unsqueeze(1)
            out = torch.baddbmm(mixed_bias, input, mixed_weight).squeeze(1)
            layer_out = activation(out) if activation is not None else out

        return layer_out


class PoseMixtureVAE(nn.Module):
    def __init__(
        self,
        frame_size,
        latent_size,
        num_condition_frames,
        num_future_predictions,
        normalization,
        num_experts,
    ):
        super().__init__()
        self.frame_size = frame_size
        self.latent_size = latent_size
        self.num_condition_frames = num_condition_frames
        self.num_future_predictions = num_future_predictions

        self.mode = normalization.get("mode")
        self.data_max = normalization.get("max")
        self.data_min = normalization.get("min")
        self.data_avg = normalization.get("avg")
        self.data_std = normalization.get("std")

        hidden_size = 256
        args = (
            frame_size,
            latent_size,
            hidden_size,
            num_condition_frames,
            num_future_predictions,
        )

        self.encoder = Encoder(*args)
        self.decoder = MixedDecoder(*args, num_experts)

    def normalize(self, t):
        if self.mode == "minmax":
            return 2 * (t - self.data_min) / (self.data_max - self.data_min) - 1
        elif self.mode == "zscore":
            return (t - self.data_avg) / self.data_std
        elif self.mode == "none":
            return t
        else:
            raise ValueError("Unknown normalization mode")

    def denormalize(self, t):
        if self.mode == "minmax":
            return (t + 1) * (self.data_max - self.data_min) / 2 + self.data_min
        elif self.mode == "zscore":
            return t * self.data_std + self.data_avg
        elif self.mode == "none":
            return t
        else:
            raise ValueError("Unknown normalization mode")

    def encode(self, x, c):
        _, mu, logvar = self.encoder(x, c)
        return mu, logvar

    def forward(self, x, c):
        z, mu, logvar = self.encoder(x, c)
        return self.decoder(z, c), mu, logvar

    def sample(self, z, c, deterministic=False):
        return self.decoder(z, c)


class VectorQuantizer(nn.Module):
    def __init__(self, num_embeddings, latent_size):
        super().__init__()

        self.num_embeddings = num_embeddings
        self.latent_size = latent_size

        # self.embedding = nn.Embedding(self.num_embeddings, self.latent_size)
        # self.embedding.weight.data.normal_()

        embed = torch.randn(latent_size, num_embeddings)
        self.register_buffer("embed", embed)
        self.register_buffer("cluster_size", torch.zeros(num_embeddings))
        self.register_buffer("embed_avg", embed.clone())

        self.commitment_cost = 0.25
        self.decay = 0.99
        self.epsilon = 1e-5

    def forward(self, inputs):
        # Calculate distances
        dist = (
            inputs.pow(2).sum(1, keepdim=True)
            - 2 * inputs @ self.embed
            + self.embed.pow(2).sum(0, keepdim=True)
        )

        _, embed_ind = (-dist).max(1)
        embed_onehot = F.one_hot(embed_ind, self.num_embeddings).type(inputs.dtype)
        embed_ind = embed_ind.view(*inputs.shape[:-1])
        quantize = F.embedding(embed_ind, self.embed.transpose(0, 1))

        # Use EMA to update the embedding vectors
        if self.training:
            self.cluster_size.data.mul_(self.decay).add_(
                1 - self.decay, embed_onehot.sum(0)
            )

            embed_sum = inputs.transpose(0, 1) @ embed_onehot
            self.embed_avg.data.mul_(self.decay).add_(1 - self.decay, embed_sum)
            n = self.cluster_size.sum()
            cluster_size = (
                (self.cluster_size + self.epsilon)
                / (n + self.num_embeddings * self.epsilon)
                * n
            )
            embed_normalized = self.embed_avg / cluster_size.unsqueeze(0)
            self.embed.data.copy_(embed_normalized)

        loss = (quantize.detach() - inputs).pow(2).mean()
        quantize = inputs + (quantize - inputs).detach()

        avg_probs = embed_onehot.mean(dim=0)
        perplexity = torch.exp(-torch.sum(avg_probs * (avg_probs + 1e-10).log()))

        return quantize, loss, perplexity, embed_ind

In [8]:
class Enc(nn.Module):
    def __init__(self, dimensions:list, k:int, act_fn:str, keep_prob=0):
        super().__init__()

        self.dimensions = dimensions
        self.act_fn = act_fn
        self.keep_prob = keep_prob
        self.k = k

        self.model = None
        self.mu = None
        self.logvar = None

        self.build()
        self.model.apply(self.init_params)

    def build(self):
        layers = []
        for i, size in enumerate(zip(self.dimensions[0:], self.dimensions[1:])):
            layers.append(("fc"+str(i), nn.Linear(size[0], size[1])))
            if i < len(self.dimensions)-2:
                layers.append(("act"+str(i), self.activation(self.act)))
                if (self.keep_prob > 0):
                    layers.append(("drop"+str(i+1), nn.Dropout(self.keep_prob)))
        self.model = nn.Sequential(OrderedDict(layers))
        self.mu = nn.Linear(self.dimensions[-1], self.k)
        self.logvar = nn.Linear(self.dimensions[-1], self.k)

    def activation(self):
        if self.act_fn == "elu":
            return nn.ELU()
        elif self.act_fn == "relu":
            return nn.ReLU()

    @staticmethod
    def init_params(m:nn.Module):
        if type(m) == nn.Linear:
            nn.init.xavier_normal_(m.weight)
            m.bias.data.fill_(.01)

    def reparameterize(self, mu:torch.Tensor, logvar:torch.Tensor) -> torch.Tensor:
        std = torch.exp(0.5 * logvar)
        eps = torch.randn_like(std)
        return mu + eps * std

    def encode(self, x:torch.Tensor) -> (torch.Tensor,torch.Tensor):
        encoded = self.model(x)
        return self.mu(encoded), self.logvar(encoded)

    def forward(self, x:torch.Tensor) -> (torch.Tensor, torch.Tensor, torch.Tensor):
        mu, logvar = self.encode(x)
        z = self.reparameterize(mu, logvar)
        return z, mu, logvar


class MLP(nn.Module):
    def __init__(self, dimensions:list, act_fn:str, keep_prob:float=.2, batch_size:int=1):
        super(MLP, self).__init__()
        self.dimensions = dimensions          #   [(in, h1), (h1, h2), ..., (hn, out)]
        self.act= act_fn                     #   func
        self.keep_prob = keep_prob          #   %
        self.batch_size = batch_size        #   int

        self.model = []

        assert(len(dimensions) >= 2)
        assert(batch_size > 0)
        assert(act_fn == "elu" or act_fn == "relu")
        assert(keep_prob < 1)
        for e in dimensions: assert(type(e) == int)

        self.build()
        self.model.apply(self.init_params)


    def build(self):
        layers = []
        for i, size in enumerate(zip(self.dimensions[0:], self.dimensions[1:])):
            layers.append(("fc"+str(i), nn.Linear(size[0], size[1])))
            if i < len(self.dimensions)-2:
                layers.append(("act"+str(i), self.activation(self.act)))
                layers.append(("drop"+str(i+1), nn.Dropout(self.keep_prob)))

        self.model = nn.Sequential(OrderedDict(layers))


    def forward(self, x:torch.Tensor) -> torch.Tensor:
        return self.model(x)

    @staticmethod
    def activation(fn_name):
        if fn_name == "elu":
            return nn.ELU()
        elif fn_name == "relu":
            return nn.ReLU()
        else:
            return nn.ReLU()

    @staticmethod
    def init_params(m):
        if type(m) == nn.Linear:
            nn.init.xavier_normal_(m.weight)
            m.bias.data.fill_(.01)


class VAE_AE(nn.Module):
    def __init__(self, encoder:nn.Module, decoder:nn.Module):
        super(VAE_AE, self).__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, x):
        return self.decoder(self.encoder(x))

In [13]:
# Prepare train data
all_data = []
for d in data:
    d = pickle.loads(d)
    pos = []
    for f in d["frames"]:
        p = [jo["pos"] for jo in f]
        pos.append(p)
    all_data.append(pos)

input_data = np.array([np.concatenate([p for p in j]) for pos in all_data for j in pos])
print(input_data.shape)

(1440, 63)


In [16]:
data_ratio = (.7, .15, .15) # training, validation, testing
SEED = 2021
batch_size = 1

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

x_tensor = torch.from_numpy(input_data).float()
y_tensor = torch.from_numpy(input_data).float()

dataset = TensorDataset(x_tensor, y_tensor)
N = len(dataset)

train_ratio = int(data_ratio[0]*N)
val_ratio = int(data_ratio[1] * N)
test_ratio = int(N-train_ratio-val_ratio)
print("Train: ", train_ratio, ", Validation: ", val_ratio, ", Test: ", test_ratio)

train_set, val_set, test_set = random_split(dataset, [train_ratio, val_ratio, test_ratio], generator=torch.Generator().manual_seed(SEED))

train_loader = DataLoader(dataset=train_set, batch_size=batch_size)
val_loader = DataLoader(dataset=val_set, batch_size=batch_size)
test_loader = DataLoader(dataset=test_set, batch_size=batch_size)


cuda
Train:  1007 , Validation:  216 , Test:  217


In [21]:
# Hyper-parameters
input_dim = input_data.shape[1]
output_dim = input_data.shape[1]
latent_dim = 36         # 12 * 3
encoder_layer_sizes = [input_dim, 256, 256, latent_dim]
decoder_layer_sizes = [latent_dim, 256, 256, output_dim]
num_epochs = 100
learning_rate = 0.001
act_fn = "elu"
keep_prob = .2

# model, loss and scheduler
ae = StackedDenoisingAutoEncoder(encoder_layer_sizes, activation=nn.ELU(), final_activation=nn.ELU())
model = DEC_AE(DEC(36, 36, ae.encoder), ae.decoder)

criterion = nn.MSELoss(reduction="mean")
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=100, gamma=0.1)

print(model)


DEC_AE(
  (encoder): DEC(
    (encoder): Sequential(
      (0): Sequential(
        (linear): Linear(in_features=63, out_features=256, bias=True)
        (activation): ELU(alpha=1.0)
      )
      (1): Sequential(
        (linear): Linear(in_features=256, out_features=256, bias=True)
        (activation): ELU(alpha=1.0)
      )
      (2): Sequential(
        (linear): Linear(in_features=256, out_features=36, bias=True)
      )
    )
    (assignment): ClusterAssignment()
  )
  (decoder): Sequential(
    (0): Sequential(
      (linear): Linear(in_features=36, out_features=256, bias=True)
      (activation): ELU(alpha=1.0)
    )
    (1): Sequential(
      (linear): Linear(in_features=256, out_features=256, bias=True)
      (activation): ELU(alpha=1.0)
    )
    (2): Sequential(
      (linear): Linear(in_features=256, out_features=63, bias=True)
      (activation): ELU(alpha=1.0)
    )
  )
)


In [22]:
total_step = len(train_loader)
i = 0
n_epochs_no_improve = 5

train_loader_len = float(len(train_loader))
val_loader_len = float(len(val_loader))
test_loader_len = float(len(test_loader))

last_avg_training_loss = 0
min_loss = np.inf
epochs_no_improve = 0
best_model_after_epoch = 0

for epoch in range(num_epochs):
    training_loss = 0
    # training
    for inputs, labels in train_loader:
        # inputs = inputs.to(device)
        # outputs = outputs.to(device)

        pred = model(inputs)
        loss = criterion(pred, labels)
        training_loss+=loss.item()

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        scheduler.step()

    last_avg_training_loss = training_loss / train_loader_len
    print ('Epoch [{}/{}], Loss: {:.4f}'
        .format(epoch+1, num_epochs, last_avg_training_loss))

    # early stopping
    with torch.no_grad():
        val_loss = 0
        for inputs, labels in val_loader:
            pred_val = model(inputs)
            loss_val = criterion(pred_val, labels)
            val_loss += loss_val.item()

        val_loss /= val_loader_len
        if min_loss > val_loss:
            min_loss = val_loss
            epochs_no_improve = 0
            best_model_after_epoch = epoch

        else:
            epochs_no_improve+=1
            if epochs_no_improve > n_epochs_no_improve:
                print("Early stopping at Epoch: ", epoch)
                print("last training loss: {:2f}".format(last_avg_training_loss))
                print("achieved best validation loss: {:.4f} after at Epoch {}".format(min_loss, best_model_after_epoch))
                break

# Testing
with torch.no_grad():
    test_loss = 0
    for inputs, labels in test_loader:
        pred_test = model(inputs)
        loss_test = criterion(pred_test, labels)
        test_loss += loss_test.item()

    test_loss /= test_loader_len
    print("Test loss: {:.4f}".format(test_loss))


Epoch [1/100], Loss: 0.1884
Epoch [2/100], Loss: 0.1721
Epoch [3/100], Loss: 0.1721
Epoch [4/100], Loss: 0.1721
Epoch [5/100], Loss: 0.1721
Epoch [6/100], Loss: 0.1721
Epoch [7/100], Loss: 0.1721
Early stopping at Epoch:  6
last training loss: 0.172100
achieved best validation loss: 0.1820 after at Epoch 0
Test loss: 0.1849
