In [1]:
%load_ext autoreload

%autoreload 2

import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
from utils import rand_rotation_tensor, test_close


Here I'm testing the provided layers by VN. They implement a network called DGCNN that (if I remove the last 'pool' layer) takes in as input a batch of points and outputs both invariant and equivariant features for those points:


$$
VN\_DGCNN : \mathbb{R}^{B \times N \times 3} \rightarrow \mathbb{R}^{B \times C \times N \times 3}
$$

Where $B$ is Batch Size, $N$ is Number of Points and $C$ is Latent Feature Size

In [3]:
from vnn.vn_models import VN_MLP, VN_DGCNN

# c_dim: latnet code size
# dim: input dimension (3 for point clouds (xyz))
# hidden_dim: hidden dimension of the MLP
# k: number of nearest neighbors in the graph convolution
# meta_output: for outputting the invariant latent code
model = VN_DGCNN(c_dim=9, dim=3, hidden_dim=128, k=20, meta_output='invariant_latent')

# generate batches of random 3D vectors
vectors = torch.rand(2, 25, 3)

# normalise onto a unit sphere (don't need to do this, just thinking about spherical signals)
# output is the still invariant and equivariant either way
vectors = vectors / torch.norm(vectors, dim=-1, keepdim=True)

conditional = torch.rand(2, 1, 3) # B x Number of Conditonal Variables x 3 
# this is encoding the conditional variable in the xyz space, lat on x axis, lon on y axis, z is 0
conditional[:, :, 2] = 0

# concatenate the conditional variable to the input
vectors = torch.cat([vectors, conditional], dim=1)

# generate a random rotation matrix
R = rand_rotation_tensor()

# test equivariance of output and invariance of latent code
output, lc = model(vectors)
output_R, lc_r = model(vectors @ R)

print(f'Size of equivariant output: {output.shape}, [B, C, N, 3]')
print(f'Size of invariant latent code: {lc.shape}, [B, C, N, 3]')

res, eps = test_close(output, output_R)
print(f'Output: VN_MLP(P @ R) != VN_MLP(P) = {not res} {f"with eps {eps:.0e}" if res else ""}')

res, eps = test_close(output_R, output @ R)
print(f'Output: VN_MLP(P @ R) = VN_MLP(P) @ R --> Equivariance: {res} with eps {eps:.0e}')

res, eps = test_close(lc_r, lc)
print(f'Latent: VN_MLP(P @ R) = VN_MLP(P) --> Invariance: {res} with eps {eps:.0e}')


Size of equivariant output: torch.Size([2, 9, 26, 3]), [B, C, N, 3]
Size of invariant latent code: torch.Size([2, 9, 26, 3]), [B, C, N, 3]
Output: VN_MLP(P @ R) != VN_MLP(P) = False with eps 1e-02
Output: VN_MLP(P @ R) = VN_MLP(P) @ R --> Equivariance: True with eps 1e-07
Latent: VN_MLP(P @ R) = VN_MLP(P) --> Invariance: True with eps 1e-07


So this works and provides both invariant and equivarient features.

But according to the VN-Transformer paper they use a simple VN-MLP with only VN-Linear, VN-BatchNorm and VN-ReLU. No pooling layers as used in VN\_DGCNN and they also do not use the graph convolution layer which is the first two layers of VN\_DGCNN. 

VN paper discribes the reason for this graph convolution layer as:

<em>""In the first input layer where the input pointcloud coordinates are $\mathbb{R}^{1 \times 3}$ vectors and thus applying $f$ to them would degenerate to a set of $\mathbb{R}^{C \times 3}$ vector-lists whose vector components are all linearly dependent (pointing to one direction). This is analogous to applying a per-pixel $1 \times 1$ convolution to a gray-scale image (single input channel). Therefore, in we add an edge convolution at the input layer, mapping $\mathbb{R}^{1 \times 3}$ features into $\mathbb{R}^{C \times 3}$ with $C > 1$ and then continue with per-point VN-MLP operations.""</em>

And VN-Transformer paper says:

<em>""The original VN paper relied on edge convolution as a pre-processing step to capture local point-cloud structure. Such feature engineering is not data-driven and requires human involvement in designing and tuning. Moreover, the sparsity of these computations makes them slow to run on accelerated hardware. Our proposed rotation-equivariant attention mechanism learns higher-level features directly from single points for arbitrary point-clouds""</em>

So I would like to not use them. But as shown below the networks I build do not have rotation equivariance/invariance :/

It seems removing that edge convolution is an issue and I can't find a good explination of what they did in the VN-Transformer paper.

In [40]:
# Testing building a simple VN-MLP and not using the graph convolution
from vnn.vn_layers import VNLinear, VNLeakyReLU, VNBatchNorm

# So I do not know if this is correct, but as we are not going to use the 
# edge convolution and the first few layers of VN_DGCNN are doing this:
#
#  p = p.unsqueeze(1).transpose(2, 3) # [B, N, 3] -> [B, 1, 3, N]
#  feat = get_graph_feature_cross(p, k=self.k, dims=3) # [B, 1, 3, N] -> [B, 3, 3, N, K]
#  net = self.conv_pos(feat) # [B, 3, 3, N, K] -> [B, Z, 3, N, K]
#  net = self.pool(net, dim=-1) # [B, Z, 3, N, K] -> [B, Z, 3, N]

# Where the output is [B, Z, 3, N]. So I tried skipping this and going straight to [B, 1, 3, N]
# so input to VNLInear is dim 1. 
linear_0 = VNLinear(in_channels=1, out_channels=5)

batch_norm_0 = VNBatchNorm(num_features=5, dim=5) # dim=5 results in nn.BatchNorm2d where as dim=3 | 4 results in nn.BatchNorm1d

leaky_relu_0 = VNLeakyReLU(5, share_nonlinearity=False, negative_slope=0.0)


In [75]:
# generate batches of random 3D vectors where the feature dimension is 1
B, C, N = 2, 1, 4
vectors = torch.rand(B, C, N, 3)
vectors = vectors / torch.norm(vectors, dim=-1, keepdim=True)

R = rand_rotation_tensor()
vectors_R = vectors @ R
p = vectors.transpose(2, 3) # they do this inside the VN_DGCNN forward pass
p_R = vectors_R.transpose(2, 3) # they do this inside the VN_DGCNN forward pass
x1 = linear_0(p)
x1_R = linear_0(p_R)
x1 = x1.transpose(2, 3) # I undo it here to be able to apply rotation matrices to test equivariance
x1_R = x1_R.transpose(2, 3) # I undo it here to be able to apply rotation matrices to test equivariance

print(f'Size of equivariant output: {x1.shape}, [B, C, N, 3]')

res, eps = test_close(x1, x1_R)
print(f'Output: VN_MLP(P @ R) != VN_MLP(P) = {not res} {f"with eps {eps:.0e}" if res else ""}')

res, eps = test_close(x1_R, x1 @ R)
print(f'Output: VN_MLP(P @ R) = VN_MLP(P) @ R --> Equivariance: {res} with eps {eps:.0e}')

Size of equivariant output: torch.Size([2, 5, 4, 3]), [B, C, N, 3]
Output: VN_MLP(P @ R) != VN_MLP(P) = True 
Output: VN_MLP(P @ R) = VN_MLP(P) @ R --> Equivariance: True with eps 1e-07


↑↑↑↑ This appears to be working ↑↑↑↑ 

So now test VN-BatchNorm

In [77]:
x2 = x1.transpose(2, 3)
x2_R = x1_R.transpose(2, 3)
x2 = batch_norm_0(x2.unsqueeze(4)).squeeze()
x2_R = batch_norm_0(x2_R.unsqueeze(4)).squeeze()
x2 = x2.transpose(2, 3)
x2_R = x2_R.transpose(2, 3)

print(f'Size of equivariant output: {x2.shape}, [B, C, N, 3]')

res, eps = test_close(x2, x2_R)
print(f'Output: VN_MLP(P @ R) != VN_MLP(P) = {not res} {f"with eps {eps:.0e}" if res else ""}')

res, eps = test_close(x2_R, x2 @ R)
print(f'Output: VN_MLP(P @ R) = VN_MLP(P) @ R --> Equivariance: {res} {f"with eps {eps:.0e}" if res else ""}')

Size of equivariant output: torch.Size([2, 5, 4, 3]), [B, C, N, 3]
Output: VN_MLP(P @ R) != VN_MLP(P) = False with eps 1e-04
Output: VN_MLP(P @ R) = VN_MLP(P) @ R --> Equivariance: True with eps 1e-04


So as shown above the BatchNorm layer has seriously affected the equivariance property such that you need a significantly larger epsilon in the test_close function for it to pass. Something is wrong here.

In [79]:
x3 = x2.transpose(2, 3)
x3_R = x2_R.transpose(2, 3)
x3 = leaky_relu_0(x3)
x3_R = leaky_relu_0(x3_R)
x3 = x3.transpose(2, 3)
x3_R = x3_R.transpose(2, 3)

print(f'Size of equivariant output: {x3.shape}, [B, C, N, 3]')

res, eps = test_close(x3_R, x3 @ R)
print(f'Output: VN_MLP(P @ R) = VN_MLP(P) @ R --> Equivariance: {res} with eps {eps:.0e}')

Size of equivariant output: torch.Size([2, 5, 4, 3]), [B, C, N, 3]
Output: VN_MLP(P @ R) = VN_MLP(P) @ R --> Equivariance: True with eps 1e-04
