In [1]:
import tensorflow as tf
from tensorflow.keras import models
from tensorflow.keras import layers

import torch
from torch import nn

import torchvision

import numpy as np
import matplotlib.pyplot as plt

<b>TensorFlow/Keras:</b>
``` python
tf.keras.layers.GroupNormalization(
    groups=32, axis=-1, epsilon=0.001, center=True, scale=True,
    beta_initializer='zeros', gamma_initializer='ones',
    beta_regularizer=None, gamma_regularizer=None,
    beta_constraint=None, gamma_constraint=None, **kwargs)
```
Group Normalization divides the channels into groups and computes within each group the mean and variance for normalization. Empirically, its accuracy is more stable than batch norm in a wide range of small batch sizes, if learning rate is adjusted linearly with batch sizes.

Relation to Layer Normalization: If the number of groups is set to 1, then this operation becomes nearly identical to Layer Normalization (see Layer Normalization docs for details).

Relation to Instance Normalization: If the number of groups is set to the input dimension (number of groups is equal to number of channels), then this operation becomes identical to Instance Normalization. You can achieve this via groups=-1.

<b>PyTorch:</b>
``` python
torch.nn.GroupNorm(num_groups, num_channels, eps=1e-05, affine=True, device=None, dtype=None)
```
Applies Group Normalization over a mini-batch of inputs.
This layer implements the operation as described in the paper Group Normalization.

$y=\dfrac{x-E[x]}{\sqrt(Var[x]+\epsilon)}\ast \gamma + \beta$

The input channels are separated into num_groups groups, each containing num_channels / num_groups channels. num_channels must be divisible by num_groups. The mean and standard-deviation are calculated separately over the each group. $\gamma$ and $\beta$ are learnable per-channel affine transform parameter vectors of size num_channels if affine is True. The standard-deviation is calculated via the biased estimator, equivalent to torch.var(input, unbiased=False).

This layer uses statistics computed from input data in both training and evaluation modes.

</pre>

# When parameters ($\beta$ and $\gamma$) are initialized as 0 and 1, respectively

In [2]:
# define random inputs
input_torch = torch.randn(1, 6, 2, 2) # batch, channel, h, w
input_tf = tf.convert_to_tensor(input_torch.permute((0,2,3,1))) # batch, h, w, channel

# define group normalization layer
norm_tf = layers.GroupNormalization(groups=3, axis=-1, epsilon=1e-04)
norm_torch = nn.GroupNorm(num_groups=3,num_channels=6, eps=1e-04)

out1 = norm_tf(input_tf)
out2 = norm_torch(input_torch)
# out3 = norm_torch2(input_torch)

print('tensorflow output: \n', out1.numpy().reshape(-1))
print('pytorch output: \n', out2.detach().permute((0,2,3,1)).numpy().reshape(-1))
# print('pytorch output2: \n', out3.detach().numpy().reshape((-1)))

tensorflow output: 
 [-0.8580077   0.4036887   1.3407819   0.55726683 -0.21924813 -0.18920709
 -0.22301942  0.35651854 -0.913093   -0.099995    0.99335134  0.39631253
  2.0230536   0.45264104  1.4933885  -1.5806415  -2.182316    0.31435674
 -0.6877071  -1.4671675  -0.38636363 -0.41134393  1.3143917  -0.42764127]
pytorch output: 
 [-0.8580077   0.4036887   1.3407818   0.55726683 -0.21924813 -0.18920709
 -0.22301944  0.35651857 -0.913093   -0.099995    0.9933514   0.3963126
  2.0230534   0.45264104  1.4933885  -1.5806415  -2.182316    0.3143568
 -0.6877071  -1.4671676  -0.38636363 -0.41134396  1.3143919  -0.42764127]


In [3]:
# see the parameters
print('tensorflow:')
display(norm_tf.weights)

print('pytorch:')
display(norm_torch.weight, norm_torch.bias)

tensorflow:


[<tf.Variable 'group_normalization/gamma:0' shape=(6,) dtype=float32, numpy=array([1., 1., 1., 1., 1., 1.], dtype=float32)>,
 <tf.Variable 'group_normalization/beta:0' shape=(6,) dtype=float32, numpy=array([0., 0., 0., 0., 0., 0.], dtype=float32)>]

pytorch:


Parameter containing:
tensor([1., 1., 1., 1., 1., 1.], requires_grad=True)

Parameter containing:
tensor([0., 0., 0., 0., 0., 0.], requires_grad=True)

# What if the same random weights are used?


In [4]:
n_batch = 2
n_channel = 6
n_group = 3
n_size = 3 # height & width of input

gamma = torch.randn(n_channel, requires_grad=True)
beta = torch.randn(n_channel, requires_grad=True)

# define group normalization layer
norm_tf = layers.GroupNormalization(groups=n_group, axis=-1, epsilon=1e-04)
norm_torch = nn.GroupNorm(num_groups=n_group,num_channels=n_channel, eps=1e-04)

# feed the input first (for some reason, norm_tf did not have weights before it receive input)
input_torch = torch.randn(n_batch, n_channel, n_size, n_size) # batch, channel, h, w
input_tf = tf.convert_to_tensor(input_torch.permute((0,2,3,1))) # batch, h, w, channel

out1 = norm_tf(input_tf)
out2 = norm_torch(input_torch)

# change the weights
norm_tf.set_weights([gamma.detach().numpy(), beta.detach().numpy()])

norm_torch.weight = torch.nn.Parameter(gamma) # gamma
norm_torch.bias = torch.nn.Parameter(beta) # beta

# print the changed weights
print('tf gamma: \n',norm_tf.weights[0])
print('torch gamma: \n',norm_torch.weight)
print('tf beta: \n',norm_tf.weights[1])
print('torch beta: \n',norm_torch.bias)

# define random inputs again
input_torch = torch.randn(n_batch, n_channel, n_size, n_size) # batch, channel, h, w
input_tf = tf.convert_to_tensor(input_torch.permute((0,2,3,1))) # batch, h, w, channel

out1 = norm_tf(input_tf)
out2 = norm_torch(input_torch)

print('tensorflow output: \n', out1.numpy().reshape((-1)))
print('pytorch output: \n', out2.detach().permute((0,2,3,1)).numpy().reshape((-1)))

tf gamma: 
 <tf.Variable 'group_normalization_1/gamma:0' shape=(6,) dtype=float32, numpy=
array([ 0.3224049 , -0.06129041, -0.7555029 ,  0.45168865, -0.60566837,
        0.93623275], dtype=float32)>
torch gamma: 
 Parameter containing:
tensor([ 0.3224, -0.0613, -0.7555,  0.4517, -0.6057,  0.9362],
       requires_grad=True)
tf beta: 
 <tf.Variable 'group_normalization_1/beta:0' shape=(6,) dtype=float32, numpy=
array([-0.84223557,  0.21315366, -0.75493366,  0.40492174, -2.2166514 ,
       -0.3456627 ], dtype=float32)>
torch beta: 
 Parameter containing:
tensor([-0.8422,  0.2132, -0.7549,  0.4049, -2.2167, -0.3457],
       requires_grad=True)
tensorflow output: 
 [-1.02924109e+00  2.23873124e-01 -1.03663230e+00  6.29164875e-02
 -2.36285210e+00 -7.61391222e-02 -8.14058363e-01  8.55055302e-02
 -7.74147749e-01  7.19713628e-01 -2.17357612e+00  1.51769447e+00
 -8.52565587e-01  3.64336073e-01 -1.39513421e+00 -4.89183366e-01
 -6.63091779e-01 -2.81906128e-03 -6.24097049e-01  1.92274556e-01
 -5.1