In [1]:
#matplotlib inline
import math
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable
from tensorboardX import SummaryWriter
from torchvision import datasets, transforms
from torchvision.utils import make_grid
from tqdm import tqdm, trange
import pandas as pd
import time
import mpmath
import os
import VMF

import importlib
import FVMF
importlib.reload(FVMF)
import test_ensemble
importlib.reload(test_ensemble)

prefix = "_phoneme_bg_"
# define the summary writer
writer = SummaryWriter()
sns.set()
sns.set_style("dark")
sns.set_palette("muted")
sns.set_color_codes("muted")


# select the device
DEVICE = torch.device("cuda:1")
LOADER_KWARGS = {'num_workers': 1, 'pin_memory': True} if torch.cuda.is_available() else {}
cuda = torch.cuda.set_device(1)

if (torch.cuda.is_available()):
    print("GPUs are used!")
else:
    print("CPUs are used!")

# define the parameters
BATCH_SIZE = 100
TEST_BATCH_SIZE = 100
COND_OPT = False
CLASSES = 5
# TRAIN_EPOCHS = 250
SAMPLES = 1
TEST_SAMPLES = 10
TEMPER = 0.001
TEMPER_PRIOR = 0.001
epochs = 250
pepochs = 50

#prepare the data
data = pd.read_csv('http://www.uio.no/studier/emner/matnat/math/STK2100/data/phoneme.data')
data = data.drop(columns=["row.names"])
data = pd.concat([data,data.g.astype("category").cat.codes.astype(int)],sort=False, axis=1) #get_dummies(data['g'], prefix='phoneme')],sort=False, axis=1)
data = data.drop(columns=["g","speaker"])
data = data.values


np.random.seed(40590)

tr_ids = np.random.choice(4509, 3500, replace = False)
te_ids = np.setdiff1d(np.arange(4509),tr_ids)[0:1000]

dtrain = data[tr_ids,:]

data_mean = dtrain.mean(axis=0)[0:256]
data_std = dtrain.std(axis=0)[0:256]

data[:,0:256] = (data[:,0:256]  - data_mean)/data_std




dtrain = data[tr_ids,:]
dtest = data[te_ids,:]


# set prior parameters
PI = 1
SIGMA_1 = torch.cuda.FloatTensor([math.exp(-0)])
SIGMA_2 = torch.cuda.FloatTensor([math.exp(-6)])


#The net does not like to get larger at a given layer??
l1shape=(256, 10)
l2shape=(10, 10)
l3shape=(10, 10)
l4shape=(10, 5)
layershapes = [l1shape, l2shape, l3shape, l4shape]

epochs = 10
trtimes  = np.zeros(epochs)
# make inference on 10 networks
for i in range(0, 1):
    print(i)
    torch.manual_seed(i)
    net = FVMF.BayesianNetwork(layershapes=layershapes,BN='notbatchnorm',VD='Gaussian',
                               dtrain=dtrain,dtest=dtest,BATCH_SIZE = 100).to(DEVICE)
    #net = VMF.BayesianNetwork(l1=l1shape, l2=l2shape, l3=l3shape,l4=l4shape,BN='notbatchnorm').to(DEVICE)
    optimizer = optim.Adam(net.parameters(), lr=0.007)
    for epoch in range(epochs):

        trtimes[epoch] = FVMF.train(net, dtrain, SAMPLES, optimizer, epoch, i,BATCH_SIZE = 100)
        #print(net.l1.weight_mu.mean())

    res = test_ensemble.test_ensemble(net,dtest,TEST_SAMPLES,TEST_BATCH_SIZE,BATCH_SIZE,CLASSES,DEVICE)

    #np.savetxt("soundGmaccuracies_" + str(i) + ".csv", res, delimiter=",")



GPUs are used!
Classes loaded
FVMF RELOADED
GPUs are used!
Classes loaded
FVMF RELOADED
GPUs are used!
Classes loaded
GPUs are used!
0
Random Init Utilized




1
loss: tensor(353.2480, device='cuda:1', grad_fn=<AddBackward0>)
negative_log_likelihood: tensor(44.4381, device='cuda:1', grad_fn=<NllLossBackward0>)
2
loss: tensor(306.4925, device='cuda:1', grad_fn=<AddBackward0>)
negative_log_likelihood: tensor(17.9101, device='cuda:1', grad_fn=<NllLossBackward0>)
3
loss: tensor(284.4404, device='cuda:1', grad_fn=<AddBackward0>)
negative_log_likelihood: tensor(13.4873, device='cuda:1', grad_fn=<NllLossBackward0>)
4
loss: tensor(265.6693, device='cuda:1', grad_fn=<AddBackward0>)
negative_log_likelihood: tensor(13.7656, device='cuda:1', grad_fn=<NllLossBackward0>)
5
loss: tensor(247.6950, device='cuda:1', grad_fn=<AddBackward0>)
negative_log_likelihood: tensor(13.0174, device='cuda:1', grad_fn=<NllLossBackward0>)
6
loss: tensor(228.9311, device='cuda:1', grad_fn=<AddBackward0>)
negative_log_likelihood: tensor(13.9477, device='cuda:1', grad_fn=<NllLossBackward0>)
7
loss: tensor(210.0452, device='cuda:1', grad_fn=<AddBackward0>)
negative_log_likelihoo

In [2]:
w_mu_nodewise = []
b_mu = []
for i,layer in enumerate(net.layers):
    w_mu_nodewise += [layer.weight_mu]
    b_mu += [layer.bias_mu]
    #print('i:',i)
    #print('layer:',layer,'\n')
#print(w_mu_nodewise)
#print(b_mu_nodewise)
r"""
w_mu5 = net.l5.weight_mu
w_mu5 = w_mu5.reshape(l3shape[0]*l3shape[1]).to(DEVICE)
#net.l3.weight_rho
b_mu5 = net.l5.bias_mu.to(DEVICE) #5
#net.l3.bias_rho
"""

'\nw_mu5 = net.l5.weight_mu\nw_mu5 = w_mu5.reshape(l3shape[0]*l3shape[1]).to(DEVICE)\n#net.l3.weight_rho\nb_mu5 = net.l5.bias_mu.to(DEVICE) #5\n#net.l3.bias_rho\n'

In [4]:
#import importlib
#import os
import FVMF
importlib.reload(FVMF)

#import VMF

#import importlib
#importlib.reload(VMF)

prefix = "_phoneme_bg_"
# define the summary writer
writer = SummaryWriter()
sns.set()
sns.set_style("dark")
sns.set_palette("muted")
sns.set_color_codes("muted")


# select the device
DEVICE = torch.device("cuda:1")
LOADER_KWARGS = {'num_workers': 1, 'pin_memory': True} if torch.cuda.is_available() else {}
cuda = torch.cuda.set_device(1)

# define the parameters
BATCH_SIZE = 100
TEST_BATCH_SIZE = 100
batch_size = 100
COND_OPT = False
CLASSES = 5
# TRAIN_EPOCHS = 250
SAMPLES = 1
TEST_SAMPLES = 10
TEMPER = 0.001
TEMPER_PRIOR = 0.001
pepochs = 50
epochs = 250

#prepare the data
data = pd.read_csv('http://www.uio.no/studier/emner/matnat/math/STK2100/data/phoneme.data')
data = data.drop(columns=["row.names"])
data = pd.concat([data,data.g.astype("category").cat.codes.astype(int)],sort=False, axis=1) #get_dummies(data['g'], prefix='phoneme')],sort=False, axis=1)
data = data.drop(columns=["g","speaker"])
data = data.values


np.random.seed(40590)

tr_ids = np.random.choice(4509, 3500, replace = False)
te_ids = np.setdiff1d(np.arange(4509),tr_ids)[0:1000]

dtrain = data[tr_ids,:]

data_mean = dtrain.mean(axis=0)[0:256]
data_std = dtrain.std(axis=0)[0:256]

data[:,0:256] = (data[:,0:256]  - data_mean)/data_std




dtrain = data[tr_ids,:]
dtest = data[te_ids,:]

# set prior parameters
PI = 1
SIGMA_1 = torch.cuda.FloatTensor([math.exp(-0)])
SIGMA_2 = torch.cuda.FloatTensor([math.exp(-6)])


epochs = 10
trtimes  = np.zeros(epochs)
#w_mu = [w_mu1, w_mu2, w_mu3, w_mu4]
#b_mu = [b_mu1, b_mu2, b_mu3, b_mu4]

#w_mu_nodewise = [w_mu1_nodewise,w_mu2_nodewise,w_mu3_nodewise,w_mu4_nodewise]
#b_mu_nodewise = [b_mu1_nodewise,b_mu2_nodewise,b_mu3_nodewise,b_mu4_nodewise]
# make inference on 10 networks

for i in range(0, 1):
    print(i)
    torch.manual_seed(i)
    net2 = FVMF.BayesianNetwork(#w_mu = w_mu, b_mu = b_mu, 
                                #w_mu = None, b_mu = None,
                                #w_mu = w_mu_nodewise, b_mu = b_mu,
                                layershapes = layershapes,
                                dtrain=dtrain, dtest=dtest,
                                VD='vmf',
                                b_kappa=torch.Tensor(1).uniform_(4,4.1),
                                w_kappa=torch.Tensor(1).uniform_(6,6.1),
                                Temper = 1)
    
    #for j,p in enumerate(net2.l1.parameters()):    
    #    p.requires_grad_(False)
    #    
    #for j,p in enumerate(net2.l2.parameters()):
    #    p.requires_grad_(False)
    
    optimizer = optim.Adam(net2.parameters(), lr=0.14)
    
    
    for epoch in range(epochs):

        trtimes[epoch] = FVMF.train(net2, dtrain, SAMPLES, optimizer, epoch, i,BATCH_SIZE = 100)
        print('max:',net2.weight_mu[1].max())
        print('norm:',torch.norm(net2.weight_mu[1]))

    res = test_ensemble.test_ensemble(net2,dtest,TEST_SAMPLES,TEST_BATCH_SIZE,BATCH_SIZE,CLASSES,DEVICE)

    #np.savetxt("soundGmaccuracies_" + str(i) + ".csv", res, delimiter=",")

FVMF RELOADED
GPUs are used!
Classes loaded
0
Random Init Utilized


  v0 = torch.cat([v0, torch.tensor(w0[det >= 0]).to(DEVICE)])


1
loss: tensor(193.1748, device='cuda:1', grad_fn=<AddBackward0>)
negative_log_likelihood: tensor(56.0445, device='cuda:1', grad_fn=<NllLossBackward0>)
max: tensor(3.5429, device='cuda:1', grad_fn=<MaxBackward1>)
norm: tensor(14.0965, device='cuda:1', grad_fn=<NormBackward1>)
2
loss: tensor(185.5356, device='cuda:1', grad_fn=<AddBackward0>)
negative_log_likelihood: tensor(23.6395, device='cuda:1', grad_fn=<NllLossBackward0>)
max: tensor(4.3328, device='cuda:1', grad_fn=<MaxBackward1>)
norm: tensor(15.9430, device='cuda:1', grad_fn=<NormBackward1>)
3
loss: tensor(172.7780, device='cuda:1', grad_fn=<AddBackward0>)
negative_log_likelihood: tensor(39.8553, device='cuda:1', grad_fn=<NllLossBackward0>)
max: tensor(4.5436, device='cuda:1', grad_fn=<MaxBackward1>)
norm: tensor(17.2230, device='cuda:1', grad_fn=<NormBackward1>)
4
loss: tensor(153.2977, device='cuda:1', grad_fn=<AddBackward0>)
negative_log_likelihood: tensor(25.0446, device='cuda:1', grad_fn=<NllLossBackward0>)
max: tensor(5.201

In [5]:
import torch
import matplotlib.pyplot as plt
np.random.seed(42069)
torch.manual_seed(42069) 

PL = [torch.Tensor([1, 1]),torch.Tensor([5, 1]),torch.Tensor([1, 5]),torch.Tensor([5, 5]),torch.Tensor([3, 3])]
cov = torch.eye(2)
n = 1000

plt.figure(figsize=(10, 10), dpi=500)

DF = torch.zeros((len(PL),n,1,3)) #multiple of 4
for i, MU in enumerate(PL): #enumerate starts from and including 0.
    distrib = torch.distributions.MultivariateNormal(loc=MU, covariance_matrix=cov)
    DATA_ = distrib.sample((n,1))

    DATA  = torch.zeros([n, 1, 3])
    DATA[:,:,:2] = DATA_
    DATA[:,:,2]  = i
    DF[i,:,:,:] = DATA
    
    #x = DATA[:,0,0]
    #y = DATA[:,0,1]
    #plt.plot(x,y,'.',markersize=1.25)
#plt.show()

C = int(3*n/4)

#DATA_train = torch.zeros((len(PL)*C,3))
#DATA_test = torch.zeros((len(PL)*(n-C),3))

DATA = DF.reshape(len(PL)*n,3)
print('DATA:',DATA,'len(DATA):',len(DATA),'mean dtrain:',DATA.mean(axis=0)[2])

data_mean = DATA.mean(axis=1)[0:2]
data_std = DATA.std(axis=1)[0:2]

DATA[:,0:2] = (DATA[:,0:2]  - data_mean)/data_std
print('DATA normalized:',DATA,'len(DATA) normalized:',len(DATA),'mean dtrain normalized:',DATA.mean(axis=0)[2])
tr_ids = np.random.choice(5000, 3500, replace = False)

dtrain = DATA[tr_ids,:]
dtest = DATA[-tr_ids,:]

print('\n','dtrain:',dtrain, 'len(dtrain):',len(dtrain),'mean dtrain:',dtrain.mean(axis=0)[2])
print('\n','dtest:',dtest, 'len(dtest):',len(dtest),'mean dtest:',dtest.mean(axis=0)[2])

DATA: tensor([[ 2.6927, -0.2502,  0.0000],
        [ 1.8529,  1.9803,  0.0000],
        [ 1.7387, -0.8557,  0.0000],
        ...,
        [ 2.3355,  2.9429,  4.0000],
        [ 2.4691,  3.4135,  4.0000],
        [ 2.9981,  0.6117,  4.0000]]) len(DATA): 5000 mean dtrain: tensor(2.)
DATA normalized: tensor([[ 1.1513, -1.3785,  0.0000],
        [ 0.6366,  0.6339,  0.0000],
        [ 0.5666, -1.9248,  0.0000],
        ...,
        [ 0.9324,  1.5024,  4.0000],
        [ 1.0143,  1.9270,  4.0000],
        [ 1.3385, -0.6009,  4.0000]]) len(DATA) normalized: 5000 mean dtrain normalized: tensor(2.)

 dtrain: tensor([[ 0.7709,  2.9068,  4.0000],
        [ 1.7892,  0.1191,  1.0000],
        [ 0.6227, -1.0395,  0.0000],
        ...,
        [ 2.7307,  2.7459,  3.0000],
        [-0.4424, -0.8526,  0.0000],
        [ 2.5481,  3.8702,  3.0000]]) len(dtrain): 3500 mean dtrain: tensor(1.9989)

 dtest: tensor([[ 0.1085,  0.1662,  0.0000],
        [ 3.3064,  3.8735,  3.0000],
        [ 1.0038,  4.0304,  

<Figure size 5000x5000 with 0 Axes>

In [7]:
import math
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable
from tensorboardX import SummaryWriter
from torchvision import datasets, transforms
from torchvision.utils import make_grid
from tqdm import tqdm, trange
import pandas as pd
import time
import mpmath
import os
import VMF
import importlib

import FVMF
importlib.reload(FVMF)

# define the summary writer
writer = SummaryWriter()
# select the device
DEVICE = torch.device("cuda:1")
LOADER_KWARGS = {'num_workers': 1, 'pin_memory': True} if torch.cuda.is_available() else {}
cuda = torch.cuda.set_device(1)

# define the parameters

COND_OPT = False
CLASSES = 5
# TRAIN_EPOCHS = 250
SAMPLES = 1
TEST_SAMPLES = 10
TEMPER = 0.001
TEMPER_PRIOR = 0.001
pepochs = 50
epochs = 250

l1shape=(2, 5)
l2shape=(5, 5)
l3shape=(5, 5)
l4shape=(5, 5)
layershapes = [l1shape, l2shape, l3shape, l4shape]

# set prior parameters
PI = 1
SIGMA_1 = torch.cuda.FloatTensor([math.exp(-0)])
SIGMA_2 = torch.cuda.FloatTensor([math.exp(-6)])
Sim_data_shape = (0,2,2,3)

epochs = 10
trtimes  = np.zeros(epochs)
#w_mu = [w_mu1, w_mu2, w_mu3, w_mu4]
#b_mu = [b_mu1, b_mu2, b_mu3, b_mu4]

#w_mu_nodewise = [w_mu1_nodewise,w_mu2_nodewise,w_mu3_nodewise,w_mu4_nodewise]
#b_mu_nodewise = [b_mu1_nodewise,b_mu2_nodewise,b_mu3_nodewise,b_mu4_nodewise]
# make inference on 10 networks

for i in range(0, 1):
    print(i)
    torch.manual_seed(i)
    net3 = FVMF.BayesianNetwork(#w_mu = w_mu, b_mu = b_mu, 
                                #w_mu = None, b_mu = None,
                                #w_mu = w_mu_nodewise, b_mu = b_mu,
                                layershapes = layershapes,
                                dtrain=dtrain, dtest=dtest,
                                VD='vmf',
                                b_kappa=torch.Tensor(1).uniform_(4,4.1),
                                w_kappa=torch.Tensor(1).uniform_(6,6.1),
                                Temper = 1)
    
    #for j,p in enumerate(net2.l1.parameters()):    
    #    p.requires_grad_(False)
    #    
    #for j,p in enumerate(net2.l2.parameters()):
    #    p.requires_grad_(False)
    
    optimizer = optim.Adam(net3.parameters(), lr=0.14)
    
    
    for epoch in range(epochs):

        trtimes[epoch] = FVMF.train(net3, dtrain, SAMPLES, optimizer, epoch, i,BATCH_SIZE = 100)
        print('max:',net2.weight_mu[1].max())
        print('norm:',torch.norm(net3.weight_mu[1]))

    res = FVMF.test_ensemble(net3,dtest,TEST_SAMPLES,TEST_BATCH_SIZE,BATCH_SIZE,CLASSES,DEVICE,shape = Sim_data_shape)

    #np.savetxt("soundGmaccuracies_" + str(i) + ".csv", res, delimiter=",")

FVMF RELOADED
GPUs are used!
Classes loaded
0
Random Init Utilized


TypeError: expected TensorOptions(dtype=long int, device=cpu, layout=Strided, requires_grad=false (default), pinned_memory=false (default), memory_format=(nullopt)) (got TensorOptions(dtype=float, device=cpu, layout=Strided, requires_grad=false (default), pinned_memory=false (default), memory_format=(nullopt)))

In [None]:
r"""
What if the problem is that since the mu's in the Gaussian is (out,in), and the vMF is out*in,
this could mean that we have "strethed" a single vMF pdf over all the parameters, while in the Gaussian we have made one for each output?

I don't know, also intuitively this should be the case, since the whole point of the vMF is the norm 1, which obviously
will require that it for each forward-pass is only one massive pdf for all inputs and outputs.

Ok, this can get rough, I will try my best to reshape the w_mu's and b_mu's the best I can, perhaps this will work just fine without too much tuning...
Hopefully.

This doe not seem to ave worked. I don't konw exactly what the problem is. Per haps it is a good idea to consult the new loss function
suggested in the paper that made the code we based our vMF on?

It is very strange that the loss is not at all affected by completely ridiculous learning rates...

Remember that vMF makes the norm of the weights and biases 1, not the forward pass of the x's. Hence the advantage is that the gradient
will not explode, since the backward pass of it will also be approx. 1. In batchnorm, maybe the gradient can explode? Since the weights 
can be whatever?


IMPORTANT: The gaussian neuralnet will also collapse to 257 if I apply more than 3 layers. This must be somehow related to the similar
behavior in the vMF when the size of each layer exceeds 3. Per haps there is an error in the loss afterall?
However, in the Gaussian case, increasing the learning rate by a factor of 10 solved the issue. This makes me suspect it is the mathematical
properties of the loss function, rather than incorrect implementation.
"""

#When l4 is (3,5):
r"""
File ~/projects/BNN/AliaksandrFolder/FVMF.py:289, in vMF.sample(self, N, rsf)
    287 e1mu = torch.zeros(d, 1).to(DEVICE)
    288 e1mu[0, 0] = 1.0
--> 289 e1mu = e1mu - self.mu if len(self.mu.shape) == 2 else e1mu - self.mu.unsqueeze(1) #e1mu.shape = (1,self.x_dim). mu_unnorm.shape = (mu_unnorm)
    290 e1mu = e1mu / norm(e1mu, dim=0).to(DEVICE)
    291 samples = samples - 2 * (samples @ e1mu) @ e1mu.t()

RuntimeError: The size of tensor a (15) must match the size of tensor b (9) at non-singleton dimension 0
"""

#When l4 is (5,5):
r"""
File ~/projects/BNN/AliaksandrFolder/FVMF.py:289, in vMF.sample(self, N, rsf)
    287 e1mu = torch.zeros(d, 1).to(DEVICE)
    288 e1mu[0, 0] = 1.0
--> 289 e1mu = e1mu - self.mu if len(self.mu.shape) == 2 else e1mu - self.mu.unsqueeze(1) #e1mu.shape = (1,self.x_dim). mu_unnorm.shape = (mu_unnorm)
    290 e1mu = e1mu / norm(e1mu, dim=0).to(DEVICE)
    291 samples = samples - 2 * (samples @ e1mu) @ e1mu.t()

RuntimeError: The size of tensor a (25) must match the size of tensor b (15) at non-singleton dimension 0
"""
#These errors above were caused by my initialization being wrong. I copy paster mu_3 for layer4, and forgot to change to mu_4. So now 
#I always get the error below.


#in all cases now: 

r"""
It seems that the whole thing does not progress at all. We just get the warning and then no further output.

self.l4(x, sample)

--> self.bias.sample()

It always get's stuck there!!

Specifically, it get's stuck in the while loop:

while len(v0) < N:
            eps = beta.sample([1, rsf * (N - len(v0))]).squeeze().to(DEVICE)
            uns = uniform.sample([1, rsf * (N - len(v0))]).squeeze().to(DEVICE)
            w0 = (1 - (1 + bb) * eps) / (1 - (1 - bb) * eps)
            t0 = (2 * aa * bb) / (1 - (1 - bb) * eps)
            det = (d - 1) * t0.log() - t0 + dd - uns.log()
            v0 = torch.cat([v0, torch.tensor(w0[det >= 0]).to(DEVICE)])
            if len(v0) > N:
                v0 = v0[:N]
                break
"""

r"""
From further investigations it is clear that the error lies in w0[det >= 0] consistently being an empty Tensor.

Even further, bb is 0 here which it usually is not. That must definitely indicate something is wrong.

Adjusting the initialization of kappa to be 9 or less on both weights and biases makes the code run, 
but posterior collapse is back. Increasing kappa seems to increase the compute aswell... however, getting the kappa inits
closer to 10 seems to also help avoid the posterior collapse. And the lower bound increased also helps, looks like 3 is optimal.
"""


r"""
It seems that Torch likes that each of the bias_mu's and weight_mu's from each layer are separately registered with name as an nn.parameter.
"""

r"""
Currently, for 4 layers with 3 hiddenwidth, it seems around 10 epochs with .14 learning rate is optimal for testperformance.
I suspect this is because we do not have the modelcapacity to go beyond the overfiting case just yet, and must settle for the classical
best-fit.
"""

r"""
The problem for both the random initialization and the variable length Gaussian has the same root. It is that the mu's and rho's 
are not being registered as parameters per layer to begin with!

And since this registering works just fine when we are directly assigning self.layer's to be each layer the parameters are registered correctly,
it must be the case that this part in the BayesianNetwork's initializer is where the problem originates.
"""

r"""
Part of the reason why the layerwise vMF might not be that performant, 
is that it forces one kappa on every weight in the entire layer.
"""

r"""
The way the mu of the weights are used in the vMF is that any mu is accepted, and then the mu's are normalized. 
I think this is the reason for why the net becomes intractably slow as we increase the layers. 
If it is possible to set the weight_mu parameters to be the normalized versions for every epoch I think that would be great.

Or perhaps build into the loss a term that penalizes the weights from deviating from norm=1...

I don't know exactly where to put this
"""

In [None]:
r"""
Plan:

First solve the uncertainty estimation.

Then solve a couple more image-classification data sets
OPTIONALLY: Solve some more regression or non-image classification sets.

Solve the nn.Parameter(the Mu's) normalization business.

Solve having separate kappas for each dimension of the vMF.
"""