In [1]:
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
import pyro
import numpy as np
import pylab as pl
import pandas as pd
import sklearn
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from pyro import distributions as dist
from pyro.infer import SVI, Trace_ELBO, Predictive
from pyro.infer.mcmc import MCMC, HMC, NUTS
import seaborn as sns

sns.set_theme()

from scripts import *
from models import *

torch.backends.cudnn.benchmark = True

## Data Loading

In [2]:
from torchvision import transforms as T

transforms = nn.Sequential(
    T.Normalize([128.], [128.])
)
scripted_transforms = torch.jit.script(transforms)

dataset = load_images("../Datasets/notMNIST_small/notMNIST_small", scripted_transforms)

In [3]:
train_size = int(len(dataset)*0.7)
test_size = len(dataset)-train_size
train_set, val_set = torch.utils.data.random_split(dataset, [train_size, test_size])

In [4]:
train_loader = DataLoader(train_set, batch_size=128, num_workers=2, pin_memory=True)
val_loader = DataLoader(train_set, batch_size=1024, pin_memory=True)

### Train classical CNN

In [5]:
cnn = CNN(1, [2,2,2], [32,64,128], n_classes=10, activation=nn.ReLU)
optimizer = torch.optim.Adam(cnn.parameters(), lr=2e-3)
criterion = nn.CrossEntropyLoss()
cnn.fit(train_loader, optimizer, criterion, n_epochs=10)

Epoch: 10	 loss: 0.17304631904924958	 time: 1.7615087032318115


In [6]:
cnn.accuracy(val_loader)

tensor(0.9504, device='cuda:0')

### Train classical MLP

In [7]:
mlp = MLP(784, 10, 1, 512, activation=nn.GELU)
optimizer = torch.optim.Adam(mlp.parameters())
criterion = nn.CrossEntropyLoss()
mlp.fit(train_loader, optimizer, criterion, 10)

Epoch: 10	 loss: 0.08301731591899562	 time: 0.8531670570373535


In [8]:
mlp.accuracy(val_loader)

tensor(0.9696, device='cuda:0')

### Train Bayesian MLP

In [12]:
# I use bayesian optimization to find the best hyperparameters
from skopt import gp_minimize
from skopt.utils import use_named_args
from skopt.space import Real, Integer, Categorical
from time import time

space  = [Integer(32, 512, "log-uniform", name='batch_size'),
          Real(10**-5, 10**-1, "log-uniform", name='lr'),
          Integer(1, 5, name='num_layers'),
          Integer(1, 1024, "log-uniform", name='hidden_size'),
          Categorical([nn.ReLU, nn.SiLU, nn.LeakyReLU, nn.GELU], name='activation')]

@use_named_args(space)
def objective(**params):
    start = time()
    device = "cuda"
        
    net = MLP(784, device=device,
              num_layers=params["num_layers"],
              hidden_size=params["hidden_size"],
              activation=params["activation"],
              n_classes=10)
    bayesian_net = BayesianNetwork(net)
    
    train_loader = DataLoader(train_set, batch_size=params["batch_size"].item())
    optimizer = pyro.optim.ClippedAdam({"lr":params["lr"]})
    elbo = Trace_ELBO()
    bayesian_net.fit(train_loader, optimizer, elbo, n_epochs=5)
    
    return (1-bayesian_net.accuracy(val_loader, n_samples=10)).item(), time()-start

res_gp = gp_minimize(objective, space, acq_func='EIps', n_calls=50, verbose=True)

Iteration No: 50 ended. Search finished for the next optimal point.
Time taken: 9.9972
Function value obtained: 0.8214
Current minimum: 0.8109


In [13]:
# Best parameters
res_gp.x

[512, 0.06783915277773545, 1, 1024, torch.nn.modules.activation.LeakyReLU]

In [11]:
net = MLP(784, 10, 1, 1024, device = "cuda", activation=nn.LeakyReLU)
bayesian_net = BayesianNetwork(net)

train_loader = DataLoader(train_set, batch_size=512)

optimizer = pyro.optim.ClippedAdam({"lr":0.06783915277773545})
elbo = Trace_ELBO()
bayesian_net.fit(train_loader, optimizer, elbo, n_epochs=5)

Epoch: 5	 loss: 1673180.3992585402	 time: 1.7568480968475342


In [12]:
bayesian_net.accuracy(val_loader, n_samples=10)

tensor(0.1700, device='cuda:0')

We see that the accuracty is way worse than a classical MLP, hinting that the posterior distribution isn't well approximated by the guide