We train models with batchout layers on MNIST. The architecture is as mentioned in the paper. 
There are seven models in total, with the 1st model having no batchout and the others having batchout at different locations. The model architectures can be found in "architectures.py" module.
The results of batchout models on test set are consistent with the paper

In [4]:
# Import packages, use cuda, initialize the data loaders
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, transforms
from torch.utils.data import DataLoader
from math import exp
from batchout import BatchOut
import time

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

mnist_train = datasets.MNIST("./data", train=True, download=True, transform=transforms.ToTensor())
mnist_test = datasets.MNIST("./data", train=False, download=True, transform=transforms.ToTensor())

train_loader = DataLoader(mnist_train, batch_size = 128, shuffle=True)
test_loader = DataLoader(mnist_test, batch_size = 128, shuffle=False)



cuda


In [5]:
# Import the models
from architectures import *

# A function that performs standard training/evaluation
def epoch(loader, model, opt=None):
    """Standard training/evaluation epoch over the dataset"""
    total_loss, total_err = 0.,0.
    for X,y in loader:
        X,y = X.to(device), y.to(device)
        if not opt:
            model.eval()
            yp = model(X)
            loss = nn.CrossEntropyLoss()(yp,y)
        if opt:
            model.train()
            yp = model(X)
            loss = nn.CrossEntropyLoss()(yp, y)
            opt.zero_grad()
            loss.backward()
            opt.step()

        total_err += (yp.max(dim=1)[1] != y).sum().item()
        total_loss += loss.item() * X.shape[0]
    return total_err / len(loader.dataset), total_loss / len(loader.dataset)


In [7]:
'''
Now we train the models with batchout present in them. THe accuracies obtained are:
1. With no batchout removed: 94 % 
2. Batchout after Conv1: 95 %
3. Batchout after Conv2: 93 %
4. Batchout after both Conv1, Conv2 : 92
5. Batchout after f1: 93 %
6. Batchout after Conv2 and f1: 92 %

The results are consistent with the paper
'''
models = [batchout_model_cnn_all, batchout_model_cnn_c1, batchout_model_cnn_c2, batchout_model_cnn_f1, batchout_model_cnn_c12, batchout_model_cnn_c2f1]
for (i, model) in enumerate(models):
    model.to(device)
    opt = optim.SGD(model.parameters(), lr=1e-2)

    print('Training the model')
    start = time.time()
    for t in range(15):
        train_err, train_loss = epoch(train_loader, model, opt)
        test_err, test_loss = epoch(test_loader, model)
        if t == 4:
            for param_group in opt.param_groups:
                param_group["lr"] = 1e-3
        if t == 10:
            for param_group in opt.param_groups:
                param_group["lr"] = 1e-5
        if t % 4 == 0 or t==14:
            print(*("{:.6f}".format(i) for i in (train_err, test_err)), sep="\t")
    
    end = time.time()
    torch.save(model.state_dict(), str(i) + '.pt')
    print(model)
    print("Time taken is: {}".format((end - start) / 60 ))


Training the model
0.112667	0.055200
0.067433	0.032300
0.063117	0.030500
0.060267	0.030500
0.061683	0.031800
Sequential(
  (0): Conv2d(1, 32, kernel_size=(5, 5), stride=(1, 1), padding=(1, 1))
  (1): BatchOut()
  (2): ReLU()
  (3): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (4): Conv2d(32, 32, kernel_size=(5, 5), stride=(1, 1), padding=(1, 1))
  (5): BatchOut()
  (6): ReLU()
  (7): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (8): Flatten()
  (9): Dropout(p=0.5, inplace=False)
  (10): Linear(in_features=800, out_features=256, bias=True)
  (11): BatchOut()
  (12): ReLU()
  (13): Dropout(p=0.5, inplace=False)
  (14): Linear(in_features=256, out_features=10, bias=True)
)
Time taken is: 6.276788592338562
Training the model
0.115750	0.053300
0.066017	0.033700
0.060017	0.030500
0.060400	0.031100
0.058333	0.030700
Sequential(
  (0): Conv2d(1, 32, kernel_size=(5, 5), stride=(1, 1), padding=(1, 1))
  (1): BatchOut()
  (2): ReLU()

In [None]:
'''
Consider the original architecture. Train the model and evaluate. 
We obtain 93.5 % accuracy on test set. Further accuracy can be obtained by decreasing lr... etc 
'''
model_cnn.to(device)

opt = optim.SGD(model_cnn.parameters(), lr=1e-2)

print('Training the model')

start = time.time()
for t in range(15):
    train_err, train_loss = epoch(train_loader, model_cnn, opt)
    test_err, test_loss = epoch(test_loader, model_cnn)
    if t == 4:
        for param_group in opt.param_groups:
            param_group["lr"] = 1e-3
    if t == 10:
        for param_group in opt.param_groups:
            param_group["lr"] = 1e-5
    if t % 4 == 0 or t==14:
        print(*("{:.6f}".format(i) for i in (train_err, test_err)), sep="\t")
end = time.time()
torch.save(model_cnn.state_dict(), "mnist_cnn.pt")
print("Time taken is: {}".format((end - start)/60))