We create a CNN network, architecture same as batchout models. We train this network adversarially and then perform it's evaluation on test set. We note the accuracies obtained against the fgsm attacks

In [13]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, transforms
from torch.utils.data import DataLoader
import foolbox as fb
from architectures import model_cnn

# Load the weights of our models
#model_cnn.load_state_dict(torch.load("models/mnist_cnn.pt"))

In [14]:
def epoch(loader, model, opt=None):
    """Standard training/evaluation epoch over the dataset"""
    total_loss, total_err = 0.,0.
    for X,y in loader:
        X,y = X.to(device), y.to(device)
        if not opt:
            model.eval()
            yp = model(X)
            loss = nn.CrossEntropyLoss()(yp,y)
        if opt:
            model.train()
            yp = model(X)
            loss = nn.CrossEntropyLoss()(yp, y)
            opt.zero_grad()
            loss.backward()
            opt.step()

        total_err += (yp.max(dim=1)[1] != y).sum().item()
        total_loss += loss.item() * X.shape[0]
    return total_err / len(loader.dataset), total_loss / len(loader.dataset)

def epoch_adversarial(loader, model, attack, epsilon, opt=None, **kwargs):
    """Adversarial training/evaluation epoch over the dataset"""
    total_loss, total_err = 0.,0.
    for X,y in loader:
        X,y = X.to(device), y.to(device)
        # We update our delta unlike the standard epoch and the train the model to be robust
        delta = epsilon * attack(model, X, y)

        if not opt:
            model.eval()
            yp = model(X + delta)
            loss = nn.CrossEntropyLoss()(yp, y)

        if opt:
            model.train()
            yp = model(X + delta)
            loss = nn.CrossEntropyLoss()(yp, y)
            opt.zero_grad()
            loss.backward()
            opt.step()


        total_err += (yp.max(dim=1)[1] != y).sum().item()
        total_loss += loss.item() * X.shape[0]
    return total_err / len(loader.dataset), total_loss / len(loader.dataset)




In [15]:
def fgsm(model, X, y):
    """ Construct FGSM adversarial examples on the examples X"""
    delta = torch.zeros_like(X, requires_grad=True)
    loss = nn.CrossEntropyLoss()(model(X + delta), y)
    loss.backward()
    return delta.grad.detach().sign() # We will multiply epsilon in the epoch functions


In [16]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

mnist_train = datasets.MNIST("./data", train=True, download=True, transform=transforms.ToTensor())
mnist_test = datasets.MNIST("./data", train=False, download=True, transform=transforms.ToTensor())

train_loader = DataLoader(mnist_train, batch_size = 512, shuffle=True)
test_loader = DataLoader(mnist_test, batch_size = 512, shuffle=False)


cuda


In [18]:
for i,e in enumerate([0.1, 0.2, 0.3, 0.4, 0.5]):
    from architectures import model_cnn
    model_cnn.to(device)
    opt = optim.SGD(model_cnn.parameters(), lr=1e-1)
    print('-' * 10, 'epsilon: {}'.format(e), '-'*10)
    for t in range(15):
        train_err, train_loss = epoch_adversarial(train_loader, model_cnn, fgsm, e, opt)
        test_err, test_loss = epoch(test_loader, model_cnn)
        adv_test_err, adv_test_loss = epoch_adversarial(test_loader, model_cnn, fgsm, e)
        if t == 4:
            for param_group in opt.param_groups:
                param_group["lr"] = 1e-2
        if t == 10:
            for param_group in opt.param_groups:
                param_group["lr"] = 1e-4
        print(*("{:.6f}".format(i) for i in (train_err, test_err, adv_test_err)), sep="\t")
    
    torch.save(model_cnn.state_dict(), str(int(e * 10)) + '.pt')


---------- epsilon: 0.1 ----------
0.106133	0.022100	0.095000
0.099400	0.021400	0.098000
0.094583	0.018000	0.083900
0.088017	0.015500	0.070600
0.085333	0.015500	0.072200
0.080100	0.013600	0.067500
0.079083	0.013600	0.067400
0.079267	0.013300	0.066200
0.078350	0.013200	0.067400
0.079317	0.013500	0.067500
0.079417	0.013000	0.066300
0.078117	0.013000	0.066400
0.077433	0.013000	0.066300
0.077683	0.013000	0.066500
0.077317	0.013000	0.066400
---------- epsilon: 0.2 ----------
0.177333	0.018600	0.183600
0.160933	0.016700	0.162300
0.151000	0.015600	0.146600
0.144950	0.015200	0.143700
0.133567	0.015500	0.133300
0.127717	0.015000	0.126300
0.129183	0.014700	0.123300
0.125733	0.014300	0.127700
0.126450	0.014700	0.129700
0.127067	0.014900	0.128100
0.125383	0.014700	0.120500
0.122250	0.014600	0.120300
0.121433	0.014600	0.123600
0.123583	0.014600	0.120800
0.124883	0.014600	0.126700
---------- epsilon: 0.3 ----------
0.239117	0.018600	0.244900
0.207983	0.019300	0.219700
0.188917	0.020800	0.207400
0.16

So we have trained the models adversarially and then performed attacks on them. For training and evaluating them (after traininig), we use the same value of episilon. So for example, the adversarially trained model on epsilon: 0.5, we get accuracy of 96.3%; for epsilon=0.4 we get 91.5 % but again this number can be further improved by better training strategies.
These results are consistent with the paper
