In [None]:
%matplotlib inline
from __future__ import print_function
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, transforms
from torch.utils.data import DataLoader, random_split
import numpy as np
import matplotlib.pyplot as plt
import os
cuda = torch.cuda.is_available()
device = 'cuda' if cuda else 'cpu'
device

We use here capture to suppress output that could disclose author information. If you run this cell, you can of course delete the capture command to get some feedback whether the cloning worked.

In [None]:
if not os.path.exists("./Representation_Learning"):
    !git clone https://github.com/Sibylse/Representation_Learning.git
%cd Representation_Learning

In [None]:
!git pull

In [None]:
from models import *
from layers import *
from loss import *
from utils import *
from optimization import Optimizer

In [None]:
classes = ('0', '1', '2', '3', '4', '5', '6', '7', '8', '9')
c=10

# Data
print('==> Preparing data..')
trans = transforms.ToTensor()

testset = datasets.MNIST(root='./data', train=False, download=True, transform=trans)
# Select only some classes for motivating picture
b = 1000
testloader = torch.utils.data.DataLoader(testset, batch_size=b, shuffle=False, num_workers=2)

In [None]:
d=64 #embedding dimension of competitors
name="MNISTd%i"%(d)

In [None]:
!ls checkpoint/

In [None]:
classifier = nn.Linear(d, c,bias=True)
net = LeNet(embedding_dim=d, classifier = classifier)
criterion = CE_Loss(c, device)
#net = load_net("MNISTd64LeNetLinear.t7",net)       

UTIL functions

In [None]:
# Plot several examples of adversarial samples at each epsilon
def plot_adv(net, perturbed_img, min_conf=0.1):
    fig = plt.figure(figsize=(18,5),constrained_layout=True)
    net.eval()
    conf_pert, pred_pert = net.conf(perturbed_img).max(1)
    conf, pred = net.conf(inputs).max(1)
    attack_success = (targets != pred_pert) & (targets==pred) & (conf_pert>min_conf)
    examples = perturbed_img[attack_success]
    perturbations = (perturbed_img - inputs)[attack_success]

    # create 1x3 subplots per subfig
    if examples.shape[0]==0: 
        return
    
    axs = fig.subplots(nrows=2, ncols=min(examples.shape[0],10))
    for j in range(axs.shape[1]):
        ax = axs[0,j]
        ax.plot()
        ax.set_title("{} ({:.2f})".format(classes[pred_pert[attack_success][j]], conf_pert[attack_success][j]),fontsize = 20)
        ax.imshow((examples[j]).squeeze().detach().numpy(), cmap="gray") 
        ax.set_xticks([])
        ax.set_yticks([])
        ax = axs[1,j]
        ax.plot()
        #ax.set_title("{} ({:.2f})".format(classes[pred_pert[attack_success][j]], conf_pert[attack_success][j]),fontsize = 20)
        ax.imshow((perturbations[j]*50).squeeze().detach().numpy(), cmap="gray") 
        ax.set_xticks([])
        ax.set_yticks([])
    plt.show()

In [None]:
# Get data to plot attack rate for increasing epsilon
def get_attacks(attack_method, net, epsilons):
    net.eval()
    attack_succ = []
    for eps in epsilons:
          attack = attack_method(net, eps=eps)
          perturbed = attack.perturb(inputs, targets)
          conf_pert,pred_pert = net.conf(perturbed).max(1)
          conf,pred = net.conf(inputs).max(1)
          attack_success = (targets != pred_pert) & (targets==pred) & (conf_pert>0.1)
          attack_succ.append(attack_success.sum().item()/b*100)
    return attack_succ

## Check for test accuracy

In [None]:
inputs, targets = next(iter(testloader))
inputs, targets = inputs.to(device), targets.to(device)

In [None]:
optimizer = Optimizer(None, testloader, device)
(acc,conf) = optimizer.test_acc(net,criterion, testloader)

# Attack

In [None]:
!pip install --upgrade git+https://github.com/BorealisAI/advertorch.git

## Carlini & Wagner Attack

In [None]:
from advertorch.attacks import CarliniWagnerL2Attack
net.eval()
attack = CarliniWagnerL2Attack(net.conf, c, confidence=0.1, max_iterations=100)    
perturbed_images= attack.perturb(inputs, targets)
conf_pert,pred_pert = net.conf(perturbed_images).max(1)
conf,pred = net.conf(inputs).max(1)
attack_success = (targets != pred_pert) & (targets==pred)
l2Dist = torch.sqrt(((perturbed_images[attack_success] - inputs[attack_success])**2).sum([1,2,3])).mean().item()
print('successful attacks: {0:3.2f}% conf {1:3.4f}  dist {2:3.2f}'.format((attack_success).sum().item()/b*100,conf_pert[attack_success].mean().item(),l2Dist))

The resulting perturbed images:

In [None]:
plot_adv(net, perturbed_images, min_conf=0.6)

# FGSM attack

In [None]:
from advertorch.attacks import GradientSignAttack
net.eval()
attack = GradientSignAttack(net, eps=0.1)
perturbed_images= attack.perturb(inputs, targets)
conf_pert,pred_pert = net.conf(perturbed_images).max(1)
conf,pred = net.conf(inputs).max(1)
attack_success = (targets != pred_pert) & (targets==pred) & (conf_pert>0.1)
print('{0:>50} classifier, successful attacks: {1:3.2f}% conf {2:3.4f}'.format(net.classifier.__class__.__name__,(attack_success).sum().item()/b*100,np.round(conf_pert[attack_success].mean().item(),2)))

In [None]:
plot_adv(net, perturbed_images, min_conf=0.6)

In [None]:
epsilons = np.linspace(0,0.5,10)
attack_succ = get_attacks(GradientSignAttack, net, epsilons)
plt.figure(figsize=(5,5))
plt.plot(epsilons, attack_succ, "*-", label="DSR")

plt.title("Attack success (%) vs Epsilon")
plt.xlabel("Epsilon")
plt.ylabel("Attack success (%)")
plt.legend(loc='center left', bbox_to_anchor=(1, 0.5))
plt.show()

# Linf PGD Attack

In [None]:
from advertorch.attacks import LinfPGDAttack
net.eval()
attack = LinfPGDAttack(net, eps=0.1, nb_iter=40, eps_iter=0.01)
perturbed_images= attack.perturb(inputs, targets)
conf_pert,pred_pert = net.conf(perturbed_images).max(1)
conf,pred = net.conf(inputs).max(1)
attack_success = (targets != pred_pert) & (targets==pred) & (conf_pert>0.1)
print('{0:>50} classifier, successful attacks: {1:3.2f}% conf {2:3.4f}'.format(net.classifier.__class__.__name__,(attack_success).sum().item()/b*100,np.round(conf_pert[attack_success].mean().item(),2)))

In [None]:
plot_adv(net, perturbed_images, min_conf=0.6)

In [None]:
epsilons = np.linspace(0,0.5,10)
attack_succ = get_attacks(lambda net,eps: LinfPGDAttack(net, eps=eps, nb_iter=60, eps_iter=0.01), net, epsilons)
plt.figure(figsize=(5,5))
plt.plot(epsilons, attack_succ, "*-", label="DSR")

plt.title("Attack success (%) vs Epsilon")
plt.xlabel("Epsilon")
plt.ylabel("Attack success (%)")
plt.legend(loc='center left', bbox_to_anchor=(1, 0.5))
plt.show()

# L2PGDAttack

In [None]:
from advertorch.attacks import L2PGDAttack
attack = L2PGDAttack(net, eps=0.1, nb_iter=40, eps_iter=0.01)
perturbed_images= attack.perturb(inputs, targets)
conf_pert,pred_pert = net.conf(perturbed_images).max(1)
conf,pred = net.conf(inputs).max(1)
attack_success = (targets != pred_pert) & (targets==pred) & (conf_pert>0.1)
print('{0:>50} classifier, successful attacks: {1:3.2f}% conf {2:3.4f}'.format(net.classifier.__class__.__name__,(attack_success).sum().item()/b*100,np.round(conf_pert[attack_success].mean().item(),2)))

In [None]:
plot_adv(net, perturbed_images, min_conf=0.3)

In [None]:
epsilons = np.linspace(0,0.5,10)
attack_succ = get_attacks(lambda net,eps: L2PGDAttack(net, eps=eps, nb_iter=60, eps_iter=0.01), net, epsilons)
plt.figure(figsize=(5,5))
plt.plot(epsilons, attack_succ, "*-", label="DSR")

plt.title("Attack success (%) vs Epsilon")
plt.xlabel("Epsilon")
plt.ylabel("Attack success (%)")
plt.legend(loc='center left', bbox_to_anchor=(1, 0.5))
plt.show()