In [1]:
import numpy as np
import scipy as sp
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.optim as optim

from torchvision import datasets, transforms
from torch.utils.data import DataLoader

cuda = torch.cuda.is_available()


## From adversarial examples to training robust models

In the previous notebooks, we focused on methods for solving the maximization problem over perturbations; that is, to finding the solution to the problem
\begin{equation}
\DeclareMathOperator*{\maximize}{maximize}
\maximize_{\|\delta\| \leq \epsilon} \ell(h_\theta(x + \delta), y).
\end{equation}

In this notebook, we will focus on training a robust classifier. More precisly, we aim at solving following minimization problem, namely Adversarial Training:
\begin{equation}
\DeclareMathOperator*{\minimize}{minimize}
\minimize_\theta \frac{1}{|S|} \sum_{x,y \in S} \max_{\|\delta\| \leq \epsilon} \ell(h_\theta(x + \delta), y).
\end{equation}
The order of the min-max operations is important here.  Specially, the max is inside the minimization, meaning that the adversary (trying to maximize the loss) gets to "move" _second_.  We assume, essentially, that the adversary has full knowledge of the classifier parameters $\theta$, and that they get to specialize their attack to whatever parameters we have chosen in the outer minimization. The goal of the robust optimization formulation, therefore, is to ensure that the model cannot be attacked _even if_ the adversary has full knowledge of the model.  Of course, in practice we may want to make assumptions about the power of the adversary but it can be difficult to pin down a precise definition of what we mean by the "power" of the adversary, so extra care should be taken in evaluating models against possible "realistic" adversaries.

## Exercice 1
1. Train a robust classifier using Adversarial Training with a specific norm
2. Evaluate your classifier on natural and adversarial examples crafted with the norm of the training and other norms
3. Make an analysis and conclude

Exercice 1

1. Entraîner un classifieur robuste en utilisant l'entraînement adversarial avec une norme spécifique.

2. Evaluer votre classifieur sur des exemples naturels et adversariaux élaborés avec la norme de l'entraînement et d'autres normes

3. Faites une analyse et concluez

In [14]:
# load CIFAR10 dataset
def load_cifar(split, batch_size):
  train = True if split == 'train' else False
  dataset = datasets.CIFAR10("./docs", train=split, download=True, transform=transforms.ToTensor())
  return DataLoader(dataset, batch_size=batch_size, shuffle=train)

batch_size = 100
train_loader = load_cifar('train', batch_size)
test_loader = load_cifar('test', batch_size)

Files already downloaded and verified
Files already downloaded and verified


In [15]:
class ConvModel(torch.nn.Module):
  
  def __init__(self):
    super(ConvModel, self).__init__()
    self.conv1 = nn.Conv2d(3, 6, kernel_size=5, padding=2)
    self.relu = nn.ReLU()
    self.pool = nn.MaxPool2d(2)
    self.conv2 = nn.Conv2d(6, 16, kernel_size=5)
    self.lin1 = nn.Linear(576, 256)
    self.lin2 = nn.Linear(256, 128)
    self.lin3 = nn.Linear(128, 10)
  
  def forward(self, x):
    x = self.conv1(x)
    x = self.relu(x)
    x = self.pool(x)
    x = self.conv2(x)
    x = self.relu(x)
    x = self.pool(x)
    x = x.view(-1, 576)
    x = self.lin1(x)
    x = self.relu(x)
    x = self.lin2(x)
    x = self.relu(x)
    x = self.lin3(x)
    return x

In [16]:
class ProjectedGradientDescent:
  
  def __init__(self, model, eps, alpha, num_iter):
    self.model = model
    self.eps = eps
    self.alpha = alpha
    self.num_iter = num_iter
  
  def compute(self, x, y):
    """ Construct PGD adversarial pertubration on the examples x."""  
    delta = torch.zeros_like(x, requires_grad=True)    
    for t in range(self.num_iter):
        loss = nn.CrossEntropyLoss()(model(x + delta), y)
        loss.backward()
        delta.data = (delta + self.alpha*delta.grad.detach().sign()).clamp(-self.eps,self.eps)
        delta.grad.zero_()
    return delta.detach()

In [17]:
def adversarial_train_model(model, criterion, optimizer, loader, attack):
  """Function to train the model"""
  total_loss, total_err = 0.,0.
  for X,y in loader:
       X,y = X.to('cuda'), y.to('cuda')
       delta = attack.compute(X, y)
       yp = model(X+delta)
       loss = nn.CrossEntropyLoss()(yp,y)
       if opt:
           opt.zero_grad()
           loss.backward()
           opt.step()
       
       total_err += (yp.max(dim=1)[1] != y).sum().item()
       total_loss += loss.item() * X.shape[0]
  return total_err / len(loader.dataset), total_loss / len(loader.dataset)
    
# adverserial training with PGD
model = ConvModel()
if cuda:
  model = model.cuda()

# define your loss
criterion = nn.CrossEntropyLoss()

# define the optimizer
opt = torch.optim.SGD(model.parameters(), lr=0.02)

# define the attack
attack = ProjectedGradientDescent(model, 0.1, 0.01, 20)

adversarial_train_model(model, criterion, opt, train_loader, attack)

(0.90362, 2.3134284019470215)

In [None]:
def eval_model(model, loader, attack=None):
  """Function to evaluate your model on a specific loader"""
  accuracy = 0.
  n_inputs = 0.
  for n_batch, (imgs, labels) in enumerate(loader):
      if cuda:
        imgs, labels = imgs.cuda(), labels.cuda()
      if attack==None:
        outputs = model(imgs)
      else:
        outputs = model(imgs + attack.compute(imgs, labels))
      predicted = outputs.argmax(axis=1)
      n_inputs += outputs.size(0)
      accuracy += (predicted == labels).sum()
  accuracy = accuracy/n_inputs
  print("Accuracy: ", accuracy)

attack = ProjectedGradientDescent(model, 0.1, 0.01, 20)
eval_model(model, test_loader)
eval_model(model, test_loader, attack)

Accuracy:  tensor(0.0998, device='cuda:0')
Accuracy:  tensor(0.0975, device='cuda:0')
