<a href="https://colab.research.google.com/github/MariaBulychev/Thesis_Sparse_Adversarial_Attacks/blob/main/SparseAdversarialAttacks.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports

In [None]:
import torchvision
import torch
import random

import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.tensorboard import SummaryWriter # TensorBoard support
import torchvision.datasets as datasets
import torch.utils.data as data
import torchvision.transforms as transforms
import matplotlib.pyplot as plt
from torch.autograd import Variable, grad
import numpy as np
import argparse
import copy
plt.figure(figsize = (3,3)) #define the image size
import scipy.io
import time

!pip install wandb
import wandb

import math

# Wandb

In [None]:
wandb.init(
    #config  = defaults,     
    project = 'SparseAdv',   
    entity  = 'MariaBulychev',       
)

In [6]:
device='cuda'

# Models


In [None]:
#@title FashionCNN { form-width: "200px" }
#@markdown Do not set shuffle if want to reproduce the results
shuffle = False #@param {type: "boolean"}
#@markdown Batch size
b_size =  32#@param {type: "integer"}
test_set = torchvision.datasets.FashionMNIST("./data", download=True, train=False, transform=
                                               transforms.Compose([transforms.ToTensor()])) 
test_loader = torch.utils.data.DataLoader(test_set,
                                          batch_size=b_size, shuffle=shuffle, drop_last=True)
pretrained_model = True #@param {type: "boolean"}
save_model = False #@param {type: "boolean"}
model_label = "v1" #@param {type: "string"}
# Build a CNN model
class FashionCNN(nn.Module):
    def __init__(self):
        super(FashionCNN, self).__init__()
        self.layer1 = nn.Sequential(
            nn.Conv2d(in_channels=1, out_channels=32, kernel_size=3, padding=1),
            nn.BatchNorm2d(32),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2)
        )
        self.layer2 = nn.Sequential(
            nn.Conv2d(in_channels=32, out_channels=64, kernel_size=3),
            nn.BatchNorm2d(64),
            nn.ReLU(),
            nn.MaxPool2d(2)
        )
        self.fc1 = nn.Linear(in_features=64*6*6, out_features=600)
        self.drop = nn.Dropout2d(0.25)
        self.fc2 = nn.Linear(in_features=600, out_features=120)
        self.fc3 = nn.Linear(in_features=120, out_features=10)
    def forward(self, x):
        if len(x.shape) == 4 and x.shape[3]==1:
             x = x.permute(0,3,2,1)
        #if len(x.shape) == 3 and x.shape[2]==1:
             #x = x.permute(0,2,1)
        out = self.layer1(x)
        out = self.layer2(out)
        #out = out.view(out.size(0), -1)
        out = out.reshape(out.size(0), -1)
        out = self.fc1(out)
        out = self.drop(out)
        out = self.fc2(out)
        out = self.fc3(out)
        return out
if not pretrained_model:
    train_set = torchvision.datasets.FashionMNIST("./data", download=True, transform=
                                                    transforms.Compose([transforms.ToTensor()]))
    train_loader = torch.utils.data.DataLoader(train_set, 
                                              batch_size=b_size, drop_last=True)
    # Make a model of CNN class
    model = FashionCNN()
    model.to(device)
    error = nn.CrossEntropyLoss()
    learning_rate = 0.001
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    #print(model)
    num_epochs = 5
    count = 0
    # Lists for visualization of loss and accuracy 
    loss_list = []
    iteration_list = []
    accuracy_list = []
    # Lists for knowing class-wise accuracy
    predictions_list = []
    labels_list = []
    for epoch in range(num_epochs):
        for images, labels in train_loader:
            # Transfering images and labels to GPU if available
            images, labels = images.to(device), labels.to(device)
            train = Variable(images.view(b_size, 1, 28, 28))
            labels = Variable(labels)
            # Forward pass 
            outputs = model(train)
            loss = error(outputs, labels)
            # Initializing a gradient as 0 so there is no mixing of gradient among the batches
            optimizer.zero_grad()
            # Propagating the error backward
            loss.backward()
            # Optimizing the parameters
            optimizer.step()
            count += 1
        # Testing the model
            if not (count % 50):    # It's same as "if count % 50 == 0"
                total = 0
                correct = 0
                for images, labels in test_loader:
                    images, labels = images.to(device), labels.to(device)
                    labels_list.append(labels)
                    test = Variable(images.view(b_size, 1, 28, 28))
                    outputs = model(test)
                    predictions = torch.max(outputs, 1)[1].to(device)
                    predictions_list.append(predictions)
                    correct += (predictions == labels).sum()
                    total += len(labels)
                accuracy = correct * 100 / total
                loss_list.append(loss.data)
                iteration_list.append(count)
                accuracy_list.append(accuracy)
            if not (count % 500):
                print("Iteration: {}, Loss: {}, Accuracy: {}%".format(count, loss.data, accuracy))
    if save_model:
      torch.save(model, "model")
      run = wandb.init(job_type="model-creation")
      artifact = wandb.Artifact('pretrained-model'+model_label, type='model')
      artifact.add_file("model")
      run.log_artifact(artifact)
else:
    run = wandb.init(job_type="model-training")
    artifact = run.use_artifact('pretrained-model:latest')
    artifact_dir = artifact.download()
    print(artifact_dir)
    # IF YOU ARE USING GPU THEN YOU NEED TO DISABLE THIS
    #CUDA_LAUNCH_BLOCKING=1
    if torch.cuda.is_available():
      model = torch.load(artifact_dir+"/model")
    else:
      model = torch.load(artifact_dir+"/model", map_location=torch.device('cpu'))

total = 0
correct = 0
labels_list = []
predictions_list = []
for images, labels in test_loader:
    images, labels = images.to(device), labels.to(device)
    labels_list.append(labels)
    test = Variable(images.view(b_size, 1, 28, 28))
    outputs = model(test)
    predictions = torch.max(outputs, 1)[1].to(device)
    predictions_list.append(predictions)
    correct += (predictions == labels).sum()
    total += len(labels)
accuracy = correct * 100 / total
print(accuracy)


In [None]:
#@title LeNet { form-width: "200px" }

class LeNet(nn.Module):
  def __init__(self):
    super(LeNet, self).__init__()

    self.conv1 = nn.Conv2d(in_channels = 1, out_channels = 6, 
                           kernel_size = 5, stride = 1, padding = 0)
    self.conv2 = nn.Conv2d(in_channels = 6, out_channels = 16, 
                           kernel_size = 5, stride = 1, padding = 0)
    self.conv3 = nn.Conv2d(in_channels = 16, out_channels = 120, 
                           kernel_size = 5, stride = 1, padding = 0)
    self.linear1 = nn.Linear(120, 84)
    self.linear2 = nn.Linear(84, 10)
    self.tanh = nn.Tanh()
    self.avgpool = nn.AvgPool2d(kernel_size = 2, stride = 2)

  def forward(self, x):
    x = self.conv1(x)
    x = self.tanh(x)
    x = self.avgpool(x)
    x = self.conv2(x)
    x = self.tanh(x)
    x = self.avgpool(x)
    x = self.conv3(x)
    x = self.tanh(x)
    
    x = x.reshape(x.shape[0], -1)
    x = self.linear1(x)
    x = self.tanh(x)
    x = self.linear2(x)
    return x


model = LeNet().to(device)

optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()

pretrained_model = True #@param {type: "boolean"}
save_model = False #@param {type: "boolean"}
model_label = "v2" #@param {type: "string"}


if not pretrained_model:
  epochs = 20
  train_loss, val_loss = [], []

  for epoch in range(epochs):
    
      total_train_loss = 0
      total_val_loss = 0

      model.train()
      
      # training our model
      for idx, (image, label) in enumerate(trainloader):

          image, label = image.to(device), label.to(device)
          optimizer.zero_grad()
          pred = model(image)

          loss = criterion(pred, label)
          total_train_loss += loss.item()

          loss.backward()
          optimizer.step()

      total_train_loss = total_train_loss / (idx + 1)
      train_loss.append(total_train_loss)
      
      # validating our model
      model.eval()
      total = 0
      for idx, (image, label) in enumerate(testloader):
          image, label = image.to(device), label.to(device)
          pred = model(image)
          loss = criterion(pred, label)
          total_val_loss += loss.item()

          pred = torch.nn.functional.softmax(pred, dim=1)
          for i, p in enumerate(pred):
              if label[i] == torch.max(p.data, 0)[1]:
                  total = total + 1

      accuracy = total / test_data_size

      total_val_loss = total_val_loss / (idx + 1)
      val_loss.append(total_val_loss)

      if epoch % 5 == 0:
        print('\nEpoch: {}/{}, Train Loss: {:.4f}, Val Loss: {:.4f}, Val Acc: {:.4f}'.format(epoch, epochs, total_train_loss, total_val_loss, accuracy))
      
  if save_model:
    torch.save(model, "model")
    run = wandb.init(project="SparseAdv", job_type="model-creation")
    artifact = wandb.Artifact('LeNet2-model'+model_label, type='model')
    artifact.add_file("model")
    run.log_artifact(artifact)

else:
    run = wandb.init(project="SparseAdv", job_type="model-training")
    artifact = run.use_artifact('LeNet2-modelv2:latest')
    artifact_dir = artifact.download()
    print(artifact_dir)
    # IF YOU ARE USING GPU THEN YOU NEED TO DISABLE THIS
    #CUDA_LAUNCH_BLOCKING=1
    if torch.cuda.is_available():
      model = torch.load(artifact_dir+"/model")
    else:
      model = torch.load(artifact_dir+"/model", map_location=torch.device('cpu'))


# validating our model
total = 0
correct = 0
labels_list = []
predictions_list = []

transform = transforms.Compose([
          transforms.Resize((32, 32)),
          transforms.ToTensor()
          ])


test_set = datasets.MNIST('DATA_MNIST/', download=True, train=False, transform=transform)
test_loader = torch.utils.data.DataLoader(test_set, batch_size=b_size, shuffle=True)

for images, labels in test_loader: #test_loader
    images, labels = images.to(device), labels.to(device)
    labels_list.append(labels)
    outputs = model(images)
    predictions = torch.max(outputs, 1)[1].to(device)
    predictions_list.append(predictions)
    correct += (predictions == labels).sum()
    total += len(labels)
accuracy = correct * 100 / total
print(accuracy)


In [None]:
#@title VGG { form-width: "200px" }

cfg = {
    'VGG11': [64, 'M', 128, 'M', 256, 256, 'M', 512, 512, 'M', 512, 512, 'M'],
    'VGG13': [64, 64, 'M', 128, 128, 'M', 256, 256, 'M', 512, 512, 'M', 512, 512, 'M'],
    'VGG16': [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 'M', 512, 512, 512, 'M', 512, 512, 512, 'M'],
    'VGG19': [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 256, 'M', 512, 512, 512, 512, 'M', 512, 512, 512, 512, 'M'],
}


class VGG(nn.Module):
    def __init__(self, vgg_name):
        super(VGG, self).__init__()
        self.features = self._make_layers(cfg[vgg_name])
        self.classifier = nn.Linear(512, 10)

    def forward(self, x):
        out = self.features(x)
        out = out.view(out.size(0), -1)
        out = self.classifier(out)
        return out

    def _make_layers(self, cfg):
        layers = []
        in_channels = 3
        for x in cfg:
            if x == 'M':
                layers += [nn.MaxPool2d(kernel_size=2, stride=2)]
            else:
                layers += [nn.Conv2d(in_channels, x, kernel_size=3, padding=1),
                           nn.BatchNorm2d(x),
                           nn.ReLU(inplace=True)]
                in_channels = x
        layers += [nn.AvgPool2d(kernel_size=1, stride=1)]
        return nn.Sequential(*layers)

shuffle = False #@param {type: "boolean"}
#@markdown Batch size
b_size =  100#@param {type: "integer"}
test_set = torchvision.datasets.FashionMNIST("./data", download=True, train=False, transform=
                                               transforms.Compose([transforms.ToTensor()])) 
test_loader = torch.utils.data.DataLoader(test_set,
                                          batch_size=b_size, shuffle=shuffle, drop_last=True)
pretrained_model = True #@param {type: "boolean"}
save_model = False #@param {type: "boolean"}
model_label = "v0" #@param {type: "string"}


# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Hyper-parameters
num_epochs = 80
learning_rate = 0.01




# Image preprocessing modules
transform = transforms.Compose([
    transforms.Pad(4),
    transforms.RandomHorizontalFlip(),
    transforms.RandomCrop(32),
    transforms.ToTensor()])

#######
transform_train = transforms.Compose([
    transforms.RandomCrop(32, padding=4),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
])

########

# CIFAR-10 dataset
train_dataset = torchvision.datasets.CIFAR10(root='../../data/',
                                             train=True, 
                                             transform=transform_train,
                                             download=True)

test_dataset = torchvision.datasets.CIFAR10(root='../../data/',
                                            train=False, 
                                            transform=transforms.ToTensor())

# Data loader
train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
                                           batch_size=100, 
                                           shuffle=True)

test_loader = torch.utils.data.DataLoader(dataset=test_dataset,
                                          batch_size=100, 
                                          shuffle=False)

# 3x3 convolution
def conv3x3(in_channels, out_channels, stride=1):
    return nn.Conv2d(in_channels, out_channels, kernel_size=3, 
                     stride=stride, padding=1, bias=False)

# Residual block
class ResidualBlock(nn.Module):
    def __init__(self, in_channels, out_channels, stride=1, downsample=None):
        super(ResidualBlock, self).__init__()
        self.conv1 = conv3x3(in_channels, out_channels, stride)
        self.bn1 = nn.BatchNorm2d(out_channels)
        self.relu = nn.ReLU(inplace=True)
        self.conv2 = conv3x3(out_channels, out_channels)
        self.bn2 = nn.BatchNorm2d(out_channels)
        self.downsample = downsample

    def forward(self, x):
        residual = x
        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)
        out = self.conv2(out)
        out = self.bn2(out)
        if self.downsample:
            residual = self.downsample(x)
        out += residual
        out = self.relu(out)
        return out

# ResNet
class ResNet(nn.Module):
    def __init__(self, block, layers, num_classes=10):
        super(ResNet, self).__init__()
        self.in_channels = 16
        self.conv = conv3x3(3, 16)
        self.bn = nn.BatchNorm2d(16)
        self.relu = nn.ReLU(inplace=True)
        self.layer1 = self.make_layer(block, 16, layers[0])
        self.layer2 = self.make_layer(block, 32, layers[1], 2)
        self.layer3 = self.make_layer(block, 64, layers[2], 2)
        self.avg_pool = nn.AvgPool2d(8)
        self.fc = nn.Linear(64, num_classes)

    def make_layer(self, block, out_channels, blocks, stride=1):
        downsample = None
        if (stride != 1) or (self.in_channels != out_channels):
            downsample = nn.Sequential(
                conv3x3(self.in_channels, out_channels, stride=stride),
                nn.BatchNorm2d(out_channels))
        layers = []
        layers.append(block(self.in_channels, out_channels, stride, downsample))
        self.in_channels = out_channels
        for i in range(1, blocks):
            layers.append(block(out_channels, out_channels))
        return nn.Sequential(*layers)

    def forward(self, x):
        out = self.conv(x)
        out = self.bn(out)
        out = self.relu(out)
        out = self.layer1(out)
        out = self.layer2(out)
        out = self.layer3(out)
        out = self.avg_pool(out)
        out = out.view(out.size(0), -1)
        out = self.fc(out)
        return out


model = VGG('VGG19').to(device)

# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=learning_rate,
                      momentum=0.9, weight_decay=5e-4)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=200)

# For updating learning rate
def update_lr(optimizer, lr):    
    for param_group in optimizer.param_groups:
        param_group['lr'] = lr
if not pretrained_model:
    # Train the model
    total_step = len(train_loader)
    curr_lr = learning_rate
    for epoch in range(num_epochs):
        for i, (images, labels) in enumerate(train_loader):
            images = images.to(device)
            labels = labels.to(device)

            # Forward pass
            outputs = model(images)
            loss = criterion(outputs, labels)

            # Backward and optimize
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            if (i+1) % 100 == 0:
                print ("Epoch [{}/{}], Step [{}/{}] Loss: {:.4f}"
                      .format(epoch+1, num_epochs, i+1, total_step, loss.item()))

        # Decay learning rate
        if (epoch+1) % 20 == 0:
            curr_lr /= 3
            update_lr(optimizer, curr_lr)

    if save_model:
        torch.save(model, "model")
        run = wandb.init(job_type="model-creation")
        artifact = wandb.Artifact('VGG_new-model2'+model_label, type='model')
        artifact.add_file("model")
        run.log_artifact(artifact)

else:
    run = wandb.init(job_type="model-training")
    artifact = run.use_artifact('VGG_new-model2v0:latest')
    artifact_dir = artifact.download()
    print(artifact_dir)
    # IF YOU ARE USING GPU THEN YOU NEED TO DISABLE THIS
    #CUDA_LAUNCH_BLOCKING=1
    if torch.cuda.is_available():
      model = torch.load(artifact_dir+"/model")
    else:
      model = torch.load(artifact_dir+"/model", map_location=torch.device('cpu'))

# Test the model
model.eval()
with torch.no_grad():
    correct = 0
    total = 0
    for images, labels in test_loader:
        images = images.to(device)
        labels = labels.to(device)
        outputs = model(images)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

    print('Accuracy of the model on the test images: {} %'.format(100 * correct / total))

# ------ Sparse Adversarial Attacks ------

# SparseFool
code from the SF paper

In [10]:
#@title DF subroutine { form-width: "250px" }

def deepfool(
    im,                   # image
    net,                  # target network
    lambda_fac=3.,        # control parameter lambda
    num_classes=10,       # number of classes
    overshoot=0.02,       # "eta" in thesis
    max_iter=50,          # maximum number of iterations
    device='cuda'
):
    '''
    outputs the approximated decision boundary and boundary point
    This function implements the DeepFool subroutine for computing 
    adversarial perturbations. It is used by sparsefool.py for the 
    linear approximation of the decision boundary.
    '''
    
    image = copy.deepcopy(im)
    input_shape = image.size()

    f_image = net.forward(Variable(image, requires_grad=True)).data.cpu().numpy().flatten()
    I = (np.array(f_image)).flatten().argsort()[::-1]
    I = I[0:num_classes]
    label = I[0]

    pert_image = copy.deepcopy(image)
    r_tot = torch.zeros(input_shape).to(device)

    k_i = label
    loop_i = 0

    while k_i == label and loop_i < max_iter:

        x = Variable(pert_image, requires_grad=True)
        fs = net.forward(x)

        pert = torch.Tensor([np.inf])[0].to(device)
        w = torch.zeros(input_shape).to(device)

        fs[0, I[0]].backward(retain_graph=True)
        grad_orig = copy.deepcopy(x.grad.data)

        for k in range(1, num_classes):
            x.grad.zero_()

            fs[0, I[k]].backward(retain_graph=True)
            cur_grad = copy.deepcopy(x.grad.data)

            w_k = cur_grad - grad_orig
            f_k = (fs[0, I[k]] - fs[0, I[0]]).data

            pert_k = torch.abs(f_k) / w_k.norm()

            if pert_k < pert:
                pert = pert_k + 0.
                w = w_k + 0.

        r_i = torch.clamp(pert, min=1e-4) * w / w.norm()
        r_tot = r_tot + r_i

        pert_image = pert_image + r_i

        check_fool = image + (1 + overshoot) * r_tot
        k_i = torch.argmax(net.forward(Variable(check_fool, requires_grad=True)).data).item()

        loop_i += 1

    x = Variable(pert_image, requires_grad=True)
    fs = net.forward(x)
    (fs[0, k_i] - fs[0, label]).backward(retain_graph=True)
    grad = copy.deepcopy(x.grad.data)
    grad = grad / grad.norm()

    r_tot = lambda_fac * r_tot
    pert_image = image + r_tot

    return grad, pert_image

In [11]:
#@title Linear Solver subroutine { form-width: "250px" }
def linear_solver(
    x_0,              # initial image 
    normal,           # normal computed by DeepFool previously
    boundary_point,   # boundary point approximated by DeepFool
    lb,               # lower bound on pixel values
    ub                # upper bound on pixel values
):
    '''
    outputs next iterate for SF 
    This function implements the Linear Solver subroutine for solving the 
    linearized box-constrained problem. 
    It is used by sparsefool.py for solving the linearized problem.
    '''

    input_shape = x_0.size()

    coord_vec = copy.deepcopy(normal)
    plane_normal = copy.deepcopy(coord_vec).view(-1)
    plane_point = copy.deepcopy(boundary_point).view(-1)

    x_i = copy.deepcopy(x_0)

    f_k = torch.dot(plane_normal, x_0.view(-1) - plane_point)
    sign_true = f_k.sign().item()

    beta = 0.001 * sign_true
    current_sign = sign_true

    while current_sign == sign_true and coord_vec.nonzero().size()[0] > 0:

        f_k = torch.dot(plane_normal, x_i.view(-1) - plane_point) + beta

        pert = f_k.abs() / coord_vec.abs().max()

        mask = torch.zeros_like(coord_vec)
        coord_vec = coord_vec.cpu()
        mask[np.unravel_index( torch.argmax (coord_vec.abs() ), input_shape)] = 1.
        coord_vec = coord_vec.to(device)

        r_i = torch.clamp(pert, min=1e-4) * mask * coord_vec.sign()
        x_i = x_i + r_i
        x_i = clip_image_values(x_i, lb, ub)

        f_k = torch.dot(plane_normal, x_i.view(-1) - plane_point)
        current_sign = f_k.sign().item()

        coord_vec[r_i != 0] = 0

    return x_i

In [12]:
#@title SparseFool code { form-width: "250px" }

import torch as torch
import copy
from torch.autograd import Variable

def sparsefool(
    x_0,                # image
    net,                # target network
    lb,                 # lower bound on pixel values
    ub,                 # upper bound on pixel values
    lambda_=3.,         # control parameter lambda
    max_iter=20,        # maximum number of iterations
    epsilon=0.02,       # "eta" in thesis (overshoot)
    device='cuda'
):
    '''
    outputs: 
    fool_im:    adversarial example
    r:          perturbation 
    pred_label: models prediction 
    loops:      number of iterations that was needed
    '''

    pred_label = torch.argmax(net(x_0))

    x_i = copy.deepcopy(x_0)
    fool_im = copy.deepcopy(x_i)

    fool_label = pred_label
    loops = 0

    while fool_label == pred_label and loops < max_iter:

        normal, x_adv = deepfool(x_i, net, lambda_, device=device)

        x_i = linear_solver(x_i, normal, x_adv, lb, ub)

        fool_im = x_0 + (1 + epsilon) * (x_i - x_0)
        fool_im = clip_image_values(fool_im, lb, ub)
        fool_label = torch.argmax(net.forward(Variable(fool_im, requires_grad=True)).data).item()

        loops += 1

    r = fool_im - x_0
    return fool_im, r, pred_label, fool_label, loops

In [13]:
#@title Utils { form-width: "250px" }

def clip_image_values(x, minv, maxv): 
    x = torch.clamp(x,minv,maxv)
    return x

# CornerSearch
original code from the CS paper

In [14]:
#@title CS utils { form-width: "250px" }

def get_logits(model, x_nat):
    x = torch.from_numpy(x_nat).permute(0, 3, 1, 2).float()
    
    with torch.no_grad():
        output = model(x.cuda())
    
    return output.cpu().numpy()

def get_predictions(model, x_nat, y_nat):
    x = torch.from_numpy(x_nat).permute(0, 3, 1, 2).float()
    y = torch.from_numpy(y_nat)
    with torch.no_grad():
        output = model(x.cuda())
    
    return (output.cpu().max(dim=-1)[1] == y).numpy()

In [15]:
#@title CornerSearch code { form-width: "250px" }

def onepixel_perturbation(attack, orig_x, pos, sigma):
  ''' 
  returns a batch with the possible perturbations of the pixel in position pos 
  '''
    
  if attack.type_attack == 'L0':
    if orig_x.shape[-1] == 3:
      batch_x = np.tile(orig_x,(8,1,1,1))
      t = np.zeros([3])
      for counter in range(8):
        t2 = counter + 0
        for c in range(3):
          t[c] = t2 % 2
          t2 = (t2 - t[c])/2
        batch_x[counter,pos[0],pos[1]] = t.astype(np.float32)
    elif orig_x.shape[-1] == 1:
      batch_x = np.tile(orig_x,(2,1,1,1))
      batch_x[0,pos[0],pos[1],0] = 0.0
      batch_x[1,pos[0],pos[1],0] = 1.0
  
  elif attack.type_attack == 'L0+Linf':
    if orig_x.shape[-1] == 3:
      batch_x = np.tile(orig_x,(8,1,1,1))
      t = np.zeros([3])
      for counter in range(8):
        t2 = counter + 0
        for c in range(3):
          t3 = t2 % 2
          t[c] = (t3*2.0 - 1.0)*attack.epsilon
          t2 = (t2 - t3)/2
        batch_x[counter,pos[0],pos[1]] = np.clip(t.astype(np.float32) + orig_x[pos[0],pos[1]], 0.0, 1.0)
    elif orig_x.shape[-1] == 1:
      batch_x = np.tile(orig_x,(2,1,1,1))
      batch_x[0,pos[0],pos[1],0] = np.clip(batch_x[0,pos[0],pos[1],0] - attack.epsilon, 0.0, 1.0)
      batch_x[1,pos[0],pos[1],0] = np.clip(batch_x[1,pos[0],pos[1],0] + attack.epsilon, 0.0, 1.0)
  
  elif attack.type_attack == 'L0+sigma':
    batch_x = np.tile(orig_x,(2,1,1,1))
    if orig_x.shape[-1] == 3:
      batch_x[0,pos[0],pos[1]] = np.clip(batch_x[0,pos[0],pos[1]]*(1.0 - attack.kappa*sigma[pos[0],pos[1]]), 0.0, 1.0)
      batch_x[1,pos[0],pos[1]] = np.clip(batch_x[0,pos[0],pos[1]]*(1.0 + attack.kappa*sigma[pos[0],pos[1]]), 0.0, 1.0)
    
    elif orig_x.shape[-1] == 1:
      batch_x[0,pos[0],pos[1]] = np.clip(batch_x[0,pos[0],pos[1]] - attack.kappa*sigma[pos[0],pos[1]], 0.0, 1.0)
      batch_x[1,pos[0],pos[1]] = np.clip(batch_x[0,pos[0],pos[1]] + attack.kappa*sigma[pos[0],pos[1]], 0.0, 1.0)
    
  else:
    raise ValueError('unknown attack')
  
  return batch_x
    
def onepixel_perturbation_image(attack, orig_x, sigma):
  ''' returns a batch with all the possible perturbations of the image orig_x '''
  
  n_channels = orig_x.shape[-1]
  assert n_channels in [1, 3]
  n_corners = 2**n_channels if attack.type_attack in ['L0', 'L0+Linf'] else 2
  
  batch_x = np.zeros([n_corners*orig_x.shape[0]*orig_x.shape[1], orig_x.shape[0], orig_x.shape[1], orig_x.shape[2]])
  for counter in range(orig_x.shape[0]):
      for counter2 in range(orig_x.shape[1]):
        batch_x[(counter*orig_x.shape[0]+counter2)*n_corners:(counter*orig_x.shape[1]+counter2)*n_corners+n_corners] = np.clip(onepixel_perturbation(attack, orig_x, [counter,counter2], sigma), 0.0, 1.0)
  
  return batch_x

def flat2square(attack, ind):
  ''' returns the position and the perturbation given the index of an image
      of the batch of all the possible perturbations '''
  
  if attack.type_attack in ['L0', 'L0+Linf']:
    if attack.shape_img[-1] == 3:
      new_pixel = ind % 8
      ind = (ind - new_pixel)//8
      c = ind % attack.shape_img[1]
      r = (ind - c)//attack.shape_img[1]
      t = np.zeros([ind.shape[0],3])
      for counter in range(3):
        t[:,counter] = new_pixel % 2
        new_pixel = (new_pixel - t[:,counter])/2
    elif attack.shape_img[-1] == 1:
      t = ind % 2
      ind = (ind-t)//2
      c = ind % attack.shape_img[1]
      r = (ind-c)//attack.shape_img[1]
  
  elif attack.type_attack == 'L0+sigma':
      t = ind % 2
      c = ((ind - t)//2) % attack.shape_img[1]
      r = ((ind - t)//2 - c)//attack.shape_img[1]
    
  return r, c, t

def npixels_perturbation(attack, orig_x, ind, k, sigma):
  ''' creates n_iter images which differ from orig_x in at most k pixels '''
  
  # sampling the n_iter k-pixels perturbations
  ind2 = np.random.randint(0, attack.n_max**2, (attack.n_iter, k))
  ind2 = attack.n_max - np.floor(ind2**0.5).astype(int) - 1
  
  # creating the n_iter k-pixels perturbed images
  batch_x = np.tile(orig_x,(attack.n_iter,1,1,1))
  if attack.type_attack == 'L0':
    for counter in range(attack.n_iter):
      p11, p12, d1 = flat2square(attack, ind[ind2[counter]])
      batch_x[counter,p11,p12] = d1 + 0 if attack.shape_img[-1] == 3 else np.expand_dims(d1 + 0, 1)
  
  elif attack.type_attack == 'L0+Linf':
    for counter in range(attack.n_iter):
      p11, p12, d1 = flat2square(attack, ind[ind2[counter]])
      d1 = d1 + 0 if attack.shape_img[-1] == 3 else np.expand_dims(d1 + 0, 1)
      batch_x[counter,p11,p12] = np.clip(batch_x[counter,p11,p12]+(2.0*d1 - 1.0)*attack.epsilon, 0.0, 1.0)
  
  elif attack.type_attack == 'L0+sigma':
    for counter in range(attack.n_iter):
      p11, p12, d1 = flat2square(attack, ind[ind2[counter]])
      d1 = np.expand_dims(d1,1)
      if attack.shape_img[-1] == 3: batch_x[counter,p11,p12] = np.clip(batch_x[counter,p11,p12] - attack.kappa*sigma[p11,p12]*(1-d1) + attack.kappa*sigma[p11,p12]*d1, 0.0, 1.0)
      elif attack.shape_img[-1] == 1: batch_x[counter,p11,p12] = np.clip(batch_x[counter,p11,p12] - attack.kappa*sigma[p11,p12]*(1-d1) + attack.kappa*sigma[p11,p12]*d1, 0.0, 1.0)
      
  return batch_x

def sigma_map(x):
  ''' creates the sigma-map for the batch x '''
  
  sh = [4]
  sh.extend(x.shape)
  t = np.zeros(sh)
  t[0,:,:-1] = x[:,1:]
  t[0,:,-1] = x[:,-1]
  t[1,:,1:] = x[:,:-1]
  t[1,:,0] = x[:,0]
  t[2,:,:,:-1] = x[:,:,1:]
  t[2,:,:,-1] = x[:,:,-1]
  t[3,:,:,1:] = x[:,:,:-1]
  t[3,:,:,0] = x[:,:,0]

  mean1 = (t[0] + x + t[1])/3
  sd1 = np.sqrt(((t[0]-mean1)**2 + (x-mean1)**2 + (t[1]-mean1)**2)/3)

  mean2 = (t[2] + x + t[3])/3
  sd2 = np.sqrt(((t[2]-mean2)**2 + (x-mean2)**2 + (t[3]-mean2)**2)/3)

  sd = np.minimum(sd1, sd2)
  sd = np.sqrt(sd)
  
  return sd
  
class CSattack():
  def __init__(self, model, args):
    self.model = model
    self.type_attack = args['type_attack'] # 'L0', 'L0+Linf', 'L0+sigma'
    self.n_iter = args['n_iter']           # number of iterations (N_iter in the paper)
    self.n_max = args['n_max']             # the modifications for k-pixels perturbations are sampled among the best n_max (N in the paper)
    self.epsilon = args['epsilon']         # for L0+Linf, the bound on the Linf-norm of the perturbation
    self.kappa = args['kappa']             # for L0+sigma (see kappa in the paper), larger kappa means easier and more visible attacks
    self.k = args['sparsity']              # maximum number of pixels that can be modified (k_max in the paper)
    self.size_incr = args['size_incr']     # size of progressive increment of sparsity levels to check  
  
  def perturb(self, x_nat, y_nat):
    adv = np.copy(x_nat)
    fl_success = np.ones([x_nat.shape[0]])
    self.shape_img = x_nat.shape[1:]
    self.sigma = sigma_map(x_nat)
    self.n_classes = 10
    self.n_corners = 2**self.shape_img[2] if self.type_attack in ['L0', 'L0+Linf'] else 2
    #corr_pred = sess.run(self.model.correct_prediction, {self.model.x_input: x_nat, self.model.y_input: y_nat})
    corr_pred = get_predictions(self.model, x_nat, y_nat)
    #print("corr pred "+str(corr_pred))
    bs = self.shape_img[0]*self.shape_img[1]
    
    for c in range(x_nat.shape[0]):
      if corr_pred[c]:
        sigma = np.copy(self.sigma[c])
        batch_x = onepixel_perturbation_image(self, x_nat[c], sigma)
        batch_y = np.squeeze(y_nat[c])
        logit_2 = np.zeros([batch_x.shape[0], self.n_classes])
        found = False
        
        # checks one-pixels modifications
        for counter in range(self.n_corners):
          #logit_2[counter*bs:(counter+1)*bs], pred = sess.run([self.model.y, self.model.correct_prediction], feed_dict={self.model.x_input: batch_x[counter*bs:(counter+1)*bs], self.model.y_input: np.tile(batch_y,(bs))})
          logit_2[counter*bs:(counter+1)*bs] = get_logits(self.model, batch_x[counter*bs:(counter+1)*bs])
          pred = logit_2[counter*bs:(counter+1)*bs].argmax(axis=-1) == np.tile(batch_y,(bs))
          if not pred.all() and not found:
            ind_adv = np.where(pred.astype(int)==0)
            adv[c] = batch_x[counter*bs + ind_adv[0][0]]
            found = True
            print('Point {} - adversarial example found changing 1 pixel'.format(c))
        
        # creates the orderings
        t1 = np.copy(logit_2[:, batch_y])
        logit_2[:, batch_y] = -1000.0*np.ones(np.shape(logit_2[:, batch_y]))
        t2 = np.amax(logit_2, axis=1)
        t3 = t1 - t2
        logit_3 = np.tile(np.expand_dims(t1,axis=1),(1,self.n_classes))-logit_2
        logit_3[:, batch_y] = t3
        ind = np.argsort(logit_3, axis=0)
        
        # checks multiple-pixels modifications
        for n3 in range(1 + self.size_incr, self.k + 1, self.size_incr):
          if not found:
             for c2 in range(self.n_classes):
               if not found:
                 ind_cl = np.copy(ind[:, c2])

                 batch_x = npixels_perturbation(self, x_nat[c], ind_cl, n3, sigma)
                 #pred = sess.run(self.model.correct_prediction, feed_dict={self.model.x_input: batch_x, self.model.y_input: np.tile(batch_y,(batch_x.shape[0]))})
                 pred = get_predictions(self.model, batch_x, np.tile(batch_y,(batch_x.shape[0])))
                 #print("pred "+str(pred))

                 if np.sum(pred.astype(np.int32)) < self.n_iter and not found:
                   found = True
                   ind_adv = np.where(pred.astype(int)==0)
                   adv[c] = batch_x[ind_adv[0][0]]
                   print('Point {} - adversarial example found changing {} pixels'.format(c, np.sum(np.amax(np.abs(adv[c] - x_nat[c]) > 1e-10, axis=-1), axis=(0,1))))
        
        if not found:
          fl_success[c] = 0
          print('Point {} - adversarial example not found'.format(c))
      
      else:
        print('Point {} - misclassified'.format(c))
    
    pixels_changed = np.sum(np.amax(np.abs(adv - x_nat) > 1e-10, axis=-1), axis=(1,2))
    #print('Pixels changed: ', pixels_changed)
    corr_pred = get_predictions(self.model, adv, y_nat)
    predictions = model(torch.tensor(adv).permute(0,3,2,1).to(device))
    print('Robust accuracy at {} pixels: {:.2f}%'.format(self.k, np.sum(corr_pred)/x_nat.shape[0]*100.0))
    print('Maximum perturbation size: {:.5f}'.format(np.amax(np.abs(adv - x_nat))))
    
    return adv, pixels_changed, fl_success

# JSMA

In [16]:
#@title JSMA code { form-width: "150px" }


def compute_jacobian_batch(
    inputs,           # input image
    output            # the models' output
):
    # outputs the forward derivative

    assert inputs.requires_grad

    num_classes = output.size()[1]

    jacobian = torch.zeros(num_classes, *inputs.size())
    grad_output = torch.zeros(*output.size())
    if inputs.is_cuda:
        grad_output = grad_output.cuda()
        jacobian = jacobian.cuda()

    for i in range(num_classes):
        if inputs.grad != None:
          inputs.grad.zero_()
        grad_output.zero_()
        grad_output[:, i] = 1
        output.backward(grad_output, retain_graph=True)
        jacobian[i] = inputs.grad.data
    return torch.transpose(jacobian, dim0=0, dim1=1)

def saliency_map_batch(
    jacobian,             # jacobian computed by compute_jacobian_batch
    search_space,         # "Gamma" in thesis
    target_index,         # target class 
    increasing=True       # computing increasing or decreasing saliency map 
):
    # outputs the maximum value of the saliency map and its indices

    all_sum = torch.sum(jacobian, 1).squeeze() # compute sum over all classes 
    alpha = jacobian[0][target_index]    
    beta = torch.flatten(all_sum - alpha)              
    alpha = torch.flatten(alpha) 
    num_pixels = jacobian.shape[2]*jacobian.shape[3]

    alpha_p = torch.ones(num_pixels,num_pixels)
    alpha_p = torch.triu(alpha_p)
    alpha_p = alpha_p.fill_diagonal_(0).to(device)

    alpha_p = alpha_p * alpha.to(device)

    tmp = torch.reshape(alpha,(len(alpha),1))
    tmp_mtrx = torch.ones(num_pixels,num_pixels).to(device)
    tmp_mtrx = tmp_mtrx * tmp
    alpha_q = torch.triu(tmp_mtrx)
    alpha_q = alpha_q.fill_diagonal_(0) 

    alpha_pq = alpha_p + alpha_q

    beta_p = torch.ones(num_pixels,num_pixels)
    beta_p = torch.triu(beta_p)
    beta_p = beta_p.fill_diagonal_(0).to(device)

    beta_p = beta_p * beta

    tmp = torch.reshape(beta,(len(beta),1))
    tmp_mtrx = torch.ones(num_pixels,num_pixels).to(device)
    tmp_mtrx = tmp_mtrx * tmp
    beta_q = torch.triu(tmp_mtrx)
    beta_q = beta_q.fill_diagonal_(0) 

    beta_pq = beta_p + beta_q

    if increasing:
        mask1 = torch.ge(alpha_pq, 0.0)    # alpha > 0, 
        mask2 = torch.le(beta_pq, 0.0)     # beta < 0    for increasing saliency map
    else:
        mask1 = torch.le(alpha_pq, 0.0)    # alpha < 0, 
        mask2 = torch.ge(beta_pq, 0.0)     # beta > 0    for decreasing saliency map 

    mask = torch.mul(mask1, mask2) # both constraints fulfilled

    space = torch.flatten(search_space) 
    for i in range (len(space)): 
    # delte rows and columns corresponding to pixels which are already modified
        if space[i] == 0:
            mask[i] = torch.zeros_like(mask[i]) 
            for j in range(mask.shape[0]):
                mask[j][i] = 0                  

    if increasing:
        saliency_map = torch.mul(torch.mul(alpha_pq, torch.abs(beta_pq)), mask.float())   # alpha * abs(beta) where alpha > 0, beta < 0
    else:
        saliency_map = torch.mul(torch.mul(torch.abs(alpha_pq), beta_pq), mask.float())   # abs(alpha) * beta where alpha < 0, beta > 0
    
    # search for the maximal value and its indices

    max_value = torch.max(saliency_map)
    idx = torch.argmax(saliency_map)
    p = torch.div(idx, num_pixels, rounding_mode='trunc')
    q = idx - (p * num_pixels)

    return max_value, (p,q)


def jsma_batch(
    model,                  # target model
    input_tensor,           # images
    true_labels,            # true labels
    target_class,           # target classes
    max_distortion=1        # maximum distortion allowed
):
    '''
    outputs:
    input_features: adverasarial images
    found:          number of images for which an adversarial was found
    dist:           distortions
    '''

    # Make a clone since we will alter the values
    input_features = torch.autograd.Variable(input_tensor.clone(), requires_grad=True) 
    batch_size = input_features.shape[0]
    width = input_features.size(2)  
    height = input_features.size(3) 
    num_pixels = width * height     
    max_iter = math.floor((num_pixels*max_distortion)) 
    count = [0 for i in range(batch_size)]

    search_space = torch.ones(batch_size, width, height).byte() 
    if input_features.is_cuda:
        search_space = search_space.cuda()    

    output = model(input_features) 
    _, source_class = torch.max(output.data, 1) 

    image_done = [False for i in range(batch_size)] 
    adv = torch.zeros_like(input_tensor)
    
    while (max(count) < max_iter) and (min(image_done) == False) and (search_space.sum() != 0):
        # Calculate Jacobian
        jacobian = compute_jacobian_batch(input_features, output) 

        for i in range(batch_size):
            if image_done[i] != True:
                image_jacobian = jacobian[i].permute(1,0,2,3) 

                # Compute increasing saliency map and find the maximum value and its indices 
                s_plus_value, s_plus_index = saliency_map_batch(image_jacobian, search_space[i], target_class[i], increasing=True) 

                # Compute decreasing saliency map and find the maximum value and its indices  
                s_minus_value, s_minus_index = saliency_map_batch(image_jacobian, search_space[i], target_class[i], increasing=False)

                # if not zero: from increasing and decreasing pixel pick the one with bigger impact
                if s_plus_value == 0.0 and s_minus_value == 0.0:
                    print("No pair of pixels fulfills the conditions.")
                    return input_features, found, dist
        
                if s_plus_value > s_minus_value:
                    p = s_plus_index[0]
                    q = s_plus_index[1]

                    # modify pixel p and q 
                    #p_0 = (p // width) 
                    p_0 = torch.div(p, width, rounding_mode='trunc')
                    p_1 = p - (p_0 * width)
                    #q_0 = (q // width) 
                    q_0 = torch.div(q, width, rounding_mode='trunc')
                    q_1 = q - (q_0 * width)
                    input_features.data[i][0][p_0][p_1] =1 
                    input_features.data[i][0][q_0][q_1] =1 
                    # remove modifyed pixel from search space
                    search_space[i][p_0][p_1] = 0
                    search_space[i][q_0][q_1] = 0

                else:
                    p = s_minus_index[0]
                    q = s_minus_index[1]
                    # modify pixel p and q 
                    #p_0 = (p // width) 
                    p_0 = torch.div(p, width, rounding_mode='trunc')
                    p_1 = p - (p_0 * width)
                    #q_0 = (q // width) 
                    q_0 = torch.div(q, width, rounding_mode='trunc')
                    q_1 = q - (q_0 * width)
                    input_features.data[i][0][p_0][p_1] =0 # +=1
                    input_features.data[i][0][q_0][q_1] =0 # +=1
                    # remove modifyed pixel from search space
                    search_space[i][p_0][p_1] = 0
                    search_space[i][q_0][q_1] = 0

                count[i] += 2
            
            _, source_class = torch.max(output.data, 1) 

            for i in range(batch_size):
                if image_done[i] != True:
                    if source_class[i] == target_class[i]:
                        image_done[i] = True
                        adv[i] = input_features[i]
            output = model(input_features) 

    dist =[]
    found = 0
    for i in range(batch_size):
        if count[i] >= max_iter:
            print("Reached max. distortion")
            with torch.no_grad():
                input_features[i][0] -= input_features[i][0]
            found += 0
            dist.append(None)
        if source_class[i] != true_labels[i]:
            found += 1
            dist.append(count[i])
            print("Adversarial found changing "+str(count[i])+" pixels.")
    
    return input_features, found, dist

# Carlini Wagner

In [17]:
#@title L2 attack { form-width: "250px" }

"""The CarliniWagnerL2 attack."""     # new only_gray_scaled 

INF = float("inf")


def cleverhans_carlini_wagner_l2_binary(
    model_fn,                     # target network
    ox,                           # original images
    x,                            # last adversarial images that were found
    n_classes,                    # number of classes
    no_change,                    # search space (0 - the pixel is not allowed to be modified; 1 - pixel is allowed)
    step=0,                       # current L0 step
    y=None,                       # target labels
    lr=5e-3,                      # learning rate for Adam 
    confidence=0,                 # "kappa" in thesis
    clip_min=0,                   # minimum pixel value 
    clip_max=1,                   # maximum pixel value 
    initial_const=1e-2,           # initial value for constant c 
    c_max = 1e10,                  # maximum value for c 
    binary_search_steps=36,       # number of binary search steps
    max_iterations=1000,          # maximum number of iterations for Adam
    begin_binary_search = 5       # how often we try to double c before we start binary search
):

    def compare(pred, label, is_logits=False):

        # Convert logits to predicted class if necessary
        if is_logits:
            pred_copy = pred.clone().detach()
            pred_copy[label] += confidence
            pred = torch.argmax(pred_copy)

        return pred != label

    if y is None: 
        # Using model predictions as ground truth to avoid label leaking
        x = x.to(device)
        pred = model_fn(x)
        y = torch.argmax(pred, 1)

    # Initialize some values needed for binary search on const
    lower_bound = [0.0] * len(x)
    upper_bound = [c_max] * len(x)
    const = x.new_ones(len(x), 1) * initial_const

    o_bestl2 = [INF] * len(x)
    o_bestscore = [-1.0] * len(x)
    x = torch.clamp(x, clip_min, clip_max)

    o_bestattack = ox.clone().detach()

    no_change= no_change.to(device)

    if ox.shape[1] == 1: # gray-scaled
        no_change_values = torch.mul(ox, torch.ones_like(no_change) - no_change)

    elif ox.shape[1] == 3: # colored
      no_change_values = torch.zeros_like(ox)
      for i in range(ox.shape[0]):
          for j in range(3):
              no_change_values[i][j] += torch.mul(ox[i][j], torch.ones_like(no_change[i][0]) - no_change[i][0])

    # Map images into the tanh-space
    x = (x - clip_min) / (clip_max - clip_min)
    x = torch.clamp(x, 0, 1)
    x = x * 2 - 1
    x = torch.arctanh(x * 0.999999)

    done = torch.zeros_like(y) # tracks for which image we have already found an adversarial or stopped the search
    for i in range(len(const)):
        if const[i] >= c_max:
            done[i] = 1

    # Prepare some variables
    modifier = torch.zeros_like(x, requires_grad=True)
    y_onehot = torch.nn.functional.one_hot(y, n_classes).to(torch.float)    

    # Define loss functions and optimizer
    f_fn = lambda real, other: torch.max(
        ( (real - other) )+ confidence,
        torch.tensor(0.0).to(real.device),
    )
    l2dist_fn = lambda x, y: torch.pow(x - y, 2).sum(list(range(len(x.size())))[1:])
    optimizer = torch.optim.Adam([modifier], lr=lr)

    binary_search = False
    doubled = [0 for index in range(ox.shape[0])]
    
    search_step = 0

    # Outer loop performing binary search on const
    while torch.min(const) <= c_max:
        # Initialize some values needed for the inner loop
        bestl2 = [INF] * len(x)
        bestscore = [-1.0] * len(x)        
        
        # Inner loop performing attack iterations
        for i in range(max_iterations):
            # One attack step
            new_x = (torch.tanh(modifier + x) + 1) / 2

            new_x = torch.mul(new_x, no_change)
            new_x = new_x + no_change_values

            new_x = new_x * (clip_max - clip_min) + clip_min
            logits = model_fn(new_x)

            real = torch.sum(y_onehot * logits, 1)
            other, _ = torch.max((1 - y_onehot) * logits - y_onehot * 1e4, 1)
            optimizer.zero_grad()
            
            f = f_fn(real, other) 
            l2 = l2dist_fn(new_x, ox)
            loss = (const * f + l2).sum()
            loss.backward(retain_graph=True)
            optimizer.step()

            if step > 0: 
                # if we are not in the initial step, save the adversarial images found so far 
                tmp = 0  # counts the number of images for which an adversarial was already found
                predictions = torch.argmax(model(new_x),1)
                for j in range(len(y)):
                    if (done[j] == 0) and (y[j] != predictions[j]) and (binary_search == False): 
                        tmp += 1
                        o_bestattack[j] = new_x[j]
                if tmp == (len(y)-torch.sum(done)) and (binary_search == False):
                    return o_bestattack.detach(), const          
        
        # Binary search step            
        if binary_search == False:
            for n in range(len(x)):
                y_n = y[n]
                bestscore[n] = torch.argmax(logits[n])
                if compare(bestscore[n], y_n) and bestscore[n] != -1 and done[n] == 0: 
                    # Success, save adversarial
                    o_bestattack[n] = new_x[n]

                elif const[n] < c_max and done[n] == 0:
                    # Failure, multiply by 2 if no solution found yet            
                    const[n] *= 2
                    doubled[n] += 1 

                    if const[n] > c_max:
                        done[n] = 1
        
        elif binary_search == True: # if we doubled c more than "begin_binary_search" times, we do binary search on c
            if search_step == 0:
              for n in range(len(x)):
                y_n = y[n]
                bestscore[n] = torch.argmax(logits[n])
                if (compare(bestscore[n], y_n)==False):
                    # Stop if L2 can't find an solution with c=c_max
                    doubled[n] = 0
                    done[n] = 1
                    if max(doubled) == 0:
                      # stop binary search if L2 can't find a solution for all with c=c_max
                      binary_search = False 

            # binary search
            for n in range(len(x)):
                y_n = y[n]
                bestscore[n] = torch.argmax(logits[n])
                if compare(bestscore[n], y_n) and bestscore[n] != -1 and doubled[n] > begin_binary_search:
                    # Success, save adversarial  
                    o_bestattack[n] = new_x[n]
                    upper_bound[n] = const[n].item() 
                    const[n] = (lower_bound[n] + upper_bound[n]) / 2
                
                elif search_step < binary_search_steps and doubled[n] > begin_binary_search: 
                    lower_bound[n] = const[n].item() 
                    const[n] = (lower_bound[n] + upper_bound[n]) / 2  

            search_step += 1
            
            if search_step == binary_search_steps: 
                binary_search = False 

        # start binary search if doubling c didn't give a solution
        if (binary_search == False) and (max(doubled) > begin_binary_search) and (search_step == 0) and (step > 0): 
            binary_search = True 
            for i in range(len(y)): 
                if doubled[i] > begin_binary_search: 
                      const[i] = c_max
        
        predictions = torch.argmax(model(o_bestattack),1)
        for i in range(len(y)):
            if (y[i] != predictions[i]) and (binary_search == False): 
                done[i] = 1
   
        if torch.sum(done) == len(y):
            return o_bestattack.detach(), const

In [37]:
#@title L0 attack { form-width: "250px" }

def cw_l0_cleverhans(      
    model_fn,                       # target network       
    x,                              # original image
    n_classes,                      # number of classes
    y=None,                         # target labels
    lr=5e-3,                        # learning rate for Adam
    confidence=0,                   # "kappa" in thesis
    clip_min=0,                     # minimum pixel value
    clip_max=1,                     # maximum pixel value
    initial_const=1e-4,             # initial value for constant c
    binary_search_steps=5,          # how often we try to double c before we start binary search
    max_iterations=1000,            # maximum number of iterations for Adam
):
    '''
    outputs: 
    best_adv: adversarial images
    '''

    # Define loss functions and optimizer
    def f(input):
        logits = model_fn(input)
        y_onehot = torch.nn.functional.one_hot(y, n_classes).to(torch.float)
        real = torch.sum(y_onehot * logits, 1)
        other, _ = torch.max((1 - y_onehot) * logits - y_onehot * 1e4, 1)
        return (real - other) + confidence

    if y is None:
        # Using model predictions as ground truth to avoid label leaking
        x=x.to(device)
        pred = model_fn(x)
        y = torch.argmax(pred, 1)

    
    no_change = torch.ones(x.shape[0], 1, x.shape[2], x.shape[3]) # in the beginning all pixels are in the search space

    step = 0
    adversarial_images = x.clone()
    best_adv = torch.zeros_like(x)

    done = torch.zeros_like(y)
    constant_c = x.new_ones(len(x), 1)*torch.tensor(1e-4).to(device)
    start = time.time()

    while torch.sum(done) != len(y):

        adversarial_images, constant_c = cleverhans_carlini_wagner_l2_binary(model, x, adversarial_images, n_classes, no_change, step, y=y, initial_const= torch.tensor(constant_c))
        
        # if l2 doesnt find an adversarial it returns the original image
        predictions = torch.argmax(model(adversarial_images),1)
        for i in range(len(predictions)):
            if (done[i] == 0) and (predictions[i] != y[i]): # if success, save adversarial 
                best_adv[i] = adversarial_images[i]
            elif (done[i] == 0) and (predictions[i] == y[i]): # else we are done with this image since L2 doesn't find an adversarial anymore
                done[i] = 1

        if torch.sum(done) == len(y):
            return best_adv
        adversarial_images.requires_grad =True 

        out_adv = torch.sum(f(adversarial_images))  
        out_adv.backward()

        g = adversarial_images.grad 

        delta = x.to(device) - adversarial_images.to(device)
        delta = delta.to(device)

        map = torch.mul(g, delta).squeeze()

        if x.shape[1] == 3: # for colored images
          map = torch.sum(map,1)

        min_indices = []
        for i in range(no_change.shape[0]):
            #find first value where no_change != 0
            found = 0
            for row in range(len(no_change[i][0])):
                for column in range(len(no_change[i][0][row])):
                    if no_change[i][0][row][column] == 1:
                        min_idx = row,column
                        min_val = map[i][row][column]
                        found = 1
                    if found == 1:
                        break
                if found == 1:
                        break

            # find min pixel 

            for row in range(map.shape[1]):
                for column in range(len(map[i][row])):
                    if map[i][row][column] < min_val:
                        if no_change[i][0][row][column] == 1:
                            min_val = map[i][row][column]
                            min_idx = row,column

            min_indices.append(min_idx)

        for i in range(no_change.shape[0]):
            if done[i] == 0:
                row = min_indices[i][0]
                column = min_indices[i][1] 
                no_change[i][0][row][column] = 0
        step += 1

        adversarial_images.grad.zero_()

# ------------------------ Runner ------------------------

In [None]:
#@title Define hyperparameters { form-width: "250px" }
#@markdown Load model and dataset 

if __name__ == '__main__':

  
  #@markdown Dataset (f_mnist, mnist, cifar10)
  dataset = "cifar10" #@param {type: "string"} 
  #@markdown Attack (CS, JSMA, SF, DF, CW_L0, CW_L2)
  attack = "CW_L0" #@param {type: "string"} 
  #@markdown Batch size
  batch_size =  2#@param {type: "integer"}  

  parser = argparse.ArgumentParser(description='Define hyperparameters.')
  parser.add_argument('--dataset', type=str, default=dataset, help='cifar10, mnist, f_mnist')
  parser.add_argument('--attack', type=str, default=attack, help='CS, JSMA, SF, DF, CW_l2, CW')
  parser.add_argument('--data_dir', type=str, default= './data')

  hps = parser.parse_args([])

  # load model and dataset

  if hps.dataset == 'f_mnist':
    test_set = torchvision.datasets.FashionMNIST("./data", download=True, train=False, transform=transforms.Compose([transforms.ToTensor()])) 
    test_loader = torch.utils.data.DataLoader(test_set, batch_size=batch_size, shuffle=False, drop_last=True)
    
    run = wandb.init(job_type="model-training")
    artifact = run.use_artifact('pretrained-model:latest')
    artifact_dir = artifact.download()

  elif hps.dataset == 'mnist':
    transform = transforms.Compose([transforms.Resize((32, 32)),transforms.ToTensor()])
    test_set = datasets.MNIST('DATA_MNIST/', download=True, train=False, transform=transform)
    test_loader = torch.utils.data.DataLoader(test_set, batch_size=batch_size, shuffle=False)

    run = wandb.init(project="SparseAdv", job_type="model-training")
    artifact = run.use_artifact('LeNet2-modelv2:latest')
    artifact_dir = artifact.download()

  elif hps.dataset == 'cifar10':
    test_set = torchvision.datasets.CIFAR10(root='../../data/', download=True, train=False, transform=transforms.ToTensor())

    test_loader = torch.utils.data.DataLoader(dataset=test_set, batch_size=batch_size, shuffle=False)

    run = wandb.init(job_type="model-training")
    artifact = run.use_artifact('VGG_new-model2v0:latest')
    artifact_dir = artifact.download()
    
  # IF YOU ARE USING GPU THEN YOU NEED TO DISABLE THIS
  if torch.cuda.is_available():
    model = torch.load(artifact_dir+"/model")
  else:
    model = torch.load(artifact_dir+"/model", map_location=torch.device('cpu'))
  model.eval()


In [None]:
#@title Runner { form-width: "250px" }

if hps.attack == 'CS':
  
  args = {'type_attack': 'L0',      
          'n_iter': 1000,           
          'n_max': 100,             
          'kappa': -1,              
          'epsilon': -1,            
          'sparsity': 1024,         
          'size_incr': 1}
  
  attack = CSattack(model, args)

  batch_i = 0
  for data in test_loader:
      images, labels = data
      images = torch.permute(images,(0,2,3,1)) 
      adv, pixels_changed, fl_success = attack.perturb(images.numpy(), labels.numpy())

      print("Distortions (pixels): "+str(pixels_changed))

      batch_tensor=torch.permute(torch.tensor(adv),(0,3,1,2)).cpu()
      grid_img = torchvision.utils.make_grid(batch_tensor, nrow=batch_size)      
      plt.imshow(grid_img.permute(1, 2, 0))


elif hps.attack == "SF":

    pixel_nr_list = []

    for data in test_loader:
        images, labels = data
        
        adv_batch = torch.zeros_like(images)

        pixel_nr_list_batch = []
        for i in range(len(labels)):
          x_0 = images[i].to(device)
          fool_im, r, pred_label, fool_label, loops = sparsefool(x_0[None,:,:,:], model, lb=0, ub=1, lambda_=1., max_iter=20, epsilon=0.02, device='cuda')  # shape of fool_im [1,1,32,32]
          
          # compute distortion
          if r.shape[1] == 1:
            pert = torch.count_nonzero(r).item()
            pixel_nr_list.append(pert)
            pixel_nr_list_batch.append(pert)
            adv_batch[i] = fool_im.squeeze(0)
          elif r.shape[1] == 3: 
            a2 = torch.sum(r,1)
            pert = torch.count_nonzero(a2).item()
            pixel_nr_list.append(pert)
            pixel_nr_list_batch.append(pert)
            adv_batch[i] = fool_im.squeeze(0)

        predictions = torch.argmax(model(adv_batch.to(device)),1)
        for i in range(20):
          if predictions[i] == labels[i]:
            print("Failed to find adversarial for image "+str(i))

        print("Distortions (pixels): "+str(pixel_nr_list_batch))

        batch_tensor=adv_batch
        grid_img = torchvision.utils.make_grid(batch_tensor, nrow=batch_size)
        plt.imshow(grid_img.permute(1,2,0))


elif hps.attack == 'JSMA':

  distortion_list = []
  
  for data in test_loader:

      images, labels = data
      
      target_tensor = torch.randint(0,10,[batch_size])

      # compute randon target classes
      for i in range(target_tensor.shape[0]):
          while target_tensor[i] == labels[i]:
                target_tensor[i] = random.randint(0, 9)

      adv, number, dist = jsma_batch(model, images.to(device), labels, target_tensor) 
      
      print("Distortions (pixels): "+str(dist))

      batch_tensor=adv.cpu() 
      grid_img = torchvision.utils.make_grid(batch_tensor, nrow=batch_size) 
      plt.imshow(grid_img.permute(1, 2, 0))


elif hps.attack == "CW_L2":    
  for data in test_loader:
      images, labels = data
      # gray scaled, no restriction on search space
      no_change = torch.ones_like(images)   

      # colored, no restriction on search space
      #no_change = torch.ones(images.shape[0],1,images.shape[2], images.shape[3]) 

      # gray scaled, restriction to a random subset of pixels 
      #no_change = torch.randint(0,2,images.shape)  

      # colored, restriction to a random subset of pixels 
      #no_change = torch.randint(0,2,(images.shape[0],1,images.shape[2], images.shape[3]))        
       
      no_change = no_change.to(device)

      images = images.to(device)   
      out_img,_ = cleverhans_carlini_wagner_l2_binary(model, images, images, 10, no_change ) 

      # calculate distortion
      Dist = []
      for image in range(images.shape[0]):
          tmp = 0
          for color in range (images.shape[1]):
              for i in range(images.shape[2]):                
                  for j in range(images.shape[3]):
                      if out_img[image][color][i][j] != images[image][color][i][j]:
                          tmp += 1
              Dist.append(tmp)
  
      images = images.cpu()

      predictions = torch.argmax(model(out_img),1)
      print("pred")
      print(predictions)
      print("labels")
      print(labels)

      batch_tensor=out_img.cpu()
      grid_img = torchvision.utils.make_grid(batch_tensor, nrow=batch_size)
      plt.imshow(grid_img.permute(1, 2, 0))


elif hps.attack == "CW_L0": 
  pixel_nr_list_batch = []
  for data in test_loader:
      images, labels = data
      no_change = torch.ones_like(images)
      images = images.to(device)
      out_img = cw_l0_cleverhans(model_fn = model, x = images, n_classes=10)
    
      # calculate distortion
      for img in range(images.shape[0]):
          r = out_img[img].cpu() - images[img].cpu() 
          if r.shape[0] == 1:
              pert = torch.count_nonzero(r).item()
              pixel_nr_list_batch.append(pert)
          elif r.shape[0] == 3: 
              a2 = torch.sum(r,0)
              pert = torch.count_nonzero(a2).item()
              pixel_nr_list_batch.append(pert)

      print("Distortions (pixels): " + str(pixel_nr_list_batch)) 
      
      batch_tensor=out_img.cpu()
      grid_img = torchvision.utils.make_grid(batch_tensor, nrow=batch_size)
      plt.imshow(grid_img.permute(1, 2, 0))
