# Installation and Repository cloning

In [2]:
!pip install adversarial-robustness-toolbox -U
!pip install multiprocess
!pip install importlib-metadata
!pip install advertorch
!pip install torch torchvision torchaudio
!pip install git+https://github.com/RobustBench/robustbench.git
!pip install numba
!pip install matplotlib
!pip install pillow

Collecting git+https://github.com/RobustBench/robustbench.git
  Cloning https://github.com/RobustBench/robustbench.git to c:\users\georg siedel\appdata\local\temp\pip-req-build-ryne21n6
  Resolved https://github.com/RobustBench/robustbench.git to commit 776bc95bb4167827fb102a32ac5aea62e46cfaab
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (pyproject.toml): finished with status 'done'
Collecting autoattack@ git+https://github.com/fra31/auto-attack.git@a39220048b3c9f2cca9a4d3a54604793c68eca7e#egg=autoattack (from robustbench==1.1)
  Using cached autoattack-0.1-py3-none-any.whl


  Running command git clone --filter=blob:none --quiet https://github.com/RobustBench/robustbench.git 'C:\Users\Georg Siedel\AppData\Local\Temp\pip-req-build-ryne21n6'




## Import Libraries

In [None]:
import torch
import torchvision
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, transforms, models

import numba
numba.__version__

import importlib
import time
import matplotlib.pyplot as plt
import numpy as np

from art.estimators.classification import PyTorchClassifier
from PIL import Image, ImageDraw, ImageFont, ImageOps
import json

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

## Load and Prepare CIFAR-10 Dataset

In [None]:
def load_dataset(dataset_split):
    # Load CIFAR-10 dataset using torchvision
    transform = transforms.Compose([
      transforms.ToTensor(),
                                 ])
    testset = datasets.CIFAR10(root='./data/cifar', train=False, download=True, transform=transform)

    # Truncated testset for experiments and ablations
    if isinstance(dataset_split, int):
        testset, _ = torch.utils.data.random_split(testset,
                                                          [dataset_split, len(testset) - dataset_split],
                                                          generator=torch.Generator().manual_seed(42))
    
    # Extract data and labels from torchvision dataset
    xtest = torch.stack([data[0] for data in testset])
    ytest = torch.tensor([data[1] for data in testset])

    return xtest, ytest

## Load and Prepare WideResNet Model

In [None]:
#%cd /kaggle/working/adversarial-distance-estimation
import models.wideresnet as wideresnet
from robustbench.utils import load_model

modeltype = 'adversarial'

print(f'\nLoading {modeltype} Model...\n')
if modeltype == 'standard':
    net = wideresnet.WideResNet_28_4(10, 'CIFAR10', normalized=True, block=wideresnet.WideBasic, activation_function='relu')
    state_dict = "model_state_dict"
    net = torch.nn.DataParallel(net)
    PATH = f'./models/pretrained_models/{modeltype}.pth'
    model = torch.load(PATH)
    net.load_state_dict(model[state_dict], strict=False)
elif modeltype == 'robust':
    #self trained with massive random data augmentation and JSD consistency loss, but no adversarial objective
    net = wideresnet.WideResNet_28_4(10, 'CIFAR10', normalized=True, block=wideresnet.WideBasic, activation_function='silu')
    net = torch.nn.DataParallel(net)
    state_dict = "model_state_dict"
    PATH = f'./models/pretrained_models/{modeltype}.pth'
    model = torch.load(PATH)
    net.load_state_dict(model[state_dict], strict=False)
elif modeltype == 'adversarial':
    #from https://github.com/BorealisAI/mma_training/tree/master/trained_models/cifar10-Linf-MMA-20-sd0
    model_name = 'Ding2020MMA'
    net = load_model(model_name=model_name, dataset='cifar10', threat_model='Linf')
    net = torch.nn.DataParallel(net)

%cd
net.eval()




Loading adversarial Model...

C:\Users\Georg Siedel


  checkpoint = torch.load(model_path, map_location=torch.device('cpu'))
  bkms = self.shell.db.get('bookmarks', {})
  self.shell.db['dhist'] = compress_dhist(dhist)[-100:]


DataParallel(
  (module): Ding2020MMANet(
    (conv1): Conv2d(3, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
    (block1): NetworkBlock(
      (layer): Sequential(
        (0): BasicBlock(
          (bn1): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (relu1): ReLU(inplace=True)
          (conv1): Conv2d(16, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
          (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (relu2): ReLU(inplace=True)
          (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
          (convShortcut): Conv2d(16, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
        )
        (1): BasicBlock(
          (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (relu1): ReLU(inplace=True)
          (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1

## Function to Test Model Accuracy

In [None]:
def test_accuracy(model, xtest, ytest):
    model.eval()
    correct, total = 0, 0

    with torch.no_grad():
        for i in range(len(xtest)):
            x = xtest[i].unsqueeze(0).to(device)
            y = ytest[i].unsqueeze(0).to(device)

            outputs = model(x)
            _, predicted = torch.max(outputs.data, 1)

            total += y.size(0)
            correct += (predicted==y).sum().item()

    accuracy = (correct / total) * 100
    print(f'\nAccuracy of the testset is: {accuracy:.3f}%\n')

In [None]:
# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(net.parameters(), lr=0.01)

# Initialize PyTorchClassifier for ART
classifier = PyTorchClassifier(model=net,
                               loss=criterion,
                               optimizer=optimizer,
                               input_shape=(3, 32, 32),
                               nb_classes=10)

In [None]:
from art.attacks.evasion import (FastGradientMethod,
                                 ProjectedGradientDescentNumpy,
                                 AutoAttack,
                                 AutoProjectedGradientDescent,
                                 CarliniL2Method,
                                 NewtonFool,
                                 DeepFool,
                                 ElasticNet,
                                 HopSkipJump)

## Adversarial Attack Class

In [None]:
class AdversarialAttacks:
  def __init__(self, classifier, epsilon, eps_iter, norm, iterations, second_attack_iters):
    self.classifier = classifier
    self.epsilon = epsilon
    self.eps_iter = eps_iter
    self.norm = norm
    self.iterations = iterations
    self.second_attack_iters = second_attack_iters

  def init_attacker(self, attack_type, **kwargs):
    if attack_type=='fast_gradient_method':
        return FastGradientMethod(self.classifier,
                                eps=self.epsilon,
                                eps_step=self.eps_iter,
                                minimal=True,
                                norm=self.norm,
                                **kwargs)
    elif attack_type=='projected_gradient_descent':
        return ProjectedGradientDescentNumpy(self.classifier,
                                             eps=self.epsilon,
                                             eps_step=self.eps_iter,
                                             max_iter=self.iterations,
                                             norm=self.norm,
                                             **kwargs)
    elif attack_type=='auto_attack':
        return AutoAttack(estimator=self.classifier,
                        eps=self.epsilon,
                        eps_step=self.eps_iter,
                        norm=self.norm)
    elif attack_type=='auto_projected_gradient_descent':
        return AutoProjectedGradientDescent(estimator=self.classifier,
                                          eps=self.epsilon,
                                          eps_step=self.eps_iter,
                                          norm=self.norm,
                                          max_iter=self.iterations,
                                          **kwargs)
    elif attack_type=='carlini_wagner_l2':
        return CarliniL2Method(self.classifier,
                               max_iter=self.second_attack_iters,
                               **kwargs)
    elif attack_type=='newton_fool':
        return NewtonFool(self.classifier,
                        max_iter=self.iterations,
                        **kwargs)
    elif attack_type=='deep_fool':
        return DeepFool(self.classifier,
                      max_iter=self.iterations,
                      epsilon=self.eps_iter,
                      **kwargs)
    elif attack_type=='elastic_net':
        return ElasticNet(self.classifier,
                      max_iter=self.second_attack_iters)
    elif attack_type=='hop_skip_jump':
        return HopSkipJump(self.classifier,
                         norm=self.norm,
                         max_iter=self.second_attack_iters)
    else:
        raise ValueError(f'Attack type "{attack_type}" not supported!')

## Plug-in Function for Adversarial Attack with Early Stopping

In [None]:
def attack_with_early_stopping(classifier, x, y, max_iterations, attacker):
    label_flipped = False
    count = 0
    start_time = time.time()

    x = x.unsqueeze(0)

    outputs = classifier.predict(x.cpu().numpy())
    _, clean_predicted = torch.max(torch.tensor(outputs).to(device).data, 1)
    
    if int(clean_predicted.item()) != int(y.item()):
        print('Misclassified input. Not attacking.')
        end_time = time.time()
        return x.cpu().detach().numpy(), end_time - start_time, 0

    for j in range(max_iterations):
        adv_inputs = attacker.generate(x.cpu().detach().numpy(), y.cpu().detach().numpy())

        adv_inputs_tensor = torch.from_numpy(adv_inputs).to(device)
        outputs = classifier.predict(adv_inputs)
        _, predicted = torch.max(torch.tensor(outputs).to(device).data, 1)

        label_flipped = bool(predicted.item() != int(y.item()))

        if label_flipped:
            print(f'\tIterations for successful iterative attack: {j+1}')
            break
            
        x = adv_inputs_tensor.clone()

    end_time = time.time()
    return adv_inputs, end_time - start_time, j

## Function for Adversarial Distance calculation (attack methods)

In [None]:
def distance_calculation(classifier, xtest, ytest, epsilon, eps_iter, norm, max_iterations, attack_type, get_image: bool = False, verbose: bool = True):

    distance_list, runtime_list = [], []
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    classifier.model.to(device)
    xtest = xtest.to(device)
    ytest = ytest.to(device)
    
    attacks = AdversarialAttacks(classifier=classifier,
                          epsilon=epsilon,
                          eps_iter=eps_iter,
                          norm=norm,
                          iterations=max_iterations,
                          second_attack_iters=40)
    attacker = attacks.init_attacker(attack_type)

    correct_prediction = 0

    for i, x in enumerate(xtest):
        x = x.to(device)
        y = ytest[i].unsqueeze(0).to(device)
        
        x_adversarial, runtime, iterations = attack_with_early_stopping(classifier=classifier,
                                                            x=x,
                                                            y=y,
                                                            max_iterations=max_iterations,
                                                            attacker=attacker)

        x_adversarial_tensor = torch.from_numpy(x_adversarial).to(device)

        # Adversarial accuracy calculation
        output_adversarial = classifier.predict(x_adversarial)
        _, predicted_adversarial = torch.max(torch.tensor(output_adversarial).to(device).data, 1)
        correct_prediction += (predicted_adversarial.item() == int(y.item()))

        distance = torch.norm((x - x_adversarial_tensor), p=float(norm))
        if distance.item() == 0.0:
            distance_list.append(0.0)
            print(f'\nMisclassified!!! dist: {distance.item()}\n')
        else:
            distance_list.append(distance.item())
        runtime_list.append(runtime)

        if verbose:
            print(f'Image {i}\t\tAdversarial_distance: {distance:.5f}\t\tRuntime: {runtime:5f} seconds')

    if get_image:
        get_example_image(x_adversarial, predicted_adversarial.item(), attack_type=attack_type)
        get_example_image(x.unsqueeze(0).numpy(), y[i], attack_type='original')
    
    adversarial_accuracy = (correct_prediction / len(xtest)) * 100
    print(f'\nAdversarial accuracy: {adversarial_accuracy}%\n')

    return distance_list, runtime_list, adversarial_accuracy

## Parameters

In [None]:
norm = np.inf  # 1, 2, np.inf
max_iterations = 500
eps_iter_dict = {
    'inf': 0.0003,
    '1': 0.2,
    '2': 0.005}
eps_iter = eps_iter_dict[str(norm)]
epsilon = max_iterations * eps_iter

## Load the dataset

In [None]:
splitsize = 20       # full, int: splitsize
xtest, ytest = load_dataset(dataset_split=splitsize)

Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to ./data/cifar\cifar-10-python.tar.gz


100%|██████████| 170M/170M [03:26<00:00, 825kB/s]  


Extracting ./data/cifar\cifar-10-python.tar.gz to ./data/cifar


## Test Accuracy

In [None]:
test_accuracy(net, xtest, ytest)


Accuracy of the testset is: 90.000%



# Experiments

In [None]:
attack_types = [
                'fast_gradient_method',
                'projected_gradient_descent',
                'auto_projected_gradient_descent',
                'auto_conjugate_gradient',
                'newton_fool',
                'deep_fool',
                'elastic_net',
                'frame_saliency',
                'auto_attack',
                'carlini_wagner_linf',
                'carlini_wagner_l2',
                'hop_skip_jump'
                ]

results_dict = {}

for attack_type in attack_types:
    results_dict[attack_type] = {}
    print(f'\t\t-------------------------- Processing Attack: {attack_type} --------------------------\n')
    results_dict[attack_type]["adversarial_distance"], results_dict[attack_type]["runtime"], results_dict[attack_type]["adversarial_accuracy"] = distance_calculation(classifier=classifier,
                                                        xtest=xtest,
                                                        ytest=ytest,
                                                        epsilon=epsilon,
                                                        eps_iter=eps_iter,
                                                        norm=norm,
                                                        max_iterations=max_iterations,
                                                        attack_type=attack_type)
    
    mean_value = np.mean([x for x in results_dict[attack_type]["adversarial_distance"] if x is not None])

    print(f'\nMean adversarial distance for {attack_type}: {mean_value:.5f} with total runtime: {sum(results_dict[attack_type]["runtime"]): .5f} seconds\n')

		-------------------------- Processing Attack: fast_gradient_method --------------------------

	Iterations for successful iterative attack: 1
Image 0		Adversarial_distance: 0.00210		Runtime: 0.523461 seconds
	Iterations for successful iterative attack: 2
Image 1		Adversarial_distance: 0.16620		Runtime: 16.282827 seconds
	Iterations for successful iterative attack: 2
Image 2		Adversarial_distance: 0.15780		Runtime: 14.328118 seconds
	Iterations for successful iterative attack: 1
Image 3		Adversarial_distance: 0.04740		Runtime: 4.404681 seconds
Misclassified input. Not attacking.

Misclassified!!! dist: 0.0

Image 4		Adversarial_distance: 0.00000		Runtime: 0.024187 seconds
	Iterations for successful iterative attack: 2
Image 5		Adversarial_distance: 0.15060		Runtime: 14.344605 seconds
	Iterations for successful iterative attack: 2
Image 6		Adversarial_distance: 0.16440		Runtime: 13.816709 seconds
	Iterations for successful iterative attack: 1
Image 7		Adversarial_distance: 0.06840		Run

## Save Results to JSON File

In [None]:
json_file_path = f'./data/attack_comparison_{modeltype}_L{norm}.json'

with open(json_file_path, 'w') as f:
    json.dump(results_dict, f, indent=4)
print(f'Evaluation results are saved under "{json_file_path}".')

# Plotting

## Distances

In [None]:
plt.figure(figsize=(16, 6))
for attack_type in attack_types:
  plt.scatter(list(range(len(xtest))), results_dict[attack_type]['adversarial_distance'], label=attack_type)
plt.xlabel('Image ID ----->')
plt.ylabel('Distance ----->')
plt.title(f'L{norm} Distance')
plt.legend()
plt.tight_layout()
plt.xticks(list(range(len(xtest))))
plt.show()

In [None]:
plt.figure(figsize=(15, 8))
for attack_type in attack_types:
  plt.plot(list(range(len(xtest))), results_dict[attack_type]['adversarial_distance'], label=attack_type)
plt.xlabel('Image ID')
plt.ylabel('Distance')
plt.title(f'L{norm} Distance')
plt.legend()
plt.tight_layout()
plt.xticks(list(range(len(xtest))))
plt.show()

# Runtime per image

In [None]:
plt.figure(figsize=(15, 8))
for attack_type in attack_types:
  plt.plot(list(range(len(xtest))), results_dict[attack_type]['runtime'], label=attack_type)
plt.xlabel('Image ID')
plt.ylabel('Runtime [seconds]')
plt.title('Step Runtime')
plt.legend()
plt.tight_layout()
plt.show()

## Adversarial Accuracy

In [None]:
plt.figure(figsize=(15, 8))
for attack_type in attack_types:
  plt.bar(attack_type, results_dict[attack_type]['adversarial_accuracy'], label=attack_type)
plt.xlabel('Attacks')
plt.ylabel('Adversarial accuracy [%]')
plt.xticks(rotation=45)
plt.legend()
plt.title('Adversarial Accuracy')
plt.tight_layout()
plt.show()

## Total Runtime

In [None]:
plt.figure(figsize=(15, 8))
for attack_type in attack_types:
  plt.bar(attack_type, sum(results_dict[attack_type]['runtime']), label=attack_type)
plt.xlabel('Attacks')
plt.xticks(rotation=45)
plt.ylabel('Runtime [seconds]')
plt.title('Total Runtime')
plt.legend()
plt.tight_layout()
plt.show()