This notebook tries to reproduce the results from the adversarial spheres paper: https://arxiv.org/pdf/1801.02774.pdf

Following guide from https://keras.io/examples/vision/mnist_convnet/.

### Import dependencies

In [1]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

from art.estimators.classification import TensorFlowV2Classifier
from art.attacks.evasion import ProjectedGradientDescent

tf.config.list_physical_devices(device_type=None)

SEED = 42

### Data generation utils

In [2]:
def normalize_vectors(xs):
    return xs / np.linalg.norm(xs, ord=2, axis=1, keepdims=True)

def sample_unit_sphere(num_samples, dim):
    return normalize_vectors(
        np.random.normal(size=(num_samples, dim))
    )

def generate_sphere_data(dim, rad, num_samples):
    assert num_samples % 2 == 0
    
    xs = np.concatenate([
        sample_unit_sphere(num_samples // 2, dim),
        rad * sample_unit_sphere(num_samples // 2, dim)
    ])
    
    ys = np.concatenate([
        np.zeros(shape=num_samples // 2, dtype=np.int64),
        np.ones(shape=num_samples // 2, dtype=np.int64)
    ])
    
    perm = np.random.permutation(num_samples)
    xs = xs[perm]
    ys = ys[perm]
    
    return xs, ys

generate_sphere_data(dim=2, rad=10, num_samples=4)

(array([[-7.16985173,  6.97088417],
        [ 0.58914738,  0.80802559],
        [-0.91550599,  0.40230434],
        [ 1.4593279 ,  9.89294507]]),
 array([1, 0, 0, 1]))

### Construct data

In [3]:
%%time
# From https://arxiv.org/pdf/1801.02774.pdf
D = 500
R = 1.3

# CIFAR10 dataset size
NUM_TRAIN = 5 * 10 ** 4
NUM_TEST = 10 ** 4

np.random.seed(SEED)
tf.random.set_seed(SEED)

x_train, y_train = generate_sphere_data(
    dim=D,
    rad=R,
    num_samples=NUM_TRAIN,
)

x_test, y_test = generate_sphere_data(
    dim=D,
    rad=R,
    num_samples=NUM_TEST,
)

CPU times: user 655 ms, sys: 166 ms, total: 822 ms
Wall time: 841 ms


In [4]:
np.random.seed(SEED)
tf.random.set_seed(SEED)

model = keras.Sequential(
    [
        keras.Input(shape=x_train[0].shape),
        layers.Dense(1000, activation="relu"),
        layers.Dense(1000, activation="relu"),
        layers.Dense(2, activation="softmax"),
    ]
)

model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 1000)              501000    
_________________________________________________________________
dense_1 (Dense)              (None, 1000)              1001000   
_________________________________________________________________
dense_2 (Dense)              (None, 2)                 2002      
Total params: 1,504,002
Trainable params: 1,504,002
Non-trainable params: 0
_________________________________________________________________


In [5]:
%%time
np.random.seed(SEED)
tf.random.set_seed(SEED)

model.compile(
    loss="SparseCategoricalCrossentropy",
    optimizer="adam",
    metrics=["accuracy"]
)

model.fit(
    x_train,
    y_train,
    batch_size=50,
    epochs=5,
    validation_split=0.1
);

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
CPU times: user 5min 17s, sys: 51.6 s, total: 6min 8s
Wall time: 28.1 s


<tensorflow.python.keras.callbacks.History at 0x7f559a8502b0>

In [6]:
score = model.evaluate(x_train, y_train, verbose=0)
print("Nat train loss:", score[0])
print("Nat train accuracy:", score[1])

Nat train loss: 0.011119423434138298
Nat train accuracy: 0.9960799813270569


In [7]:
score = model.evaluate(x_test, y_test, verbose=0)
print("Nat test loss:", score[0])
print("Nat test accuracy:", score[1])

Nat test loss: 0.026690993458032608
Nat test accuracy: 0.9905999898910522


In [8]:
# From https://github.com/Trusted-AI/adversarial-robustness-toolbox/blob/main/notebooks/adversarial_retraining.ipynb
# and https://github.com/Trusted-AI/adversarial-robustness-toolbox/issues/238
art_model = TensorFlowV2Classifier(
    model=model,
    input_shape=x_test[0].shape,
    nb_classes=2,
    loss_object=keras.losses.SparseCategoricalCrossentropy(),
    clip_values=[-R, R],
)

attack = ProjectedGradientDescent(
    art_model,
    norm=2,
    eps=0.18,
    eps_step=0.01,
    max_iter=40,
    batch_size=1024,
)

In [9]:
%%time
np.random.seed(SEED)
tf.random.set_seed(SEED)
x_train_adv = attack.generate(x_train[:NUM_TEST], y=y_train[:NUM_TEST])

score = model.evaluate(x_train_adv, y_train[:NUM_TEST], verbose=0)
print("Adv train loss:", score[0])
print("Adv train accuracy:", score[1])

PGD - Batches: 0it [00:00, ?it/s]

Adv train loss: 10.049074172973633
Adv train accuracy: 0.3400000035762787
CPU times: user 2min 44s, sys: 13.3 s, total: 2min 57s
Wall time: 12.4 s


In [10]:
%%time
np.random.seed(SEED)
tf.random.set_seed(SEED)
x_test_adv = attack.generate(x_test, y=y_test)

score = model.evaluate(x_test_adv, y_test, verbose=0)
print("Adv test loss:", score[0])
print("Adv test accuracy:", score[1])

PGD - Batches: 0it [00:00, ?it/s]

Adv test loss: 10.807324409484863
Adv test accuracy: 0.2883000075817108
CPU times: user 2min 45s, sys: 11.7 s, total: 2min 57s
Wall time: 12.4 s


### Conclusions

We perform an L2 perturbation of 0.18 on inputs of norm 1.0 or 1.3.

However, in real life, images do not have norm 1.0 or 1.3.
If we assume pixel intensities in $[0, 1]$ with an average intensity of $0.5$,
real life images have a norm of approximately $0.5 \times \sqrt{D}$,
where $D$ is the number of pixels in the image.

Thus the effective L2 perturbation for IRL images is approximately
$0.18 \times 0.5 \sqrt{D} \approx 2$.

Thus the IRL equivalent of the above experiment
is an L2 perturbation of norm 2 resulting in 28.8% adversarial test accuracy
(compared to 99% natural test accuracy).