In [None]:
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt

import robustml
from obfuscated_gradients.thermometer.robustml_model import Thermometer

from obfuscated_gradients.thermometer.robustml_attack import Attack as Attack_Orig
from src.robustml_attack_ce import Attack as Attack_CE
from src.robustml_apgd import Attack as Attack_Apgd

In [None]:
# set up TensorFlow session
sess = tf.compat.v1.Session()

# initialize a model
model = Thermometer(sess)

# initialize a data provider for CIFAR-10 images
cifar_path = r'.\cifar10_data\test_batch'
provider = robustml.provider.CIFAR10(cifar_path)

# Change these to run the evaluation on a different set of test images
eval_start_idx = 0
eval_end_idx = 100

# Initialize the attacks
attack_orig = Attack_Orig(sess, model.model, epsilon=model.threat_model.epsilon)
attack_ce = Attack_CE(sess, model.model, epsilon=model.threat_model.epsilon)
attack_apgd = Attack_Apgd(sess, model._model, epsilon=model.threat_model.epsilon)

# Note: This will generate a few TensorFlow deprecation warnings, but everything should still work

Original Attack

In [None]:
print('Evaluating the Original Attack')
success_rate = robustml.evaluate.evaluate(
    model,
    attack_orig,
    provider,
    start=eval_start_idx,
    end=eval_end_idx,
    deterministic=True,
    debug=False,
)

print('attack success rate: %.2f%% (over %d data points)' % (success_rate*100, eval_end_idx-eval_start_idx))

Original Attack with Cross Entropy Loss

In [None]:
print('Evaluating the Original Attack modified to use Cross Entropy Loss')
success_rate = robustml.evaluate.evaluate(
    model,
    attack_ce,
    provider,
    start=eval_start_idx,
    end=eval_end_idx,
    deterministic=True,
    debug=False,
)

print('attack success rate: %.2f%% (over %d data points)' % (success_rate*100, eval_end_idx-eval_start_idx))

Auto-PGD Attack

In [None]:
print('Evaluating the Auto-PGD Attack')
success_rate = robustml.evaluate.evaluate(
    model,
    attack_apgd,
    provider,
    start=eval_start_idx,
    end=eval_end_idx,
    deterministic=True,
    debug=False,
)

print('attack success rate: %.2f%% (over %d data points)' % (success_rate*100, eval_end_idx-eval_start_idx))

Sample Adversarial Images

In [None]:
def get_logits(x):
    x = x * 255.0
    # first encode the input, then classify it
    encoded = model.encode(x)
    logits = tf.nn.softmax(model._model.pre_softmax)
    return model._sess.run(logits, {model._model.x_input: encoded})[0]
    

cifar_labels = ['airplane', 'automobile', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck']

def display_image_logits(img, logits, label):
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 4))
    
    # Display image    
    # Remove tick marks
    ax1.tick_params(
        axis='x',
        which='both',
        bottom=False,
        top=False,
        labelbottom=False)
    
    ax1.tick_params(
        axis='y',
        which='both',
        left=False,
        right=False,
        labelleft=False)
    
    ax1.set_title("Input Image")
    ax1.imshow(img)

    # Display logits as bar chart
    barlist = ax2.bar(range(10), logits)
    
    # Color the classified class red
    top_idx = np.argmax(logits)
    barlist[top_idx].set_color('r')
    
    # Color the correct class green
    barlist[label].set_color('g')
    
    ax2.set_title("Classifier Output Logits")
    ax2.set_xlabel("CIFAR-10 Classes")
    ax2.set_ylabel("Probabilty Logit")
    ax2.set_ylim(0.0, 1.1)
    ax2.set_xticks(range(10))
    ax2.set_xticklabels(cifar_labels)
    ax2.tick_params(
        axis='x',
        which='both',
        labelrotation=90.0)
    
    fig.subplots_adjust(bottom=0.2)
    plt.show()

Retrieve Clean Image from Provider

In [None]:
img_idx = 0     # Change this to use a different sample image
img, label = provider[img_idx]

Display the Clean Image along with the Model's Classification

In [None]:
display_image_logits(np.copy(img), get_logits(np.copy(img)), label)

Run the Original Attack on the Image

In [None]:
adv_orig = attack_orig.run(np.copy(img), label, None)
display_image_logits(adv_orig, get_logits(adv_orig), label)

Run the Original Attack modified to use Cross Entropy Loss on the Image

In [None]:
adv_ce = attack_ce.run(np.copy(img), label, None)
display_image_logits(adv_ce, get_logits(adv_ce), label)

Run the Auto-PGD Attack on the Image

In [None]:
adv_apgd = attack_apgd.run(np.copy(img), label, None)
display_image_logits(adv_orig, get_logits(adv_apgd), label)