# Spatially transformed adversarial examples

In [1]:
import os
import ipdb
%matplotlib inline
os.environ["CUDA_VISIBLE_DEVICES"]="3"

## setup MNIST

In [2]:
MNIST_data = './MNIST-data/'

In [3]:
## setup_mnist.py -- mnist data and model loading code
##
## Copyright (C) 2016, Nicholas Carlini <nicholas@carlini.com>.
##
## This program is licenced under the BSD 2-Clause licence,
## contained in the LICENCE file in this directory.

import tensorflow as tf
import numpy as np
import os
import pickle
import gzip
import urllib

from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Flatten
from keras.layers import Conv2D, MaxPooling2D
from keras.utils import np_utils
from keras.models import load_model

def extract_data(filename, num_images):
    with gzip.open(filename) as bytestream:
        bytestream.read(16)
        buf = bytestream.read(num_images*28*28)
        data = np.frombuffer(buf, dtype=np.uint8).astype(np.float32)
        data = (data / 255) - 0.5
        data = data.reshape(num_images, 28, 28, 1)
        return data

def extract_labels(filename, num_images):
    with gzip.open(filename) as bytestream:
        bytestream.read(8)
        buf = bytestream.read(1 * num_images)
        labels = np.frombuffer(buf, dtype=np.uint8)
    return (np.arange(10) == labels[:, None]).astype(np.float32)

class MNIST:
    def __init__(self):
        if not os.path.exists(MNIST_data):
            os.mkdir(MNIST_data)
            files = ["train-images-idx3-ubyte.gz",
                     "t10k-images-idx3-ubyte.gz",
                     "train-labels-idx1-ubyte.gz",
                     "t10k-labels-idx1-ubyte.gz"]
            for name in files:

                urllib.urlretrieve('http://yann.lecun.com/exdb/mnist/' + name, "MNIST_data/"+name)

        train_data = extract_data(MNIST_data + "//train-images-idx3-ubyte.gz", 60000)
        train_labels = extract_labels(MNIST_data + "/train-labels-idx1-ubyte.gz", 60000)
        self.test_data = extract_data(MNIST_data + "/t10k-images-idx3-ubyte.gz", 10000)
        self.test_labels = extract_labels(MNIST_data + "/t10k-labels-idx1-ubyte.gz", 10000)
        
        VALIDATION_SIZE = 5000
        
        self.validation_data = train_data[:VALIDATION_SIZE, :, :, :]
        self.validation_labels = train_labels[:VALIDATION_SIZE]
        self.train_data = train_data[VALIDATION_SIZE:, :, :, :]
        self.train_labels = train_labels[VALIDATION_SIZE:]


class MNISTModel:
    def __init__(self, restore, session=None):
        self.num_channels = 1
        self.image_size = 28
        self.num_labels = 10

        model = Sequential()

        model.add(Conv2D(32, (3, 3),
                         input_shape=(28, 28, 1)))
        model.add(Activation('relu'))
        model.add(Conv2D(32, (3, 3)))
        model.add(Activation('relu'))
        model.add(MaxPooling2D(pool_size=(2, 2)))
        
        model.add(Conv2D(64, (3, 3)))
        model.add(Activation('relu'))
        model.add(Conv2D(64, (3, 3)))
        model.add(Activation('relu'))
        model.add(MaxPooling2D(pool_size=(2, 2)))
        
        model.add(Flatten())
        model.add(Dense(200))
        model.add(Activation('relu'))
        model.add(Dense(200))
        model.add(Activation('relu'))
        model.add(Dense(10))
        # Behold no softmax
        model.load_weights(restore)

        self.model = model

    def predict(self, data):
        return self.model(data)

Using TensorFlow backend.


## Train models

In [4]:
## train_models.py -- train the neural network models for attacking
##
## Copyright (C) 2016, Nicholas Carlini <nicholas@carlini.com>.
##
## This program is licenced under the BSD 2-Clause licence,
## contained in the LICENCE file in this directory.


import numpy as np
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Flatten
from keras.layers import Conv2D, MaxPooling2D
from keras.optimizers import SGD

import tensorflow as tf
# from setup_mnist import MNIST
# from setup_cifar import CIFAR
import os

def train(data, file_name, params, num_epochs=50, batch_size=128, train_temp=1, init=None):
    """
    Standard neural network training procedure.
    """
    model = Sequential()

    print(data.train_data.shape)
    
    model.add(Conv2D(params[0], (3, 3),
                            input_shape=data.train_data.shape[1:]))
    model.add(Activation('relu'))
    model.add(Conv2D(params[1], (3, 3)))
    model.add(Activation('relu'))
    model.add(MaxPooling2D(pool_size=(2, 2)))

    model.add(Conv2D(params[2], (3, 3)))
    model.add(Activation('relu'))
    model.add(Conv2D(params[3], (3, 3)))
    model.add(Activation('relu'))
    model.add(MaxPooling2D(pool_size=(2, 2)))

    model.add(Flatten())
    model.add(Dense(params[4]))
    model.add(Activation('relu'))
    model.add(Dropout(0.5))
    model.add(Dense(params[5]))
    model.add(Activation('relu'))
    model.add(Dense(10))
    # no softmax
    if init != None:
        model.load_weights(init)

    def fn(correct, predicted):
        return tf.nn.softmax_cross_entropy_with_logits(labels=correct,
                                                       logits=predicted/train_temp)

    sgd = SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True)
    
    model.compile(loss=fn,
                  optimizer=sgd,
                  metrics=['accuracy'])
    
    model.fit(data.train_data, data.train_labels,
              batch_size=batch_size,
              validation_data=(data.validation_data, data.validation_labels),
              epochs=num_epochs,
              shuffle=True)
    

    if file_name != None:
        model.save(file_name)

    return model

def train_distillation(data, file_name, params, num_epochs=50, batch_size=128, train_temp=1):
    """
    Train a network using defensive distillation.
    Distillation as a Defense to Adversarial Perturbations against Deep Neural Networks
    Nicolas Papernot, Patrick McDaniel, Xi Wu, Somesh Jha, Ananthram Swami
    IEEE S&P, 2016.
    """
    if not os.path.exists(file_name+"_init"):
        # Train for one epoch to get a good starting point.
        train(data, file_name+"_init", params, 1, batch_size)
    
    # now train the teacher at the given temperature
    teacher = train(data, file_name+"_teacher", params, num_epochs, batch_size, train_temp,
                    init=file_name+"_init")

    # evaluate the labels at temperature t
    predicted = teacher.predict(data.train_data)
    with tf.Session() as sess:
        y = sess.run(tf.nn.softmax(predicted/train_temp))
        print(y)
        data.train_labels = y

    # train the student model at temperature t
    student = train(data, file_name, params, num_epochs, batch_size, train_temp,
                    init=file_name+"_init")

    # and finally we predict at temperature 1
    predicted = student.predict(data.train_data)

    print(predicted)
    
if not os.path.isdir('models'):
    os.makedirs('models')

# train(MNIST(), "models/mnist_baseline", [32, 32, 64, 64, 200, 200], num_epochs=50)

# train_distillation(MNIST(), "models/mnist-distilled-100", [32, 32, 64, 64, 200, 200],
#                    num_epochs=50, train_temp=100)


## Spatial transformed adversarial attack

In [5]:
## stAdv_attack.py -- attack a network optimizing for spatial distance
##
##
## This program is licenced under the BSD 2-Clause licence,
## contained in the LICENCE file in this directory.

import sys
import tensorflow as tf
import numpy as np

BINARY_SEARCH_STEPS = 9  # number of times to adjust the constant with binary search
MAX_ITERATIONS = 10000   # number of iterations to perform gradient descent/L-BFGS
ABORT_EARLY = True       # if we stop improving, abort gradient descent early
LEARNING_RATE = 1e-2     # larger values converge faster to less accurate results
TARGETED = True          # should we target one specific class? or just be wrong?
CONFIDENCE = 0.5           # how strong the adversarial example should be
INITIAL_CONST = 0.05     # the initial constant c to pick as a first guess, Manages the contiribution of the loss due to flow or spatial regularization loss.

class STAdv:
    def __init__(self, sess, model, batch_size=1, confidence = CONFIDENCE,
                 targeted = TARGETED, learning_rate = LEARNING_RATE,
                 binary_search_steps = BINARY_SEARCH_STEPS, max_iterations = MAX_ITERATIONS,
                 abort_early = ABORT_EARLY, 
                 initial_const = INITIAL_CONST,
                 boxmin = -0.5, boxmax = 0.5):
        """
        The Spatial transformed adversarial attack. 
        This attack is based on Spatially transformed adversarial examples paper.
        Returns adversarial examples for the supplied model.
        
        confidence: Confidence of adversarial examples: higher produces examples
              that are farther away, but more strongly classified as adversarial.
        batch_size: Number of attacks to run simultaneously.
        targeted: True if we should perform a targetted attack, False otherwise.
        learning_rate: The learning rate for the attack algorithm. Smaller values
              produce better results but are slower to converge.
        binary_search_steps: The number of times we perform binary search to
              find the optimal tradeoff-constant between distance and confidence. 
        max_iterations: The maximum number of iterations. Larger values are more
              accurate; setting too small will require a large learning rate and will
              produce poor results.
        abort_early: If true, allows early aborts if gradient descent gets stuck.
        initial_const: The initial tradeoff-constant to use to tune the relative
              importance of distance and confidence. If binary_search_steps is large,
              the initial constant is not important.
        """

        image_size, num_channels, num_labels = model.image_size, model.num_channels, model.num_labels
        self.sess = sess
        self.TARGETED = targeted
        self.LEARNING_RATE = learning_rate
        self.MAX_ITERATIONS = max_iterations
        self.BINARY_SEARCH_STEPS = binary_search_steps
        self.ABORT_EARLY = abort_early
        self.CONFIDENCE = confidence
        self.initial_const = initial_const
        self.batch_size = batch_size

        self.repeat = binary_search_steps >= 10

        shape = (batch_size,image_size,image_size,num_channels)
        
        # the variable we're going to optimize over
        modifier = tf.Variable(np.zeros(shape,dtype=np.float32))
        # these are variables to be more efficient in sending data to tf
        self.timg = tf.Variable(np.zeros(shape), dtype=tf.float32)
        self.tlab = tf.Variable(np.zeros((batch_size,num_labels)), dtype=tf.float32)
        self.const = tf.Variable(np.zeros(batch_size), dtype=tf.float32)

        # and here's what we use to assign them
        self.assign_timg = tf.placeholder(tf.float32, shape)
        self.assign_tlab = tf.placeholder(tf.float32, (batch_size,num_labels))
        self.assign_const = tf.placeholder(tf.float32, [batch_size])
        
        # the resulting image, tanh'd to keep bounded from boxmin to boxmax
#         self.boxmul = (boxmax - boxmin) / 2.
#         self.boxplus = (boxmin + boxmax) / 2.
#         self.newimg = tf.tanh(modifier + self.timg) * self.boxmul + self.boxplus
        self.newimg = modifier + self.timg
        
        # Create the grid for bilinear interpolation
        indices_grid = self._meshgrid(image_size, image_size)
        print("meshgrid shape", indices_grid.shape)
        indices_grid = tf.tile(indices_grid, tf.stack([batch_size]))
        print("batch_tile grid shape", indices_grid.shape)
        indices_grid = tf.reshape(indices_grid, (batch_size, 3, -1))
        print("batch+3:", indices_grid.shape)
        transformed_grid = indices_grid
        x_s = tf.slice(transformed_grid, [0, 0, 0], [-1, 1, -1])
        print("grid x_s shape:", x_s.shape)
        y_s = tf.slice(transformed_grid, [0, 1, 0], [-1, 1, -1])

        x_s_flatten = tf.reshape(x_s, [-1])
        y_s_flatten = tf.reshape(y_s, [-1])
        print("x_flatten_shape:", x_s_flatten.shape)
        # TODO: apply biliear interpolation after adding the modifer?
        # bilinear interpolation
        output_size = (image_size, image_size)
        self.newimg = self._interpolate(self.newimg,
                                 x_s_flatten,
                                 y_s_flatten,
                                 output_size)
        self.newimg = tf.reshape(self.newimg, shape)
        
        self.newimg = tf.cast(self.newimg, dtype='float32')
        print(self.newimg.shape)
        
        # prediction BEFORE-SOFTMAX of the model
        self.output = model.predict(self.newimg)
        
        # distance to the input data
        ## TODO : figure out the exact total variation loss used in the paper
        # the following version works too
        self.stdist = self.total_variation_based_loss(self.newimg- self.timg)
#         self.stdist = self.total_variation_based_loss(self.newimg)
        self.stdist = tf.cast(self.stdist,  dtype='float32')
    
        # compute the probability of the label class versus the maximum other
        real = tf.reduce_sum((self.tlab)*self.output, 1)
        other = tf.reduce_max((1.-self.tlab)*self.output - (self.tlab*10000.), 1)
        
        #TODO: improve on the value of loss1 as its 0 most of the time.
        if self.TARGETED:
            # if targetted, optimize for making the other class most likely
            loss1 = tf.maximum(0.0, other-real+self.CONFIDENCE)
        else:
            # if untargeted, optimize for making this class least likely.
            loss1 = tf.maximum(0.0, real-other+self.CONFIDENCE)
        
        # sum up the losses
        self.loss2 = tf.reduce_sum(self.const*self.stdist)
        self.loss1 = tf.reduce_sum(loss1)
        self.loss = self.loss1 + self.loss2
        
        # Setup the adam optimizer and keep track of variables we're creating
        start_vars = set(x.name for x in tf.global_variables())
        self.optimizer = optimizer = tf.contrib.opt.ScipyOptimizerInterface(self.loss,
                method='L-BFGS-B',  var_list = [modifier], var_to_bounds={modifier:([0,1])}, # TODO: bound the new image not just the modifier
                options={'maxiter': max_iterations}, 
                )
#         optimizer = tf.train.AdamOptimizer(self.LEARNING_RATE)
#         self.train = optimizer.minimize(self.loss, var_list=[modifier])
        end_vars = tf.global_variables()
        new_vars = [x for x in end_vars if x.name not in start_vars]

        # these are the variables to initialize when we run
        self.setup = []
        self.setup.append(self.timg.assign(self.assign_timg))
        self.setup.append(self.tlab.assign(self.assign_tlab))
        self.setup.append(self.const.assign(self.assign_const))
        
        self.init = tf.variables_initializer(var_list=[modifier]+new_vars)
    
    def total_variation_based_loss(self, images):
        """
          Calculate and return the total variation based loss for one or more images.
          The total variation is the sum of the L2 norm for neighboring
          pixel-values in the input images. This measures how much noise is in the
          images.
          This is based on the total variation of images in the tensorflow with the 
          following documention:
          Total Variation:
          This can be used as a loss-function during optimization so as to suppress
          noise in images. If you have a batch of images, then you should calculate
          the scalar loss-value as the sum:
          `loss = tf.reduce_sum(tf.image.total_variation(images))`
          This implements the anisotropic 2-D version of the formula described here:
          https://en.wikipedia.org/wiki/Total_variation_denoising
          Args:
            images: 4-D Tensor of shape `[batch, height, width, channels]` or
                    3-D Tensor of shape `[height, width, channels]`.
            name: A name for the operation (optional).
          Raises:
            ValueError: if images.shape is not a 3-D or 4-D vector.
          Returns:
            The total variation of `images`.
            If `images` was 4-D, return a 1-D float Tensor of shape `[batch]` with the
            total variation for each image in the batch.
            If `images` was 3-D, return a scalar float with the total variation for
            that image.
          """
        ndims = images.get_shape().ndims

        if ndims == 3:
            # The input is a single image with shape [height, width, channels].
            # Calculate the difference of neighboring pixel-values.
            # The images are shifted one pixel along the height and width by slicing.
            pixel_dif1 = images[1:, :, :] - images[:-1, :, :]
            pixel_dif2 = images[:, 1:, :] - images[:, :-1, :]
            
            # Sum for all axis. (None is an alias for all axis.)
            sum_axis = None
        elif ndims == 4:
            # The input is a batch of images with shape:
            # [batch, height, width, channels].
            
            # Calculate the difference of neighboring pixel-values.
            # The images are shifted one pixel along the height and width by slicing.
            pixel_dif1 = images[:, 1:, :, :] - images[:, :-1, :, :]
            pixel_dif2 = images[:, :, 1:, :] - images[:, :, :-1, :]

            # Only sum for the last 3 axis.
            # This results in a 1-D tensor with the total variation for each image.
            sum_axis = [1, 2, 3]
        else:
            raise ValueError('\'images\' must be either 3 or 4-dimensional.')

        # Calculate the total variation by taking the absolute value of the
        # pixel-differences and summing over the appropriate axis.
        tot_var = (
            tf.add(
#                 tf.nn.l2_loss(pixel_dif1**2),
#                 tf.nn.l2_loss(pixel_dif2**2)))
            tf.reduce_sum(tf.square(pixel_dif1), axis=sum_axis) ,
            tf.reduce_sum(tf.square(pixel_dif2), axis=sum_axis)
            ))
        return tot_var


            # grid sampling, returns a tensor of size hxwx3
    def _meshgrid(self, height, width):
        x_linspace = tf.linspace(-1., 1., width)
        y_linspace = tf.linspace(-1., 1., height)
        x_coordinates, y_coordinates = tf.meshgrid(x_linspace, y_linspace)
        x_coordinates = tf.reshape(x_coordinates, [-1])
        y_coordinates = tf.reshape(y_coordinates, [-1])
        ones = tf.ones_like(x_coordinates)
        indices_grid = tf.concat([x_coordinates, y_coordinates, ones], 0)
        return indices_grid

    def _repeat(self, x, num_repeats):
            ones = tf.ones((1, num_repeats), dtype='int32')
            x = tf.reshape(x, shape=(-1,1))
            x = tf.matmul(x, ones)
            return tf.reshape(x, [-1])

    # bilinear interpolation
    def _interpolate(self, image, x, y, output_size):
        batch_size = tf.shape(image)[0]
        height = tf.shape(image)[1]
        width = tf.shape(image)[2]
        num_channels = tf.shape(image)[3]

        x = tf.cast(x , dtype='float32')
        y = tf.cast(y , dtype='float32')

        height_float = tf.cast(height, dtype='float32')
        width_float = tf.cast(width, dtype='float32')

        output_height = output_size[0]
        output_width  = output_size[1]
        
        # because the intial values are from -1 and 1, scale them to the original coordinates
        x = .5*(x + 1.0)*(width_float)
        y = .5*(y + 1.0)*(height_float)
        
        # Get the lower x coordinate
        x0 = tf.cast(tf.floor(x), 'int32')
        # upper x coordinate is just one plus the lower x
        x1 = x0 + 1
        y0 = tf.cast(tf.floor(y), 'int32')
        y1 = y0 + 1
        
        # max y and max x for clipping the coordinates
        max_y = tf.cast(height - 1, dtype='int32')
        max_x = tf.cast(width - 1,  dtype='int32')
        zero = tf.zeros([], dtype='int32')
    
        # Clip the coordinates
        x0 = tf.clip_by_value(x0, zero, max_x)
        x1 = tf.clip_by_value(x1, zero, max_x)
        y0 = tf.clip_by_value(y0, zero, max_y)
        y1 = tf.clip_by_value(y1, zero, max_y)
        
        # flatten the pixel values of input img
        flat_image_dimensions = width*height
        pixels_batch = tf.range(batch_size)*flat_image_dimensions
        # flatten the output img dimensions
        flat_output_dimensions = output_height*output_width
        # get a array of ones for the output flattened dim
        base = self._repeat(pixels_batch, flat_output_dimensions)
        
        print("base shape:", base.shape)
        base_y0 = base + y0*width
        print("base_y0_shape", base_y0.shape)
        base_y1 = base + y1*width
        indices_a = base_y0 + x0
        print("indices_shape", indices_a.shape)
        indices_b = base_y1 + x0
        indices_c = base_y0 + x1
        indices_d = base_y1 + x1

        flat_image = tf.reshape(image, shape=(-1, num_channels))
        flat_image = tf.cast(flat_image, dtype='float32')
        # get the pixel values corresponding to the indices from the image.
        pixel_values_a = tf.gather(flat_image, indices_a)
        pixel_values_b = tf.gather(flat_image, indices_b)
        pixel_values_c = tf.gather(flat_image, indices_c)
        pixel_values_d = tf.gather(flat_image, indices_d)
        
        # cast to float
        x0 = tf.cast(x0, 'float32')
        x1 = tf.cast(x1, 'float32')
        y0 = tf.cast(y0, 'float32')
        y1 = tf.cast(y1, 'float32')
        
        # Get the 4 areas calculated for the 4 coordinate points and the input point.
        area_a = tf.expand_dims(((x1 - x) * (y1 - y)), 1)
        area_b = tf.expand_dims(((x1 - x) * (y - y0)), 1)
        area_c = tf.expand_dims(((x - x0) * (y1 - y)), 1)
        area_d = tf.expand_dims(((x - x0) * (y - y0)), 1)
        output = tf.add_n([area_a*pixel_values_a,
                           area_b*pixel_values_b,
                           area_c*pixel_values_c,
                           area_d*pixel_values_d])
        return output

    def attack(self, imgs, targets):
        """
        Perform the stAdv attack on the given images for the given targets.
        If self.targeted is true, then the targets represents the target labels.
        If self.targeted is false, then targets are the original class labels.
        """
        r = []
        print('go up to', len(imgs))
        for i in range(0,len(imgs),self.batch_size):
            print('attack for image', i)
            r.extend(self.attack_batch(imgs[i:i+self.batch_size], targets[i:i+self.batch_size]))
        return np.array(r)

    def attack_batch(self, imgs, labs):
        """
        Run the attack on a batch of images and labels.
        """
        def compare(x,y):
            if not isinstance(x, (float, int, np.int64)):
                x = np.copy(x)
                if self.TARGETED:
                    x[y] -= self.CONFIDENCE
                else:
                    x[y] += self.CONFIDENCE
                x = np.argmax(x)
            if self.TARGETED:
                return x == y
            else:
                return x != y

        batch_size = self.batch_size

        # convert to tanh-space
#         imgs = np.arctanh((imgs - self.boxplus) / self.boxmul * 0.999999)

        # set the lower and upper bounds accordingly
        lower_bound = np.zeros(batch_size)
        CONST = np.ones(batch_size)*self.initial_const
        upper_bound = np.ones(batch_size)*1e10

        # the best stAdv, score, and image attack
        o_best_stAdv = [1e10]*batch_size
        o_bestscore = [-1]*batch_size
        # best image for the attack
        o_bestattack = [np.zeros(imgs[0].shape)]*batch_size
        
        # binary seach for the correct c and also the best example.
        for outer_step in range(self.BINARY_SEARCH_STEPS):
            print("outer step:", outer_step)
#             print("o_best_attack_img:", o_bestattack)
            
            # completely reset adam's internal state.
            self.sess.run(self.init)
            batch = imgs[:batch_size]
            batchlab = labs[:batch_size]
            beststAdv = [1e10]*batch_size
            bestscore = [-1]*batch_size

            # The last iteration (if we run many steps) repeat the search once.
            if self.repeat == True and outer_step == self.BINARY_SEARCH_STEPS-1:
                CONST = upper_bound

            # set the variables so that we don't have to send them over again
            self.sess.run(self.setup, {self.assign_timg: batch,
                                       self.assign_tlab: batchlab,
                                       self.assign_const: CONST})
            
            prev = 1e6
            
            for iteration in range(self.MAX_ITERATIONS):
                # perform the attack 
                self.optimizer.minimize(self.sess)
                l, stAdvs, scores, nimg = self.sess.run([
#                                                        self.train,
                                                         self.loss, 
                                                         self.stdist,
                                                         self.output, 
                                                         self.newimg])

                # print out the losses every 10%
                if iteration%(self.MAX_ITERATIONS//10) == 0:
                    print("iter:", iteration, "total_loss, L_adv, L_flow:", self.sess.run((self.loss,self.loss1,self.loss2)))

                # check if we should abort search if we're getting nowhere.
                if self.ABORT_EARLY and iteration%(self.MAX_ITERATIONS//10) == 0:
                    if l > prev*.9999:
                        break
                    prev = l

                # adjust the best result found so far
                for e,(stAdv,sc,ii) in enumerate(zip(stAdvs,scores,nimg)):
                    if stAdv < beststAdv[e] and compare(sc, np.argmax(batchlab[e])):
                        beststAdv[e] = stAdv
                        bestscore[e] = np.argmax(sc)
                    if stAdv < o_best_stAdv[e] and compare(sc, np.argmax(batchlab[e])):
                        o_best_stAdv[e] = stAdv
                        o_bestscore[e] = np.argmax(sc)
                        o_bestattack[e] = ii

            # adjust the constant as needed through binary search
            for e in range(batch_size):
                if compare(bestscore[e], np.argmax(batchlab[e])) and bestscore[e] != -1:
                    # success, divide const by two
                    upper_bound[e] = min(upper_bound[e],CONST[e])
                    if upper_bound[e] < 1e9:
                        CONST[e] = (lower_bound[e] + upper_bound[e])/2
                else:
                    # failure, either multiply by 10 if no solution found yet
                    #          or do binary search with the known upper bound
                    lower_bound[e] = max(lower_bound[e],CONST[e])
                    if upper_bound[e] < 1e9:
                        CONST[e] = (lower_bound[e] + upper_bound[e])/2
                    else:
                        CONST[e] *= 10

        # return the best solution found
        o_best_stAdv = np.array(o_best_stAdv)
        return o_bestattack

## Run STAdv attack

In [6]:
## test_attack.py -- sample code to test attack procedure
##
## Copyright (C) 2016, Nicholas Carlini <nicholas@carlini.com>.
##
## This program is licenced under the BSD 2-Clause licence,
## contained in the LICENCE file in this directory.

import tensorflow as tf
import numpy as np
import time
import matplotlib.pyplot as plt

def show(img):
    """
    Show MNSIT digits in the console.
    """
    remap = "  .*#"+"#"*100
    img = (img.flatten()+.5)*3
    if len(img) != 784: return
    print("START")
    for i in range(28):
        print("".join([remap[int(round(x))] for x in img[i*28:i*28+28]]))

def plot(img, image_size, title):
    plt.figure(figsize=(7,7))
    plt.imshow(img.reshape(image_size, image_size), cmap='gray', interpolation='none')
    plt.title(title, fontsize=10)
    plt.axis('off')
    plt.show()

def generate_data(data, samples, targeted=True, start=0, inception=False):
    """
    Generate the input data to the attack algorithm.
    data: the images to attack
    samples: number of samples to use
    targeted: if true, construct targeted attacks, otherwise untargeted attacks
    start: offset into data to use
    inception: if targeted and inception, randomly sample 100 targets intead of 1000
    """
    inputs = []
    targets = []
    # generating data from test_data inputs and then the targets
    for i in range(samples):
        if targeted:
            if inception:
                seq = random.sample(range(1,1001), 10)
            else:
                seq = range(data.test_labels.shape[1])

            for j in seq:
                if (j == np.argmax(data.test_labels[start+i])) and (inception == False):
                    continue
                inputs.append(data.test_data[start+i])
                targets.append(np.eye(data.test_labels.shape[1])[j])
        else:
            inputs.append(data.test_data[start+i])
            targets.append(data.test_labels[start+i])

    inputs = np.array(inputs)
    targets = np.array(targets)

    return inputs, targets

In [12]:
import pdb
import random 
start = random.randint(0,10000)
with tf.Session() as sess:
    data, model =  MNIST(), MNISTModel("models/mnist_baseline", sess)
    attack = STAdv(sess, model, batch_size=9, max_iterations=1000, confidence=0)
    
    inputs, targets = generate_data(data, samples=1, targeted=True,
                                    start=start, inception=False)
    timestart = time.time()
    adv = attack.attack(inputs, targets)
    timeend = time.time()

    print("Took",timeend-timestart,"seconds to run",len(inputs),"samples.")
    
    image_size = 28
    print("adv shape:", adv.shape)
    np.save("adv_examples", adv)
    for i in range(len(adv)):
        print("adv[i]", adv[i])
        print("Valid:")
        show(inputs[i])
#         plot(inputs[i], image_size, "valid")
        print("Adversarial:")
        show(adv[i])
        
#         plot(inputs[i],image_size, "adv")
        prediction = model.model.predict(adv[i:i+1])
        print("Classification:", prediction)

        print("classification:", np.argmax(prediction))
        num_pixels = model.image_size**2
        print("Total distortion:", sess.run(STAdv.total_variation_based_loss(attack, tf.convert_to_tensor(adv[i]-inputs[i]))) **.5/num_pixels)

('meshgrid shape', TensorShape([Dimension(2352)]))
('batch_tile grid shape', TensorShape([Dimension(21168)]))
('batch+3:', TensorShape([Dimension(9), Dimension(3), Dimension(784)]))
('grid x_s shape:', TensorShape([Dimension(9), Dimension(1), Dimension(784)]))
('x_flatten_shape:', TensorShape([Dimension(7056)]))
('base shape:', TensorShape([Dimension(None)]))
('base_y0_shape', TensorShape([Dimension(7056)]))
('indices_shape', TensorShape([Dimension(7056)]))
(9, 28, 28, 1)
('go up to', 9)
('attack for image', 0)
('outer step:', 0)
('iter:', 0, 'total_loss, L_adv, L_flow:', (10.244497, 1.4305115e-06, 10.244495))
('iter:', 100, 'total_loss, L_adv, L_flow:', (10.24416, 0.0, 10.24416))
('outer step:', 1)
('iter:', 0, 'total_loss, L_adv, L_flow:', (9.324176, 0.0, 9.324176))
('iter:', 100, 'total_loss, L_adv, L_flow:', (9.324174, 0.0, 9.324174))
('outer step:', 2)
('iter:', 0, 'total_loss, L_adv, L_flow:', (5.351032, 5.9127808e-05, 5.3509727))
('iter:', 100, 'total_loss, L_adv, L_flow:', (5.3

# Generate Spatially adversarial examples

In [64]:
import pdb
import random 
def generate_adv_train_data(data, samples, targeted=True, start=0, inception=False):
    """
    Generate the input data to the attack algorithm.
    data: the images to attack
    samples: number of samples to use
    targeted: if true, construct targeted attacks, otherwise untargeted attacks
    start: offset into data to use
    inception: if targeted and inception, randomly sample 100 targets intead of 1000
    """
    inputs = []
    targets = []
    labels = []
    # generating data from test_data inputs and then the targets
    for i in range(samples):
        if targeted:
            if inception:
                seq = random.sample(range(1,1001), 10)
            else:
                seq = range(data.test_labels.shape[1])

            for j in seq:
                if (j == np.argmax(data.test_labels[start+i])) and (inception == False):
                    continue
                inputs.append(data.test_data[start+i])
                targets.append(np.eye(data.test_labels.shape[1])[j])
                labels.append(data.test_labels[start+i])
        else:
            inputs.append(data.test_data[start+i])
            targets.append(data.test_labels[start+i])
            labels.append(data.test_labels[start+i])


    inputs = np.array(inputs)
    targets = np.array(targets)
    labels = np.array(labels)

    return inputs, targets, labels

adv_inputs = np.array([])
adv_targets = np.array([])
true_labels = np.array([])
with tf.Session() as sess:
    i = 0

    while(i<50):
        data, model =  MNIST(), MNISTModel("models/mnist_baseline", sess)
        attack = STAdv(sess, model, batch_size=9, max_iterations=1000, confidence=0)

        inputs, targets, labels = generate_adv_train_data(data, samples=1, targeted=True,
                                        start=i, inception=False)
        i = i+1
        timestart = time.time()
        adv = attack.attack(inputs, targets)
        if(adv_inputs.shape[0]==0):
            adv_inputs = adv
            adv_targets = targets
            true_labels = labels
        else:
            adv_inputs = np.concatenate((adv_inputs, adv))
            adv_targets = np.concatenate((adv_targets, targets))
            true_labels = np.concatenate((true_labels, labels))
        timeend = time.time()

        print("Took",timeend-timestart,"seconds to run",len(inputs),"samples.")

        image_size = 28
    print("adv shape:", adv_inputs.shape)
    print("adv targets:", adv_targets.shape)
    print("adv labels:", true_labels.shape)
    
    np.save("adv_inputs", adv_inputs)
    np.save("adv_targets", adv_targets)
    np.save("true_labels", true_labels)

('meshgrid shape', TensorShape([Dimension(2352)]))
('batch_tile grid shape', TensorShape([Dimension(21168)]))
('batch+3:', TensorShape([Dimension(9), Dimension(3), Dimension(784)]))
('grid x_s shape:', TensorShape([Dimension(9), Dimension(1), Dimension(784)]))
('x_flatten_shape:', TensorShape([Dimension(7056)]))
('base shape:', TensorShape([Dimension(None)]))
('base_y0_shape', TensorShape([Dimension(7056)]))
('indices_shape', TensorShape([Dimension(7056)]))
(9, 28, 28, 1)
('go up to', 9)
('attack for image', 0)
('outer step:', 0)
('iter:', 0, 'total_loss, L_adv, L_flow:', (9.395704, 0.0006556511, 9.395048))
('iter:', 100, 'total_loss, L_adv, L_flow:', (9.394525, 0.0, 9.394525))
('iter:', 200, 'total_loss, L_adv, L_flow:', (9.394525, 0.0, 9.394525))
('outer step:', 1)
('iter:', 0, 'total_loss, L_adv, L_flow:', (11.16351, 0.0, 11.16351))
('iter:', 100, 'total_loss, L_adv, L_flow:', (5.534014, 0.0, 5.534014))
('iter:', 200, 'total_loss, L_adv, L_flow:', (5.534014, 0.0, 5.534014))
('outer 

KeyboardInterrupt: 

In [None]:
import pdb
import random 
def generate_adv_train_data(data, samples, targeted=True, start=0, inception=False):
    """
    Generate the input data to the attack algorithm.
    data: the images to attack
    samples: number of samples to use
    targeted: if true, construct targeted attacks, otherwise untargeted attacks
    start: offset into data to use
    inception: if targeted and inception, randomly sample 100 targets intead of 1000
    """
    inputs = []
    targets = []
    labels = []
    # generating data from test_data inputs and then the targets
    for i in range(samples):
        if targeted:
            if inception:
                seq = random.sample(range(1,1001), 10)
            else:
                seq = range(data.test_labels.shape[1])

            for j in seq:
                if (j == np.argmax(data.test_labels[start+i])) and (inception == False):
                    continue
                inputs.append(data.test_data[start+i])
                targets.append(np.eye(data.test_labels.shape[1])[j])
                labels.append(data.test_labels[start+i])
        else:
            inputs.append(data.test_data[start+i])
            targets.append(data.test_labels[start+i])
            labels.append(data.test_labels[start+i])


    inputs = np.array(inputs)
    targets = np.array(targets)
    labels = np.array(labels)

    return inputs, targets, labels

adv_inputs = np.load('./adv_inputs.npy')
adv_targets = np.load('./adv_targets.npy')
true_labels = np.load('./true_labels.npy')
with tf.Session() as sess:
    i = 21

    while(i<50):
        data, model =  MNIST(), MNISTModel("models/mnist_baseline", sess)
        attack = STAdv(sess, model, batch_size=9, max_iterations=1000, confidence=0)

        inputs, targets, labels = generate_adv_train_data(data, samples=1, targeted=True,
                                        start=i, inception=False)
        i = i+1
        timestart = time.time()
        adv = attack.attack(inputs, targets)
        if(adv_inputs.shape[0]==0):
            adv_inputs = adv
            adv_targets = targets
            true_labels = labels
        else:
            adv_inputs = np.concatenate((adv_inputs, adv))
            adv_targets = np.concatenate((adv_targets, targets))
            true_labels = np.concatenate((true_labels, labels))
        timeend = time.time()

        print("Took",timeend-timestart,"seconds to run",len(inputs),"samples.")

        image_size = 28
    print("adv shape:", adv_inputs.shape)
    print("adv targets:", adv_targets.shape)
    print("adv labels:", true_labels.shape)
    
    np.save("adv_inputs", adv_inputs)
    np.save("adv_targets", adv_targets)
    np.save("true_labels", true_labels)

('meshgrid shape', TensorShape([Dimension(2352)]))
('batch_tile grid shape', TensorShape([Dimension(21168)]))
('batch+3:', TensorShape([Dimension(9), Dimension(3), Dimension(784)]))
('grid x_s shape:', TensorShape([Dimension(9), Dimension(1), Dimension(784)]))
('x_flatten_shape:', TensorShape([Dimension(7056)]))
('base shape:', TensorShape([Dimension(None)]))
('base_y0_shape', TensorShape([Dimension(7056)]))
('indices_shape', TensorShape([Dimension(7056)]))
(9, 28, 28, 1)
('go up to', 9)
('attack for image', 0)
('outer step:', 0)
('iter:', 0, 'total_loss, L_adv, L_flow:', (1.2053373, 0.14506906, 1.0602683))
