# Non targeted and targeted Adversarial Examples 

In [1]:
%matplotlib inline
import network.network as Network
import network.mnist_loader as mnist_loader
import pickle
import matplotlib.pyplot as plt
import numpy as np

We load up the MNIST data. The network we unpickle has one hidden layer of 30 units, 784 input units and 10 output units.

In [2]:
with open('network/trained_network.pkl', 'rb') as f:
    u = pickle._Unpickler(f)
    u.encoding = 'latin1'
    net = u.load()
    
training_data, validation_data, test_data = mnist_loader.load_data_wrapper()

We run predict with any number between 0 and 9999. The output of the network is a one-hot vector indicating the network's predictions:

In [None]:
def predict(n):
    # Get the data from the test set
    l = [data for data in test_data]
    x = l[n][0]

    # Print the prediction of the network
    print('Network output: \n' + str(np.round(net.feedforward(x), 2)) + '\n')
    print('Network prediction: ' + str(np.argmax(net.feedforward(x))) + '\n')
    print('Actual image: ')
    
    # Draw the image
    plt.imshow(x.reshape((28,28)), cmap='Greys')

#to complete

The goal is to generate an adversarial example, we want the network to think our image is an other number with a 1 at the ieme place of the one-hot vector. Our goal is to find an $ \vec x $ such that $ C $ our cost function is minimize. We initialize a random vector $ \vec x $ and we apply gradient descent on $ C $. We change $ \vec x $ at each step because our goal is now to change only the input of the network, we keep the weights and biases constant. The cost function $ C $ is define :
$$ C = \frac{1}{2} \|\vec y_{goal} - \hat y(\vec x)\|^2_2 $$

- Implement the `sigmoid` and its element-wise derivative `dsigmoid` functions:

$$
sigmoid(x) = \frac{1}{1 + e^{-x}}
$$

$$
dsigmoid(x) = sigmoid(x) \cdot (1 - sigmoid(x))
$$

In [4]:
def sigmoid(z):
    """The sigmoid function."""
    #to complete
                                                                                                                                                                                
def sigmoid_prime(z):
    """Derivative of the sigmoid function."""
    #to complete

Also, a function to gradient the derivative of the cost function, $ \nabla_x C $ with respect to the input $ \vec x $, with a goal label of $ \vec y_{goal} $.

In [5]:
def input_derivative(net, x, y):
    """ Calculate derivatives wrt the inputs"""
    nabla_b = [np.zeros(b.shape) for b in net.biases]
    nabla_w = [np.zeros(w.shape) for w in net.weights]
    
    # feedforward
    activation = x
    activations = [x] # list to store all the activations, layer by layer
    zs = [] # list to store all the z vectors, layer by layer
    for b, w in zip(net.biases, net.weights):
        z = np.dot(w, activation)+b
        zs.append(z)
        activation = sigmoid(z)
        activations.append(activation)
        
    # backward pass
    delta = net.cost_derivative(activations[-1], y) * \
        sigmoid_prime(zs[-1])
    nabla_b[-1] = delta
    nabla_w[-1] = np.dot(delta, activations[-2].transpose())

    for l in range(2, net.num_layers):
        z = zs[-l]
        sp = sigmoid_prime(z)
        delta = np.dot(net.weights[-l+1].transpose(), delta) * sp
        nabla_b[-l] = delta
        nabla_w[-l] = np.dot(delta, activations[-l-1].transpose())
        
    # Return derivatives WRT to input
    return net.weights[0].T.dot(delta)

The actual function that generates adversarial examples and a wrapper function:

In [6]:
def adversarial(net, n, steps, eta):
    """
    net : network object
        neural network instance to use
    n : integer
        our goal label (just an int, the function transforms it into a one-hot vector)
    steps : integer
        number of steps for gradient descent
    eta : float
        step size for gradient descent
    """
    # Set the goal output (output vector of the neural network)  
    #set the n case to 1
    # Create a random image to initialize gradient descent with using np.random.normal
   
    # Gradient descent on the input
    #We calculate the derivative for each step and update the gradient
    #we return the gradient value

# Wrapper function
def generate(n):
    """
    n : integer
        goal label (not a one hot vector)
    """
    a = adversarial(net, n, 1000, 1)
    x = np.round(net.feedforward(a), 2)
    
    print('Network Output: \n' + str(x) + '\n')
    
    print('Network Prediction: ' + str(np.argmax(x)) + '\n')
    
    print('Adversarial Example: ')
    plt.imshow(a.reshape(28,28), cmap='Greys')

Now we can generate an adversarial examples with generate. You can call generate with a number between 0 and 5.

In [None]:
#to complete

Now let's generate an image that look like the same to the image, but not for the neural network. We have now a new cost function $ C $ :
$$ C = \|\vec y_{goal} - y_{hat}(\vec x)\|^2_2 + \lambda \|\vec x - \vec x_{target}\|^2_2 $$

We now want to minimize $ C $ and the distance between $ \vec x $ and $ \vec x_{target} $.

In [8]:
def sneaky_adversarial(net, n, x_target, steps, eta, lam=.05):
    """
    net : network object
        neural network instance to use
    n : integer
        our goal label (just an int, the function transforms it into a one-hot vector)
    x_target : numpy vector
        our goal image for the adversarial example
    steps : integer
        number of steps for gradient descent
    eta : float
        step size for gradient descent
    lam : float
        lambda, our regularization parameter. Default is .05
    """
    
    # Set the goal output (output vector of the neural network)      
    #set the n case to 1
    # Create a random image to initialize gradient descent with using np.random.normal
    
    # Gradient descent on the input
    #We calculate the derivative for each step and update the gradient (this time with an added penalty
    #to the cost function)
    #we return the gradient value

# Wrapper function
def sneaky_generate(n, m):
    """
    n: int 0-9, the target number to match
    m: index of example image to use (from the test set)
    """
    
    # Find random instance of m in test set
    idx = np.random.randint(0,8000)
    l = [data for data in test_data]
    while l[idx][1] != m:
        idx += 1
    
    # Hardcode the parameters for the wrapper function
    a = sneaky_adversarial(net, n, l[idx][0], 100, 1)
    x = np.round(net.feedforward(a), 2)
    
    print('\nWhat we want our adversarial example to look like: ')
    plt.imshow(l[idx][0].reshape((28,28)), cmap='Greys')
    plt.show()
    
    print('\n')
    
    print('Adversarial Example: ')
    
    plt.imshow(a.reshape(28,28), cmap='Greys')
    plt.show()
    
    print('Network Prediction: ' + str(np.argmax(x)) + '\n')
    
    print('Network Output: \n' + str(x) + '\n')
    
    return a

Now you can generate your own adversarial example, try with 0, 2, 3, 5, 6, or 8.

In [9]:
# sneaky_generate(target label, target digit)
training_data, validation_data, test_data = mnist_loader.load_data_wrapper()
#Generate your own example

#to complete

We can now generate adversarial examples, but now let's defend against them, one simple method is to us binary thresholding.

In [10]:
def binary_thresholding(n, m):
    """
    n: int 0-9, the target number to match
    m: index of example image to use (from the test set)
    """
    
    x = sneaky_generate(n, m)

    #remove noise from the image with a thresholding
    #to complete 
    
    
    print("With binary thresholding: ")
    
    plt.imshow(x.reshape(28,28), cmap="Greys")
    plt.show()
    
    print("Prediction with binary thresholding: " + str(np.argmax(np.round(net.feedforward(x)))) + '\n')
    
    print("Network output: ")
    print(np.round(net.feedforward(x), 2))

In [None]:
# binary_thresholding(target digit, actual digit)
training_data, validation_data, test_data = mnist_loader.load_data_wrapper()
#to complete

One simple method is to generate adversarial examples and give them to the neural network, the goal is to make the neural network learn to ignore adversarial examples.

In [12]:
def augment_data(n, data, steps):
    """
    n : integer
        number of adversarial examples to generate
    data : list of tuples
        data set to generate adversarial examples using
    """
    # Our augmented training set:
    augmented = []
    
    for i in range(n):
        # Progress "bar"
        if i % 500 == 0:
            print("Generated digits: " + str(i))
            
        # Randomly choose a digit that the example will look like
        rnd_actual_digit = np.random.randint(10)
        
        # Find random instance of rnd_actual_digit in the training set
        rnd_actual_idx = np.random.randint(len(data))
        while np.argmax(data[rnd_actual_idx][1]) != rnd_actual_digit:
            rnd_actual_idx = np.random.randint(len(data))
        x_target = data[rnd_actual_idx][0]
        
        # Choose value for adversarial attack
        rnd_fake_digit = np.random.randint(10)
        
        # Generate adversarial example
        x_adversarial = sneaky_adversarial(net, rnd_fake_digit, x_target, steps, 1)
        
        # Add new data
        y_actual = data[rnd_actual_idx][1]
        
        augmented.append((x_adversarial, y_actual))
        
    return augmented

In [4]:
# This will take quite a while (~3 min for 10000, ~15 for 50000)
# Try 10000 examples first if you don't want to wait
augmented = augment_data(10000, list(training_data), 100)

In [None]:
def check_augmented(i, augmented):
    # Show image
    print('Image: \n')
    plt.imshow(augmented[i][0].reshape(28,28), cmap='Greys')
    plt.show()
    
    # Show original network prediction
    print('Original network prediction: \n')
    print(np.round(net.feedforward(augmented[i][0]), 2))
    
    # Show label
    print('\nLabel: \n')
    print(augmented[i][1])

# check i^th adversarial image
check_augmented(239, augmented)

We can now create a new neural network using the library Network and train it on our augmented dataset and the original training set, using the original test set to validate.

In [5]:
# Create new network with network
# to complete

#reload data
training_data, validation_data, test_data = mnist_loader.load_data_wrapper()

# Train on augmented + original training set with SGD
#to complete

We can make a test set of adversarial examples by using the following function call:

In [None]:
# For some reason the training data has the format: list of tuples
# tuple[0] is np array of image
# tuple[1] is one hot np array of label
# test data is also list of tuples
# tuple[0] is np array of image
# tuple[1] is integer of label
# Just fixing this:
normal_test_data = []
training_data, validation_data, test_data = mnist_loader.load_data_wrapper()
l = [data for data in test_data]
for i in range(len(l)):
    ground_truth = l[i][1]
    one_hot = np.zeros(10)
    one_hot[ground_truth] = 1
    one_hot = np.expand_dims(one_hot, axis=1)
    normal_test_data.append((l[i][0], one_hot))
    

# Using normal_test_data because of weird way data is packaged

#call augment_data
#to complete

Let's checkout the accuracy of our newly trained network on adversarial examples from the new adversarial test set:

In [None]:
def accuracy(net, test_data):
    """
    net : network object
    test_data: list
        list of 2-tuples of two arrays, one image and one label (one-hot)
    """
    #to complete
    
#print accuracy

And finally, a function that compares the original network to the new network on adversarial examples

In [None]:
def sample_test(net, net2, test_data, n):
    print('Original network prediction: ' + str(np.argmax(np.round(net.feedforward(test_data[n][0])))) + '\n')
    print('New network prediction: ' + str(np.argmax(np.round(net2.feedforward(test_data[n][0])))) + '\n')
    print('Image: \n')
    plt.imshow(test_data[n][0].reshape(28,28), cmap='Greys')

#call sample test