In [1]:
import tensorflow as tf
import numpy as np
from cleverhans.dataset import MNIST, CIFAR10
from cleverhans.attacks import FastGradientMethod, CarliniWagnerL2
from model import MyModel, CNN
from perturbation import fixed_pattern, generator

In [2]:
import os
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "4"

In [3]:
NB_EPOCHS = 100
BATCH_SIZE = 128
nb_classes = 10
nb_filters = 64 # 没啥用

In [4]:
# Get MNIST data
train_start=0
train_end=60000
test_start=0
test_end=10000
mnist = MNIST(train_start=train_start, train_end=train_end,
                test_start=test_start, test_end=test_end)
xr_train, yr_train = mnist.get_set('train')
xr_test, yr_test = mnist.get_set('test')
#xp_train, yp_train = mnist.get_set('train')
#xp_test, yp_test = mnist.get_set('test')
xp_train = xr_train.copy()
xp_test = xr_test.copy()
#print(xp_train[0])

### add noise

In [5]:
sigma = [0.1, 0, -0.1]
probability = [0.2, 0.6, 0.2]
pattern = fixed_pattern(sigma, probability)
for i in range(0, train_end):
    xp_train[i] = xp_train[i] + pattern
for i in range(0, test_end):
    xp_test[i] = xp_test[i] + pattern
#print(xp_train[0])

### train

In [6]:
# variables
xr = tf.placeholder(tf.float32, [None, 28, 28, 1], name="xr")
xp = tf.placeholder(tf.float32, [None, 28, 28, 1], name="xp")
y = tf.placeholder(tf.float32, [None, 10])

In [7]:
model = MyModel(10)

In [8]:
output_logits_real, output_real = model.basic_cnn(xr)
output_logits_fake, output_fake = model.basic_cnn(xp, reuse=True)

In [9]:
# custom loss
alpha = 1.
beta = 1.
gama = 0.01
loss_r = alpha * tf.reduce_mean(tf.reduce_sum(y * output_real, -1))
loss_p = beta * tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=output_logits_fake, labels=y))
loss_d = gama * tf.reduce_mean(tf.square(xr - xp))

total_loss = loss_r+loss_p+loss_d

In [10]:
global_step = tf.Variable(0, trainable=False)
lr_decayed = tf.train.exponential_decay(0.001, global_step, 10000, 0.1, staircase=False)
optimizer = tf.train.AdamOptimizer(learning_rate=lr_decayed).minimize(total_loss)
#print(tf.all_variables())

In [11]:
# calculate accuracy
correct_prediction1 = tf.equal(tf.argmax(output_fake,1), \
        tf.argmax(y,1))
accuracy1 = tf.reduce_mean(tf.cast(correct_prediction1, "float"))

correct_prediction2 = tf.equal(tf.argmax(output_real,1), \
        tf.argmax(y,1))
accuracy2 = tf.reduce_mean(tf.cast(correct_prediction2, "float"))

In [12]:
sess = tf.Session()
sess.run(tf.global_variables_initializer())
sess.run(tf.tables_initializer())

In [13]:
total_batch = int(xr_train.shape[0] / BATCH_SIZE)
for epoch in range(20):
    for i in range(total_batch):
        #batch_xr, batch_yr = mnist_raw.train.next_batch(batch_size)
        #batch_xp, batch_yp = mnist_process.train.next_batch(batch_size)
        #batch_xr = batch_xr.reshape(-1, 28, 28, 1)
        #batch_xp = batch_xp.reshape(-1, 28, 28, 1)
        bstart, bend = i*BATCH_SIZE, (i+1)*BATCH_SIZE
        batch_xr, batch_xp = xr_train[bstart:bend], xp_train[bstart:bend]
        batch_yp = yr_train[bstart:bend]

        _, loss_,temp_loss1, temp_loss2, temp_acc1, temp_acc2 = sess.run([optimizer,total_loss,loss_r,loss_p,accuracy2,accuracy1],
                                   feed_dict={xr: batch_xr,
                                              xp: batch_xp,
                                              y: batch_yp})
        if i % 1000 == 0:
            print(temp_loss1, temp_loss2)
            print(temp_acc1,temp_acc2)

0.10000262 2.302622
0.1953125 0.109375
0.07052729 0.14034794
0.0703125 0.96875
0.070377834 0.105591126
0.0703125 0.96875
0.070341155 0.06142211
0.0703125 0.984375
0.070333734 0.079415664
0.0703125 0.984375
0.07034134 0.026171274
0.0703125 0.9921875
0.07031831 0.022004003
0.0703125 0.9921875
0.070312954 0.015083114
0.0703125 0.9921875
0.07031362 0.0043362156
0.0703125 1.0
0.070328176 0.011266169
0.0703125 1.0
0.07031645 0.006569887
0.0703125 1.0
0.0703753 0.009243619
0.0703125 1.0
0.070314966 0.022100687
0.0703125 0.9921875
0.07080066 0.0021658177
0.0703125 1.0
0.0703125 0.009034006
0.0703125 0.9921875
0.070312515 0.0062605706
0.0703125 1.0
0.07031255 0.0062178895
0.0703125 1.0
0.0703125 0.017189678
0.0703125 0.9921875
0.0703125 0.0031528957
0.0703125 1.0
0.0703125 0.013271573
0.0703125 0.9921875


In [14]:
saver = tf.train.Saver()
saver.save(sess,"./savemodel/cnn/cnnmodel.ckpt")

'./savemodel/cnn/cnnmodel.ckpt'

In [15]:
print("raw input accuracy %g" %accuracy2.eval(session=sess,
                                         feed_dict={xr: xr_test[0:2000],
                                                    y: yr_test[0:2000]}))
print("processed input accuracy %g" %accuracy1.eval(session=sess,
                                         feed_dict={xp: xp_test[0:2000],
                                                    y: yr_test[0:2000]}))

raw input accuracy 0.096
processed input accuracy 0.9855


In [16]:
# sess.close()

# generate adversarial examples using CleverHans
note that the session is still open

## FGSM
we can modeified parameter "eps" to get different adversarial examples

In [17]:
x = tf.placeholder(tf.float32, [None, 28, 28, 1])
y = tf.placeholder(tf.float32, shape=(None, 10))
attack_model = CNN('cnn', 10)
NUM_CLASSES = 10
target_class = tf.reshape(tf.one_hot(2,NUM_CLASSES),[1,NUM_CLASSES])
fgsm_params = {
    'eps': 0.05,
    'clip_min': 0,
    'clip_max': 1.,
    'y_target': target_class
}
it = 10 # iterative FGSM

# init = tf.global_variables_initializer()
# sess = tf.Session()
# sess.run(init)

# with tf.Session() as sess:
#   sess.run(init)
fgsm = FastGradientMethod(attack_model, sess=sess)
x_adv = fgsm.generate(x, **fgsm_params)
#   saver = tf.train.Saver()
#   saver.restore(sess, './savemodel/cnn/cnnmodel.ckpt')
#   saver = tf.train.import_meta_graph('./savemodel/cnn/cnnmodel.meta')
#   saver.restore(sess,tf.train.latest_checkpoint('./savemodel/cnn/'))
adv_images = np.zeros((50000,28,28,1))
for j in range(50000): # np.shape(xr_train)[0]=60000
    adv_images[j] = xr_train[j].reshape(-1,28,28,1)
    if j%2000==0:
        print("Iteration "+str(j))
    for i in range(it):
        adv_images[j] = sess.run(x_adv, feed_dict={x: adv_images[j].reshape(-1,28,28,1)}) #xr_train[0:10]})



Iteration 0
Iteration 2000
Iteration 4000
Iteration 6000
Iteration 8000
Iteration 10000
Iteration 12000
Iteration 14000
Iteration 16000
Iteration 18000
Iteration 20000
Iteration 22000
Iteration 24000
Iteration 26000
Iteration 28000
Iteration 30000
Iteration 32000
Iteration 34000
Iteration 36000
Iteration 38000
Iteration 40000
Iteration 42000
Iteration 44000
Iteration 46000
Iteration 48000


In [18]:
import matplotlib.pyplot as plt
plt.imshow(adv_images[2].reshape(-1,28), cmap='gray')
plt.show()

<Figure size 640x480 with 1 Axes>

In [19]:
adv = tf.placeholder(tf.float32, [None, 28, 28, 1], name="adv")
output_logits_adv, output_adv = model.basic_cnn(adv, reuse=True)

correct_prediction2 = tf.equal(tf.argmax(output_adv, -1), \
        tf.argmax(target_class, -1))
accuracy2 = tf.reduce_mean(tf.cast(correct_prediction2, "float"))
print("test accuracy %g" %accuracy2.eval(session = sess,
      feed_dict = {
          adv:adv_images[0:8000]}))

test accuracy 0.999375


In [22]:
from PIL import Image

for i in range(50000):
 im = adv_images[i].reshape(28,28)
 img= Image.fromarray(im*255)
 img = img.convert('RGB')
 img.save('out/adversarial/fixed/3/adv_%s.png'%i,'png')

## Manual calculation
we can use FGSM method which written by ourselves to generate adversarial examples

In [None]:
def step_fgsm(x, eps, logits):
  label = tf.argmax(logits,1)
  one_hot_label = tf.one_hot(label, NUM_CLASSES)
  cross_entropy = tf.losses.softmax_cross_entropy(one_hot_label,
                                                  logits,
                                                  label_smoothing=0.1,
                                                  weights=1.0)
  x_adv = x + eps*tf.sign(tf.gradients(cross_entropy,x)[0])
  x_adv = tf.clip_by_value(x_adv,-1.0,1.0)
  return tf.stop_gradient(x_adv)
 
def step_targeted_attack(x, eps, one_hot_target_class, logits):
  #one_hot_target_class = tf.one_hot(target, NUM_CLASSES)
  #print(one_hot_target_class,"\n\n")
  cross_entropy = tf.losses.softmax_cross_entropy(one_hot_target_class,
                                                  logits,
                                                  label_smoothing=0.1,
                                                  weights=1.0)
  x_adv = x - eps * tf.sign(tf.gradients(cross_entropy, x)[0])
  x_adv = tf.clip_by_value(x_adv, -1.0, 1.0)
  return tf.stop_gradient(x_adv)

def step_ll_adversarial_images(x, eps, logits):
  least_likely_class = tf.argmin(logits, 1)
  one_hot_ll_class = tf.one_hot(least_likely_class, NUM_CLASSES)
  one_hot_ll_class = tf.reshape(one_hot_ll_class,[1,NUM_CLASSES])
  # This reuses the method described above
  return step_targeted_attack(x, eps, one_hot_ll_class, logits)

In [None]:
softmax_tensor = sess.graph.get_tensor_by_name('discriminator/fc2/add:0')
image_tensor = sess.graph.get_tensor_by_name('xr:0')
target_class = tf.reshape(tf.one_hot(2,NUM_CLASSES),[1,NUM_CLASSES])

adv_image_tensor = step_targeted_attack(image_tensor, fgsm_params['eps'], target_class, softmax_tensor)
adv_image = xr_train[0].reshape(-1,28,28,1)
t = adv_image.copy()
adv_noise = np.zeros(t.shape)
# for j in range(100):
j=0
adv_image = xr_train[j].reshape(-1,28,28,1)
if j%2000==0:
    print("Iteration "+str(j))
for i in range(it):
    #print("Iteration "+str(i))
    adv_image = sess.run(adv_image_tensor,{'xr:0': adv_image})
adv_noise = np.concatenate((adv_noise, adv_image))
#plt.imshow(adv_image.reshape(-1,28))

In [None]:
plt.imshow(adv_image[0].reshape(-1,28))

test accuracy

In [None]:
adv = tf.placeholder(tf.float32, [None, 28, 28, 1], name="adv")
output_logits_adv, output_adv = model.basic_cnn(adv, reuse=True)

correct_prediction2 = tf.equal(tf.argmax(output_adv, -1), \
        tf.argmax(target_class, -1))
accuracy2 = tf.reduce_mean(tf.cast(correct_prediction2, "float"))
print("test accuracy %g" %accuracy2.eval(session = sess,
      feed_dict = {
          adv:adv_images}))