In [1]:
from model import MyModel, CNN
from cleverhans.attacks import FastGradientMethod, CarliniWagnerL2
from cleverhans.dataset import MNIST, CIFAR10
from keras.datasets import fashion_mnist
from perturbation import generator
import numpy as np
import tensorflow as tf
import cv2

Using TensorFlow backend.


In [2]:
import os
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [3]:
dataset = "mnist"
network = "cnn"
BATCH_SIZE = 128

In [4]:
# Get MNIST data
train_start=0
train_end=60000
test_start=0
test_end=10000
mnist = MNIST(train_start=train_start, train_end=train_end,
                test_start=test_start, test_end=test_end)
xr_train, yr_train = mnist.get_set('train')
xr_test, yr_test = mnist.get_set('test')
xp_train = xr_train.copy()
xp_test = xr_test.copy()

In [5]:
# variables
xr = tf.placeholder(tf.float32, [None, 28, 28, 1], name="xr")
xp = tf.placeholder(tf.float32, [None, 28, 28, 1], name="xp")
y  = tf.placeholder(tf.float32, [None, 10])

In [6]:
model = MyModel(10)
# generate perturbation according to the input
_, G_sample = generator(xr)

In [7]:
if network == "cnn":
    output_logits_real, output_real = model.basic_cnn(xr)
    output_logits_fake, output_fake = model.basic_cnn(G_sample,reuse=True)
elif network == "resnet":
    output_logits_real, output_real = model.resnet20(xr)
    output_logits_fake, output_fake = model.resnet20(G_sample,reuse=True)

In [8]:
# loss
alpha = 1.
beta = 1.
gama = 0.01
loss_r = alpha * tf.reduce_mean(tf.reduce_sum(y * output_real, -1))
loss_p = beta * tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=output_logits_fake, labels=y))
loss_d = gama * tf.reduce_mean(tf.square(xr - G_sample))
loss_p_d =  tf.add(loss_p, loss_d)
total_loss = loss_r+loss_p+loss_d

In [9]:
# learning rate
global_step = tf.Variable(0, trainable=False)   
lr_decayed = tf.train.exponential_decay(0.001, global_step, 2*10000, 0.1, staircase=False)

In [10]:
# variable list
all_var = tf.global_variables()
g_vars = [var for var in all_var if 'generator' in var.name]
d_vars = [var for var in all_var if 'discriminator' in var.name]

In [11]:
D_optimizer = tf.train.AdamOptimizer(learning_rate=lr_decayed).minimize(total_loss, var_list=[d_vars])
G_optimizer = tf.train.AdamOptimizer(learning_rate=lr_decayed).minimize(loss_p_d, var_list=[g_vars])

In [12]:
sess = tf.Session()
sess.run(tf.global_variables_initializer())
sess.run(tf.tables_initializer())

In [13]:
total_batch = int(xr_train.shape[0] / BATCH_SIZE)
D_loss = open('out/acc_loss/discriminator_loss.txt','w+')
G_loss = open('out/acc_loss/generator_loss.txt','w+')
for epoch in range(80):
    for i in range(total_batch):
        #batch_xr, batch_yr = mnist_raw.train.next_batch(batch_size)
        #batch_xp, batch_yp = mnist_process.train.next_batch(batch_size)
        #batch_xr = batch_xr.reshape(-1, 28, 28, 1)
        #batch_xp = batch_xp.reshape(-1, 28, 28, 1)
        bstart, bend = i*BATCH_SIZE, (i+1)*BATCH_SIZE
        batch_xr, batch_xp = xr_train[bstart:bend], xp_train[bstart:bend]
        batch_y = yr_train[bstart:bend]

        # train discriminator
        _, D_loss_curr = sess.run([D_optimizer, total_loss], feed_dict={xr: batch_xr, y: batch_y})
        _, G_loss_curr = sess.run([G_optimizer, loss_p_d],   feed_dict={xr: batch_xr, y: batch_y})
        if i % 2000 == 0:
            print('D_loss: {:.4}'.format(D_loss_curr))
            print('G_loss: {:.4}'.format(G_loss_curr))
    D_loss.write(str(D_loss_curr)+'\n')
    G_loss.write(str(G_loss_curr)+'\n')
D_loss.close()
G_loss.close()

D_loss: 2.402
G_loss: 2.299
D_loss: 0.2562
G_loss: 0.1029
D_loss: 0.222
G_loss: 0.04329
D_loss: 0.2351
G_loss: 0.07908
D_loss: 0.2124
G_loss: 0.04089
D_loss: 0.2151
G_loss: 0.06653
D_loss: 0.1836
G_loss: 0.03062
D_loss: 0.2743
G_loss: 0.1294
D_loss: 0.04135
G_loss: 0.03593
D_loss: 0.01113
G_loss: 0.01073
D_loss: 0.03821
G_loss: 0.03303
D_loss: 0.0154
G_loss: 0.01182
D_loss: 0.01241
G_loss: 0.02056
D_loss: 0.02344
G_loss: 0.02609
D_loss: 0.02715
G_loss: 0.02575
D_loss: 0.01349
G_loss: 0.01709
D_loss: 0.01578
G_loss: 0.01397
D_loss: 0.0187
G_loss: 0.02254
D_loss: 0.02843
G_loss: 0.01053
D_loss: 0.0115
G_loss: 0.01267
D_loss: 0.02713
G_loss: 0.0217
D_loss: 0.03096
G_loss: 0.0101
D_loss: 0.01694
G_loss: 0.01135
D_loss: 0.01706
G_loss: 0.01018
D_loss: 0.009472
G_loss: 0.009256
D_loss: 0.0151
G_loss: 0.01168
D_loss: 0.01021
G_loss: 0.01331
D_loss: 0.01017
G_loss: 0.01011
D_loss: 0.01742
G_loss: 0.02894
D_loss: 0.008766
G_loss: 0.00872
D_loss: 0.01562
G_loss: 0.01065
D_loss: 0.02482
G_loss: 0

In [15]:
# calculate accuracy
correct_prediction = tf.equal(tf.argmax(output_fake, axis=-1), tf.argmax(y, axis=-1))
accuracy_fake = tf.reduce_mean(tf.cast(correct_prediction, "float"))

correct_prediction1 = tf.equal(tf.argmax(output_real, axis=-1),tf.argmax(y, axis=-1))
accuracy_real = tf.reduce_mean(tf.cast(correct_prediction1, "float"))

print("raw input accuracy %g"       %accuracy_real.eval(session=sess, feed_dict={xr: xr_test[0:2000], y: yr_test[0:2000]}))
print("processed input accuracy %g" %accuracy_fake.eval(session=sess, feed_dict={xr: xr_test[0:2000], 
                                                                                 xp: xp_test[0:2000], y: yr_test[0:2000]}))

raw input accuracy 0.0895
processed input accuracy 0.969


## Manual calculation

In [17]:
# NUM_CLASSES = 10
# def step_fgsm(x, eps, logits):
#   label = tf.argmax(logits,1)
#   one_hot_label = tf.one_hot(label, NUM_CLASSES)
#   cross_entropy = tf.losses.softmax_cross_entropy(one_hot_label,
#                                                   logits,
#                                                   label_smoothing=0.1,
#                                                   weights=1.0)
#   x_adv = x + eps*tf.sign(tf.gradients(cross_entropy,x)[0])
#   x_adv = tf.clip_by_value(x_adv,-1.0,1.0)
#   return tf.stop_gradient(x_adv)
 
# def step_targeted_attack(x, eps, one_hot_target_class, logits):
#   #one_hot_target_class = tf.one_hot(target, NUM_CLASSES)
#   #print(one_hot_target_class,"\n\n")
#   cross_entropy = tf.losses.softmax_cross_entropy(one_hot_target_class,
#                                                   logits,
#                                                   label_smoothing=0.1,
#                                                   weights=1.0)
#   x_adv = x - eps * tf.sign(tf.gradients(cross_entropy, x)[0])
#   x_adv = tf.clip_by_value(x_adv, -1.0, 1.0)
#   return tf.stop_gradient(x_adv)

# def step_ll_adversarial_images(x, eps, logits):
#   least_likely_class = tf.argmin(logits, 1)
#   one_hot_ll_class = tf.one_hot(least_likely_class, NUM_CLASSES)
#   one_hot_ll_class = tf.reshape(one_hot_ll_class,[1,NUM_CLASSES])
#   # This reuses the method described above
#   return step_targeted_attack(x, eps, one_hot_ll_class, logits)

'\nNUM_CLASSES = 10\ndef step_fgsm(x, eps, logits):\n  label = tf.argmax(logits,1)\n  one_hot_label = tf.one_hot(label, NUM_CLASSES)\n  cross_entropy = tf.losses.softmax_cross_entropy(one_hot_label,\n                                                  logits,\n                                                  label_smoothing=0.1,\n                                                  weights=1.0)\n  x_adv = x + eps*tf.sign(tf.gradients(cross_entropy,x)[0])\n  x_adv = tf.clip_by_value(x_adv,-1.0,1.0)\n  return tf.stop_gradient(x_adv)\n \ndef step_targeted_attack(x, eps, one_hot_target_class, logits):\n  #one_hot_target_class = tf.one_hot(target, NUM_CLASSES)\n  #print(one_hot_target_class,"\n\n")\n  cross_entropy = tf.losses.softmax_cross_entropy(one_hot_target_class,\n                                                  logits,\n                                                  label_smoothing=0.1,\n                                                  weights=1.0)\n  x_adv = x - eps * tf.sign(tf.g

In [None]:
# softmax_tensor = sess.graph.get_tensor_by_name('discriminator/fc2/add:0')
# image_tensor = sess.graph.get_tensor_by_name('xr:0')
# target_class = tf.reshape(tf.one_hot(2,NUM_CLASSES),[1,NUM_CLASSES])

# adv_image_tensor = step_targeted_attack(image_tensor, 0.05, target_class, softmax_tensor)
# # adv_image = mnist_raw.train.images[0].reshape(-1,28,28,1)
# # t = adv_image.copy()
# # adv_noise = np.zeros(t.shape)
# adv_image = np.zeros((50000,28,28,1))
# for j in range(50000):
#   adv_image = mnist_raw.train.images[j].reshape(-1,28,28,1)
#   if j%2000==0:
#     print("Iteration "+str(j))
#   for i in range(10):
#     #print("Iteration "+str(i))
#     adv_image[j] = sess.run(adv_image_tensor,{'xr:0': adv_images[j].reshape(-1,28,28,1)})
# #   adv_noise = np.concatenate((adv_noise, adv_image))
# #plt.imshow(adv_image.reshape(-1,28))

# FGSM

## targeted label

In [18]:
# with tf.Session() as session:
# #print(mnist_raw.train.images[0])
#   target_class = tf.reshape(tf.one_hot(2,NUM_CLASSES),[1,NUM_CLASSES])
#   out = session.run(target_class)
#   print(out)
##  out[[0. 0. 1. 0. 0. 0. 0. 0. 0. 0.]]

In [16]:
x = tf.placeholder(tf.float32, [None, 28, 28, 1])
y = tf.placeholder(tf.float32, shape=(None, 10))
attack_model = CNN('cnn', 10)
NUM_CLASSES = 10
target_class = tf.reshape(tf.one_hot(2,NUM_CLASSES),[1,NUM_CLASSES])
fgsm_params = {
    'eps': 0.05,
    'clip_min': 0,
    'clip_max': 1.,
    'y_target': target_class
}
it = 10 # iterative FGSM

fgsm = FastGradientMethod(attack_model, sess=sess)
x_adv = fgsm.generate(x, **fgsm_params)
adv_images = np.zeros((50000,28,28,1))
for j in range(50000): # np.shape(xr_train)[0]=60000
    adv_images[j] =xr_train[j].reshape(-1,28,28,1)
    if j%2000==0:
        print("Iteration "+str(j))
    for i in range(it):
        adv_images[j] = sess.run(x_adv, feed_dict={x: adv_images[j].reshape(-1,28,28,1)}) #xr_train[0:10]})



Iteration 0
Iteration 2000
Iteration 4000
Iteration 6000
Iteration 8000
Iteration 10000
Iteration 12000
Iteration 14000
Iteration 16000
Iteration 18000
Iteration 20000
Iteration 22000
Iteration 24000
Iteration 26000
Iteration 28000
Iteration 30000
Iteration 32000
Iteration 34000
Iteration 36000
Iteration 38000
Iteration 40000
Iteration 42000
Iteration 44000
Iteration 46000
Iteration 48000


In [17]:
adv = tf.placeholder(tf.float32, [None, 28, 28, 1], name="adv")
output_logits_adv, output_adv = model.basic_cnn(adv, reuse=True)

correct_prediction2 = tf.equal(tf.argmax(output_adv, -1), tf.argmax(target_class, -1))
accuracy2 = tf.reduce_mean(tf.cast(correct_prediction2, "float"))
print("test accuracy %g" %accuracy2.eval(session=sess, feed_dict={adv:adv_images}))

test accuracy 0.9785


## save data as tfrecord

In [None]:
def _int64_feature(value):
    return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))
def _bytes_feature(value):
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

with tf.Session() as session:
    target_label = session.run(target_class)
    
filename = "./out/adv_generator_mnist.tfrecords"
writer = tf.python_io.TFRecordWriter(filename)
for i in range(50000):
    images_raw = adv_images[i].tostring()
    example = tf.train.Example(features=tf.train.Features(feature={
        'label': _int64_feature(np.argmax(target_label)),
        'image': _bytes_feature(images_raw)}))
    writer.write(example.SerializeToString())
writer.close()

In [18]:
# from PIL import Image

# for i in range(50000):
#     im = adv_images[i].reshape(28,28)
#     img= Image.fromarray(im*255)
#     img = img.convert('RGB')
#     img.save('out/adversarial/generator/3/adv_%s.png'%i,'png')