In [None]:
import tensorflow as tf
from tensorflow.keras.datasets import mnist
from tensorflow.keras import Sequential
from tensorflow.keras.callbacks import LambdaCallback
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Dropout, Dense, Flatten, Activation
import numpy as np
import random
import matplotlib.pyplot as plt

img_rows, img_cols, channels = 28, 28, 1
num_classes = 10
(x_train, y_train), (x_test, y_test) = mnist.load_data()
labels = ['zero', 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine']
img_rows, img_cols, channels = 28, 28, 1
num_classes = 10
x_train = x_train / 255
x_test = x_test / 255
x_train = x_train.reshape((-1, img_rows, img_cols, channels))
x_test = x_test.reshape((-1, img_rows, img_cols, channels))
y_train = tf.keras.utils.to_categorical(y_train, num_classes)
y_test = tf.keras.utils.to_categorical(y_test, num_classes)


def create_model():
    model = Sequential()
    model.add(Conv2D(32, kernel_size=(3, 3), strides=(3, 3), padding='same', activation='relu',
                     input_shape=(img_rows, img_cols, channels)))
    model.add(Conv2D(64, kernel_size=(3, 3), strides=(3, 3), padding='same', activation='relu'))
    model.add(Conv2D(64, kernel_size=(3, 3), strides=(3, 3), padding='same', activation='relu'))
    model.add(MaxPooling2D(pool_size=(2, 2)))
    model.add(Dropout(0.2))
    model.add(Flatten())
    model.add(Dense(32))
    model.add(Dropout(0.2))
    model.add(Dense(32))
    model.add(Dropout(0.2))
    model.add(Dense(num_classes, activation='softmax'))
    model.compile(optimizer='adam', loss='mse', metrics=['accuracy'])

    return model

model = create_model()
model.fit(x_train, y_train,
          batch_size=32,
          epochs=10,
          validation_data=(x_test, y_test))

print("accuracy on regular images:", model.evaluate(x=x_test, y=y_test, verbose=0))

def adversarial_pattern(image, label):
    image = tf.cast(image, tf.float32)
    with tf.GradientTape() as tape:
        tape.watch(image)
        prediction = model(image)
        loss = tf.keras.losses.MSE(label, prediction)
    gradient = tape.gradient(loss, image)
    signed_grad = tf.sign(gradient)
    return signed_grad

image = x_train[0]
image_label = y_train[0]
res = adversarial_pattern(image.reshape((1, img_rows, img_cols, channels)), image_label).numpy()
adversarial = image + res * 0.15

def generate_adversarials(batch_size):
    while True:
        x = []
        y = []
        for batch in range(batch_size):
            N = random.randint(0, 100)

            label = y_train[N]
            image = x_train[N]

            res = adversarial_pattern(image.reshape((1, img_rows, img_cols, channels)), label).numpy()

            epsilon = 0.15
            adversarial = image + res * epsilon

            x.append(adversarial)
            y.append(y_train[N])

        x = np.asarray(x).reshape((batch_size, img_rows, img_cols, channels))
        y = np.asarray(y)

        yield x, y

x_adv_test, y_adv_test = next(generate_adversarials(10000))
print("accuracy on adversarial images:", model.evaluate(x=x_adv_test, y=y_adv_test, verbose=0))

x_adv_train, y_adv_train = next(generate_adversarials(20000))
model.fit(x_adv_train, y_adv_train,
          batch_size=32,
          epochs=10,
          validation_data=(x_test, y_test))

print("Defended accuracy on adversarial images:", model.evaluate(x=x_adv_test, y=y_adv_test, verbose=0))
print("Defended accuracy on regular images:", model.evaluate(x=x_test, y=y_test, verbose=0))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
accuracy on regular images: [0.0031986036337912083, 0.9794999957084656]
accuracy on adversarial images: [0.12948882579803467, 0.1889999955892563]
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Defended accuracy on adversarial images: [4.527693917352879e-16, 1.0]
Defended accuracy on regular images: [0.017679112032055855, 0.9018999934196472]
