In [1]:
# !pip install tensorflow numpy matplotlib

In [2]:
import tensorflow as tf
from tensorflow.keras import datasets, layers, models
import numpy as np
import matplotlib.pyplot as plt

(x_train, y_train), (x_test, y_test) = datasets.mnist.load_data()

x_train = x_train / 255.0
x_test = x_test / 255.0

x_train = np.expand_dims(x_train, axis=-1)
x_test = np.expand_dims(x_test, axis=-1)

y_train = tf.keras.utils.to_categorical(y_train, 10)
y_test = tf.keras.utils.to_categorical(y_test, 10)


Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/mnist.npz


In [3]:
def create_model():
    model = models.Sequential([
        layers.Conv2D(32, (3, 3), activation='relu', input_shape=(28, 28, 1)),
        layers.MaxPooling2D((2, 2)),
        layers.Conv2D(64, (3, 3), activation='relu'),
        layers.MaxPooling2D((2, 2)),
        layers.Flatten(),
        layers.Dense(64, activation='relu'),
        layers.Dense(10, activation='softmax')
    ])
    model.compile(optimizer='adam',
                  loss='categorical_crossentropy',
                  metrics=['accuracy'])
    return model

model = create_model()
model.summary()


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d (Conv2D)             (None, 26, 26, 32)        320       
                                                                 
 max_pooling2d (MaxPooling2  (None, 13, 13, 32)        0         
 D)                                                              
                                                                 
 conv2d_1 (Conv2D)           (None, 11, 11, 64)        18496     
                                                                 
 max_pooling2d_1 (MaxPoolin  (None, 5, 5, 64)          0         
 g2D)                                                            
                                                                 
 flatten (Flatten)           (None, 1600)              0         
                                                                 
 dense (Dense)               (None, 64)                1

In [4]:
def generate_adversarial_example(model, x, y, epsilon=0.1):
    x = tf.convert_to_tensor(x)
    y = tf.convert_to_tensor(y)

    with tf.GradientTape() as tape:
        tape.watch(x)
        prediction = model(x)
        loss = tf.keras.losses.categorical_crossentropy(y, prediction)

    gradient = tape.gradient(loss, x)

    signed_grad = tf.sign(gradient)

    adversarial_example = x + epsilon * signed_grad
    adversarial_example = tf.clip_by_value(adversarial_example, 0, 1)

    return adversarial_example


In [5]:
adv_model = create_model()
batch_size = 64
epochs = 5
epsilon = 0.1

for epoch in range(epochs):
    print(f'Epoch {epoch+1}/{epochs}')

    for i in range(0, len(x_train), batch_size):
        x_batch = x_train[i:i+batch_size]
        y_batch = y_train[i:i+batch_size]

        x_adv_batch = generate_adversarial_example(adv_model, x_batch, y_batch, epsilon)

        x_combined = np.concatenate([x_batch, x_adv_batch])
        y_combined = np.concatenate([y_batch, y_batch])

        adv_model.train_on_batch(x_combined, y_combined)

    test_loss, test_acc = adv_model.evaluate(x_test, y_test, verbose=2)
    print(f'Test accuracy after adversarial training: {test_acc:.4f}')


Epoch 1/5
313/313 - 1s - loss: 0.0726 - accuracy: 0.9780 - 1s/epoch - 3ms/step
Test accuracy after adversarial training: 0.9780
Epoch 2/5
313/313 - 1s - loss: 0.0481 - accuracy: 0.9843 - 890ms/epoch - 3ms/step
Test accuracy after adversarial training: 0.9843
Epoch 3/5
313/313 - 1s - loss: 0.0368 - accuracy: 0.9879 - 899ms/epoch - 3ms/step
Test accuracy after adversarial training: 0.9879
Epoch 4/5
313/313 - 1s - loss: 0.0310 - accuracy: 0.9896 - 868ms/epoch - 3ms/step
Test accuracy after adversarial training: 0.9896
Epoch 5/5
313/313 - 1s - loss: 0.0289 - accuracy: 0.9900 - 875ms/epoch - 3ms/step
Test accuracy after adversarial training: 0.9900


In [6]:
def detect_adversarial_input(model, x, threshold=0.6):
    predictions = model.predict(x)
    confidence = np.max(predictions, axis=1)

    adversarial_detected = confidence < threshold
    return adversarial_detected


In [7]:
clean_detection = detect_adversarial_input(model, x_test[:100])
print(f"Adversarial detected in clean data: {np.sum(clean_detection)}")

x_adv_test = generate_adversarial_example(adv_model, x_test[:100], y_test[:100], epsilon)

adv_detection = detect_adversarial_input(adv_model, x_adv_test)
print(f"Adversarial detected in adversarial data: {np.sum(adv_detection)}")


Adversarial detected in clean data: 100
Adversarial detected in adversarial data: 1
