## Attacking Apples neural hash

This notebook will describe a whitebox attack on the underlying neural network of apples neural hash.

### Loading the model:
Source and Credits: https://github.com/AsuharietYgvar/AppleNeuralHash2ONNX

In [82]:
import onnxruntime
import numpy as np
from PIL import Image

In [83]:
MODEL_PATH = "./model.onnx"
OUTPUT_MATRIX_PATH = "./neuralhash_128x96_seed1.dat"

In [84]:
def neural_hash(image):
    # Load ONNX model
    session = onnxruntime.InferenceSession(MODEL_PATH)

    # Load output hash matrix
    seed1 = open(OUTPUT_MATRIX_PATH, 'rb').read()[128:]
    seed1 = np.frombuffer(seed1, dtype=np.float32)
    seed1 = seed1.reshape([96, 128])

    # Preprocess image
    image = Image.open(image).convert('RGB')
    image = image.resize([360, 360])
    arr = np.array(image).astype(np.float32) / 255.0
    arr = arr * 2.0 - 1.0
    arr = arr.transpose(2, 0, 1).reshape([1, 3, 360, 360])

    # Run model
    inputs = {session.get_inputs()[0].name: arr}
    outs = session.run(None, inputs)

    # Convert model output to hex hash
    hash_output = seed1.dot(outs[0].flatten())
    hash_bits = ''.join(['1' if it >= 0 else '0' for it in hash_output])
    hash_hex = '{:0{}x}'.format(int(hash_bits, 2), len(hash_bits) // 4)

    return hash_hex

In [85]:
neural_hash("../shrek.png")

'55d07b72427978ac7a8fd1d9'

### Producing hash collisions
Source and Credits: https://github.com/anishathalye/neural-hash-collider

In [86]:
# util.py

import numpy as np
import onnx
from onnx_tf.backend import prepare
from PIL import Image


def load_model(path):
    onnx_model = onnx.load(path)
    model = prepare(onnx_model, training_mode=True)
    return model


def load_seed(path):
    seed = open(path, 'rb').read()[128:]
    seed = np.frombuffer(seed, dtype=np.float32)
    seed = seed.reshape([96, 128])
    return seed


def load_image(path):
    im = Image.open(path).convert('RGB')
    im = im.resize([360, 360])
    arr = np.array(im).astype(np.float32) / 255.0
    arr = arr * 2.0 - 1.0
    arr = arr.transpose(2, 0, 1).reshape([1, 3, 360, 360])
    return arr


def save_image(arr, path):
    arr = arr.reshape([3, 360, 360]).transpose(1, 2, 0)
    arr = (arr + 1.0) * (255.0 / 2.0)
    arr = arr.astype(np.uint8)
    im = Image.fromarray(arr)
    im.save(path)


def hash_from_hex(hex_repr):
    n = int(hex_repr, 16)
    h = np.zeros(96)
    for i in range(96):
        h[i] = (n >> (95 - i)) & 1
    return h


def hash_to_hex(h):
    bits = ''.join(['1' if i >= 0.5 else '0' for i in h])
    return '{:0{}x}'.format(int(bits, 2), len(bits) // 4)

In [89]:
# collide.py

import tensorflow as tf
from scipy.ndimage.filters import gaussian_filter
import os


def collide(o_image,
            o_model=MODEL_PATH,
            o_seed=OUTPUT_MATRIX_PATH,
            o_target='59a34eabe31910abfb06f308',
            o_learning_rate=2.0,
            o_combined_threshold=2,
            o_k=10.0,
            o_l2_weight=2e-3,
            o_tv_weight=1e-4,
            o_hash_weight=0.8,
            o_clip_range=0.1,
            o_iterations=1000,
            o_save_directory='.',
            o_save_iterations=0,
            o_blur=0):
    tf.compat.v1.disable_eager_execution()

    model = load_model(o_model)
    image = model.tensor_dict['image']
    logits = model.tensor_dict['leaf/logits']
    seed = load_seed(o_seed)

    target = hash_from_hex(o_target)

    original = load_image(o_image)
    h = hash_from_hex(o_target)

    with model.graph.as_default():
        with tf.compat.v1.Session() as sess:
            sess.run(tf.compat.v1.global_variables_initializer())

            proj = tf.reshape(tf.linalg.matmul(seed, tf.reshape(logits, (128, 1))), (96,))
            # proj is in R^96; it's interpreted as a 96-bit hash by mapping
            # entries < 0 to the bit '0', and entries >= 0 to the bit '1'
            normalized, _ = tf.linalg.normalize(proj)
            hash_output = tf.sigmoid(normalized * o_k)
            # now, hash_output has entries in (0, 1); it's interpreted by
            # mapping entries < 0.5 to the bit '0' and entries >= 0.5 to the
            # bit '1'

            # we clip hash_output to (clip_range, 1-clip_range); this seems to
            # improve the search (we don't "waste" perturbation tweaking
            # "strong" bits); the sigmoid already does this to some degree, but
            # this seems to help
            hash_output = tf.clip_by_value(hash_output, o_clip_range, 1.0 - o_clip_range) - 0.5
            hash_output = hash_output * (0.5 / (0.5 - o_clip_range))
            hash_output = hash_output + 0.5

            # hash loss: how far away we are from the target hash
            hash_loss = tf.math.reduce_sum(tf.math.squared_difference(hash_output, h))

            perturbation = image - original
            # image loss: how big / noticeable is the perturbation?
            img_loss = o_l2_weight * tf.nn.l2_loss(perturbation) + o_tv_weight * tf.image.total_variation(perturbation)[0]

            # combined loss: try to minimize both at once
            combined_loss = o_hash_weight * hash_loss + (1 - o_hash_weight) * img_loss

            # gradients of all the losses
            g_hash_loss, = tf.gradients(hash_loss, image)
            g_img_loss, = tf.gradients(img_loss, image)
            g_combined_loss, = tf.gradients(combined_loss, image)

            # perform attack

            x = original
            best = (float('inf'), 0)  # (distance, image quality loss)
            dist = float('inf')

            for i in range(o_iterations):
                # we do an alternating projections style attack here; if we
                # haven't found a colliding image yet, only optimize for that;
                # if we have a colliding image, then minimize the size of the
                # perturbation; if we're close, then do both at once
                if dist == 0:
                    loss_name, loss, g = 'image', img_loss, g_img_loss
                elif best[0] == 0 and dist <= o_combined_threshold:
                    loss_name, loss, g = 'combined', combined_loss, g_combined_loss
                else:
                    loss_name, loss, g = 'hash', hash_loss, g_hash_loss

                # compute loss values and gradient
                xq = quantize(x)  # take derivatives wrt the quantized version of the image
                hash_output_v, img_loss_v, loss_v, g_v = sess.run([hash_output, img_loss, loss, g], feed_dict={image: xq})
                dist = np.sum((hash_output_v >= 0.5) != (h >= 0.5))

                # if it's better than any image found so far, save it
                score = (dist, img_loss_v)
                if score < best or (o_save_iterations > 0 and (i+1) % o_save_iterations == 0):
                    save_image(x, os.path.join(o_save_directory, 'out_iter={:05d}_dist={:02d}_q={:.3f}.png'.format(i+1, dist, img_loss_v)))
                if score < best:
                    best = score

                # gradient descent step
                g_v_norm = g_v / np.linalg.norm(g_v)
                x = x - o_learning_rate * g_v_norm
                if o_blur > 0:
                    x = blur_perturbation(original, x, o_blur)
                x = x.clip(-1, 1)
                print('iteration: {}/{}, best: ({}, {:.3f}), hash: {}, distance: {}, loss: {:.3f} ({})'.format(
                    i+1,
                    o_iterations,
                    best[0],
                    best[1],
                    hash_to_hex(hash_output_v),
                    dist,
                    loss_v,
                    loss_name
                ))

def quantize(x):
    x = (x + 1.0) * (255.0 / 2.0)
    x = x.astype(np.uint8).astype(np.float32)
    x = x / (255.0 / 2.0) - 1.0
    return x


def blur_perturbation(original, x, sigma):
    perturbation = x - original
    perturbation = gaussian_filter_by_channel(perturbation, sigma=sigma)
    return original + perturbation


def gaussian_filter_by_channel(x, sigma):
    return np.stack([gaussian_filter(x[0, ch, :, :], sigma) for ch in range(x.shape[1])])[np.newaxis]

In [90]:
collide(o_image="../shrek.png")

iteration: 100/1000, best: (1, 6.297), hash: 1ec173f89d10be5300ac0206, distance: 1, loss: 6.786 (hash)
iteration: 200/1000, best: (1, 6.297), hash: 1ec173f89d10be5300ac0206, distance: 1, loss: 6.398 (hash)
iteration: 300/1000, best: (1, 6.297), hash: 1ec173f89d10be5300ac0206, distance: 1, loss: 6.329 (hash)
iteration: 400/1000, best: (0, 14.153), hash: 1ec173f89d10be5300ac0206, distance: 1, loss: 6.511 (combined)
iteration: 500/1000, best: (0, 14.153), hash: 1ec173f89d10be5300ac0206, distance: 1, loss: 6.370 (combined)
iteration: 600/1000, best: (0, 14.153), hash: 1ec173f89d10be5300ac0206, distance: 1, loss: 6.340 (combined)
iteration: 700/1000, best: (0, 14.153), hash: 1ec173f89d10be5300ac0206, distance: 1, loss: 6.317 (combined)
iteration: 800/1000, best: (0, 14.153), hash: 1ec173f89d10be5300ac0206, distance: 1, loss: 6.290 (combined)
iteration: 900/1000, best: (0, 14.153), hash: 1ec173f89d10be5300ac0206, distance: 1, loss: 6.280 (combined)
iteration: 1000/1000, best: (0, 14.153), ha

hash of doge: 4d3032644e122d8c7326cfc9
has of titanic: 1ec173f89d10be5300ac0216
hash of pertubated-doge: 1ec173f89d10be5300ac0216
