# Hashing Autoencoders and RAMBO

We are attempting to improve the performance of RAMBO by training an autoencoder to act as a hash function. Our eventual goal is to make a library of PDFs easily searchable, but for now we test with the AOL dataset used in class for our homeworks.

## Create training and validation data

Here we simply download the AOL dataset from the appropriate website. We will later hash it and split it into training and validation datasets.

In [2]:
import urllib.request
from pathlib import Path

AOL_URL = "http://www.cim.mcgill.ca/~dudek/206/Logs/AOL-user-ct-collection/user-ct-test-collection-01.txt"

data_dir = Path("data")
data_file = Path("data/aol.txt")

if not data_file.is_file():
    if not data_dir.is_dir():
        data_dir.mkdir(parents=True, exist_ok=True)

    with urllib.request.urlopen(AOL_URL) as data_url, data_file.open(
        "w", encoding="utf-8"
    ) as fd:
        fd.write(data_url.read().decode("utf-8"))

Let's read it into a Pandas `DataFrame` and extract the queries from it.

In [3]:
import numpy as np
import pandas as pd

data = pd.read_csv(data_file, sep="\t")
phrases = data.Query.dropna().unique().tolist()

We convert the phrases to lists of ASCII numbers and pad them to 512 elements in length.

In [4]:
PAD_CONST = 512


def word_to_ascii(word):
    ascii_word = list(map(ord, word))
    padded_ascii = ascii_word + ([0] * (PAD_CONST - len(ascii_word)))
    return padded_ascii


phrases_ascii = np.array(list(map(word_to_ascii, phrases)))
phrases_ascii.shape

(1216652, 512)

Now we hash them with MurmurHash for our `y` variable

In [5]:
from sklearn.utils import murmurhash3_32


# Taken from: https://stackoverflow.com/a/47521145
def vec_bin_array(arr, m):
    """
    Arguments:
    arr: Numpy array of positive integers
    m: Number of bits of each integer to retain

    Returns a copy of arr with every element replaced with a bit vector.
    Bits encoded as int8's.
    """
    to_str_func = np.vectorize(lambda x: np.binary_repr(x).zfill(m))
    strs = to_str_func(arr)
    ret = np.zeros(list(arr.shape) + [m], dtype=np.int64)
    for bit_ix in range(0, m):
        fetch_bit_func = np.vectorize(lambda x: x[bit_ix] == "1")
        ret[..., bit_ix] = fetch_bit_func(strs).astype("int8")

    return ret


phrases_hashed = vec_bin_array(
    np.array(list(map(lambda x: murmurhash3_32(x, seed=2021, positive=True), phrases))),
    32,
)
phrases_hashed.shape

(1216652, 32)

## Building our model

Before we train our encoder, we split the dataset into training, testing, and validation sets.

In [6]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    phrases_ascii, phrases_ascii, test_size=0.2, random_state=2021
)

X_train, X_val, y_train, y_val = train_test_split(
    X_train, y_train, test_size=0.25, random_state=2021
)

Now we're finally ready to create `tf.Dataset` objects out of our data. This is an API provided by Tensorflow which allows for easy manipulation of data for training models.

In [9]:
import tensorflow as tf

train_dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train))
val_dataset = tf.data.Dataset.from_tensor_slices((X_val, y_val))
test_dataset = tf.data.Dataset.from_tensor_slices((X_test, y_test))

BATCH_SIZE = 32
SHUFFLE_BUFFER_SIZE = 96

train_dataset = train_dataset.shuffle(SHUFFLE_BUFFER_SIZE).batch(BATCH_SIZE)
val_dataset = val_dataset.shuffle(SHUFFLE_BUFFER_SIZE).batch(BATCH_SIZE)
test_dataset = test_dataset.shuffle(SHUFFLE_BUFFER_SIZE).batch(BATCH_SIZE)

Next, we define our model. We focus on the encoding portion of the encoder-decoder pair, as that is what concerns us the most.

In [17]:
from tensorflow import keras
from tensorflow.keras import layers

latent_dim = 32


class Autoencoder(keras.models.Model):
    def __init__(self, latent_dim):
        super(Autoencoder, self).__init__()
        self.latent_dim = latent_dim
        self.encoder = keras.Sequential(
            [
                layers.Dense(256, activation="relu"),
                layers.Dense(128, activation="relu"),
                layers.Dense(64, activation="relu"),
                layers.Dense(self.latent_dim, activation="relu"),
            ]
        )

        self.decoder = keras.Sequential(
            [
                layers.Dense(64, activation="relu"),
                layers.Dense(128, activation="relu"),
                layers.Dense(256, activation="relu"),
                layers.Dense(512, activation="relu"),
            ]
        )

    def call(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded

    def encode(self, x):
        return self.encoder(x)

    def decode(self, x):
        return self.decoder(x)


model = Autoencoder(latent_dim)
model.build((None, 512))
model.compile(
    loss="mean_squared_error",
    optimizer="adam",
    metrics=["accuracy"],
)
model.summary()

Model: "autoencoder_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
sequential_2 (Sequential)    (None, 32)                174560    
_________________________________________________________________
sequential_3 (Sequential)    (None, 512)               175040    
Total params: 349,600
Trainable params: 349,600
Non-trainable params: 0
_________________________________________________________________


In [18]:
callbacks = [
    tf.keras.callbacks.EarlyStopping(
        monitor="val_loss", mode="min", patience=10, verbose=1
    ),
    tf.keras.callbacks.ReduceLROnPlateau(
        factor=0.1, patience=5, min_lr=0.00001, verbose=1
    ),
    tf.keras.callbacks.ModelCheckpoint(
        "model-tgs-salt.h5", verbose=1, save_best_only=True, save_weights_only=True
    ),
]

history = model.fit(
    train_dataset, epochs=20, callbacks=callbacks, validation_data=val_dataset
)

Epoch 1/5

Epoch 00001: val_loss improved from inf to 24.82251, saving model to model-tgs-salt.h5
Epoch 2/5

Epoch 00002: val_loss improved from 24.82251 to 23.80871, saving model to model-tgs-salt.h5
Epoch 3/5

Epoch 00003: val_loss improved from 23.80871 to 21.05137, saving model to model-tgs-salt.h5
Epoch 4/5

Epoch 00004: val_loss improved from 21.05137 to 20.08895, saving model to model-tgs-salt.h5
Epoch 5/5

Epoch 00005: val_loss did not improve from 20.08895


### Plotting training results

Here we plot our training results, just to convince ourselves that our model is working.

In [9]:
import matplotlib.pyplot as plt

#  "Accuracy"
plt.plot(history.history["acc"])
plt.plot(history.history["val_acc"])
plt.title("model accuracy")
plt.ylabel("accuracy")
plt.xlabel("epoch")
plt.legend(["train", "validation"], loc="upper left")
plt.show()
# "Loss"
plt.plot(history.history["loss"])
plt.plot(history.history["val_loss"])
plt.title("model loss")
plt.ylabel("loss")
plt.xlabel("epoch")
plt.legend(["train", "validation"], loc="upper left")
plt.show()

KeyError: 'acc'

### Testing our model
We test our model on the test dataset we set aside earlier.

In [19]:
results = model.evaluate(test_dataset)

print("Test loss, test accuracy: ", results)

Test loss, test accuracy:  [21.552907943725586, 0.3331264853477478]


In [31]:
print(model.encode(tf.convert_to_tensor([(word_to_ascii("test1"))])))
print(model.encode(tf.convert_to_tensor([(word_to_ascii("test2"))])))
print(model.encode(tf.convert_to_tensor([(word_to_ascii("test3"))])))
print(model.encode(tf.convert_to_tensor([(word_to_ascii("test4"))])))

tf.Tensor(
[[ 33.52078   34.702988   0.         0.        81.65477  117.56293
   38.503662  35.110764  68.977264  47.95541    0.        59.143414
    0.        70.062996  55.42965   35.76763    0.        32.363712
   28.937103  35.198166  92.18039   50.493496   0.        47.72702
    0.       120.11927   36.761654   0.        34.79291   70.11691
   78.18633    0.      ]], shape=(1, 32), dtype=float32)
tf.Tensor(
[[ 33.30903   34.671623   0.         0.        81.57011  117.166084
   38.776367  35.472603  68.5487    47.83555    0.        59.083603
    0.        69.72224   55.407513  36.247406   0.        32.132687
   28.953465  35.232616  92.30373   50.025177   0.        47.62036
    0.       120.35853   36.702152   0.        34.419674  69.945496
   78.15209    0.      ]], shape=(1, 32), dtype=float32)
tf.Tensor(
[[ 33.097713  34.643894   0.         0.        81.47064  116.74923
   39.068504  35.844177  68.110176  47.71743    0.        59.023647
    0.        69.39486   55.36061   36.722

### Saving the model

We saved the model in Tensorflow's own `SavedModel` format to use later.

In [None]:
model.save("dense_encoder")