# Hashing Autoencoders and RAMBO

We are attempting to improve the performance of RAMBO by training an autoencoder to act as a hash function. Our eventual goal is to make a library of PDFs easily searchable, but for now we test with the AOL dataset used in class for our homeworks.

## Create training and validation data

Here we simply download the AOL dataset from the appropriate website. We will later hash it and split it into training and validation datasets.

In [1]:
import urllib.request
from pathlib import Path

AOL_URL = "http://www.cim.mcgill.ca/~dudek/206/Logs/AOL-user-ct-collection/user-ct-test-collection-01.txt"

data_dir = Path("data")
data_file = Path("data/aol.txt")

if not data_file.is_file():
    if not data_dir.is_dir():
        data_dir.mkdir(parents=True, exist_ok=True)

    with urllib.request.urlopen(AOL_URL) as data_url, data_file.open(
        "w", encoding="utf-8"
    ) as fd:
        fd.write(data_url.read().decode("utf-8"))

Let's read it into a Pandas `DataFrame` and extract the queries from it.

In [2]:
import numpy as np
import pandas as pd

data = pd.read_csv(data_file, sep="\t")
phrases = data.Query.dropna().unique().tolist()

We convert the phrases to lists of ASCII numbers and pad them to 512 elements in length.

In [27]:
import re

PAD_CONST = 512


def word_to_ascii(word):
    ascii_word = list(map(ord, word))
    padded_ascii = ascii_word + ([0] * (PAD_CONST - len(ascii_word)))
    return padded_ascii

# breaks phrases into word and collect unique ones.
def preprocess():
    keywords = set()
    for s in phrases:
        for w in re.findall(r'\w+', s) :
            keywords.add(w)
    return list(keywords)

word_set = preprocess()

phrases_ascii = np.array(list(map(word_to_ascii, word_set)))
phrases_ascii.shape


(445286, 512)

Now we hash them with MurmurHash for our `y` variable

In [4]:
from sklearn.utils import murmurhash3_32


# Taken from: https://stackoverflow.com/a/47521145
def vec_bin_array(arr, m):
    """
    Arguments:
    arr: Numpy array of positive integers
    m: Number of bits of each integer to retain

    Returns a copy of arr with every element replaced with a bit vector.
    Bits encoded as int8's.
    """
    to_str_func = np.vectorize(lambda x: np.binary_repr(x).zfill(m))
    strs = to_str_func(arr)
    ret = np.zeros(list(arr.shape) + [m], dtype=np.int64)
    for bit_ix in range(0, m):
        fetch_bit_func = np.vectorize(lambda x: x[bit_ix] == "1")
        ret[..., bit_ix] = fetch_bit_func(strs).astype("int8")

    return ret


phrases_hashed = vec_bin_array(
    np.array(list(map(lambda x: murmurhash3_32(x, seed=2021, positive=True), phrases))),
    32,
)
phrases_hashed.shape

(1216652, 32)

## Building our model

Before we train our encoder, we split the dataset into training, testing, and validation sets.

In [28]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    phrases_ascii, phrases_ascii, test_size=0.2, random_state=2021
)

X_train, X_val, y_train, y_val = train_test_split(
    X_train, y_train, test_size=0.25, random_state=2021
)

Now we're finally ready to create `tf.Dataset` objects out of our data. This is an API provided by Tensorflow which allows for easy manipulation of data for training models.

In [29]:
import tensorflow as tf

train_dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train))
val_dataset = tf.data.Dataset.from_tensor_slices((X_val, y_val))
test_dataset = tf.data.Dataset.from_tensor_slices((X_test, y_test))

BATCH_SIZE = 32
SHUFFLE_BUFFER_SIZE = 96

train_dataset = train_dataset.shuffle(SHUFFLE_BUFFER_SIZE).batch(BATCH_SIZE)
val_dataset = val_dataset.shuffle(SHUFFLE_BUFFER_SIZE).batch(BATCH_SIZE)
test_dataset = test_dataset.shuffle(SHUFFLE_BUFFER_SIZE).batch(BATCH_SIZE)

Next, we define our model. We focus on the encoding portion of the encoder-decoder pair, as that is what concerns us the most.

In [47]:
from tensorflow import keras
from tensorflow.keras import layers, regularizers

latent_dim = 32


class Autoencoder(keras.models.Model):
    def __init__(self, latent_dim):
        super(Autoencoder, self).__init__()
        self.latent_dim = latent_dim
        self.encoder = keras.Sequential(
            [
                layers.Dense(256, activation="relu"),
                layers.Dense(128, activation="relu"),
                layers.Dense(64, activation="relu"),
                layers.Dense(self.latent_dim, activation="relu", activity_regularizer=regularizers.l1(10e-3)),
            ]
        )

        self.decoder = keras.Sequential(
            [
                layers.Dense(64, activation="relu"),
                layers.Dense(128, activation="relu"),
                layers.Dense(256, activation="relu"),
                layers.Dense(512, activation="relu"),
            ]
        )

    def call(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded

    def encode(self, x):
        return self.encoder(x)

    def decode(self, x):
        return self.decoder(x)


model = Autoencoder(latent_dim)
model.build((None, 512))
model.compile(
    loss="mean_squared_error",
    optimizer="adam",
    metrics=["accuracy"],
)
model.summary()

Model: "autoencoder_14"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
sequential_26 (Sequential)   (None, 32)                174560    
_________________________________________________________________
sequential_27 (Sequential)   (None, 512)               175040    
Total params: 349,600
Trainable params: 349,600
Non-trainable params: 0
_________________________________________________________________


In [None]:
callbacks = [
    tf.keras.callbacks.EarlyStopping(
        monitor="val_loss", mode="min", patience=10, verbose=1
    ),
    tf.keras.callbacks.ReduceLROnPlateau(
        factor=0.1, patience=5, min_lr=0.00001, verbose=1
    ),
    tf.keras.callbacks.ModelCheckpoint(
        "model-tgs-salt.h5", verbose=1, save_best_only=True, save_weights_only=True
    ),
]

history = model.fit(
    train_dataset, epochs=40, callbacks=callbacks, validation_data=val_dataset
)

Epoch 1/40

Epoch 00001: val_loss improved from inf to 16.09947, saving model to model-tgs-salt.h5
Epoch 2/40

Epoch 00002: val_loss improved from 16.09947 to 10.07854, saving model to model-tgs-salt.h5
Epoch 3/40

Epoch 00003: val_loss improved from 10.07854 to 9.79567, saving model to model-tgs-salt.h5
Epoch 4/40

Epoch 00004: val_loss improved from 9.79567 to 8.85660, saving model to model-tgs-salt.h5
Epoch 5/40

Epoch 00005: val_loss improved from 8.85660 to 8.82855, saving model to model-tgs-salt.h5
Epoch 6/40

Epoch 00006: val_loss improved from 8.82855 to 8.75421, saving model to model-tgs-salt.h5
Epoch 7/40

Epoch 00007: val_loss did not improve from 8.75421
Epoch 8/40

Epoch 00008: val_loss improved from 8.75421 to 8.56468, saving model to model-tgs-salt.h5
Epoch 9/40

Epoch 00009: val_loss improved from 8.56468 to 8.54370, saving model to model-tgs-salt.h5
Epoch 10/40

Epoch 00010: val_loss did not improve from 8.54370
Epoch 11/40

Epoch 00011: val_loss did not improve from 8

Epoch 38/40

Epoch 00038: val_loss did not improve from 2.75405
Epoch 39/40

### Plotting training results

Here we plot our training results, just to convince ourselves that our model is working.

In [None]:
import matplotlib.pyplot as plt

#  "Accuracy"
plt.plot(history.history["acc"])
plt.plot(history.history["val_acc"])
plt.title("model accuracy")
plt.ylabel("accuracy")
plt.xlabel("epoch")
plt.legend(["train", "validation"], loc="upper left")
plt.show()
# "Loss"
plt.plot(history.history["loss"])
plt.plot(history.history["val_loss"])
plt.title("model loss")
plt.ylabel("loss")
plt.xlabel("epoch")
plt.legend(["train", "validation"], loc="upper left")
plt.show()

### Testing our model
We test our model on the test dataset we set aside earlier.

In [None]:
results = model.evaluate(test_dataset)

print("Test loss, test accuracy: ", results)

In [46]:
print(model.encode(tf.convert_to_tensor([(word_to_ascii("test1"))])))
print(model.encode(tf.convert_to_tensor([(word_to_ascii("test2"))])))
print(model.encode(tf.convert_to_tensor([(word_to_ascii("test3"))])))
print(model.encode(tf.convert_to_tensor([(word_to_ascii("test4"))])))
print(model.encode(tf.convert_to_tensor([(word_to_ascii("apple"))])))
print(model.encode(tf.convert_to_tensor([(word_to_ascii("pear"))])))
print(model.encode(tf.convert_to_tensor([(word_to_ascii("shakespear"))])))
print(model.encode(tf.convert_to_tensor([(word_to_ascii("beatles"))])))

tf.Tensor(
[[0.         0.         0.         1.6588633  0.         0.
  0.49578863 0.         0.         0.         1.1204555  0.
  0.         0.         1.3190415  0.         0.         0.
  0.         0.5487397  0.         0.         0.         0.
  0.         0.         3.9223642  0.         0.         2.0180912
  0.         0.        ]], shape=(1, 32), dtype=float32)
tf.Tensor(
[[0.         0.         0.         1.6487377  0.         0.
  0.49603695 0.         0.         0.         1.1241846  0.
  0.         0.         1.3250573  0.         0.         0.
  0.         0.5537174  0.         0.         0.         0.
  0.         0.         3.9255157  0.         0.         2.017427
  0.         0.        ]], shape=(1, 32), dtype=float32)
tf.Tensor(
[[0.         0.         0.         1.638612   0.         0.
  0.4962849  0.         0.         0.         1.1279136  0.
  0.         0.         1.3310716  0.         0.         0.
  0.         0.55869544 0.         0.         0.         0.


In [34]:
code1 = model.encode(tf.convert_to_tensor([(word_to_ascii("test1"))]))
code2 = model.encode(tf.convert_to_tensor([(word_to_ascii("apple"))]))
code3 = model.encode(tf.convert_to_tensor([(word_to_ascii("test1"))]))
code4 = model.encode(tf.convert_to_tensor([(word_to_ascii("pear"))]))

In [39]:

def map_code_to_hash_val(code, range):
    # code is the output of autoencoder of a word
    # range is the length of the bloomfilter
    
    # convert tensor to array
    arr = code.numpy()[0]
    print(arr)
    sub_range_len = range/code
    max_val = 128
    #for i in range(32):
        
map_code_to_hash_val(code2, pow(2,10))

[ 12.567999   0.         0.         0.        25.422573   0.
   0.         0.         0.        66.64754    0.         0.
   0.         0.        27.471172  38.67737    0.        70.55087
  51.201347  55.41217  115.450645   0.         0.         0.
   0.         0.        46.195705  16.857456   0.         0.
   0.         0.      ]


### Saving the model

We saved the model in Tensorflow's own `SavedModel` format to use later.

In [None]:
model.save("dense_encoder")