# Init

In [30]:
import hnswlib
import numpy as np
import sqlite3
import pickle
import tqdm
import time

In [2]:
# Open the database
conn = sqlite3.connect('../steam_instructor-xl.db')

In [3]:
# Helper functions

def get_any_description_embedding():
    c = conn.cursor()

    c.execute(f'''
        SELECT embedding FROM description_embeddings LIMIT 1
    ''')
    results = c.fetchone()[0]

    c.close()

    return pickle.loads(results)

def get_description_embeddings_for_appid(appid):
    c = conn.cursor()

    c.execute(f'''
        SELECT embedding FROM description_embeddings WHERE appid = ?
    ''', (appid,))
    results = c.fetchone()[0]

    c.close()

    return pickle.loads(results)

def get_review_embeddings_for_appid(appid):
    c = conn.cursor()

    c.execute(f'''
        SELECT recommendationid, embedding
        FROM review_embeddings
        WHERE appid = ?
    ''', (appid,))
    results = c.fetchall()

    c.close()

    return {recommendationid: pickle.loads(embedding) for recommendationid, embedding in results}

def get_count_appids_with_description_embeddings():
    c = conn.cursor()

    c.execute(f'''
        SELECT COUNT(DISTINCT appid) FROM description_embeddings
    ''')
    results = c.fetchone()[0]

    c.close()

    return results

def get_count_appids_with_review_embeddings():
    c = conn.cursor()

    c.execute(f'''
        SELECT COUNT(DISTINCT appid) FROM review_embeddings
    ''')
    results = c.fetchone()[0]

    c.close()

    return results

def get_all_description_embeddings_generator(page_size):
    c = conn.cursor()

    c.execute(f'''
        SELECT appid, embedding FROM description_embeddings
    ''')

    while True:
        results = c.fetchmany(page_size)
        if not results:
            break
        for appid, embedding in results:
            yield appid, pickle.loads(embedding)

    c.close()

def get_batched_description_embeddings_generator(page_size):
    c = conn.cursor()

    c.execute(f'''
        SELECT appid, embedding FROM description_embeddings
    ''')

    while True:
        results = c.fetchmany(page_size)
        if not results:
            break
        yield [(appid, pickle.loads(embedding)) for appid, embedding in results]

    c.close()

def get_all_review_embeddings_generator(page_size):
    c = conn.cursor()

    c.execute(f'''
        SELECT appid, embedding FROM review_embeddings
    ''')

    while True:
        results = c.fetchmany(page_size)
        if not results:
            break
        for appid, embedding in results:
            yield appid, pickle.loads(embedding)

    c.close()

def get_batched_review_embeddings_generator(page_size):
    c = conn.cursor()

    c.execute(f'''
        SELECT appid, embedding FROM review_embeddings
    ''')

    while True:
        results = c.fetchmany(page_size)
        if not results:
            break
        yield [(appid, pickle.loads(embedding)) for appid, embedding in results]

    c.close()

def mean_pooling(embeddings):
    return np.sum(embeddings, axis=0) / len(embeddings)

def get_pooled_description_embedding_for_appid(appid):
    return mean_pooling(get_description_embeddings_for_appid(appid))

def get_pooled_review_embeddings_for_appid(appid):
    all_review_embeddings = get_review_embeddings_for_appid(appid)
    flat_review_embeddings = [review_embedding for review_id in all_review_embeddings for review_embedding in all_review_embeddings[review_id]]
    return mean_pooling(flat_review_embeddings)



def get_index_dimension():
    return len(get_any_description_embedding()[0])

# Recall Testing

Testing recall using description embeddings with a variety of different distance algorithms, with 2000 queries made

In [13]:
num_queries = 2000

## `cosine` Distance

**Results Table**  

| k  | ef  | M  | recall | time to build | query time (batched) | query time (individual) |
|----|-----|----|--------|---------------|----------------------|-------------------------|
| 10 | 200 | 54 | 55-56% | N/A | N/A | N/A |
| 10 | 400 | 48 | 77%  | N/A | N/A | N/A |                                          <!-- 77.01%  76.665% 77.075% -->
| 10 | 800 | 48 | 88% | `4m15s` (?) | N/A | N/A |                                   <!-- 87.8%   87.985% 87.745% -->
| 10 | 2000 | 48 | 95% | `7m53s` | `11s` total / `5.5ms` per query | N/A |          <!-- 95.27%  95.18%  95.145% -->
| 10 | 2000 | 32 | 95% | `7m13s` | `10s` total / `4.75ms` per query | `25ms` per query |             <!-- 94.9%   94.96%  94.98%  -->
| 10 | 5000 | 48 | 98% | N/A | `20s` total / `10ms` per query | `50ms` per query |  <!-- 97.68%  97.745% 97.68% -->

<!-- not enough queries made -->
<!--| 10 | 200 | 48 | ~56%    |-->

(?) - Seems suspect


In [46]:
dim = get_index_dimension()
num_elements = get_count_appids_with_description_embeddings()
k = 10 # Number of nearest neighbors

# ef - the size of the dynamic list for the nearest neighbors (used during the search).
# Higher = more accurate, but slower
ef = 2000

# M -  the number of bi-directional links created for every new element during construction.
# 12-48 is good for most cases. 
# Highly related to dimensionality of the data.
#   dim = 4 -> M = 6 - 12
#   dim = 512 -> M = 48 - 64
M = 32 

In [47]:
hnsw_index = hnswlib.Index(space='cosine', dim=dim)
hnsw_index.init_index(max_elements=num_elements, ef_construction=ef, M=M)
hnsw_index.set_ef(ef)

In [6]:
bf_index = hnswlib.BFIndex(space='cosine', dim=dim)
bf_index.init_index(max_elements=num_elements)

bf_index_initialized = False

In [48]:
# Build index

bar = tqdm.tqdm(total=num_elements)

for batch in get_batched_description_embeddings_generator(1000):
    appids, embeddings = zip(*batch)
    embeddings = [mean_pooling(embedding) for embedding in embeddings]

    hnsw_index.add_items(embeddings, appids)

    if not bf_index_initialized:
        bf_index.add_items(embeddings, appids)
    
    bar.update(len(batch))

bar.close()
bf_index_initialized = True

100%|██████████| 100175/100175 [07:13<00:00, 231.32it/s]


In [51]:

# Time batched queries (num_queries queries at a time)
query_data = np.float32(np.random.random((num_queries, dim)))

print("Querying hnsw index...", end="")
hnsw_begin = time.perf_counter()
labels_hnsw, distances_hnsw = hnsw_index.knn_query(query_data, k)
hnsw_end = time.perf_counter()
print("took", hnsw_end - hnsw_begin, "seconds")

print("Querying brute force index...")
labels_bf, distances_bf = bf_index.knn_query(query_data, k)

correct = sum([1 for i in range(num_queries) for label in labels_hnsw[i] if label in labels_bf[i]])

print("recall is :", float(correct)/(k*num_queries))

Querying hnsw index...took 9.85190091698314 seconds
Querying brute force index...
recall is : 0.9498


In [52]:
# Time individual queries
# One timer on outer loop: 0.056638814979000016 seconds
# Timer on inner loop:     0.05581076550566649 seconds
query_data = np.float32(np.random.random((num_queries, dim)))

total_time = 0.0

for i in tqdm.trange(num_queries):
    query = query_data[i]

    hnsw_begin = time.perf_counter()
    _ = hnsw_index.knn_query(query, k)
    hnsw_end = time.perf_counter()

    total_time += hnsw_end - hnsw_begin

print("Average time per query:", total_time / num_queries, "seconds")

  0%|          | 0/2000 [00:00<?, ?it/s]

100%|██████████| 2000/2000 [00:50<00:00, 39.64it/s]

Average time per query: 0.025140569918847177 seconds



