# Init

In [1]:
import hnswlib
import numpy as np
import sqlite3
import pickle
import tqdm

In [2]:
# Open the database
conn = sqlite3.connect('../steam_instructor-xl.db')

In [3]:
# Helper functions

def get_any_description_embedding():
    c = conn.cursor()

    c.execute(f'''
        SELECT embedding FROM description_embeddings LIMIT 1
    ''')
    results = c.fetchone()[0]

    c.close()

    return pickle.loads(results)

def get_description_embeddings_for_appid(appid):
    c = conn.cursor()

    c.execute(f'''
        SELECT embedding FROM description_embeddings WHERE appid = ?
    ''', (appid,))
    results = c.fetchone()[0]

    c.close()

    return pickle.loads(results)

def get_review_embeddings_for_appid(appid):
    c = conn.cursor()

    c.execute(f'''
        SELECT recommendationid, embedding
        FROM review_embeddings
        WHERE appid = ?
    ''', (appid,))
    results = c.fetchall()

    c.close()

    return {recommendationid: pickle.loads(embedding) for recommendationid, embedding in results}

def get_count_appids_with_description_embeddings():
    c = conn.cursor()

    c.execute(f'''
        SELECT COUNT(DISTINCT appid) FROM description_embeddings
    ''')
    results = c.fetchone()[0]

    c.close()

    return results

def get_count_appids_with_review_embeddings():
    c = conn.cursor()

    c.execute(f'''
        SELECT COUNT(DISTINCT appid) FROM review_embeddings
    ''')
    results = c.fetchone()[0]

    c.close()

    return results

def get_all_description_embeddings_generator(page_size):
    c = conn.cursor()

    c.execute(f'''
        SELECT appid, embedding FROM description_embeddings
    ''')

    while True:
        results = c.fetchmany(page_size)
        if not results:
            break
        for appid, embedding in results:
            yield appid, pickle.loads(embedding)

    c.close()

def get_batched_description_embeddings_generator(page_size):
    c = conn.cursor()

    c.execute(f'''
        SELECT appid, embedding FROM description_embeddings
    ''')

    while True:
        results = c.fetchmany(page_size)
        if not results:
            break
        yield [(appid, pickle.loads(embedding)) for appid, embedding in results]

    c.close()

def get_all_review_embeddings_generator(page_size):
    c = conn.cursor()

    c.execute(f'''
        SELECT appid, embedding FROM review_embeddings
    ''')

    while True:
        results = c.fetchmany(page_size)
        if not results:
            break
        for appid, embedding in results:
            yield appid, pickle.loads(embedding)

    c.close()

def get_batched_review_embeddings_generator(page_size):
    c = conn.cursor()

    c.execute(f'''
        SELECT appid, embedding FROM review_embeddings
    ''')

    while True:
        results = c.fetchmany(page_size)
        if not results:
            break
        yield [(appid, pickle.loads(embedding)) for appid, embedding in results]

    c.close()

def mean_pooling(embeddings):
    return np.sum(embeddings, axis=0) / len(embeddings)

def get_pooled_description_embedding_for_appid(appid):
    return mean_pooling(get_description_embeddings_for_appid(appid))

def get_pooled_review_embeddings_for_appid(appid):
    all_review_embeddings = get_review_embeddings_for_appid(appid)
    flat_review_embeddings = [review_embedding for review_id in all_review_embeddings for review_embedding in all_review_embeddings[review_id]]
    return mean_pooling(flat_review_embeddings)



def get_index_dimension():
    return len(get_any_description_embedding()[0])

# Recall Testing

Testing recall using description embeddings with a variety of different distance algorithms, with 2000 queries made

In [13]:
num_queries = 2000

## `cosine` Distance

**Results Table**  

| k  | ef  | M  | recall |
|----|-----|----|--------|
| 10 | 200 | 54 | 55-56% | <!-- asdf -->
| 10 | 400 | 48 | 77%  | <!-- 77.01%  76.665% 77.075% -->
| 10 | 800 | 48 | testing |

<!-- not enough queries made -->
<!--| 10 | 200 | 48 | ~56%    |-->



In [21]:
dim = get_index_dimension()
num_elements = get_count_appids_with_description_embeddings()
k = 10 # Number of nearest neighbors

# ef - the size of the dynamic list for the nearest neighbors (used during the search).
# Higher = more accurate, but slower
ef = 800

# M -  the number of bi-directional links created for every new element during construction.
# 12-48 is good for most cases. 
# Highly related to dimensionality of the data.
#   dim = 4 -> M = 6 - 12
#   dim = 512 -> M = 48 - 64
M = 48 

In [22]:
hnsw_index = hnswlib.Index(space='cosine', dim=dim)
hnsw_index.init_index(max_elements=num_elements, ef_construction=ef, M=M)
hnsw_index.set_ef(ef)

In [6]:
bf_index = hnswlib.BFIndex(space='cosine', dim=dim)
bf_index.init_index(max_elements=num_elements)

bf_index_initialized = False

In [23]:
# Build index

bar = tqdm.tqdm(total=num_elements)

for batch in get_batched_description_embeddings_generator(1000):
    appids, embeddings = zip(*batch)
    embeddings = [mean_pooling(embedding) for embedding in embeddings]

    hnsw_index.add_items(embeddings, appids)

    if not bf_index_initialized:
        bf_index.add_items(embeddings, appids)
    
    bar.update(len(batch))

bar.close()
bf_index_initialized = True

100%|██████████| 100175/100175 [04:15<00:00, 391.78it/s]


In [20]:
query_data = np.float32(np.random.random((num_queries, dim)))

print("querying hnsw index...")
labels_hnsw, distances_hnsw = hnsw_index.knn_query(query_data, k)

print("querying brute force index...")
labels_bf, distances_bf = bf_index.knn_query(query_data, k)

correct = sum([1 for i in range(num_queries) for label in labels_hnsw[i] if label in labels_bf[i]])

print("recall is :", float(correct)/(k*num_queries))

querying hnsw index...
querying brute force index...
recall is : 0.77075
