In [1]:
import polars as pl
import openai
from babydragon.memory.indexes.numpy_index import NpIndex
from babydragon.models.embedders.ada2 import OpenAiEmbedder
import time


In [2]:
# Initialize an OpenAiEmbedder instance
openai_embedder = OpenAiEmbedder()
openai.api_key = ""
# Initialize a NpIndex instance using OpenAiEmbedder
# values = ['Hello, world!', 'This is a test sentence.', 'OpenAI is amazing!','cake','pie','ice cream', 'Buffer Errors']
# embeddings = openai_embedder.embed(values)


In [3]:
import numpy as np
import uuid

import random




def generate_data(rows):
    # Generate a list of unique strings
    # start = time.time()
    # unique_strings = [str(uuid.uuid4()) for _ in range(rows)]
    # end = time.time()
    # print(f"generated unique strings in {end-start} seconds")
    # Generate a numpy array of size (rows, 1508) with random values
    print("generating random array")
    start = time.time()
    random_array = np.random.rand(rows, 1536).astype(np.float32)
    end = time.time()
    query_vector = np.random.rand(1536).astype(np.float32)  
    print(f"generated random array in {end-start} seconds")
    start = time.time()
    # Normalize each vector in the numpy array
    norms = np.linalg.norm(random_array, axis=1, keepdims=True)
    normalized_array = random_array / norms
    # normalized_array = random_array.astype(np.float32)
    normalized_query_vector = query_vector / np.linalg.norm(query_vector)
    end = time.time()
    print(f"normalized random array in {end-start} seconds")
    # Convert numpy array to list of lists
    start = time.time()
    # list_of_lists = normalized_array.astype(np.float32).tolist()
    end = time.time()
    # query_vector_as_list = normalized_query_vector.astype(np.float32).tolist()
    return normalized_array,normalized_query_vector

def convert_to_numpy_array(embeddings_list,embedding_query):
    embeddings_array = np.array(embeddings_list)
    query_vector = np.array(embedding_query)
    return embeddings_array, query_vector

def convert_to_polars_array(embeddings_array,embedding_query):
    df = pl.DataFrame({'embeddings': embeddings_array}, schema={'embeddings':pl.List( inner=pl.Float32) }) #:pl.Array(width=1536, inner=pl.Float64)
    query_as_series = pl.Series(embedding_query, dtype=pl.Float32)
    return df,query_as_series


def numpy_search(query_vector, embeddings_array,  top_k):
    # Compute dot product of query vector with each row in the embeddings array
    dot_product_array = np.dot(embeddings_array, query_vector)
    # Sort dot product array in descending order and get the indices of the top_k rows
    top_k_indices = np.argsort(dot_product_array)[::-1][:top_k]
    # Get the values corresponding to the top_k_indices
    top_k_scores = [dot_product_array[i] for i in top_k_indices]
    return top_k_scores

def polar_search(query_as_series,df,top_k):
    dot_product_frame = df.with_columns(df["embeddings"].list.eval(pl.element().explode().dot(query_as_series),parallel=True).list.first().alias("dot_product"))
    # Sort by dot product and select top_k rows
    result = dot_product_frame.sort('dot_product', descending=True).slice(0, top_k)
    return result

In [4]:
def test_speed():
    row_sizes = [10, 100, 1000, 10000, 100000, 1000000]  # Add more if needed
    top_k = 10

    for rows in row_sizes:
        print(f"Testing with {rows} rows...")
        embeddings_array, query_vector = generate_data(rows)


       
        # Execute top-k search with numpy and measure time
        start_time = time.time()
        top_k_scores = numpy_search(query_vector, embeddings_array, top_k)
        numpy_search_time = time.time() - start_time
        print(f"Time to execute top-k search with numpy: {numpy_search_time} seconds")

         # Convert to polars dataframe and measure time
        start_time = time.time()
        polars_df, polars_query = convert_to_polars_array(embeddings_array, query_vector)
        polars_conversion_time = time.time() - start_time
        print(f"Time to convert to polars dataframe: {polars_conversion_time} seconds")

        del embeddings_array
        # Execute top-k search with polars and measure time
        start_time = time.time()
        result = polar_search(polars_query, polars_df, top_k)
        polars_search_time = time.time() - start_time
        del polars_df
        print(f"Time to execute top-k search with polars: {polars_search_time} seconds")

In [5]:
test_speed()

Testing with 10 rows...
generating random array
generated random array in 0.0015056133270263672 seconds
normalized random array in 0.0004944801330566406 seconds
Time to execute top-k search with numpy: 0.0010006427764892578 seconds
Time to convert to polars dataframe: 0.0009987354278564453 seconds
Time to execute top-k search with polars: 0.0040056705474853516 seconds
Testing with 100 rows...
generating random array
generated random array in 0.001997709274291992 seconds
normalized random array in 0.0005006790161132812 seconds
Time to execute top-k search with numpy: 0.0 seconds
Time to convert to polars dataframe: 0.0010018348693847656 seconds
Time to execute top-k search with polars: 0.002497434616088867 seconds
Testing with 1000 rows...
generating random array
generated random array in 0.009997129440307617 seconds
normalized random array in 0.002502918243408203 seconds
Time to execute top-k search with numpy: 0.0005006790161132812 seconds
Time to convert to polars dataframe: 0.009999

MemoryError: Unable to allocate 114. GiB for an array with shape (10000000, 1536) and data type float64