In [2]:
%load_ext autoreload

In [4]:
%autoreload 2

In [5]:
import os

while "polars_vector_store" not in os.listdir():
    os.chdir("..")
    print(f"Current folder: {os.getcwd()}")

Current folder: c:\Users\giova\Desktop\Sides\polars-vector-store


# Imports

In [6]:
import polars as pl
import numpy as np
from dotenv import load_dotenv
from datetime import datetime
import requests
from polars_vector_store.loader.parquet import ParquetLoader

In [7]:
load_dotenv()

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
url = "https://api.openai.com/v1/embeddings"

In [8]:
def compute_embeddings(
    texts: str | list[str],
    open_ai_model: str = "text-embedding-3-small",
    normalize: bool = True,
) -> np.ndarray:
    """
    Compute embeddings for a list of texts.

    Returns a numpy array of embeddings of shape
    (len(texts), embedding_size).
    """

    if isinstance(texts, str):
        texts = [texts]

    headers = {
        "Authorization": f"Bearer {OPENAI_API_KEY}",
        "Content-Type": "application/json",
    }

    payload = {
        "input": texts,
        "model": open_ai_model,
        "encoding_format": "float",
    }

    # Make the POST request
    json_response = requests.post(url, headers=headers, json=payload).json()

    # Extract the embeddings

    embeddings = np.array(
        [embedding_json["embedding"] for embedding_json in json_response["data"]]
    )

    if normalize:
        embeddings = embeddings / np.linalg.norm(embeddings, axis=1, keepdims=True)

    return embeddings

# Load Data

In [9]:
DATA_FOLDER = "data"
DATASET_NAME = "hacker_news"
FILE_NAME = f"{DATA_FOLDER}/{DATASET_NAME}.parquet"

In [10]:
df = pl.scan_parquet(FILE_NAME)

In [11]:
materialized_df = df.collect()

In [12]:
materialized_df.shape

(28544, 5)

In [13]:
df.head().collect()

id,time,title,url,embedding
str,i32,str,str,"array[f64, 1536]"
"""35515614""",1681151391,"""Text-Based Tetris""","""https://aino.agency/game""","[-0.041159, 0.038379, … 0.001997]"
"""35680911""",1682285922,"""Will the Internet Democratize …","""https://www.nytimes.com/2023/0…","[0.020964, -0.022481, … -0.008165]"
"""35806111""",1683139428,"""ChatGPT can now find you a hou…","""https://www.theverge.com/2023/…","[-0.03301, 0.025399, … -0.000259]"
"""35908618""",1683840510,"""Capsule captures first look in…","""https://www.ucdavis.edu/news/c…","[-0.004219, 0.024209, … -0.011713]"
"""35911041""",1683857335,"""Long popular in Asia, floating…","""https://apnews.com/article/flo…","[-0.0095, 0.006706, … -0.000616]"


In [14]:
query = "Python"
query_embedding = compute_embeddings(query)

# ChromaDB

In [15]:
from polars_vector_store.chroma import ChromaDB

In [16]:
parquet_loader = ParquetLoader(
    path_to_file=FILE_NAME,
    id_column_name="id",
    text_column_name="title",
    embedding_column_name="embedding",
    metadata_columns_names=["url", "time"],
)

In [17]:
chroma = ChromaDB()  # can just do this if data is already loaded in memory

In [18]:
%%timeit

chroma.similarity_search_by_vector(
    query_embedding,
    k=3,
)

3.84 ms ± 411 μs per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [19]:
%%timeit

chroma.similarity_search_by_vector(
    query_embedding,
    k=3,
    filters={
        "time": {"$gt": datetime.strptime("2023-05-01", "%Y-%m-%d").timestamp()},
    },
)

200 ms ± 29.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


---

# Polars - Numpy Based

In [23]:
from polars_vector_store.polars.numpy_based import NumpyBasedPolarsVectorStore

In [24]:
polars_numpy_vs = NumpyBasedPolarsVectorStore.from_parquet(parquet_loader)

In [25]:
polars_numpy_vs


<polars_vector_store.polars.numpy_based.NumpyBasedPolarsVectorStore at 0x2391c0c4ce0>

DB is not materialized

In [None]:
%%timeit

polars_numpy_vs.similarity_search_by_vector(
    query_embedding,
    k=3,
)

1.84 s ± 162 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [None]:
%%timeit

polars_numpy_vs.similarity_search_by_vector(
    query_embedding,
    k=3,
    filter=pl.col("time") >= datetime.strptime("2023-05-01", "%Y-%m-%d").timestamp(),
)

1.98 s ± 89.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


Give materialized db

In [None]:
parquet_loader_with_db = ParquetLoader(
    path_to_file=FILE_NAME,
    id_column_name="id",
    text_column_name="title",
    embedding_column_name="embedding",
    metadata_columns_names=["url", "time"],
)
parquet_loader_with_db.materialized_df = materialized_df

polars_numpy_vs_with_materialized_db = NumpyBasedPolarsVectorStore.from_parquet(
    parquet_loader_with_db
)

In [None]:
%%timeit

polars_numpy_vs_with_materialized_db.similarity_search_by_vector(
    query_embedding,
    k=3,
)

60.7 ms ± 6.33 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [None]:
%%timeit

polars_numpy_vs_with_materialized_db.similarity_search_by_vector(
    query_embedding,
    k=3,
    filter=pl.col("time") >= datetime.strptime("2023-05-01", "%Y-%m-%d").timestamp(),
)

65.3 ms ± 4.56 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


# Polars TopK

In [26]:
from polars_vector_store.polars.polars_top_k import PolarsTopKVectorStore

In [27]:
parquet_loader = ParquetLoader(
    path_to_file=FILE_NAME,
    id_column_name="id",
    text_column_name="title",
    embedding_column_name="embedding",
    metadata_columns_names=["url", "time"],
)

In [28]:
polars_top_k = PolarsTopKVectorStore.from_parquet(parquet_loader)

In [None]:
%%timeit

polars_top_k.similarity_search_by_vector(
    query_embedding,
    k=3,
)

1.77 s ± 533 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [None]:
%%timeit

polars_top_k.similarity_search_by_vector(
    query_embedding,
    k=3,
    filters=pl.col("time") >= datetime.strptime("2023-05-01", "%Y-%m-%d").timestamp(),
)

1.35 s ± 91.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


# Polars Arg Partition

In [None]:
from polars_vector_store.polars.polars_argpartition import (
    PolarsArgPartitionVectorStore,
)

In [None]:
parquet_loader = ParquetLoader(
    path_to_file=FILE_NAME,
    id_column_name="id",
    text_column_name="title",
    embedding_column_name="embedding",
    metadata_columns_names=["url", "time"],
)

In [None]:
polars_arg_part = PolarsArgPartitionVectorStore.from_parquet(parquet_loader)

In [None]:
%%timeit

polars_arg_part.similarity_search_by_vector(
    query_embedding,
    k=3,
)

1.29 s ± 45.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [None]:
%%timeit

polars_arg_part.similarity_search_by_vector(
    query_embedding,
    k=3,
    filters=pl.col("time") >= datetime.strptime("2023-05-01", "%Y-%m-%d").timestamp(),
)

1.31 s ± 52.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
