In [1]:
import os

while "src" not in os.listdir():
    os.chdir("..")
    print(f"Current folder: {os.getcwd()}")

Current folder: c:\Users\giova\Desktop\MLCube\Repositories\ml3\rag-with-polars


# Imports

In [27]:
import polars as pl
import numpy as np
from dotenv import load_dotenv
import requests

In [28]:
load_dotenv()

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
url = "https://api.openai.com/v1/embeddings"

In [29]:
def compute_embeddings(
    texts: str | list[str],
    open_ai_model: str = "text-embedding-3-small",
    normalize: bool = True,
) -> np.ndarray:
    """
    Compute embeddings for a list of texts.

    Returns a numpy array of embeddings of shape
    (len(texts), embedding_size).
    """

    if isinstance(texts, str):
        texts = [texts]

    headers = {
        "Authorization": f"Bearer {OPENAI_API_KEY}",
        "Content-Type": "application/json",
    }

    payload = {
        "input": texts,
        "model": open_ai_model,
        "encoding_format": "float",
    }

    # Make the POST request
    json_response = requests.post(url, headers=headers, json=payload).json()

    # Extract the embeddings

    embeddings = np.array(
        [embedding_json["embedding"] for embedding_json in json_response["data"]]
    )

    if normalize:
        embeddings = embeddings / np.linalg.norm(embeddings, axis=1, keepdims=True)

    return embeddings

# Load Data

In [21]:
DATA_FOLDER = "data"
DATASET_NAME = "hacker_news"
FILE_NAME = f"{DATA_FOLDER}/{DATASET_NAME}.parquet"

In [22]:
df = pl.scan_parquet(FILE_NAME)

In [80]:
materialized_df = df.collect()

In [23]:
df.head().collect()

id,time,title,url,embedding
i32,i32,str,str,"array[f64, 1536]"
35515614,1681151391,"""Text-Based Tetris""","""https://aino.agency/game""","[-0.04116, 0.038436, … 0.002007]"
35806111,1683139428,"""ChatGPT can now find you a hou…","""https://www.theverge.com/2023/…","[-0.033018, 0.025405, … -0.000245]"
35911041,1683857335,"""Long popular in Asia, floating…","""https://apnews.com/article/flo…","[-0.009536, 0.006712, … -0.000615]"
35582623,1681579623,"""Sound Money Bills Moving Forwa…","""https://www.soundmoneydefense.…","[0.02966, 0.003126, … 0.002368]"
35714042,1682518719,"""Edge AI generates choreographi…","""https://edge-dance.github.io/""","[0.004581, -0.044337, … 0.003402]"


df is our vector store. It has an id, the title, the url of the post, the time it was published and the embedding

In [None]:
query = "Python"
query_embedding = compute_embeddings(query)

### Method 1

Inspired by the blog post. Materialize dataframe, get embedding column, and convert to numpy array and compute cosine similarity with efficient
numpy tricks.

In [89]:
def get_closest_1(
    lazy_vector_store: pl.LazyFrame,
    query_embed: np.ndarray,
    materialized_vector_store: pl.DataFrame | None = None,
    k: int = 3,
) -> pl.DataFrame:
    """
    First method: extract embeddings column and compute
    cosine similarity with the query embedding.

    Returns the k rows with the smallest cosine similarity.

    The performance of this method is heavily impacted by having
    to materialize the dataframe. We keep that as an argument
    so that we can test its impact.
    Nonetheless assuming to having it already materialized is not
    optimal since in a real-world scenario the embeddings could be
    very large or we might want to load them on-demand (rather than
    having them always in memory).
    """

    # We need to materialize in order to access the embeddings column
    if materialized_vector_store is None:
        materialized_vector_store = lazy_vector_store.collect()

    vector_store_embeds = materialized_vector_store["embedding"].to_numpy()

    # Compute cosine similarity.
    # Since the embeddings are normalized, this is equivalent to the dot product.
    cosine_similarities = np.einsum("ij,ij->i", vector_store_embeds, query_embed)

    # Get the indices of the k smallest cosine similarities
    # Notice that argpartition gives no guarantee on the order
    # of the k smallest elements, which is why we need
    # an extra sorting step after the partitioning.
    closest_indices = np.argpartition(cosine_similarities, -k)[-k:]

    # Sort the k closest indices by cosine similarity
    idx = closest_indices[np.argsort(cosine_similarities[closest_indices])[::-1]]

    return (
        lazy_vector_store.with_row_index()
        .filter(pl.col("index").is_in(idx))
        .select(["id", "title", "url", "time"])
        .collect()
    )

In [90]:
get_closest_1(lazy_vector_store=df, query_embed=query_embedding, k=3)

id,title,url,time
i32,str,str,i32
35421096,"""Think Python 2e""","""https://greenteapress.com/wp/t…",1680517178
35684659,"""Python Dictionaries""","""https://medium.com/@j_ankit/py…",1682321434
35465484,"""Python Programming Exercises G…","""https://inventwithpython.com/p…",1680767014


In [91]:
get_closest_1(
    lazy_vector_store=df,
    query_embed=query_embedding,
    k=3,
    materialized_vector_store=materialized_df,
)

id,title,url,time
i32,str,str,i32
35421096,"""Think Python 2e""","""https://greenteapress.com/wp/t…",1680517178
35684659,"""Python Dictionaries""","""https://medium.com/@j_ankit/py…",1682321434
35465484,"""Python Programming Exercises G…","""https://inventwithpython.com/p…",1680767014


In [92]:
%%timeit

get_closest_1(lazy_vector_store=df, query_embed=query_embedding, k=3)

561 ms ± 36.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [93]:
%%timeit

get_closest_1(
    lazy_vector_store=df,
    query_embed=query_embedding,
    k=3,
    materialized_vector_store=materialized_df,
)

19 ms ± 1.1 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


### Method 2



In [229]:
def get_closest_2(
    lazy_vector_store: pl.LazyFrame,
    query_embed: np.ndarray,  # noqa
    materialized_vector_store: pl.DataFrame | None = None,  # noqa
    k: int = 3,
) -> pl.DataFrame:
    """ """

    return (
        lazy_vector_store.with_columns(
            query=pl.lit(query_embedding.reshape(-1).tolist()).cast(
                pl.Array(pl.Float64, shape=query_embedding.shape[1])
            ),
        )
        .with_columns(
            # cosine_similarity=pl.col("embedding").mul(pl.col("query")).arr.sum()
            cosine_similarity=np.dot(
                pl.col("embedding"),
                pl.col("query"),
            ).arr.sum()
        )
        .sort("cosine_similarity", descending=True)
        .head(k)
        .select(["id", "title", "url", "time"])
        .collect()
    )

In [230]:
get_closest_2(
    lazy_vector_store=df,
    query_embed=query_embedding,
    k=3,
)

id,title,url,time
i32,str,str,i32
35465484,"""Python Programming Exercises G…","""https://inventwithpython.com/p…",1680767014
35421096,"""Think Python 2e""","""https://greenteapress.com/wp/t…",1680517178
35684659,"""Python Dictionaries""","""https://medium.com/@j_ankit/py…",1682321434


In [231]:
%%timeit

get_closest_2(lazy_vector_store=df, query_embed=query_embedding, k=3)

572 ms ± 29.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


### Method 3

Remove sorting as that's what's taking the most time.

In [234]:
def get_closest_3(
    lazy_vector_store: pl.LazyFrame,
    query_embed: np.ndarray,  # noqa
    materialized_vector_store: pl.DataFrame | None = None,  # noqa
    k: int = 3,
) -> pl.DataFrame:
    """ """

    return (
        lazy_vector_store.with_columns(
            query=pl.lit(query_embedding.reshape(-1).tolist()).cast(
                pl.Array(pl.Float64, shape=query_embedding.shape[1])
            ),
        )
        .with_columns(
            cosine_similarity=np.dot(
                pl.col("embedding"),
                pl.col("query"),
            ).arr.sum()
        )
        .top_k(k, by="cosine_similarity")
        .select(["id", "title", "url", "time"])
        .collect()
    )

In [236]:
get_closest_3(
    lazy_vector_store=df,
    query_embed=query_embedding,
    k=3,
)

id,title,url,time
i32,str,str,i32
35465484,"""Python Programming Exercises G…","""https://inventwithpython.com/p…",1680767014
35421096,"""Think Python 2e""","""https://greenteapress.com/wp/t…",1680517178
35684659,"""Python Dictionaries""","""https://medium.com/@j_ankit/py…",1682321434


In [237]:
%%timeit

get_closest_3(lazy_vector_store=df, query_embed=query_embedding, k=3)

572 ms ± 26.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
