In [1]:
import os

while "src" not in os.listdir():
    os.chdir("..")
    print(f"Current folder: {os.getcwd()}")

Current folder: c:\Users\giova\Desktop\MLCube\Repositories\ml3\rag-with-polars


In [2]:
import requests
from dotenv import load_dotenv
import polars as pl
import numpy as np

In [3]:
load_dotenv()

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
url = "https://api.openai.com/v1/embeddings"

In [4]:
def compute_embeddings(
    texts: str | list[str],
    open_ai_model: str = "text-embedding-3-small",
    normalize: bool = True,
) -> np.ndarray:
    """
    Compute embeddings for a list of texts.

    Returns a numpy array of embeddings of shape
    (len(texts), embedding_size).
    """

    if isinstance(texts, str):
        texts = [texts]

    headers = {
        "Authorization": f"Bearer {OPENAI_API_KEY}",
        "Content-Type": "application/json",
    }

    payload = {
        "input": texts,
        "model": open_ai_model,
        "encoding_format": "float",
    }

    # Make the POST request
    json_response = requests.post(url, headers=headers, json=payload).json()

    # Extract the embeddings

    embeddings = np.array(
        [embedding_json["embedding"] for embedding_json in json_response["data"]]
    )

    if normalize:
        embeddings = embeddings / np.linalg.norm(embeddings, axis=1, keepdims=True)

    return embeddings

# Download data

We'll download a dataset that contains titles and url of hacker news post. See it [here](https://huggingface.co/datasets/julien040/hacker-news-posts).
We just load posts published after 1st April 2023, then gather 1 every 4 (around 14_000 samples).

In [19]:
df = (
    pl.scan_parquet("hf://datasets/julien040/hacker-news-posts/story.parquet")
    .select(["id", "time", "title", "url"])
    .filter(pl.col("time") >= 1680307200)
    .gather_every(4)
)

In [20]:
materialized_df = df.collect()

In [21]:
materialized_df.shape

(14272, 4)

In [None]:
# Save data without embeddings as a checkpoint

# DATA_FOLDER = "data"
# DATASET_NAME = "hacker_news"
# FILE_NAME = f"{DATA_FOLDER}/{DATASET_NAME}_no_embeds.parquet"

# os.makedirs(DATA_FOLDER, exist_ok=True)
# materialized_df.write_parquet(f"{DATA_FOLDER}/{DATASET_NAME}.parquet")

In [24]:
titles = materialized_df["title"].to_list()

In [31]:
batch_size = 100

embeddings = []

for i in range(0, len(titles), batch_size):
    print(f"Getting embeddings for batch {i // batch_size + 1}...")

    batch_titles = titles[i : i + batch_size]

    batch_embeddings = compute_embeddings(batch_titles)

    embeddings.append(batch_embeddings)

np_embeddings = np.vstack(embeddings)

Getting embeddings for batch 1...
Getting embeddings for batch 2...
Getting embeddings for batch 3...
Getting embeddings for batch 4...
Getting embeddings for batch 5...
Getting embeddings for batch 6...
Getting embeddings for batch 7...
Getting embeddings for batch 8...
Getting embeddings for batch 9...
Getting embeddings for batch 10...
Getting embeddings for batch 11...
Getting embeddings for batch 12...
Getting embeddings for batch 13...
Getting embeddings for batch 14...
Getting embeddings for batch 15...
Getting embeddings for batch 16...
Getting embeddings for batch 17...
Getting embeddings for batch 18...
Getting embeddings for batch 19...
Getting embeddings for batch 20...
Getting embeddings for batch 21...
Getting embeddings for batch 22...
Getting embeddings for batch 23...
Getting embeddings for batch 24...
Getting embeddings for batch 25...
Getting embeddings for batch 26...
Getting embeddings for batch 27...
Getting embeddings for batch 28...
Getting embeddings for batch 

In [32]:
df_with_embeddings = materialized_df.with_columns(embedding=np_embeddings)

In [33]:
df_with_embeddings.columns

['id', 'time', 'title', 'url', 'embedding']

In [34]:
# Save data with embeddings

DATA_FOLDER = "data"
DATASET_NAME = "hacker_news"
FILE_NAME = f"{DATA_FOLDER}/{DATASET_NAME}.parquet"

os.makedirs(DATA_FOLDER, exist_ok=True)
df_with_embeddings.write_parquet(f"{DATA_FOLDER}/{DATASET_NAME}.parquet")