In [2]:
import voyageai
import os
from dotenv import load_dotenv

load_dotenv()

# Set your API key (store it in an env var for safety)
os.environ["VOYAGE_API_KEY"] = os.getenv("VOYAGE_KEY")
client = voyageai.Client()

In [3]:
def embed_voyage(texts: list[str]) -> list[list[float]]:
    """
    Calls Voyage's /embeddings endpoint with model `voyage-code-3`.
    Returns a list of 1024‑dim float vectors.
    """
    # Voyage automatically truncates to 2048 tokens; we pre‑split to be safe.
    resp = client.embed(
        model="voyage-code-3",
        input=texts,          # can be a list of strings (batch up to 64)
    )
    return resp.embeddings   # already a List[List[float]]

In [4]:
import tiktoken

ENC = tiktoken.get_encoding("cl100k_base")
MAX_TOKENS = 2048

def split_to_chunks(text: str, max_tokens: int = MAX_TOKENS) -> list[str]:
    token_ids = ENC.encode(text)
    chunks = []
    for i in range(0, len(token_ids), max_tokens):
        chunk_ids = token_ids[i:i+max_tokens]
        chunks.append(ENC.decode(chunk_ids))
    return chunks

In [None]:
import pathlib, ast
from docutils.core import publish_doctree

def rst_chunks(file_path: pathlib.Path) -> list[dict]:
    text = file_path.read_text(encoding="utf-8")
    doctree = publish_doctree(text)

    chunks = []
    for node in doctree.traverse():
        if node.tagname == "literal_block":                     # code block
            chunks.append({
                "text": node.astext(),
                "type": "code",
                "file": str(file_path),
                "line_start": node.line,
                "line_end": node.line + node.astext().count("\n"),
            })
        elif node.tagname in ("paragraph", "title", "section"):
            # Split large paragraphs into token‑bounded windows
            for chunk_text in split_to_chunks(node.astext()):
                chunks.append({
                    "text": chunk_text,
                    "type": "doc",
                    "file": str(file_path),
                    "line_start": node.line,
                    "line_end": node.line + chunk_text.count("\n"),
                })
    return chunks

def py_chunks(file_path: pathlib.Path) -> list[dict]:
    source = file_path.read_text(encoding="utf-8")
    tree = ast.parse(source, filename=str(file_path))

    chunks = []
    for node in ast.iter_child_nodes(tree):
        if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef, ast.ClassDef)):
            start = node.lineno - 1
            end   = node.end_lineno
            code  = "\n".join(source.splitlines()[start:end])
            chunks.append({
                "text": code,
                "type": "code",
                "file": str(file_path),
                "line_start": start + 1,
                "line_end": end,
            })
    return chunks

In [None]:
def collect_all_chunks(root_dir: pathlib.Path) -> list[dict]:
    chunks = []
    for path in root_dir.rglob("*"):
        if path.suffix == ".rst":
            chunks.extend(rst_chunks(path))
        elif path.suffix == ".py":
            chunks.extend(py_chunks(path))
    return chunks