Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 0 additions & 14 deletions python/src/cairo_coder/dspy/document_retriever.py
Original file line number Diff line number Diff line change
Expand Up @@ -534,7 +534,6 @@ def __init__(
vector_db: SourceFilteredPgVectorRM | None = None,
max_source_count: int = 5,
similarity_threshold: float = 0.4,
embedding_model: str = "gemini-embedding-001",
):
"""
Initialize the DocumentRetrieverProgram.
Expand All @@ -544,12 +543,8 @@ def __init__(
vector_db: Optional pre-initialized vector database instance
max_source_count: Maximum number of documents to retrieve
similarity_threshold: Minimum similarity score for document inclusion
embedding_model: Gemini embedding model to use for reranking
"""
super().__init__()
# TODO: These should not be literal constants like this.
# TODO: if the vector_db is setup upon startup, then this should not be done here.
self.embedder = dspy.Embedder("gemini/gemini-embedding-001", dimensions=3072, batch_size=512)

self.vector_store_config = vector_store_config
if vector_db is None:
Expand All @@ -558,18 +553,15 @@ def __init__(
self.vector_db = SourceFilteredPgVectorRM(
db_url=db_url,
pg_table_name=pg_table_name,
embedding_func=self.embedder,
content_field="content",
fields=["id", "content", "metadata"],
k=max_source_count,
embedding_model='gemini-embedding-001',
include_similarity=True,
)
else:
self.vector_db = vector_db
self.max_source_count = max_source_count
self.similarity_threshold = similarity_threshold
self.embedding_model = embedding_model

async def aforward(
self, processed_query: ProcessedQuery, sources: list[DocumentSource] | None = None
Expand All @@ -591,7 +583,6 @@ async def aforward(
# Step 1: Fetch documents from vector store
documents = await self._afetch_documents(processed_query, sources)

# TODO: No source found means no answer can be given!
if not documents:
return []

Expand Down Expand Up @@ -621,7 +612,6 @@ def forward(
sync_retriever = SourceFilteredPgVectorRM(
db_url=db_url,
pg_table_name=pg_table_name,
embedding_func=self.embedder,
content_field="content",
fields=["id", "content", "metadata"],
k=self.max_source_count,
Expand Down Expand Up @@ -665,7 +655,6 @@ async def _afetch_documents(

search_queries = processed_query.search_queries
if not search_queries or len(search_queries) == 0:
# TODO: revert
search_queries = [processed_query.original]


Expand Down Expand Up @@ -740,7 +729,6 @@ def create_document_retriever(
vector_db: SourceFilteredPgVectorRM | None = None,
max_source_count: int = 5,
similarity_threshold: float = 0.4,
embedding_model: str = "text-embedding-3-large",
) -> DocumentRetrieverProgram:
"""
Factory function to create a DocumentRetrieverProgram instance.
Expand All @@ -750,7 +738,6 @@ def create_document_retriever(
vector_db: Optional pre-initialized vector database instance
max_source_count: Maximum number of documents to retrieve
similarity_threshold: Minimum similarity score for document inclusion
embedding_model: OpenAI embedding model to use for reranking

Returns:
Configured DocumentRetrieverProgram instance
Expand All @@ -760,5 +747,4 @@ def create_document_retriever(
vector_db=vector_db,
max_source_count=max_source_count,
similarity_threshold=similarity_threshold,
embedding_model=embedding_model,
)
1 change: 0 additions & 1 deletion python/src/cairo_coder/dspy/generation_program.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@
logger = structlog.get_logger(__name__)


# TODO: Find a way to properly "erase" common mistakes like PrintTrait imports.
class CairoCodeGeneration(Signature):
"""
Analyze a Cairo programming query and use the context to generate a high-quality Cairo code solution and explanations.
Expand Down
68 changes: 28 additions & 40 deletions python/src/cairo_coder/dspy/pgvector_rm.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
import warnings
from collections.abc import Callable
from typing import Optional

Expand All @@ -12,13 +11,6 @@
raise ImportError(
"The 'pgvector' extra is required to use PgVectorRM. Install it with `pip install dspy-ai[pgvector]`. Also, try `pip install pgvector psycopg2`.",
) from e
try:
import openai
except ImportError:
warnings.warn(
"`openai` is not installed. Install it with `pip install openai` to use OpenAI embedding models.",
stacklevel=2, category=ImportWarning,
)


class PgVectorRM(dspy.Retrieve):
Expand All @@ -33,69 +25,74 @@ class PgVectorRM(dspy.Retrieve):
Args:
db_url (str): A PostgreSQL database URL in psycopg2's DSN format
pg_table_name (Optional[str]): name of the table containing passages
openai_client (openai.OpenAI): OpenAI client to use for computing query embeddings. Either openai_client or embedding_func must be provided.
embedding_func (Callable): A function to use for computing query embeddings. Either openai_client or embedding_func must be provided.
embedding_func (Callable): A function to use for computing query embeddings. If not provided, uses dspy.settings.embedder.
content_field (str = "text"): Field containing the passage text. Defaults to "text"
k (Optional[int]): Default number of top passages to retrieve. Defaults to 20
embedding_field (str = "embedding"): Field containing passage embeddings. Defaults to "embedding"
fields (List[str] = ['text']): Fields to retrieve from the table. Defaults to "text"
embedding_model (str = "text-embedding-ada-002"): Field containing the OpenAI embedding model to use. Defaults to "text-embedding-ada-002"

Examples:
Below is a code snippet that shows how to use PgVector as the default retriever

```python
import dspy
import openai
import psycopg2

openai.api_key = os.environ.get("OPENAI_API_KEY", None)
openai_client = openai.OpenAI()
# Configure embedder at startup
embedder = dspy.Embedder("gemini/gemini-embedding-001", dimensions=3072)
dspy.configure(embedder=embedder)

llm = dspy.OpenAI(model="gpt-3.5-turbo")
llm = dspy.LM("gemini/gemini-flash-latest")
dspy.configure(lm=llm)

DATABASE_URL should be in the format postgresql://user:password@host/database
db_url=os.getenv("DATABASE_URL")
# DATABASE_URL should be in the format postgresql://user:password@host/database
db_url = os.getenv("DATABASE_URL")

retriever_model = PgVectorRM(conn, openai_client=openai_client, "paragraphs", fields=["text", "document_id"], k=20)
dspy.settings.configure(lm=llm, rm=retriever_model)
# embedding_func will default to dspy.settings.embedder
retriever_model = PgVectorRM(db_url, "paragraphs", fields=["text", "document_id"], k=20)
dspy.configure(rm=retriever_model)
```

Below is a code snippet that shows how to use PgVector in the forward() function of a module
Below is a code snippet that shows how to use PgVector with a custom embedding function
```python
self.retrieve = PgVectorRM(db_url, openai_client=openai_client, "paragraphs", fields=["text", "document_id"], k=20)
def my_embedder(text: str) -> list[float]:
# Your custom embedding logic
return embeddings

self.retrieve = PgVectorRM(db_url, "paragraphs", embedding_func=my_embedder, fields=["text", "document_id"], k=20)
```
"""

def __init__(
self,
db_url: str,
pg_table_name: str,
openai_client: Optional[openai.OpenAI] = None,
embedding_func: Optional[Callable] = None,
k: int = 20,
embedding_field: str = "embedding",
fields: Optional[list[str]] = None,
content_field: str = "text",
embedding_model: str = "text-embedding-ada-002",
include_similarity: bool = False,
):
"""
k = 20 is the number of paragraphs to retrieve
"""
assert (
openai_client or embedding_func
), "Either openai_client or embedding_func must be provided."
self.openai_client = openai_client
self.embedding_func = embedding_func
# Use provided embedding_func or fall back to dspy.settings.embedder
if embedding_func is None:
if dspy.settings.embedder is None:
raise ValueError(
"No embedding_func provided and no embedder configured in dspy.settings. "
"Either pass embedding_func or configure with: dspy.configure(embedder=...)"
)
self.embedding_func = dspy.settings.embedder
else:
self.embedding_func = embedding_func

self.conn = psycopg2.connect(db_url)
register_vector(self.conn)
self.pg_table_name = pg_table_name
self.fields = fields or ["text"]
self.content_field = content_field
self.embedding_field = embedding_field
self.embedding_model = embedding_model
self.include_similarity = include_similarity

super().__init__(k=k)
Expand Down Expand Up @@ -144,14 +141,5 @@ def forward(self, query: str, k: int = None):
return retrieved_docs

def _get_embeddings(self, query: str) -> list[float]:
if self.openai_client is not None:
return (
self.openai_client.embeddings.create(
model=self.embedding_model,
input=query,
encoding_format="float",
)
.data[0]
.embedding
)
"""Get embeddings for a query using the configured embedding function."""
return self.embedding_func(query)
1 change: 0 additions & 1 deletion python/src/cairo_coder/dspy/query_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -187,7 +187,6 @@ def _validate_resources(self, resources: list[str]) -> list[DocumentSource]:
continue

# Return valid resources or default fallback
# TODO: Upon failure, this should return an error message to the user.
return valid_resources if valid_resources else list(DocumentSource)

def _is_contract_query(self, query: str) -> bool:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,23 +31,21 @@ def _():
# mlflow.set_experiment("DSPy")
# mlflow.dspy.autolog()

## Setup VectorDB for document retrieval
## Setup embedder and LM in dspy.configure
embedder = dspy.Embedder("gemini/gemini-embedding-001", dimensions=3072, batch_size=512)
lm = dspy.LM("gemini/gemini-flash-latest", max_tokens=30000, cache=False)
dspy.configure(lm=lm, adapter=XMLAdapter(), embedder=embedder)

## Setup VectorDB for document retrieval - will use dspy.settings.embedder
vector_store_config = get_vector_store_config()
vector_db = SourceFilteredPgVectorRM(
db_url=vector_store_config.dsn,
pg_table_name=vector_store_config.table_name,
embedding_func=embedder,
content_field="content",
fields=["id", "content", "metadata"],
k=5, # Default k, will be overridden by retriever
embedding_model="gemini-embedding-001",
include_similarity=True,
)

# Programs to be optimized: QueryProcessing --> OptimizedQuery --> Document retrieval
lm = dspy.LM("gemini/gemini-flash-latest", max_tokens=30000, cache=False)
dspy.configure(lm=lm, adapter=XMLAdapter())
return XMLAdapter, dspy, os, vector_db, vector_store_config


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,23 +31,21 @@ def _():
# mlflow.set_experiment("DSPy")
# mlflow.dspy.autolog()

## Setup VectorDB for document retrieval
## Setup embedder and LM in dspy.configure
embedder = dspy.Embedder("gemini/gemini-embedding-001", dimensions=3072, batch_size=512)
lm = dspy.LM("gemini/gemini-flash-latest", max_tokens=30000, cache=False)
dspy.configure(lm=lm, adapter=XMLAdapter(), embedder=embedder)

## Setup VectorDB for document retrieval - will use dspy.settings.embedder
vector_store_config = get_vector_store_config()
vector_db = SourceFilteredPgVectorRM(
db_url=vector_store_config.dsn,
pg_table_name=vector_store_config.table_name,
embedding_func=embedder,
content_field="content",
fields=["id", "content", "metadata"],
k=5, # Default k, will be overridden by retriever
embedding_model="gemini-embedding-001",
include_similarity=True,
)

# Programs to be optimized: QueryProcessing --> OptimizedQuery --> Document retrieval
lm = dspy.LM("gemini/gemini-flash-latest", max_tokens=30000, cache=False)
dspy.configure(lm=lm, adapter=XMLAdapter())
return XMLAdapter, dspy, os, vector_db, vector_store_config


Expand Down
12 changes: 5 additions & 7 deletions python/src/cairo_coder/optimizers/retrieval_optimizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,23 +31,21 @@ def _():
# mlflow.set_experiment("DSPy")
# mlflow.dspy.autolog()

## Setup VectorDB for document retrieval
## Setup embedder and LM in dspy.configure
embedder = dspy.Embedder("gemini/gemini-embedding-001", dimensions=3072, batch_size=512)
lm = dspy.LM("gemini/gemini-flash-lite-latest", max_tokens=15000, cache=False)
dspy.configure(lm=lm, adapter=XMLAdapter(), embedder=embedder)

## Setup VectorDB for document retrieval - will use dspy.settings.embedder
vector_store_config = get_vector_store_config()
vector_db = SourceFilteredPgVectorRM(
db_url=vector_store_config.dsn,
pg_table_name=vector_store_config.table_name,
embedding_func=embedder,
content_field="content",
fields=["id", "content", "metadata"],
k=5, # Default k, will be overridden by retriever
embedding_model="text-embedding-3-large",
include_similarity=True,
)

# Programs to be optimized: QueryProcessing --> OptimizedQuery --> Document retrieval
lm = dspy.LM("gemini/gemini-flash-lite-latest", max_tokens=15000, cache=False)
dspy.configure(lm=lm, adapter=XMLAdapter())
return XMLAdapter, dspy, os, vector_db, vector_store_config


Expand Down
22 changes: 9 additions & 13 deletions python/src/cairo_coder/server/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -182,11 +182,14 @@ def __init__(
# Setup routes
self._setup_routes()

# TODO: This is the place where we should select the proper LLM configuration.
# TODO: For now we just Hard-code DSPY - GEMINI
dspy.configure(lm=dspy.LM("gemini/gemini-flash-latest", max_tokens=30000, cache=False), adapter=XMLAdapter())
dspy.configure(callbacks=[AgentLoggingCallback()])
dspy.configure(track_usage=True)
embedder = dspy.Embedder("gemini/gemini-embedding-001", dimensions=3072, batch_size=512)
dspy.configure(
lm=dspy.LM("gemini/gemini-flash-latest", max_tokens=30000, cache=False),
adapter=XMLAdapter(),
embedder=embedder,
callbacks=[AgentLoggingCallback()],
track_usage=True,
)

def _setup_routes(self):
"""Setup FastAPI routes matching TypeScript backend."""
Expand Down Expand Up @@ -641,17 +644,13 @@ async def lifespan(app: FastAPI):
config = ConfigManager.load_config()
vector_store_config = config.vector_store

# TODO: These should not be literal constants like this.
embedder = dspy.Embedder("gemini/gemini-embedding-001", dimensions=3072, batch_size=512)

# embedding_func will default to dspy.settings.embedder (configured in __init__)
_vector_db = SourceFilteredPgVectorRM(
db_url=vector_store_config.dsn,
pg_table_name=vector_store_config.table_name,
embedding_func=embedder,
content_field="content",
fields=["id", "content", "metadata"],
k=5, # Default k, will be overridden by retriever
embedding_model='gemini-embedding-001',
include_similarity=True,
)

Expand Down Expand Up @@ -686,9 +685,6 @@ def main():
parser.add_argument("--workers", type=int, default=5, help="Number of workers to run")
args = parser.parse_args()

# TODO: configure DSPy with the proper LM.
# TODO: Find a proper pattern for it?
# TODO: multi-model management?
uvicorn.run(
"cairo_coder.server.app:create_app_factory",
host="0.0.0.0",
Expand Down