In [1]:
!pip install -q chromadb pypdf2 sentence-transformers litellm langchain


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m25.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [2]:
import os
from typing import Any

import chromadb
import PyPDF2
from langchain.text_splitter import RecursiveCharacterTextSplitter 
# from langchain_text_splitters import RecursiveCharacterTextSplitter -> latest version?
from litellm import completion
from sentence_transformers import SentenceTransformer

* 'fields' has been removed


In [3]:
os.environ['LITELLM_LOG'] = 'DEBUG'
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
PDFS_FOLDER_PATH = ""

In [4]:
def extract_text_from_pdfs(pdfs_folder_path: str) -> str:
    all_text = ""
    for pdf_file in os.listdir(pdfs_folder_path):
        if pdf_file.endswith(".pdf"):
            pdf_file_path = os.path.join(pdfs_folder_path, pdf_file)
            with open(pdf_file_path, "rb") as file:
                reader = PyPDF2.PdfReader(file)
                for page in reader.pages:
                    all_text += page.extract_text()

    return all_text

In [5]:
all_text = extract_text_from_pdfs(PDFS_FOLDER_PATH)

In [6]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=50,
    separators=["\n\n", "\n", " ", ""],
)
chunks = text_splitter.split_text(all_text)

## ChromaDB

In [7]:
client = chromadb.PersistentClient(path="chroma_db")
text_embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

try:
    client.delete_collection(name="knowledge_base")
    print("Deleted existing collection: knowledge_base")
except Exception as e:
    print(f"Collection does not exist or could not be deleted: {e}")
collection = client.create_collection(name="knowledge_base")

for i, chunk in enumerate(chunks):
    embedding = text_embedding_model.encode(chunk)
    collection.add(
        ids=[f"chunk_{i}"],
        embeddings=[embedding.tolist()],
        metadatas=[{"source": "pdf", "chunk_id": i}],
        documents=[chunk],
    )

Collection does not exist or could not be deleted: Collection knowledge_base does not exist.


In [8]:
def semantic_search(
    query: str, text_embedding_model: SentenceTransformer, top_k: int = 5
) -> dict[str, Any]:
    query_embedding = text_embedding_model.encode(query)

    return collection.query(
        query_embeddings=[query_embedding.tolist()], n_results=top_k
    )

In [9]:
query = "How to apply dual contouring of hermite data to a circle x^2+y^2=2.5^2"
results = semantic_search(query, text_embedding_model)

In [10]:
for i, result in enumerate(results["documents"][0]):
    print(f"Result {i+1}: {result}\n")

Result 1: thatcontain features. This Hermite approach cangenerate contours
thatcontain both sharp vertices andsharp edges. One drawback of
thismethod istheneed toexplicitly testforsuch features andto
then perform some type ofspecial processing inthese cases. As
analternati vetotheEMC method, wepropose thefollowing dual
contouring method forHermite data:
1.Foreach cube thatexhibits asign change, generate avertex
positioned attheminimizer ofthequadratic function ofequa-
tion1.

Result 2: whose edges aretagged byHermite data (i.e; exact intersection
points andnormals). This method avoidstheneed toexplicitly iden-
tifyandprocess “features” asrequired inprevious Hermite contour -
ingmethods. Using anew,numerically stable representation for
quadratic error functions, wedevelop anoctree-based method for
simplifying contours produced bythismethod. Wenextextend our
contouring method tothese simpliﬁed octrees. This newmethod

Result 3: contour generated bytheHermite data intheupper leftportion o

## LiteLLM with Gemini

In [11]:
def generate_response(query: str, context: str) -> str:
    prompt = f"Query: {query}\nContext: {context}\nAnswer:"
    response = completion(
        model="gemini/gemini-1.5-flash",
        messages=[{"content": prompt, "role": "user"}],
        api_key=GEMINI_API_KEY,
    )

    return response["choices"][0]["message"]["content"]

In [12]:
search_results = semantic_search(query, text_embedding_model)
context = "\n".join(search_results['documents'][0])

In [13]:
response = generate_response(query, context)
print("Generated Response:\n", response)

Generated Response:
 The provided text describes a dual contouring method for Hermite data, but it doesn't offer a direct algorithm for applying it to a specific function like the circle x² + y² = 2.5².  However, we can outline the steps based on the description:

**1. Create a signed distance field:**

* **Discretize the space:** Create a grid covering the area containing the circle.  The resolution of the grid will determine the accuracy of the final contour.
* **Evaluate the function:** For each grid cell (cube), determine whether it's inside or outside the circle by evaluating the function x² + y² - 2.5² at the cell's center (or corners).  A positive value means outside, a negative value means inside.  Store this as a signed distance value (positive distance for outside points, negative for inside points).  Alternatively, you can simply store a sign (+1 or -1).

**2. Identify sign changes:**

* **Edge traversal:** For each edge of each grid cell, check if the signed distance values