In [148]:
from lancedb.pydantic import Vector, LanceModel
import lancedb
from llama_index.embeddings.ollama import OllamaEmbedding
from pathlib import Path
from procureme.models.contract_model import ParsedDocument
import json

In [149]:
CLIENT = OllamaEmbedding(
    model_name="nomic-embed-text:v1.5",
    base_url="http://localhost:11434",
    ollama_additional_kwargs={"mirostat": 0},
)

In [150]:
DIMENTION: int = len(CLIENT.get_query_embedding("O Hey!"))

In [172]:
DIMENTION

768

In [151]:
class DocumentWithMetadata(LanceModel):
    chunk_id: str
    doc_id: str
    file_name: str
    total_pages: int
    content: str
    part: str
    vector: Vector(DIMENTION)

In [152]:
DBPATH = Path().absolute().parent.joinpath("vectordb")
FILEPATH = DBPATH.joinpath("contracts.db")
TABLE_NAME = "contracts_test"

In [153]:
db = lancedb.connect(FILEPATH)

In [154]:
db.create_table(TABLE_NAME, schema=DocumentWithMetadata, mode="overwrite")

[90m[[0m2025-05-18T16:29:02Z [33mWARN [0m lance::dataset::write::insert[90m][0m No existing dataset at /Users/datapsycho/PythonProjects/procure.me/vectordb/contracts.db/contracts_test.lance, it will be created


LanceTable(name='contracts_test', version=1, _conn=LanceDBConnection(uri='/Users/datapsycho/PythonProjects/procure.me/vectordb/contracts.db'))

In [155]:
tbl = db.open_table("contracts_test") if TABLE_NAME in db.table_names() else db.create_table(TABLE_NAME, schema=DocumentWithMetadata)

In [156]:
# Read and convert data into pydantic lance format
def loader(file_path: Path) -> ParsedDocument:
    with open(file_path, "r", encoding="utf-8") as f:
        data = json.load(f)
        parser = ParsedDocument.model_validate(data)
        return parser

In [157]:
test_doc = loader(Path().absolute().parent.joinpath("data", "silver", "contracts", "CW0348.pdf.json"))

In [158]:
print(test_doc)

id_='37f12f5a-399a-4b3c-aadd-15d88b2fe8be' total_pages=2 file_name='CW0348.pdf' text=' \n PROCUREMENT CONTRACT  \nThis Procurement Contract (the "Contract") is entered into between C11, hereinafter referred to \nas the "Supplier," and Plasma Corporation, hereinafter referred to as the "Buyer."  \n1. TERM  \n1.1 Effective Date: This Contract shall become effective on November 2020.  \n1.2 Expiry Date: The initial term of this Contract shall be for a period of 5 years from the \nEffective Date unless terminated earlier as per the terms of this Contract.  \n2. SUPPLIER DETAILS  \nSupplier Name: C11 Address: 456 Oak Avenue, Cityville, USA Contact Person: Jane Doe \nEmail: jane@xyzsuppliers.com Phone: +1-987-654-3210  \n3. PURCHASE DETAILS  \n3.1 Product/Service Description: The Supplier shall provide the following products/services to \nthe Buyer:  \nProduct X011 \nProduct Y011 \nProduct Z011 \n \n3.2 Volume Discounts: The Buyer and the Supplier agree to the following volume discounts \nba

In [159]:
def convert_doc_to_lancedb(doc: ParsedDocument) -> list[DocumentWithMetadata]:
    documents = []
    for part in doc.parts:
        document_unit = DocumentWithMetadata(
            chunk_id="{}-{}".format(doc.file_name, part.part),
            doc_id=doc.file_name,
            part=part.part,
            file_name=doc.file_name,
            total_pages=doc.total_pages,
            content=part.text,
            vector=CLIENT.get_query_embedding(part.text),
        )
        documents.append(document_unit)
    return documents

In [160]:
sample_data = convert_doc_to_lancedb(test_doc)

In [161]:
tbl.add(sample_data)

In [162]:
# Embed your query text
query_text = "what is the Effective Date: or Contract shall become effective of the Contract?"
query_vector = CLIENT.get_query_embedding(query_text)

In [163]:
# Perform the search
top_n = 5  # Number of top results to retrieve
results = tbl.search(query_vector).limit(top_n)

In [164]:
results.to_pandas()

Unnamed: 0,chunk_id,doc_id,file_name,total_pages,content,part,vector,_distance
0,CW0348.pdf-part - 2,CW0348.pdf,CW0348.pdf,2,6. CONFIDENTIALITY \n6.1 Confidential Informa...,part - 2,"[0.7514758, 0.23749453, -3.101361, -0.24841878...",331.484802
1,CW0348.pdf-part - 1,CW0348.pdf,CW0348.pdf,2,\n PROCUREMENT CONTRACT \nThis Procurement C...,part - 1,"[0.1820217, 0.64779174, -2.8016148, -0.8422372...",385.538605


In [165]:
tbl.delete('chunk_id = "CW0348.pdf-part - 2"')
results.to_pandas()

Unnamed: 0,chunk_id,doc_id,file_name,total_pages,content,part,vector,_distance
0,CW0348.pdf-part - 1,CW0348.pdf,CW0348.pdf,2,\n PROCUREMENT CONTRACT \nThis Procurement C...,part - 1,"[0.1820217, 0.64779174, -2.8016148, -0.8422372...",385.538605


In [168]:
sample_data

[DocumentWithMetadata(chunk_id='CW0348.pdf-part - 1', doc_id='CW0348.pdf', file_name='CW0348.pdf', total_pages=2, content=' \n PROCUREMENT CONTRACT  \nThis Procurement Contract (the "Contract") is entered into between C11, hereinafter referred to \nas the "Supplier," and Plasma Corporation, hereinafter referred to as the "Buyer."  \n1. TERM  \n1.1 Effective Date: This Contract shall become effective on November 2020.  \n1.2 Expiry Date: The initial term of this Contract shall be for a period of 5 years from the \nEffective Date unless terminated earlier as per the terms of this Contract.  \n2. SUPPLIER DETAILS  \nSupplier Name: C11 Address: 456 Oak Avenue, Cityville, USA Contact Person: Jane Doe \nEmail: jane@xyzsuppliers.com Phone: +1-987-654-3210  \n3. PURCHASE DETAILS  \n3.1 Product/Service Description: The Supplier shall provide the following products/services to \nthe Buyer:  \nProduct X011 \nProduct Y011 \nProduct Z011 \n \n3.2 Volume Discounts: The Buyer and the Supplier agree t

In [166]:
results = tbl.search(query_vector).limit(top_n)

In [169]:
res = (
    tbl.merge_insert("chunk_id")
    .when_not_matched_insert_all()
    .execute(sample_data)
)

In [170]:
tbl.count_rows()

2