In [2]:
import os
import uuid
import requests
from pathlib import Path
from qdrant_client import QdrantClient, models  
from metadata_registry import FILE_METADATA, DEFAULT_METADATA

In [4]:
class LMStudioBgeM3Dense:
    def __init__(self, base_url, model):
        self.url = f"{base_url}/embeddings"
        self.model = model

    def embed_documents(self, texts):
        r = requests.post(
            self.url,
            json={
                "model": self.model,
                "input": texts  # MUST be raw strings
            }
        )
        r.raise_for_status()
        return [d["embedding"] for d in r.json()["data"]]

    def embed_query(self, text):
        return self.embed_documents([text])[0]

In [5]:
embedder = LMStudioBgeM3Dense(
    base_url="http://127.0.0.1:1234/v1",
    model="text-embedding-bge-m3"
)

In [6]:
print(len(embedder.embed_query("test sentence")))  # must be 1024

1024


In [7]:
def parse_markdown_unified(md_text):
    lines = md_text.split("\n")
    parsed = []

    section = "General"
    headers = []
    in_table = False

    for line in lines:
        s = line.strip()
        if not s:
            continue

        if s.startswith("#"):
            section = s.lstrip("#").strip()
            in_table = False
            continue

        if "|" in s and "---" not in s:
            cells = [c.strip() for c in s.split("|") if c.strip()]
            if not in_table:
                headers = cells
                in_table = True
            elif len(cells) == len(headers):
                parsed.append({
                    "type": "table_row",
                    "section": section,
                    "data": dict(zip(headers, cells))
                })
            continue

        if not in_table:
            parsed.append({
                "type": "text_block",
                "section": section,
                "text": s
            })

    return parsed

In [8]:
from langchain_openai import ChatOpenAI

llm = ChatOpenAI(
    base_url="http://127.0.0.1:1234/v1",
    api_key="not-needed",
    model="meta-llama-3.1-8b-instruct",
    temperature=0.1
)

def describe_table_row(section, row):
    data = ", ".join(f"{k}: {v}" for k, v in row.items())
    prompt = f"Convert this table row into one factual sentence.\nSection: {section}\nRow: {data}"
    try:
        return llm.invoke(prompt).content.strip()
    except:
        return f"In {section}, {data}"


In [9]:
resp = llm.invoke("Say only the word: OK")
print("LLM response:", resp.content)

LLM response: OK


In [10]:
client = QdrantClient(
    url="http://localhost:6333",
    timeout=120)
collection = "md_bge_m3_source"

if client.collection_exists(collection):
    client.delete_collection(collection)

client.create_collection(
    collection_name=collection,
    vectors_config=models.VectorParams(
        size=1024,
        distance=models.Distance.COSINE
    )
)

True

In [11]:
def batched(iterable, size):
    for i in range(0, len(iterable), size):
        yield iterable[i:i + size]

In [14]:
def get_metadata_for_file(filename: str) -> dict:
    return FILE_METADATA.get(filename, DEFAULT_METADATA)

In [15]:
def ingest_markdown_file(md_path: str, embed_batch_size=16, upsert_batch_size=64):
    meta = get_metadata_for_file(Path(md_path).name)
    
    with open(md_path, "r", encoding="utf-8") as f:
        content = f.read()

    items = parse_markdown_unified(content)

    texts, payloads = [], []

    for i, item in enumerate(items):
        chunk_id = f"{Path(md_path).stem}_{i}"

        if item["type"] == "table_row":
            text = describe_table_row(item["section"], item["data"])
            payload = {
                "source_file": Path(md_path).name,
                "section": item["section"],
                "type": "table",
                "chunk_id": chunk_id,
                **item["data"],
                
                # dynamic metadata
                "doc_type": meta.get("doc_type"),
                "category": meta.get("category"),
                "jurisdiction": meta.get("jurisdiction"),
                "authority": meta.get("authority"),
                
            }
        else:
            text = f"Context: {item['section']}. Content: {item['text']}"
            payload = {
                "source_file": Path(md_path).name,
                "section": item["section"],
                "type": "text",
                "chunk_id": chunk_id,
                "original_text": item["text"],

                # dynamic metadata
                "doc_type": meta.get("doc_type"),
                "category": meta.get("category"),
                "jurisdiction": meta.get("jurisdiction"),
                "authority": meta.get("authority"),
            }

        texts.append(text)
        payloads.append(payload)

    # ---- EMBEDDING (BATCHED) ----
    all_vectors = []
    for batch in batched(texts, embed_batch_size):
        all_vectors.extend(embedder.embed_documents(batch))

    # ---- UPSERT (BATCHED) ----
    points = [
        models.PointStruct(
            id=str(uuid.uuid4()),
            vector=all_vectors[i],
            payload=payloads[i]
        )
        for i in range(len(all_vectors))
    ]

    for batch in batched(points, upsert_batch_size):
        client.upsert(
            collection_name=collection,
            points=batch
        )

    print(f"Ingested safely: {md_path}")


In [16]:
MARKDOWN_DIR = "source_of_truth_markdowns"  # folder containing all your .md files

md_files = list(Path(MARKDOWN_DIR).glob("*.md"))

print(f"Found {len(md_files)} markdown files")

for md_file in md_files:
    ingest_markdown_file(str(md_file))

Found 34 markdown files
Ingested safely: source_of_truth_markdowns\Bharatiya Sakshya Adhiniyam (BSA), 2023.md
Ingested safely: source_of_truth_markdowns\BNS.md
Ingested safely: source_of_truth_markdowns\BNSS.md
Ingested safely: source_of_truth_markdowns\budget_2025-2026.md
Ingested safely: source_of_truth_markdowns\Charter of Patients Rights (NHRC).md
Ingested safely: source_of_truth_markdowns\Code on Wages, 2019.md
Ingested safely: source_of_truth_markdowns\CONSTITUTION_OF_INDIA.md
Ingested safely: source_of_truth_markdowns\Consumer Protection Act, 2019.md
Ingested safely: source_of_truth_markdowns\DPDP Act, 2023.md
Ingested safely: source_of_truth_markdowns\GST Acts (CGST SGST).md
Ingested safely: source_of_truth_markdowns\Income Tax Act, 1961.md
Ingested safely: source_of_truth_markdowns\Industrial Disputes Act, 1947.md
Ingested safely: source_of_truth_markdowns\IT Act, 2000.md
Ingested safely: source_of_truth_markdowns\Lokpal and Lokayuktas Act, 2013.md
Ingested safely: source_of_t