In [None]:
!pip install docling

In [None]:
from docling.document_converter import DocumentConverter
import json

converter = DocumentConverter()

source = "Constitution.pdf"

print(f"Converting {source} ... this might take a minute as it analyzes layout.")

result = converter.convert(source)


markdown_content = result.document.export_to_markdown()


json_content = result.document.export_to_dict()

# Save the outputs
with open("converted_law.md", "w") as f:
    f.write(markdown_content)

print("--- CONVERSION SUCCESSFUL ---")
print(markdown_content[:500])

In [2]:
# Install LangChain, Qdrant Client, and Embedding models
!pip install -qU langchain-text-splitters langchain-qdrant sentence-transformers

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/377.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m [32m368.6/377.2 kB[0m [31m16.1 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m377.2/377.2 kB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m
[?25h

In [3]:
!pip install langchain-huggingface

Collecting langchain-huggingface
  Downloading langchain_huggingface-1.2.0-py3-none-any.whl.metadata (2.8 kB)
Downloading langchain_huggingface-1.2.0-py3-none-any.whl (30 kB)
Installing collected packages: langchain-huggingface
Successfully installed langchain-huggingface-1.2.0


In [9]:
import re
from langchain_text_splitters import MarkdownHeaderTextSplitter, RecursiveCharacterTextSplitter

with open('converted_law.md', 'r') as f:
    raw_markdown = f.read()

headers_to_split_on = [
    ("#", "Part"),
]


markdown_splitter = MarkdownHeaderTextSplitter(
    headers_to_split_on=headers_to_split_on,
    strip_headers=False
)
md_header_splits = markdown_splitter.split_text(raw_markdown)

#Recursive Splitting (The Final Chunks)
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=800,
    chunk_overlap=100,
    separators=["\n\n", "\n", ". ", " "]
)
final_chunks = text_splitter.split_documents(md_header_splits)

print(f"✅ Created {len(final_chunks)} chunks.")

✅ Created 577 chunks.


In [5]:
!pip install -qU langchain-qdrant fastembed

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/108.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m108.5/108.5 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.6/61.6 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.4/17.4 MB[0m [31m69.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m324.8/324.8 kB[0m [31m19.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m46.0/46.0 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.8/86.8 kB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
[?25h

In [8]:
from langchain_qdrant import QdrantVectorStore, FastEmbedSparse, RetrievalMode
from qdrant_client import QdrantClient
from langchain_huggingface import HuggingFaceEmbeddings

model_name = "BAAI/bge-m3"
model_kwargs = {"device": "cpu"} # Uses Colab's GPU
encode_kwargs = {"normalize_embeddings": True}
sparse_embeddings = FastEmbedSparse(model_name="Qdrant/bm25")

embeddings = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)


vector_store = QdrantVectorStore.from_documents(
    documents=final_chunks,
    embedding=embeddings,              # Your Dense Model
    sparse_embedding=sparse_embeddings, # Your Sparse Model
    path="/content/qdrant_db",
    collection_name="nepal_law_hybrid",
    retrieval_mode=RetrievalMode.HYBRID,
)

print("✅ Hybrid Search is now active!")

✅ Hybrid Search is now active!


In [11]:
from sentence_transformers import CrossEncoder

# Load the BGE reranker model
reranker = CrossEncoder(
    "BAAI/bge-reranker-large",
    device="cpu"  # or "cpu" if no GPU
)

def rerank_with_bge_large(query, retrieved_docs):
    # Prepare query/passage pairs
    pairs = [(query, doc.page_content) for doc in retrieved_docs]

    # Compute relevance scores — higher means more relevant
    scores = reranker.predict(pairs)

    # Attach scores then sort
    scored_docs = list(zip(retrieved_docs, scores))
    scored_docs.sort(key=lambda x: x[1], reverse=True)

    # Return just the sorted document objects
    return [doc for doc, _ in scored_docs]


In [12]:
!pip install -U langchain-groq


Collecting langchain-groq
  Downloading langchain_groq-1.1.1-py3-none-any.whl.metadata (2.4 kB)
Collecting groq<1.0.0,>=0.30.0 (from langchain-groq)
  Downloading groq-0.37.1-py3-none-any.whl.metadata (16 kB)
Downloading langchain_groq-1.1.1-py3-none-any.whl (19 kB)
Downloading groq-0.37.1-py3-none-any.whl (137 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m137.5/137.5 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: groq, langchain-groq
Successfully installed groq-0.37.1 langchain-groq-1.1.1


In [13]:
import os
import getpass

os.environ["GROQ_API_KEY"] = getpass.getpass("Enter your Groq API Key: ")


Enter your Groq API Key: ··········


In [14]:
from langchain_groq import ChatGroq

llm = ChatGroq(
    model="llama-3.1-8b-instant",
    temperature=0.2,
    max_tokens=512,
    max_retries=2
)

In [16]:
query = "what does article 273 tells?"

results = vector_store.similarity_search(query, k=20)
reranked = rerank_with_bge_large(query, results)
print("--- Search Result ---")
results_en = reranked[:9]


context = "\n".join([doc.page_content for doc in results_en])
prompt = f"""
You are a constitutional law assistant for Nepal.

RULES:
- Use ONLY the provided context.
- Do NOT invent articles, clauses, or interpretations.
- If the answer is not found, say so explicitly.
- Use formal, neutral legal language.
- Reference article/section numbers when mentioned.

CONTEXT:
{context}

QUESTION:
{query}

ANSWER:
"""

# Use the prompt and generate
response = llm.invoke(prompt)

print("Answer:", response.content)




--- Search Result ---
Answer: Article 273 states that if a grave emergency arises in regard to the sovereignty, territorial integrity of Nepal or the security of any part thereof, by war, external aggression, armed rebellion, extreme economic disarray, natural calamity or epidemic, the President may declare or order a state of emergency in respect of the whole of Nepal or of any specified part thereof.
