In [None]:
import os
from langchain_community.document_loaders import (
    PyMuPDFLoader,
    UnstructuredWordDocumentLoader,
    UnstructuredHTMLLoader,
    TextLoader
)
from langchain_text_splitters import RecursiveCharacterTextSplitter

from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma


# ---------- Loaders ----------
def load_document(path: str):
    ext = os.path.splitext(path)[1].lower()

    if ext == ".pdf":
        return PyMuPDFLoader(path).load()
    elif ext == ".docx":
        return UnstructuredWordDocumentLoader(path).load()
    elif ext in [".html", ".htm"]:
        return UnstructuredHTMLLoader(path).load()
    elif ext == ".txt":
        return TextLoader(path, encoding="utf-8").load()
    else:
        raise ValueError(f"Unsupported file type: {ext}")


# ---------- MAIN ----------
data_folder = "data"
all_docs = []

for file_name in os.listdir(data_folder):
    file_path = os.path.join(data_folder, file_name)

    try:
        docs = load_document(file_path)
        all_docs.extend(docs)
        print(f"‚úÖ Loaded: {file_name} | docs: {len(docs)}")

    except Exception as e:
        print(f"‚ö†Ô∏è Skipped: {file_name} -> {e}")

print("\nTotal loaded docs:", len(all_docs))


# 1) Split
splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=150)
chunks = splitter.split_documents(all_docs)

print("‚úÖ Total chunks created:", len(chunks))


# 2) Embeddings (Latest)
embedding_model = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2"
)


# 3) Vector DB (Auto-persist)
vectordb = Chroma.from_documents(
    documents=chunks,
    embedding=embedding_model,
    persist_directory="chroma_store"
)

print("‚úÖ Stored chunks into ChromaDB")


# 4) Retrieval
query = input("\nEnter your question: ")

print("\nüîç Searching...")

results = vectordb.similarity_search(query, k=5)

print("\n‚úÖ Top 5 Relevant Chunks:\n")

for i, doc in enumerate(results, start=1):
    print("=" * 60)
    print(f"Result {i}")
    print("=" * 60)
    print(doc.page_content[:700])


‚úÖ Loaded: India Code_ Section Details.html | docs: 1
‚úÖ Loaded: legal document.txt | docs: 1
‚úÖ Loaded: THE INDIAN PENAL CODE.docx | docs: 1
‚úÖ Loaded: the_constitution_of_india.pdf | docs: 256

Total loaded docs: 259
‚úÖ Total chunks created: 1719
