In [1]:
import os
from langchain_community.document_loaders import (
    PyMuPDFLoader,
    UnstructuredWordDocumentLoader,
    UnstructuredHTMLLoader,
    TextLoader
)
from langchain_text_splitters import RecursiveCharacterTextSplitter


# ---------- Loaders ----------
def load_pdf_document(path: str):
    loader = PyMuPDFLoader(path)
    return loader.load()


def load_docx_document(path: str):
    loader = UnstructuredWordDocumentLoader(path)
    return loader.load()


def load_html_document(path: str):
    loader = UnstructuredHTMLLoader(path)
    return loader.load()


def load_txt_document(path: str):
    loader = TextLoader(path, encoding="utf-8")
    return loader.load()


def load_document(path: str):
    ext = os.path.splitext(path)[1].lower()

    if ext == ".pdf":
        return load_pdf_document(path)
    elif ext == ".docx":
        return load_docx_document(path)
    elif ext in [".html", ".htm"]:
        return load_html_document(path)
    elif ext == ".txt":
        return load_txt_document(path)
    else:
        raise ValueError(f"Unsupported file type: {ext}")


# ---------- Chunk Creator ----------
def create_chunks(documents, chunk_size=2000, chunk_overlap=200):
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap
    )
    return splitter.split_documents(documents)


# ---------- Main Program ----------
if __name__ == "__main__":
    data_folder = "data"   # put your 4 files inside this folder
    output_file = "output.txt"

    all_chunks_text = []

    for file_name in os.listdir(data_folder):
        file_path = os.path.join(data_folder, file_name)

        try:
            docs = load_document(file_path)
            print(f"✅ Loaded {file_name} | Documents: {len(docs)}")

            chunks = create_chunks(docs)
            print(f"   ➜ Split into {len(chunks)} chunks\n")

            # Store chunks with headings
            all_chunks_text.append(f"\n\n========== FILE: {file_name} ==========\n")
            for i, chunk in enumerate(chunks, start=1):
                all_chunks_text.append(f"\n--- Chunk {i} ---\n")
                all_chunks_text.append(chunk.page_content)

        except Exception as e:
            print(f"⚠️ Skipped {file_name} → {e}\n")

    # Write all chunks into output.txt
    with open(output_file, "w", encoding="utf-8") as f:
        f.write("\n".join(all_chunks_text))

    print(f"\n✅ DONE! All chunked output saved to: {output_file}")


FileNotFoundError: [WinError 3] The system cannot find the path specified: 'data'

In [3]:
import os
os.makedirs("data", exist_ok=True)
print("✅ data folder created")


✅ data folder created


In [5]:
import os
from langchain_community.document_loaders import (
    PyMuPDFLoader,
    UnstructuredWordDocumentLoader,
    UnstructuredHTMLLoader,
    TextLoader
)
from langchain_text_splitters import RecursiveCharacterTextSplitter


# ---------- Loaders ----------
def load_pdf_document(path: str):
    loader = PyMuPDFLoader(path)
    return loader.load()


def load_docx_document(path: str):
    loader = UnstructuredWordDocumentLoader(path)
    return loader.load()


def load_html_document(path: str):
    loader = UnstructuredHTMLLoader(path)
    return loader.load()


def load_txt_document(path: str):
    loader = TextLoader(path, encoding="utf-8")
    return loader.load()


def load_document(path: str):
    ext = os.path.splitext(path)[1].lower()

    if ext == ".pdf":
        return load_pdf_document(path)
    elif ext == ".docx":
        return load_docx_document(path)
    elif ext in [".html", ".htm"]:
        return load_html_document(path)
    elif ext == ".txt":
        return load_txt_document(path)
    else:
        raise ValueError(f"Unsupported file type: {ext}")


# ---------- Chunk Creator ----------
def create_chunks(documents, chunk_size=2000, chunk_overlap=200):
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap
    )
    return splitter.split_documents(documents)


# ---------- Main Program ----------
if __name__ == "__main__":
    data_folder = "data"   # put your 4 files inside this folder
    output_file = "output.txt"

    all_chunks_text = []

    for file_name in os.listdir(data_folder):
        file_path = os.path.join(data_folder, file_name)

        try:
            docs = load_document(file_path)
            print(f"✅ Loaded {file_name} | Documents: {len(docs)}")

            chunks = create_chunks(docs)
            print(f"   ➜ Split into {len(chunks)} chunks\n")

            # Store chunks with headings
            all_chunks_text.append(f"\n\n========== FILE: {file_name} ==========\n")
            for i, chunk in enumerate(chunks, start=1):
                all_chunks_text.append(f"\n--- Chunk {i} ---\n")
                all_chunks_text.append(chunk.page_content)

        except Exception as e:
            print(f"⚠️ Skipped {file_name} → {e}\n")

    # Write all chunks into output.txt
    with open(output_file, "w", encoding="utf-8") as f:
        f.write("\n".join(all_chunks_text))

    print(f"\n✅ DONE! All chunked output saved to: {output_file}")


✅ Loaded India Code_ Section Details.html | Documents: 1
   ➜ Split into 1 chunks

✅ Loaded legal document.txt | Documents: 1
   ➜ Split into 32 chunks

✅ Loaded THE INDIAN PENAL CODE.docx | Documents: 1
   ➜ Split into 267 chunks

⚠️ Skipped the_constitution_of_india.pdf → pymupdf package not found, please install it with `pip install pymupdf`


✅ DONE! All chunked output saved to: output.txt


In [7]:
!pip install pymupdf


Collecting pymupdf
  Downloading pymupdf-1.26.7-cp310-abi3-win_amd64.whl.metadata (3.4 kB)
Downloading pymupdf-1.26.7-cp310-abi3-win_amd64.whl (18.4 MB)
   ---------------------------------------- 0.0/18.4 MB ? eta -:--:--
   -- ------------------------------------- 1.0/18.4 MB 6.3 MB/s eta 0:00:03
   --- ------------------------------------ 1.8/18.4 MB 4.8 MB/s eta 0:00:04
   ---- ----------------------------------- 2.1/18.4 MB 4.7 MB/s eta 0:00:04
   ------ --------------------------------- 3.1/18.4 MB 4.3 MB/s eta 0:00:04
   --------- ------------------------------ 4.5/18.4 MB 4.2 MB/s eta 0:00:04
   ----------- ---------------------------- 5.2/18.4 MB 4.1 MB/s eta 0:00:04
   ------------- -------------------------- 6.0/18.4 MB 4.1 MB/s eta 0:00:04
   -------------- ------------------------- 6.8/18.4 MB 4.1 MB/s eta 0:00:03
   ---------------- ----------------------- 7.6/18.4 MB 4.1 MB/s eta 0:00:03
   ------------------ --------------------- 8.4/18.4 MB 4.0 MB/s eta 0:00:03
   ----

In [9]:
import os
from langchain_community.document_loaders import (
    PyMuPDFLoader,
    UnstructuredWordDocumentLoader,
    UnstructuredHTMLLoader,
    TextLoader
)
from langchain_text_splitters import RecursiveCharacterTextSplitter


# ---------- Loaders ----------
def load_pdf_document(path: str):
    loader = PyMuPDFLoader(path)
    return loader.load()


def load_docx_document(path: str):
    loader = UnstructuredWordDocumentLoader(path)
    return loader.load()


def load_html_document(path: str):
    loader = UnstructuredHTMLLoader(path)
    return loader.load()


def load_txt_document(path: str):
    loader = TextLoader(path, encoding="utf-8")
    return loader.load()


def load_document(path: str):
    ext = os.path.splitext(path)[1].lower()

    if ext == ".pdf":
        return load_pdf_document(path)
    elif ext == ".docx":
        return load_docx_document(path)
    elif ext in [".html", ".htm"]:
        return load_html_document(path)
    elif ext == ".txt":
        return load_txt_document(path)
    else:
        raise ValueError(f"Unsupported file type: {ext}")


# ---------- Chunk Creator ----------
def create_chunks(documents, chunk_size=2000, chunk_overlap=200):
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap
    )
    return splitter.split_documents(documents)


# ---------- Main Program ----------
if __name__ == "__main__":
    data_folder = "data"   # put your 4 files inside this folder
    output_file = "output.txt"

    all_chunks_text = []

    for file_name in os.listdir(data_folder):
        file_path = os.path.join(data_folder, file_name)

        try:
            docs = load_document(file_path)
            print(f"✅ Loaded {file_name} | Documents: {len(docs)}")

            chunks = create_chunks(docs)
            print(f"   ➜ Split into {len(chunks)} chunks\n")

            # Store chunks with headings
            all_chunks_text.append(f"\n\n========== FILE: {file_name} ==========\n")
            for i, chunk in enumerate(chunks, start=1):
                all_chunks_text.append(f"\n--- Chunk {i} ---\n")
                all_chunks_text.append(chunk.page_content)

        except Exception as e:
            print(f"⚠️ Skipped {file_name} → {e}\n")

    # Write all chunks into output.txt
    with open(output_file, "w", encoding="utf-8") as f:
        f.write("\n".join(all_chunks_text))

    print(f"\n✅ DONE! All chunked output saved to: {output_file}")


✅ Loaded India Code_ Section Details.html | Documents: 1
   ➜ Split into 1 chunks

✅ Loaded legal document.txt | Documents: 1
   ➜ Split into 32 chunks

✅ Loaded THE INDIAN PENAL CODE.docx | Documents: 1
   ➜ Split into 267 chunks

✅ Loaded the_constitution_of_india.pdf | Documents: 256
   ➜ Split into 557 chunks


✅ DONE! All chunked output saved to: output.txt
