In [None]:
import os
from langchain_community.document_loaders import (
    PyMuPDFLoader,
    UnstructuredWordDocumentLoader,
    UnstructuredHTMLLoader,
    TextLoader
)
from langchain_text_splitters import RecursiveCharacterTextSplitter


# ---------- Loaders ----------
def load_pdf_document(path: str):
    loader = PyMuPDFLoader(path)
    return loader.load()


def load_docx_document(path: str):
    loader = UnstructuredWordDocumentLoader(path)
    return loader.load()


def load_html_document(path: str):
    loader = UnstructuredHTMLLoader(path)
    return loader.load()


def load_txt_document(path: str):
    loader = TextLoader(path, encoding="utf-8")
    return loader.load()


def load_document(path: str):
    ext = os.path.splitext(path)[1].lower()

    if ext == ".pdf":
        return load_pdf_document(path)
    elif ext == ".docx":
        return load_docx_document(path)
    elif ext in [".html", ".htm"]:
        return load_html_document(path)
    elif ext == ".txt":
        return load_txt_document(path)
    else:
        raise ValueError(f"Unsupported file type: {ext}")


# ---------- Chunk Creator ----------
def create_chunks(documents, chunk_size=2000, chunk_overlap=200):
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap
    )
    return splitter.split_documents(documents)


# ---------- Main Program ----------
if __name__ == "__main__":
    data_folder = "data"   # put your 4 files inside this folder
    output_file = "output.txt"

    all_chunks_text = []

    for file_name in os.listdir(data_folder):
        file_path = os.path.join(data_folder, file_name)

        try:
            docs = load_document(file_path)
            print(f"✅ Loaded {file_name} | Documents: {len(docs)}")

            chunks = create_chunks(docs)
            print(f"   ➜ Split into {len(chunks)} chunks\n")

            # Store chunks with headings
            all_chunks_text.append(f"\n\n========== FILE: {file_name} ==========\n")
            for i, chunk in enumerate(chunks, start=1):
                all_chunks_text.append(f"\n--- Chunk {i} ---\n")
                all_chunks_text.append(chunk.page_content)

        except Exception as e:
            print(f"⚠️ Skipped {file_name} → {e}\n")

    # Write all chunks into output.txt
    with open(output_file, "w", encoding="utf-8") as f:
        f.write("\n".join(all_chunks_text))

    print(f"\n✅ DONE! All chunked output saved to: {output_file}")
