In [3]:
import os
from dotenv import load_dotenv
import glob

In [1]:
from langchain.document_loaders import DirectoryLoader
from langchain.document_loaders import PyPDFLoader

def load_pdfs_from_folder(folder_path: str, glob_pattern: str = "**/*.pdf"):
    """
    Load all PDF files from a folder using LangChain's DirectoryLoader.
    
    Args:
        folder_path: Path to the folder containing PDF files
        glob_pattern: Pattern to match PDF files (default: "**/*.pdf" for recursive search)
    
    Returns:
        List of loaded documents
    
    Example:
        docs = load_pdfs_from_folder("/path/to/pdfs")
    """
    # Configure the DirectoryLoader to use PyPDFLoader for PDF files
    loader = DirectoryLoader(
        path=folder_path,
        glob=glob_pattern,
        loader_cls=PyPDFLoader,
        show_progress=True,
        use_multithreading=True
    )
    
    # Load and return the documents
    documents = loader.load()
    return documents

In [38]:
folders = glob.glob('../medical-records/*')

documents = []

for folder in folders:
    doc_type = os.path.basename(folder)
    folder_docs = load_pdfs_from_folder(folder_path=folder, glob_pattern="**/*.pdf")
    print(folder_docs[0])
    for doc in folder_docs:
        doc.metadata['doc_type'] = doc_type
        documents.append(doc)

100%|██████████| 1/1 [00:00<00:00, 217.10it/s]


page_content='' metadata={'producer': 'PyPDF', 'creator': 'PyPDF', 'creationdate': '', 'source': '../medical-records/OTHER/patho1.pdf', 'total_pages': 1, 'page': 0, 'page_label': '1'}


100%|██████████| 7/7 [00:00<00:00, 246.62it/s]


page_content='' metadata={'producer': 'PyPDF', 'creator': 'PyPDF', 'creationdate': '', 'source': '../medical-records/RADIOLOGY/Hospital7.pdf', 'total_pages': 1, 'page': 0, 'page_label': '1'}


100%|██████████| 3/3 [00:00<00:00, 297.03it/s]


page_content='' metadata={'producer': 'PyPDF', 'creator': 'PyPDF', 'creationdate': '', 'source': '../medical-records/ONCOLOGY/CAP2.pdf', 'total_pages': 2, 'page': 0, 'page_label': '1'}


100%|██████████| 14/14 [00:00<00:00, 367.38it/s]

page_content='' metadata={'producer': 'PyPDF', 'creator': 'PyPDF', 'creationdate': '', 'source': '../medical-records/HEMATOLOGY/HEMATOLOGY11.pdf', 'total_pages': 2, 'page': 0, 'page_label': '1'}





In [18]:
len(documents)

61

In [19]:
documents[20]

Document(metadata={'producer': 'PyPDF', 'creator': 'PyPDF', 'creationdate': '', 'source': '../medical-records/ONCOLOGY/CAP1.pdf', 'total_pages': 3, 'page': 0, 'page_label': '1', 'doc_type': 'ONCOLOGY'}, page_content='')

In [20]:
from langchain.text_splitter import CharacterTextSplitter

In [21]:
text_splitter = CharacterTextSplitter(chunk_size = 1000, chunk_overlap=200)
chunks = text_splitter.split_documents(documents=documents)

In [22]:
len(chunks)

0