# Process PDF Documents

In [1]:
import hashlib
import os
import pickle

from langchain_text_splitters import (
    MarkdownHeaderTextSplitter,
    RecursiveCharacterTextSplitter,
)
from marker.convert import convert_single_pdf
from marker.logger import configure_logging
from marker.models import load_all_models
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


## Download files and process pdf files with marker converter (.pdf -> .md)

https://github.com/VikParuchuri/marker?tab=readme-ov-file

In [2]:
def parse_file_name(filename):
    # Remove space and dots on filename
    if filename.endswith(".pdf"):
        name_part = filename[:-4]
        name_part = name_part.replace(" ", "").replace(".", "")
        cleaned_filename = name_part + ".pdf"
    else:
        cleaned_filename = filename.replace(" ", "").replace(".", "")
    return cleaned_filename

In [3]:
def convert_pdf_to_markdown(fname, reference_folder, model_lst, md_out_path=None):

    md_filename = fname.rsplit(".", 1)[0] + ".md"

    pdf_filename = os.path.join(reference_folder, fname)

    print(pdf_filename)
    full_text, _, _ = convert_single_pdf(pdf_filename, model_lst, batch_multiplier=1)

    if md_out_path:
        with open(os.path.join(md_out_path, md_filename), "w+") as f:
            f.write(full_text)
    else:
        return full_text

In [4]:
def download_and_process_pdf_file(
    f_key, text_splitter, markdown_splitter, model_lst, reference_folder="../data/"
):

    temp_file_name = parse_file_name(f_key)
    temp_file_path = os.path.join(reference_folder, temp_file_name)
    
    mdfile = convert_pdf_to_markdown(temp_file_name, reference_folder, model_lst, None)

    md_header_split = markdown_splitter.split_text(mdfile)

    documents = []
    for split in md_header_split:

        split_texts = text_splitter.split_text(split.page_content)

        for i, split_text in enumerate(split_texts):

            document_id = f"{f_key}_part_{i}"
            hash_object = hashlib.md5(document_id.encode())
            hash_hex = hash_object.hexdigest()
            document_id = hash_hex[:10]

            metadata_dict = {
                "document_id": document_id,
                "pdf_name": f_key,
                "pdf_part": i,
            }

            metadata_dict.update(split.metadata)

            documents.append({"metadata": metadata_dict, "content": split_text})
            
    return documents

List files in folder

In [5]:
import os


def list_pdf_files(directory_path):
    """
    Lists all PDF files in the given directory and attempts to read them as binary data.

    Parameters:
        directory_path (str): The path to the directory containing PDF files.

    Returns:
        dict: A dictionary where keys are filenames and values are raw binary content or text content (if readable).
    """
    pdf_files_content = {}

    # Check if the directory exists
    if not os.path.isdir(directory_path):
        print(f"The directory '{directory_path}' does not exist.")
        return pdf_files_content

    filenames = [f for f in os.listdir(directory_path) if f.lower().endswith('.pdf')]

    return filenames


# Usage example
directory = "../data/"  # Replace with your directory path
filenames = list_pdf_files(directory)
print(f"Number of PDFs read: {len(filenames)}")

Number of PDFs read: 12


Setup marker class for converting pdf to markdown

In [6]:
configure_logging()
model_lst = load_all_models()

Loaded detection model vikp/surya_det3 on device cuda with dtype torch.float16
Loaded detection model vikp/surya_layout3 on device cuda with dtype torch.float16
Loaded reading order model vikp/surya_order on device cuda with dtype torch.float16
Loaded recognition model vikp/surya_rec2 on device cuda with dtype torch.float16
Loaded texify model to cuda with torch.float16 dtype


Setup process splitters

In [7]:
headers_to_split_on = [
    ("#", "Header 1"),
    ("##", "Header 2"),
    ("###", "Header 3"),
]
chunk_size = 500
chunk_overlap = 100
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=chunk_size, chunk_overlap=chunk_overlap
)
markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)

In [8]:
documents = []

for filename in tqdm(filenames):

    print("\n filename: {} \n".format(filename))

    splitted_doc = download_and_process_pdf_file(
        filename, text_splitter, markdown_splitter, model_lst, reference_folder="../data/"
    )
    
    documents.append(splitted_doc)
    

  0%|          | 0/12 [00:00<?, ?it/s]


 filename: Responsible_travel.pdf 

../data/Responsible_travel.pdf



Detecting bboxes:   0%|          | 0/2 [00:00<?, ?it/s][A
Detecting bboxes:  50%|█████     | 1/2 [00:00<00:00,  1.08it/s][A
Detecting bboxes: 100%|██████████| 2/2 [00:01<00:00,  1.60it/s][A

Detecting bboxes:   0%|          | 0/1 [00:00<?, ?it/s][A
Detecting bboxes: 100%|██████████| 1/1 [00:01<00:00,  1.25s/it][A

Finding reading order:   0%|          | 0/1 [00:00<?, ?it/s][A
Finding reading order: 100%|██████████| 1/1 [00:01<00:00,  1.42s/it][A
  8%|▊         | 1/12 [00:07<01:27,  7.91s/it]


 filename: Thailand.pdf 

../data/Thailand.pdf



Detecting bboxes:   0%|          | 0/17 [00:00<?, ?it/s][A
Detecting bboxes:   6%|▌         | 1/17 [00:00<00:11,  1.45it/s][A
Detecting bboxes:  12%|█▏        | 2/17 [00:01<00:09,  1.54it/s][A
Detecting bboxes:  18%|█▊        | 3/17 [00:01<00:09,  1.51it/s][A
Detecting bboxes:  24%|██▎       | 4/17 [00:02<00:08,  1.52it/s][A
Detecting bboxes:  29%|██▉       | 5/17 [00:03<00:07,  1.54it/s][A
Detecting bboxes:  35%|███▌      | 6/17 [00:03<00:07,  1.54it/s][A
Detecting bboxes:  41%|████      | 7/17 [00:04<00:06,  1.55it/s][A
Detecting bboxes:  47%|████▋     | 8/17 [00:05<00:05,  1.53it/s][A
Detecting bboxes:  53%|█████▎    | 9/17 [00:05<00:05,  1.54it/s][A
Detecting bboxes:  59%|█████▉    | 10/17 [00:06<00:04,  1.54it/s][A
Detecting bboxes:  65%|██████▍   | 11/17 [00:07<00:03,  1.54it/s][A
Detecting bboxes:  71%|███████   | 12/17 [00:07<00:03,  1.54it/s][A
Detecting bboxes:  76%|███████▋  | 13/17 [00:08<00:02,  1.54it/s][A
Detecting bboxes:  82%|████████▏ | 14/17 [00:09<00:


 filename: Laos.pdf 

../data/Laos.pdf



Detecting bboxes:   0%|          | 0/8 [00:00<?, ?it/s][A
Detecting bboxes:  12%|█▎        | 1/8 [00:00<00:04,  1.55it/s][A
Detecting bboxes:  25%|██▌       | 2/8 [00:01<00:03,  1.58it/s][A
Detecting bboxes:  38%|███▊      | 3/8 [00:01<00:03,  1.59it/s][A
Detecting bboxes:  50%|█████     | 4/8 [00:02<00:02,  1.56it/s][A
Detecting bboxes:  62%|██████▎   | 5/8 [00:03<00:01,  1.55it/s][A
Detecting bboxes:  75%|███████▌  | 6/8 [00:03<00:01,  1.54it/s][A
Detecting bboxes:  88%|████████▊ | 7/8 [00:04<00:00,  1.56it/s][A
Detecting bboxes: 100%|██████████| 8/8 [00:05<00:00,  1.55it/s][A

Detecting bboxes:   0%|          | 0/6 [00:00<?, ?it/s][A
Detecting bboxes:  17%|█▋        | 1/6 [00:01<00:06,  1.26s/it][A
Detecting bboxes:  33%|███▎      | 2/6 [00:02<00:04,  1.23s/it][A
Detecting bboxes:  50%|█████     | 3/6 [00:03<00:03,  1.22s/it][A
Detecting bboxes:  67%|██████▋   | 4/6 [00:04<00:02,  1.21s/it][A
Detecting bboxes:  83%|████████▎ | 5/6 [00:06<00:01,  1.22s/it][A
Detecting


 filename: Brunei.pdf 

../data/Brunei.pdf



Detecting bboxes:   0%|          | 0/4 [00:00<?, ?it/s][A
Detecting bboxes:  25%|██▌       | 1/4 [00:00<00:01,  1.67it/s][A
Detecting bboxes:  50%|█████     | 2/4 [00:01<00:01,  1.67it/s][A
Detecting bboxes:  75%|███████▌  | 3/4 [00:01<00:00,  1.64it/s][A
Detecting bboxes: 100%|██████████| 4/4 [00:02<00:00,  1.63it/s][A

Detecting bboxes:   0%|          | 0/3 [00:00<?, ?it/s][A
Detecting bboxes:  33%|███▎      | 1/3 [00:01<00:02,  1.19s/it][A
Detecting bboxes:  67%|██████▋   | 2/3 [00:02<00:01,  1.20s/it][A
Detecting bboxes: 100%|██████████| 3/3 [00:03<00:00,  1.05s/it][A

Finding reading order:   0%|          | 0/3 [00:00<?, ?it/s][A
Finding reading order:  33%|███▎      | 1/3 [00:00<00:01,  1.10it/s][A
Finding reading order:  67%|██████▋   | 2/3 [00:01<00:00,  1.08it/s][A
Finding reading order: 100%|██████████| 3/3 [00:02<00:00,  1.22it/s][A
 33%|███▎      | 4/12 [02:06<04:03, 30.43s/it]


 filename: Malaysia.pdf 

../data/Malaysia.pdf



Detecting bboxes:   0%|          | 0/10 [00:00<?, ?it/s][A
Detecting bboxes:  10%|█         | 1/10 [00:00<00:05,  1.58it/s][A
Detecting bboxes:  20%|██        | 2/10 [00:01<00:05,  1.55it/s][A
Detecting bboxes:  30%|███       | 3/10 [00:01<00:04,  1.54it/s][A
Detecting bboxes:  40%|████      | 4/10 [00:02<00:03,  1.55it/s][A
Detecting bboxes:  50%|█████     | 5/10 [00:03<00:03,  1.55it/s][A
Detecting bboxes:  60%|██████    | 6/10 [00:03<00:02,  1.53it/s][A
Detecting bboxes:  70%|███████   | 7/10 [00:04<00:01,  1.53it/s][A
Detecting bboxes:  80%|████████  | 8/10 [00:05<00:01,  1.53it/s][A
Detecting bboxes:  90%|█████████ | 9/10 [00:05<00:00,  1.53it/s][A
Detecting bboxes: 100%|██████████| 10/10 [00:06<00:00,  1.66it/s][A

Detecting bboxes:   0%|          | 0/7 [00:00<?, ?it/s][A
Detecting bboxes:  14%|█▍        | 1/7 [00:01<00:07,  1.27s/it][A
Detecting bboxes:  29%|██▊       | 2/7 [00:02<00:06,  1.24s/it][A
Detecting bboxes:  43%|████▎     | 3/7 [00:03<00:04,  1.24s/it]


 filename: Vietnam.pdf 

../data/Vietnam.pdf



Detecting bboxes:   0%|          | 0/13 [00:00<?, ?it/s][A
Detecting bboxes:   8%|▊         | 1/13 [00:00<00:08,  1.50it/s][A
Detecting bboxes:  15%|█▌        | 2/13 [00:01<00:07,  1.52it/s][A
Detecting bboxes:  23%|██▎       | 3/13 [00:01<00:06,  1.53it/s][A
Detecting bboxes:  31%|███       | 4/13 [00:02<00:05,  1.54it/s][A
Detecting bboxes:  38%|███▊      | 5/13 [00:03<00:05,  1.55it/s][A
Detecting bboxes:  46%|████▌     | 6/13 [00:03<00:04,  1.55it/s][A
Detecting bboxes:  54%|█████▍    | 7/13 [00:04<00:03,  1.54it/s][A
Detecting bboxes:  62%|██████▏   | 8/13 [00:05<00:03,  1.54it/s][A
Detecting bboxes:  69%|██████▉   | 9/13 [00:05<00:02,  1.53it/s][A
Detecting bboxes:  77%|███████▋  | 10/13 [00:06<00:01,  1.53it/s][A
Detecting bboxes:  85%|████████▍ | 11/13 [00:07<00:01,  1.54it/s][A
Detecting bboxes:  92%|█████████▏| 12/13 [00:07<00:00,  1.53it/s][A
Detecting bboxes: 100%|██████████| 13/13 [00:08<00:00,  1.57it/s][A

Detecting bboxes:   0%|          | 0/9 [00:00<?, ?


 filename: Cambodia.pdf 

../data/Cambodia.pdf



Detecting bboxes:   0%|          | 0/9 [00:00<?, ?it/s][A
Detecting bboxes:  11%|█         | 1/9 [00:00<00:05,  1.54it/s][A
Detecting bboxes:  22%|██▏       | 2/9 [00:01<00:04,  1.56it/s][A
Detecting bboxes:  33%|███▎      | 3/9 [00:01<00:03,  1.55it/s][A
Detecting bboxes:  44%|████▍     | 4/9 [00:02<00:03,  1.55it/s][A
Detecting bboxes:  56%|█████▌    | 5/9 [00:03<00:02,  1.55it/s][A
Detecting bboxes:  67%|██████▋   | 6/9 [00:03<00:01,  1.55it/s][A
Detecting bboxes:  78%|███████▊  | 7/9 [00:04<00:01,  1.56it/s][A
Detecting bboxes:  89%|████████▉ | 8/9 [00:05<00:00,  1.55it/s][A
Detecting bboxes: 100%|██████████| 9/9 [00:05<00:00,  1.65it/s][A

Detecting bboxes:   0%|          | 0/6 [00:00<?, ?it/s][A
Detecting bboxes:  17%|█▋        | 1/6 [00:01<00:06,  1.28s/it][A
Detecting bboxes:  33%|███▎      | 2/6 [00:02<00:04,  1.24s/it][A
Detecting bboxes:  50%|█████     | 3/6 [00:03<00:03,  1.23s/it][A
Detecting bboxes:  67%|██████▋   | 4/6 [00:04<00:02,  1.23s/it][A
Detecting


 filename: Philippines.pdf 

../data/Philippines.pdf



Detecting bboxes:   0%|          | 0/19 [00:00<?, ?it/s][A
Detecting bboxes:   5%|▌         | 1/19 [00:00<00:12,  1.44it/s][A
Detecting bboxes:  11%|█         | 2/19 [00:01<00:11,  1.52it/s][A
Detecting bboxes:  16%|█▌        | 3/19 [00:01<00:10,  1.53it/s][A
Detecting bboxes:  21%|██        | 4/19 [00:02<00:09,  1.53it/s][A
Detecting bboxes:  26%|██▋       | 5/19 [00:03<00:09,  1.54it/s][A
Detecting bboxes:  32%|███▏      | 6/19 [00:03<00:08,  1.54it/s][A
Detecting bboxes:  37%|███▋      | 7/19 [00:04<00:07,  1.54it/s][A
Detecting bboxes:  42%|████▏     | 8/19 [00:05<00:07,  1.54it/s][A
Detecting bboxes:  47%|████▋     | 9/19 [00:05<00:06,  1.54it/s][A
Detecting bboxes:  53%|█████▎    | 10/19 [00:06<00:05,  1.55it/s][A
Detecting bboxes:  58%|█████▊    | 11/19 [00:07<00:05,  1.55it/s][A
Detecting bboxes:  63%|██████▎   | 12/19 [00:07<00:04,  1.54it/s][A
Detecting bboxes:  68%|██████▊   | 13/19 [00:08<00:03,  1.55it/s][A
Detecting bboxes:  74%|███████▎  | 14/19 [00:09<00:


 filename: Sustainable_travel.pdf 

../data/Sustainable_travel.pdf



Detecting bboxes:   0%|          | 0/3 [00:00<?, ?it/s][A
Detecting bboxes:  33%|███▎      | 1/3 [00:00<00:01,  1.65it/s][A
Detecting bboxes:  67%|██████▋   | 2/3 [00:01<00:00,  1.66it/s][A
Detecting bboxes: 100%|██████████| 3/3 [00:01<00:00,  1.99it/s][A

Detecting bboxes:   0%|          | 0/2 [00:00<?, ?it/s][A
Detecting bboxes:  50%|█████     | 1/2 [00:01<00:01,  1.22s/it][A
Detecting bboxes: 100%|██████████| 2/2 [00:02<00:00,  1.01s/it][A

Finding reading order:   0%|          | 0/2 [00:00<?, ?it/s][A
Finding reading order:  50%|█████     | 1/2 [00:00<00:00,  1.06it/s][A
Finding reading order: 100%|██████████| 2/2 [00:01<00:00,  1.28it/s][A
 75%|███████▌  | 9/12 [05:48<01:57, 39.19s/it]


 filename: Myanmar.pdf 

../data/Myanmar.pdf



Detecting bboxes:   0%|          | 0/9 [00:00<?, ?it/s][A
Detecting bboxes:  11%|█         | 1/9 [00:00<00:05,  1.46it/s][A
Detecting bboxes:  22%|██▏       | 2/9 [00:01<00:04,  1.53it/s][A
Detecting bboxes:  33%|███▎      | 3/9 [00:01<00:03,  1.54it/s][A
Detecting bboxes:  44%|████▍     | 4/9 [00:02<00:03,  1.54it/s][A
Detecting bboxes:  56%|█████▌    | 5/9 [00:03<00:02,  1.54it/s][A
Detecting bboxes:  67%|██████▋   | 6/9 [00:03<00:01,  1.54it/s][A
Detecting bboxes:  78%|███████▊  | 7/9 [00:04<00:01,  1.54it/s][A
Detecting bboxes:  89%|████████▉ | 8/9 [00:05<00:00,  1.54it/s][A
Detecting bboxes: 100%|██████████| 9/9 [00:05<00:00,  1.63it/s][A

Detecting bboxes:   0%|          | 0/6 [00:00<?, ?it/s][A
Detecting bboxes:  17%|█▋        | 1/6 [00:01<00:06,  1.26s/it][A
Detecting bboxes:  33%|███▎      | 2/6 [00:02<00:05,  1.25s/it][A
Detecting bboxes:  50%|█████     | 3/6 [00:03<00:03,  1.25s/it][A
Detecting bboxes:  67%|██████▋   | 4/6 [00:04<00:02,  1.24s/it][A
Detecting


 filename: Singapore.pdf 

../data/Singapore.pdf



Detecting bboxes:   0%|          | 0/20 [00:00<?, ?it/s][A
Detecting bboxes:   5%|▌         | 1/20 [00:00<00:13,  1.43it/s][A
Detecting bboxes:  10%|█         | 2/20 [00:01<00:11,  1.51it/s][A
Detecting bboxes:  15%|█▌        | 3/20 [00:01<00:11,  1.52it/s][A
Detecting bboxes:  20%|██        | 4/20 [00:02<00:10,  1.52it/s][A
Detecting bboxes:  25%|██▌       | 5/20 [00:03<00:09,  1.54it/s][A
Detecting bboxes:  30%|███       | 6/20 [00:03<00:09,  1.55it/s][A
Detecting bboxes:  35%|███▌      | 7/20 [00:04<00:08,  1.55it/s][A
Detecting bboxes:  40%|████      | 8/20 [00:05<00:07,  1.54it/s][A
Detecting bboxes:  45%|████▌     | 9/20 [00:05<00:07,  1.55it/s][A
Detecting bboxes:  50%|█████     | 10/20 [00:06<00:06,  1.55it/s][A
Detecting bboxes:  55%|█████▌    | 11/20 [00:07<00:05,  1.55it/s][A
Detecting bboxes:  60%|██████    | 12/20 [00:07<00:05,  1.55it/s][A
Detecting bboxes:  65%|██████▌   | 13/20 [00:08<00:04,  1.55it/s][A
Detecting bboxes:  70%|███████   | 14/20 [00:09<00:


 filename: Indonesia.pdf 

../data/Indonesia.pdf



Detecting bboxes:   0%|          | 0/17 [00:00<?, ?it/s][A
Detecting bboxes:   6%|▌         | 1/17 [00:00<00:10,  1.52it/s][A
Detecting bboxes:  12%|█▏        | 2/17 [00:01<00:09,  1.57it/s][A
Detecting bboxes:  18%|█▊        | 3/17 [00:01<00:08,  1.58it/s][A
Detecting bboxes:  24%|██▎       | 4/17 [00:02<00:08,  1.57it/s][A
Detecting bboxes:  29%|██▉       | 5/17 [00:03<00:07,  1.56it/s][A
Detecting bboxes:  35%|███▌      | 6/17 [00:03<00:07,  1.54it/s][A
Detecting bboxes:  41%|████      | 7/17 [00:04<00:06,  1.53it/s][A
Detecting bboxes:  47%|████▋     | 8/17 [00:05<00:05,  1.55it/s][A
Detecting bboxes:  53%|█████▎    | 9/17 [00:05<00:05,  1.53it/s][A
Detecting bboxes:  59%|█████▉    | 10/17 [00:06<00:04,  1.52it/s][A
Detecting bboxes:  65%|██████▍   | 11/17 [00:07<00:03,  1.52it/s][A
Detecting bboxes:  71%|███████   | 12/17 [00:07<00:03,  1.52it/s][A
Detecting bboxes:  76%|███████▋  | 13/17 [00:08<00:02,  1.52it/s][A
Detecting bboxes:  82%|████████▏ | 14/17 [00:09<00:

In [9]:
# save the documents object into a pickle file to avoid computing it again
with open("../data/docs_processed.pickle", "wb") as f:
    pickle.dump(documents, f)

In [10]:
len(documents)

12

In [12]:
flattened_list = [item for sublist in documents for item in sublist]

In [13]:
flattened_list[0]

{'metadata': {'document_id': 'f392923b41',
  'pdf_name': 'Responsible_travel.pdf',
  'pdf_part': 0,
  'Header 2': 'Responsible Travel'},
 'content': 'See Sustainable travel for the ecological and appropriate technology dimension of travel sustainability.'}

In [14]:
# save the documents object into a pickle file to avoid computing it again
with open("../data/docs_processed.pickle", "wb") as f:
    pickle.dump(flattened_list, f)