In [None]:
%pip install pymupdf
%pip install langdetect

In [None]:
from pathlib import Path
import pymupdf
import json
from langdetect import detect
from tqdm.auto import tqdm
from typing import List

In [47]:
# TODO: 
# - find number of embedded images
# - docling
# - pdfplumber
# - spacy
# - nltk

class Metadata():
    def __init__(self):
        pass

    def ocr_text_statistics(self, file_name: str, ocr_folder: Path):
        ocr_text = (ocr_folder / f"{file_name}.txt").read_text()

        statistics = {
            "num_words": len(ocr_text.split()),
            "num_lines": len(ocr_text.splitlines()),
            "num_chars": len(ocr_text),
        }

        return statistics

    def language_metadata(self, file_name: str, ocr_folder: Path):
        ocr_text = (ocr_folder / f"{file_name}.txt").read_text()

        try:
            language = detect(ocr_text)
        except:
            language = "unknown"

        metadata = {
            "language": language
        }

        return metadata

    def pdf_statistics(self, file_name: str, pdf_folder: Path):
        doc = pymupdf.open(pdf_folder / f"{file_name}.pdf")

        metadata = {
            "num_pages": doc.page_count,
            "form_fields": doc.is_form_pdf,
            "table_of_contents": doc.get_toc(simple=True),
            **doc.metadata
        }

        return metadata

    def process_file(self, file_name: str, ocr_folder: Path, pdf_folder: Path, result_path: Path):
        metadata = {}

        if result_path.exists():
            metadata = json.load(result_path.open())

        # only add metadata if it is not already present => enable future addition of new metadata
        if "ocr_text_statistics" not in metadata:
            metadata["ocr_text_statistics"] = self.ocr_text_statistics(file_name, ocr_folder)
        if "pdf_statistics" not in metadata:
            metadata["pdf_statistics"] = self.pdf_statistics(file_name, pdf_folder)
        if "language_metadata" not in metadata:
            metadata["language_metadata"] = self.language_metadata(file_name, ocr_folder)

        return metadata

    def process_metadata(self, pdf_folder: Path, ocr_folder: Path, results_folder: Path = Path("metadata")):
        ocr_files = list(ocr_folder.glob("*.txt")) # use ocr files as baseline as some pdfs were not able to be processed
        file_stems = [file.stem for file in ocr_files]

        results_folder.mkdir(parents=True, exist_ok=True)
        
        for stem in tqdm(file_stems, desc="Processing metadata"):
            result_path = results_folder / f"{stem}.json"
            metadata = self.process_file(stem, ocr_folder, pdf_folder, result_path)
            json.dump(metadata, result_path.open("w"), ensure_ascii=False, indent=4)


In [48]:
metadata = Metadata()

In [None]:
metadata.process_metadata(Path("pdfs"), Path("tesseract"))