In [None]:
import yaml
from pathlib import Path

with open("../config.yaml") as f:
    config = yaml.safe_load(f)

project_root = Path.cwd().parent
PDFS_DIR = project_root / config["PDFS_DIR"]
EXTRACTED_DIR = project_root / config["EXTRACTED_DIR"]

In [None]:
import pymupdf
import pymupdf4llm
from tqdm import tqdm
from lingua import LanguageDetectorBuilder

In [None]:
def detect_language(text: str, detector):
    result = detector.detect_language_of(text)
    return result.iso_code_639_1.name.lower() if result else None

In [None]:
processing_stats = []
detector = LanguageDetectorBuilder.from_all_languages().build()

for lang_code, lang_config in config["LANGUAGES"].items():
    lang_pdf_dir = PDFS_DIR / lang_code
    lang_extracted_dir = EXTRACTED_DIR / lang_code
    lang_extracted_dir.mkdir(parents=True, exist_ok=True)

    if not lang_pdf_dir.exists():
        continue

    pdf_files = list(lang_pdf_dir.glob("*.pdf"))
    kept_count = 0
    wrong_lang_count = 0
    error_count = 0

    for pdf_path in tqdm(
        pdf_files, total=len(pdf_files), desc=f"Processing {lang_config['name']}"
    ):
        try:
            with pymupdf.open(str(pdf_path)) as doc:
                md_text = pymupdf4llm.to_markdown(doc)

            detected_code = detect_language(md_text, detector)
            if detected_code == lang_code:
                new_pdf_path = lang_pdf_dir / f"{kept_count}.pdf"
                pdf_path.rename(new_pdf_path)

                md_path = lang_extracted_dir / f"{kept_count}.md"
                md_path.write_text(md_text, encoding="utf-8")
                kept_count += 1
            else:
                pdf_path.unlink()
                wrong_lang_count += 1

        except Exception as e:
            print(f"Error converting PDF to markdown: {str(e)}")
            pdf_path.unlink(missing_ok=True)
            error_count += 1

    processing_stats.append(
        {
            "Language": lang_config["name"],
            "Total PDFs": len(pdf_files),
            "Kept": kept_count,
            "Wrong Language": wrong_lang_count,
            "Errors": error_count,
            "Success Rate": (
                f"{kept_count / len(pdf_files) * 100:.1f}%" if pdf_files else "0%"
            ),
        }
    )
