판교어 pdf 코드

In [13]:
# !pip install pdfplumber

In [None]:
# !pip install kiwipiepy

Collecting kiwipiepy
  Downloading kiwipiepy-0.22.0-cp312-cp312-win_amd64.whl.metadata (1.3 kB)
Collecting kiwipiepy_model<0.23,>=0.22 (from kiwipiepy)
  Downloading kiwipiepy_model-0.22.0.tar.gz (79.5 MB)
     ---------------------------------------- 0.0/79.5 MB ? eta -:--:--
     --- ------------------------------------ 6.0/79.5 MB 30.7 MB/s eta 0:00:03
     ------ -------------------------------- 14.2/79.5 MB 35.5 MB/s eta 0:00:02
     ---------- ---------------------------- 22.0/79.5 MB 34.8 MB/s eta 0:00:02
     -------------- ------------------------ 29.1/79.5 MB 35.5 MB/s eta 0:00:02
     ------------------ -------------------- 38.0/79.5 MB 36.6 MB/s eta 0:00:02
     ---------------------- ---------------- 46.4/79.5 MB 37.4 MB/s eta 0:00:01
     -------------------------- ------------ 54.3/79.5 MB 37.6 MB/s eta 0:00:01
     ------------------------------ -------- 62.4/79.5 MB 37.5 MB/s eta 0:00:01
     --------------------------------- ----- 68.7/79.5 MB 36.8 MB/s eta 0:00:01
  

  DEPRECATION: Building 'kiwipiepy_model' using the legacy setup.py bdist_wheel mechanism, which will be removed in a future version. pip 25.3 will enforce this behaviour change. A possible replacement is to use the standardized build interface by setting the `--use-pep517` option, (possibly combined with `--no-build-isolation`), or adding a `pyproject.toml` file to the source tree of 'kiwipiepy_model'. Discussion can be found at https://github.com/pypa/pip/issues/6334


In [19]:
import pdfplumber
import json
from kiwipiepy import Kiwi

In [20]:
PDF_PATH = "판교어사전.pdf-20241003004619.pdf"
START_PAGE = 6
OUTPUT_PATH = "pangyo_dict.jsonl"

In [None]:
kiwi = Kiwi()  


def clean_cell(cell: str) -> str:
    if not cell:
        return ""
    text = str(cell).replace("\n", " ").strip()
    return " ".join(text.split())


def fix_spacing(text: str) -> str:
    if not text:
        return ""
    return kiwi.space(text)


def extract_entries_from_tables(pdf_path, start_page=6):
    entries = []

    with pdfplumber.open(pdf_path) as pdf:
        for i in range(start_page, len(pdf.pages)):
            page = pdf.pages[i]
            tables = page.extract_tables()
            if not tables:
                continue

            for table in tables:
                for row in table:
                    if not row:
                        continue

                    row = list(row) + [""] * (4 - len(row))
                    num, term, meaning, example = row[:4]

                    if isinstance(num, str) and "번" in num and "호" in num:
                        continue
                    if not num or not term:
                        continue

                    entries.append({
                        "num": clean_cell(num),
                        "term": clean_cell(term),
                        "meaning": clean_cell(meaning),
                        "example": clean_cell(example),
                    })

    return entries


def entries_to_jsonl(entries, output_path):
    with open(output_path, "w", encoding="utf-8") as f:
        for e in entries:
            term = e["term"]

            meaning_raw = e["meaning"]
            example_raw = e["example"]

            meaning = fix_spacing(meaning_raw)
            example = fix_spacing(example_raw)

            question = f"{term}(이)란 무엇인가?"

            answer = f"{meaning} 예: {example}"
            
            obj = {
                "question": question,
                "answer": answer,
            }
            f.write(json.dumps(obj, ensure_ascii=False) + "\n")


if __name__ == "__main__":
    entries = extract_entries_from_tables(PDF_PATH, START_PAGE)
    entries_to_jsonl(entries, OUTPUT_PATH)
    print(f"JSONL 파일 생성 완료: {OUTPUT_PATH}, 총 {len(entries)}개 항목")

JSONL 파일 생성 완료: pangyo_dict.jsonl, 총 300개 항목
