## PDFから指定されたtextを抽出して保存する

In [369]:
import fitz  # PyMuPDF
import re
from pathlib import Path
import pandas as pd
import requests
from io import BytesIO



In [370]:
def load_pdf_text(source: str) -> str:
    """
    source が
      - http(s) で始まる場合: Web上のPDFとして取得
      - それ以外: ローカルパスとして扱う
    全ページのテキストを結合して返す。
    """
    if source.startswith("http://") or source.startswith("https://"):
        # URLからPDFを取得
        resp = requests.get(source, timeout=30)
        resp.raise_for_status()
        pdf_bytes = resp.content
        doc = fitz.open(stream=BytesIO(pdf_bytes), filetype="pdf")
    else:
        # ローカルファイルとして開く
        doc = fitz.open(source)

    texts = []
    for page in doc:
        texts.append(page.get_text("text"))
    doc.close()
    return "\n".join(texts)

In [371]:
except_file_list = [
    # フォーマットが特殊
    'https://www2.jica.go.jp/ja/evaluation/pdf/2010_VNXI-3_4_f.pdf'
]

In [373]:
df = pd.read_csv("../df_check_99.csv")
target_urls = df["file"].dropna().unique().tolist()
#target_urls = target_urls[0:200]  # テスト用

#target_urls = ["https://www2.jica.go.jp/ja/evaluation/pdf/2010_C01-P160_4_f.pdf"]
for i,url in enumerate(target_urls):
    if url in except_file_list:
        print(f"=== {i} {url} SKIPPED ===")
        continue
    text = load_pdf_text(url)
    # 取得したテキストを保存
    fname = Path("../pdf_text/{url}.txt".format(url=url.split("/")[-1].replace(".pdf", "")))
    with open(fname, "w", encoding="utf-8") as f:
        f.write(text)
    print(f"=== {i} {url} ===")


=== 0 https://www2.jica.go.jp/ja/evaluation/pdf/2010_0200600_4_f.pdf ===
=== 1 https://www2.jica.go.jp/ja/evaluation/pdf/2010_0202100_4_f.pdf ===
=== 2 https://www2.jica.go.jp/ja/evaluation/pdf/2010_0202700_4_f.pdf ===
=== 3 https://www2.jica.go.jp/ja/evaluation/pdf/2010_0211100_4_f.pdf ===
=== 4 https://www2.jica.go.jp/ja/evaluation/pdf/2010_0300500_4_f.pdf ===
=== 5 https://www2.jica.go.jp/ja/evaluation/pdf/2010_0300700_4_f.pdf ===
=== 6 https://www2.jica.go.jp/ja/evaluation/pdf/2010_0304000_4_f.pdf ===
=== 7 https://www2.jica.go.jp/ja/evaluation/pdf/2010_0306300_4_f.pdf ===
=== 8 https://www2.jica.go.jp/ja/evaluation/pdf/2010_0307400_4_f.pdf ===
=== 9 https://www2.jica.go.jp/ja/evaluation/pdf/2010_0308100_4_f.pdf ===
=== 10 https://www2.jica.go.jp/ja/evaluation/pdf/2010_0309700_4_f.pdf ===
=== 11 https://www2.jica.go.jp/ja/evaluation/pdf/2010_0400200_4_f.pdf ===
=== 12 https://www2.jica.go.jp/ja/evaluation/pdf/2010_0400300_4_f.pdf ===
=== 13 https://www2.jica.go.jp/ja/evaluation/pdf