＜共テ現文変換＞

In [5]:
from pdfminer.converter import PDFPageAggregator
from pdfminer.layout import LAParams, LTContainer, LTTextBox, LTTextLine, LTChar
from pdfminer.pdfinterp import PDFPageInterpreter, PDFResourceManager
from pdfminer.pdfpage import PDFPage

def pdfminer_config(line_overlap, word_margin, char_margin, line_margin, detect_vertical):
    laparams = LAParams(line_overlap=line_overlap,
                        word_margin=word_margin,
                        char_margin=char_margin,
                        line_margin=line_margin,
                        detect_vertical=detect_vertical)
    
    resource_manager = PDFResourceManager()
    device = PDFPageAggregator(resource_manager, laparams=laparams)
    interpreter = PDFPageInterpreter(resource_manager, device)
    return (interpreter, device)

def find_textboxes(layout_obj):
    if isinstance(layout_obj, LTTextBox):
        return [layout_obj]
    if isinstance(layout_obj, LTContainer):
        boxes = []
        for child in layout_obj:
            boxes.extend(find_textboxes(child))
        return boxes
    return []

def find_textlines(layout_obj):
    if isinstance(layout_obj, LTTextLine):
        return [layout_obj]
    if isinstance(layout_obj, LTTextBox):
        lines = []
        for child in layout_obj:
            lines.extend(find_textlines(child))
        return lines
    return []

def find_characters(layout_obj):
    if isinstance(layout_obj, LTChar):
        return [layout_obj]
    if isinstance(layout_obj, LTTextLine):
        characters = []
        for child in layout_obj:
            characters.extend(find_characters(child))
        return characters
    return []

def write_text(text_file, text):
    text_file.write(text)

def pdf2txt(filename):
    with open(f'data/{filename}.txt', 'w', encoding='utf-8') as text_file:
        with open(f"data/{filename}.pdf", 'rb') as f:
            interpreter, device = pdfminer_config(line_overlap=0.5, word_margin=0.1, char_margin=2, line_margin=0.5, detect_vertical=True)
            for page in PDFPage.get_pages(f):
                interpreter.process_page(page)  # ページを処理する。
                layout = device.get_result()  # LTPageオブジェクトを取得。
                boxes = find_textboxes(layout)
                for box in boxes:
                    write_text(text_file, box.get_text().strip())

In [6]:
files = ["2022gendai", "2023gendai", "2024gendai"]
for file in files:
    pdf2txt(file)

In [7]:
import re
# 変換用の辞書
conversion_dict = {
    "︼": "【",
    "︻": "】",
    "︵": "（",
    "︶": "）",
    "﹁": "「",
    "﹂":"」",
    "﹃": "『",
    "﹄":"』",
    " ": "",
    "(cid:7923)": "っ",
    "(cid:2)": "",
    "(cid:3)": "",
    "(cid:4)": "",
    "(cid:5)": "",
    "(cid:6)": "",
    "(cid:7)": "",
    "(cid:8)": "",
    "(cid:9)": "",
    "(cid:7664)": "喰",
    "(cid:7766)": "謎",
    "(cid:7775)": "箸",
    "(cid:7891)": "ー",
    "(cid:7894)": "~",
    "(cid:7926)": "ょ",
    "(cid:7928)": "ァ",
    "(cid:7929)": "ィ",
    "(cid:7930)": "ゥ",
    "(cid:7931)": "ェ",
    "(cid:7932)": "ォ",
    "(cid:7933)": "ッ",
    "(cid:7934)": "ャ",
    "(cid:7935)": "ュ",
    "(cid:7936)": "ョ",
    "（２６０１―１）":"",
    "（２６０１―２）":"",
    "（２６０１―３）":"",
    "（２６０１―４）":"",
    "（２６０１―５）":"",
    "（２６０１―６）":"",
    "（２６０１―７）":"",
    "（２６０１―８）":"",
    "（２６０１―９）":"",
    "（２１０１―１）":"",
    "（２１０１―２）":"",
    "（２１０１―３）":"",
    "（２１０１―４）":"",
    "（２１０１―５）":"",
    "（２１０１―６）":"",
    "（２１０１―７）":"",
    "（２１０１―８）":"",
    "（２１０１―９）":"",
    "︒": "。",
    "︑": "、",
    
}

# テキストファイルを読み込み、変換し、書き戻す関数
def convert_text_file(filename, conversion_dict):
    # ファイルの内容を読み込む
    with open(f"data/{filename}.txt", 'r', encoding='utf-8') as file:
        text = file.read()

    # 置き換えを実行
    for old, new in conversion_dict.items():
        text = text.replace(old, new)
    
    # （突貫工事）（配点）以前の文章はカット，（注）が先頭の文章もカット
    # 「（配点」以前の文章をカット
    if "（配点" in text:
        text = text.split("（配点", 1)[1]
    
    # 全ての「（配点xx）」をカット
    text = re.sub(r'（配点\d+）', '', text)
    
    # （注）が先頭の文章をカット
    text_lines = text.splitlines()
    filtered_lines = [line for line in text_lines if not line.startswith("（注）")]
    text = "\n".join(filtered_lines)
    
    
    # 変換後の内容を同じファイルに書き戻す
    with open(f"data/{filename}.txt", 'w', encoding='utf-8') as file:
        file.write(text)

# 実行
files = ["2022gendai", "2023gendai", "2024gendai"]
for file in files:
    convert_text_file(file, conversion_dict)