In [62]:
from pdfminer.layout import LAParams, LTTextBoxHorizontal, LTChar
from pdfminer.converter import PDFPageAggregator
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfpage import PDFPage

def process_chars(char_list, line_threshold=25, sentence_threshold=25):
    merged_paragraphs = []
    previous_y = None
    sentence = ''

    for char, (x, y, _, _) in char_list:
        if previous_y is not None and abs(y - previous_y) > line_threshold:
            # 新的一行，将之前的句子添加到结果中，并开始新的句子
            if sentence:
                merged_paragraphs.append(sentence)
            sentence = char
        else:
            sentence += char
            # 如果当前字符是句号并且句子长度超过一定阈值，则将句子添加到结果中，并开始新的句子
            if char == '。' and len(sentence) > sentence_threshold:
                merged_paragraphs.append(sentence)
                sentence = ''
        previous_y = y

    # 最后一句可能没有加入结果，别忘了添加
    if sentence:
        merged_paragraphs.append(sentence)

    return merged_paragraphs

def read_pdf(file_path):
    resource_manager = PDFResourceManager()
    laparams = LAParams()
    device = PDFPageAggregator(resource_manager, laparams=laparams)
    interpreter = PDFPageInterpreter(resource_manager, device)

    char_list = []

    # 打开PDF文件
    with open(file_path, 'rb') as file:
        # 遍历PDF的每一页
        for page in PDFPage.get_pages(file):
            interpreter.process_page(page)
            layout = device.get_result()

            # 遍历页面的每一个文本框
            for element in layout:
                if isinstance(element, LTTextBoxHorizontal):
                    # 遍历每一个字符
                    for text_line in element:
                        for char in text_line:
                            if isinstance(char, LTChar):
                                # 收集字符及其位置信息
                                char_list.append((char.get_text(), char.bbox))

    # 处理字符列表，生成句子
    merged_sentences = process_chars(char_list)
    for sentence in merged_sentences:
        print(sentence)

# 读取PDF文件
read_pdf('ceshi3.pdf')

DOI:10.13813/j.cn11-5141/u.2008.06.014
■文章编号：1672-5328（2008）06-0057-07城市交通UrbanTransportofChina第6卷第6期2008年11月Vol.6No.6Nov.2008
TOD内涵分析及实施框架IntensionAnalysisandImplementationFrameworkforTOD
陈莎殷广涛叶敏（中国城市规划设计研究院，北京100037）ChenSha,YinGuangtaoandYeMin(ChinaAcademyofUrbanPlanning&Design,Beijing100037,China)
摘要：TOD是美国城市“精明增长”模式的重要内容，契合我国可持续发展的城市要求。
首先总结了国外TOD实践的主要策略和成功经验。通过对比分析国内外TOD实施目的、实施背景两方面的差异性，从规划技术、城市管理和公共政策多个层面揭示了TOD的本质内涵。
基于对我国城市规划、土地利用、建设开发的流程梳理，明确了我国TOD模式推广和应用的原则。
最后从政府职能、规划建设管理机制、实施重点以及配套政策4个方面提出了我国TOD实施框架和建议。
Abstract：TransitOrientedDevelopment(TOD),asamajorcomponentofSmartGrowthpatternforU.S.cities,seemstofitwithurbansustainabledevelopmentinChinaaswell.ThroughasummaryofmajorstrategiesandexperienceofTOD'simplemen-tationinforeigncountries,andbasedonacomparativeanalysisofimplementationpurposesandbackgroundsbothathomeandabroad,thispaperdiscussestheinherentnatureofTODfromvar-iousangles,suchasplanningtechnology,urbanmanagement,andpublicpolicies.Accordingtotherelationshipofurbanplan-nin

In [78]:
import os
import io
from pdfminer.layout import LAParams, LTTextBoxHorizontal, LTChar
from pdfminer.converter import PDFPageAggregator
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfpage import PDFPage

def process_chars(char_list, line_threshold=25, sentence_threshold=25):
    merged_paragraphs = []
    previous_y = None
    sentence = ''

    for char, (x, y, _, _) in char_list:
        if previous_y is not None and abs(y - previous_y) > line_threshold:
            # 新的一行，将之前的句子添加到结果中，并开始新的句子
            if sentence:
                merged_paragraphs.append(sentence)
            sentence = char
        else:
            sentence += char
            # 如果当前字符是句号并且句子长度超过一定阈值，则将句子添加到结果中，并开始新的句子
            if char == '。' and len(sentence) > sentence_threshold:
                merged_paragraphs.append(sentence)
                sentence = ''
        previous_y = y

    # 最后一句可能没有加入结果，别忘了添加
    if sentence:
        merged_paragraphs.append(sentence)

    return merged_paragraphs

def read_dual_columns(file_path):
    resource_manager = PDFResourceManager()
    laparams = LAParams()
    device = PDFPageAggregator(resource_manager, laparams=laparams)
    interpreter = PDFPageInterpreter(resource_manager, device)

    page_char_lists = []  # 将字符列表分页

    # 打开PDF文件
    with open(file_path, 'rb') as file:
        # 遍历PDF的每一页
        for page in PDFPage.get_pages(file):
            interpreter.process_page(page)
            layout = device.get_result()

            # 变量初始化左右两列范围
            x_midpoint = (layout.x0 + layout.x1) / 2
            left_column_chars = []
            right_column_chars = []

            # 遍历页面的每一个文本框
            for element in layout:
                if isinstance(element, LTTextBoxHorizontal):
                    # 遍历每一个字符
                    for text_line in element:
                        for char in text_line:
                            if isinstance(char, LTChar):
                                char_text = char.get_text()
                                char_x = char.bbox[0]

                                # Determine the column for each character
                                if char_x < x_midpoint:
                                    left_column_chars.append((char_text, char.bbox))
                                else:
                                    right_column_chars.append((char_text, char.bbox))

            page_char_lists.append((left_column_chars, right_column_chars))
    # 创建一个内存文件，用于捕获函数输出
    output_buffer = io.StringIO()
    # 处理字符列表，生成句子并打印结果
    for page_number, (left_chars, right_chars) in enumerate(page_char_lists, start=1):
        merged_left_sentences = process_chars(left_chars)
        merged_right_sentences = process_chars(right_chars)
#         # 打印页数
#         print("Page:", page_number)
        
#         # 打印左列内容
#         print("Left Column:")
#         for left_sentence in merged_left_sentences:
#             print(left_sentence)

#         # 打印右列内容
#         print("Right Column:")
#         for right_sentence in merged_right_sentences:
#             print(right_sentence)

#         print("=" * 20)  # 打印分隔线
# 将页数和内容写入输出缓冲
        output_buffer.write(f"Page: {page_number}\n")
        output_buffer.write("Left Column:\n")
        output_buffer.write("\n".join(merged_left_sentences) + "\n")
        output_buffer.write("Right Column:\n")
        output_buffer.write("\n".join(merged_right_sentences) + "\n")
        output_buffer.write("=" * 20 + "\n")

    # 返回捕获的输出
    return output_buffer.getvalue()
        

# 读取PDF文件，分别处理左右两列的内容
# read_dual_columns('ceshi2.pdf')

input_folder = "D:\jupyter notebook\知识抽取\pdf"
output_folder = "D:\jupyter notebook\知识抽取\pdf"

pdf_files = [file for file in os.listdir(input_folder) if file.endswith(".pdf")]

for pdf_file in pdf_files:
    pdf_path = os.path.join(input_folder, pdf_file)
    text = read_dual_columns(pdf_path)

    # 构造输出文本的文件名
    output_text_filename = os.path.splitext(pdf_file)[0] + ".txt"
    output_text_path = os.path.join(output_folder, output_text_filename)

    # 将左右栏文本写入一个文本文件中
    with open(output_text_path, "w", encoding="utf-8") as text_file:
        text_file.write(text)

In [1]:
import os

import fitz
from paddleocr import PaddleOCR

from sklearn.cluster import KMeans
import numpy as np


ocr = PaddleOCR(det=False, use_gpu=False, enable_mkldnn=True, use_tensorrt=True, use_angle_cls=True, lang='ch')


def calc_column_boundaries(lines):

    # 获取所有行的左边界
    line_xs = [line["bbox"][0] for line in lines]
    line_xs = np.array(line_xs)[:, np.newaxis]

    # 对行左边界聚类    这个聚类的n影响实际数据了
    km = KMeans(n_clusters=3)
    km.fit(line_xs)
    clusters = km.cluster_centers_

    # 类中心点为列边界
    cols = []
    for c in clusters:
        cols.append([c[0], c[0]])

    return cols


def jpg_to_txt(file_name):
    pro_path = os.getcwd() + "\\" + file_name.replace("\\", "/")
    pdf_doc = fitz.open(pro_path)
    full_text = []
    for pg in range(pdf_doc.page_count):
        page = pdf_doc[pg]
        # blocks = page.get_text('blocks')
        # 获取行数据
        lines = page.get_text("dict")["blocks"]
        cols = calc_column_boundaries(lines)


        # 遍历每列
        for col in cols:
           # 提取列坐标
           x0, x1 = col[0], col[1]
           y0, y1 = page.rect.y0, page.rect.y1

           # 裁剪列图片
           pix = page.getPixmap(matrix=fitz.Matrix(x0, y0))
           pix.save('temp.jpg')


           page.get_pixmap
           # ocr识别
           result = ocr.ocr('temp.jpg', cls=True)

           for lines in result:
             for line in lines:
                if line[1][1] > 0.5:
                 full_text += ''.join(line[1][0]) + '\n'

    print(''.join(full_text))
    return full_text



# 按间距中的绿色按钮以运行脚本。
if __name__ == '__main__':
    pdf_file = 'ceshi3'
    text = jpg_to_txt(pdf_file)

    # 添加文本写入txt文件
    txt_file = 'example.txt'
    with open(txt_file, 'w', encoding='utf-8') as f:
        f.write("".join((text)))
        # f.write(text)

    print('PDF转换完成,文本已保存到example.txt')

ModuleNotFoundError: No module named 'fitz'