In [1]:
# 使用 Python 从 PDF 中提取文本
## 1. 从 PDF 中提取文本的最简单方法是使用 "extract_text"：
from pdfminer.high_level import extract_text
text = extract_text('test.pdf')
print(repr(text))
print(text)

'PReFilter:   \nAn Efficient Privacy-preserving Relay Filtering Scheme for Delay Tolerant Networks \n\n\x0c'
PReFilter:   
An Efficient Privacy-preserving Relay Filtering Scheme for Delay Tolerant Networks 




In [3]:
## 2. 要从 PDF 中读取文本并在命令行上打印：
from io import StringIO
from pdfminer.high_level import extract_text_to_fp
output_string = StringIO()
with open('test.pdf', 'rb') as fin:
    extract_text_to_fp(fin, output_string)
print(output_string.getvalue().strip())

PReFilter:  An Efficient Privacy-preserving Relay Filtering Scheme for Delay Tolerant Networks


In [7]:
## 3.或者将其转换为 html 并使用布局分析：
from io import StringIO
from pdfminer.high_level import extract_text_to_fp
from pdfminer.layout import LAParams
output_string = StringIO()
with open('test.pdf', 'rb') as fin:
    extract_text_to_fp(fin, output_string, laparams=LAParams(), output_type='html', codec=None)
output_string.getvalue().strip()

'<html><head>\n<meta http-equiv="Content-Type" content="text/html">\n</head><body>\n<span style="position:absolute; border: gray 1px solid; left:0px; top:50px; width:595px; height:841px;"></span>\n<div style="position:absolute; top:50px;"><a name="1">Page 1</a></div>\n<div style="position:absolute; border: textbox 1px solid; writing-mode:lr-tb; left:90px; top:124px; width:373px; height:26px;"><span style="font-family: F1; font-size:10px">PReFilter:   \n<br>An Efficient Privacy-preserving Relay Filtering Scheme for Delay Tolerant Networks \n<br></span></div><div style="position:absolute; top:0px;">Page: <a href="#1">1</a></div>\n</body></html>'

In [8]:
## 4. 从 PDF 文件中提取文本并将其保存在 python 中 变量：

from io import StringIO

from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfparser import PDFParser

output_string = StringIO()
with open('test.pdf', 'rb') as in_file:
    parser = PDFParser(in_file)
    doc = PDFDocument(parser)
    rsrcmgr = PDFResourceManager()
    device = TextConverter(rsrcmgr, output_string, laparams=LAParams())
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    for page in PDFPage.create_pages(doc):
        interpreter.process_page(page)

print(output_string.getvalue())

PReFilter:   
An Efficient Privacy-preserving Relay Filtering Scheme for Delay Tolerant Networks 




In [27]:
# 使用 Python 从 PDF 中提取元素
## 1. 高级功能可用于完成常见任务。在这种情况下， 我们可以使用extract_pages：

from pdfminer.high_level import extract_pages
for page_layout in extract_pages("test.pdf"):
    for element in page_layout:
        print(element)
## 每个都将是一个,,,或一个。其中一些可以进一步迭代，例如迭代 虽然安威尔给你一个，而这些反过来可以 被迭代以获取一个。请参见此处的图表：布局分析算法。elementLTTextBoxLTFigureLTLineLTRectLTImageLTTextBoxLTTextLineLTChar

## 2. 假设我们要提取所有文本。我们可以做到：

from pdfminer.high_level import extract_pages
from pdfminer.layout import LTTextContainer
for page_layout in extract_pages("test.pdf"):
    for element in page_layout:
        if isinstance(element, LTTextContainer):
            print(element.get_text())
## 3. 或者，我们可以提取每个字符的字体名称或大小：
'''
from pdfminer.high_level import extract_pages
from pdfminer.layout import LTTextContainer, LTChar
for page_layout in extract_pages("test.pdf"):
    for element in page_layout:
        if isinstance(element, LTTextContainer):
            for text_line in element:
                for character in text_line:
                    if isinstance(character, LTChar):
                        print(character.fontname)
                        print(character.size)
'''

<LTTextBoxHorizontal(0) 90.000,740.961,463.699,767.121 'PReFilter:   \nAn Efficient Privacy-preserving Relay Filtering Scheme for Delay Tolerant Networks \n'>
PReFilter:   
An Efficient Privacy-preserving Relay Filtering Scheme for Delay Tolerant Networks 



'\nfrom pdfminer.high_level import extract_pages\nfrom pdfminer.layout import LTTextContainer, LTChar\nfor page_layout in extract_pages("test.pdf"):\n    for element in page_layout:\n        if isinstance(element, LTTextContainer):\n            for text_line in element:\n                for character in text_line:\n                    if isinstance(character, LTChar):\n                        print(character.fontname)\n                        print(character.size)\n'

In [33]:
# 从pdf中提取图像
## !curl https://www.robots.ox.ac.uk/~vgg/publications/2012/parkhi12a/parkhi12a.pdf --output example.pdf
!python3 pdf2txt.py example.pdf --output-dir cats-and-dogs

In [41]:
!pdf2txt.py --help

  File "I:\TGY-being\project\python\pdfminer_env\Scripts\pdf2txt.py", line 18
    def float_or_disabled(x: str) -> Optional[float]:
                           ^
SyntaxError: invalid syntax


In [31]:
!python3 dumppdf.py -a test.pdf