Skip to content

Commit

Permalink
Added --autofix-pdf option to help synthesize better (#35)
Browse files Browse the repository at this point in the history
  • Loading branch information
augustak committed Sep 21, 2023
1 parent 2bae1f4 commit b8142c0
Show file tree
Hide file tree
Showing 6 changed files with 89 additions and 47 deletions.
6 changes: 6 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,11 @@
# Changelog

## Version 0.6.0 - 2023-09-21

- Added an option to PDF synthesizing that attempts to correct incomplete and/or bad PDFs before synthesizing. This
will result in more PDFs being synthesized and more of the PDFs' text being synthesized. The option requires you to
have libreoffice installed. If using the docker image, libreoffice comes pre-installed.

## Version 0.4.5 - 2022-12-07

- Basic Synthesizer: Do not synthesize zeros since prefixed and suffixed zeros are often stripped in amounts. E.g.
Expand Down
3 changes: 2 additions & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
FROM python:3.10

RUN apt-get update && apt install -y --no-install-recommends \
ghostscript
ghostscript \
libreoffice

WORKDIR /root/synthetic/
COPY . .
Expand Down
11 changes: 8 additions & 3 deletions synthetic/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
from .image.parser import parse_image
from .image.synthesizer import BasicSynthesizer as BasicImageSynthesizer
from .iterdata import parse_documents
from .pdf.parser import parse_pdf
from .pdf.parser import parse_pdf, DEFAULT_TIMEOUT
from .pdf.synthesizer import BasicSynthesizer as BasicPdfSynthesizer


Expand Down Expand Up @@ -59,10 +59,15 @@ def create_pdf_parser(subparsers):
'--timeout-in-seconds',
type=int,
help='Time to wait for a single pdf to be parsed',
default=5,
default=DEFAULT_TIMEOUT,
)
pdf_parser.add_argument(
'--autofix-pdf',
help='Fix potentially bad PDFs. Requires libreoffice to be installed, will be ignored otherwise',
action='store_true',
)
cmd = partial(parse_documents, accepted_document_types=[Pdf], parse_fn=parse_pdf)
pdf_parser.set_defaults(optionals=['max_fonts', 'max_pages', 'timeout_in_seconds'])
pdf_parser.set_defaults(optionals=['max_fonts', 'max_pages', 'timeout_in_seconds', 'autofix_pdf'])
pdf_parser.set_defaults(cmd=cmd)


Expand Down
2 changes: 1 addition & 1 deletion synthetic/__version__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,4 +7,4 @@
__maintainer_email__ = 'august@lucidtech.ai'
__title__ = 'lucidtech-synthetic'
__url__ = 'https://github.com/LucidtechAI/synthetic'
__version__ = '0.5.0'
__version__ = '0.6.0'
2 changes: 1 addition & 1 deletion synthetic/iterdata.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,4 +91,4 @@ def parse_documents(
futures.append(executor.submit(_parse_fn, *args))

for future in as_completed(futures):
logger.info(future.result())
print(future.result())
112 changes: 71 additions & 41 deletions synthetic/pdf/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import sys
from functools import partial
from io import BytesIO, StringIO
from os import getpid
from pathlib import Path
from typing import Type
from uuid import uuid4
Expand All @@ -21,6 +22,7 @@
from .utils import Font


DEFAULT_TIMEOUT = 30
logger = logging.getLogger(__name__)

if sys.platform != 'win32':
Expand Down Expand Up @@ -159,10 +161,6 @@ class NoTextException(Exception):
pass


class AlreadyProcessed(Exception):
pass


class TooManyFontsException(Exception):
pass

Expand All @@ -171,10 +169,6 @@ class TooManyPagesException(Exception):
pass


class TooManySingleChars(Exception):
pass


def has_form(qpdf_page, operands):
for operand in filter(_is_name_op, operands):
x_object = qpdf_page.Resources.XObject.get(str(operand))
Expand Down Expand Up @@ -205,8 +199,7 @@ def _parse_pdf_objects(qpdf_page: pikepdf.Page, font_map, new_content_stream):
current_font=last_used_font,
)

for text_id, text, font in text_block:
yield text_block, text_id, text, font
yield text_block
new_content_stream.extend(text_block.content_stream)
else:
new_content_stream.append((operands, operator))
Expand All @@ -215,35 +208,59 @@ def _parse_pdf_objects(qpdf_page: pikepdf.Page, font_map, new_content_stream):


def update_available_characters(qpdf_page: pikepdf.Page, font_map):
text_lengths = collections.Counter()
for text_block in _parse_pdf_objects(qpdf_page, font_map, []):
for text_id, text, font in text_block:
font.available_characters |= set(text)

for _, text_id, text, font in _parse_pdf_objects(qpdf_page, font_map, []):
font.available_characters |= set(text)
text_lengths[len(text)] += 1

single_chars = text_lengths[1] / sum(text_lengths.values())
if single_chars > 0.9:
raise TooManySingleChars(f'Too many single characters in document ({single_chars * 100:.2f}%)')
def split(s, template):
pos = 0
for c in template:
yield s[pos:pos + len(c)]
pos += len(c)


def parse_text(qpdf_page: pikepdf.Page, font_map, synthesizer: PdfSynthesizer):
new_content_stream = []

for text_block, text_id, text, font in _parse_pdf_objects(qpdf_page, font_map, new_content_stream):
modified_text = synthesizer.modify_text(text, font=font)
text_block.set_unicode_text(text_id, modified_text)
for text_block in _parse_pdf_objects(qpdf_page, font_map, new_content_stream):
texts = []
text_ids = []
for text_id, text, font in text_block:
texts.append(text)
text_ids.append(text_id)

modified_text = synthesizer.modify_text(''.join(texts))

for (text_id, text, font), new_text in zip(text_block, split(modified_text, texts)):
text_block.set_unicode_text(text_id, new_text)

return new_content_stream


def out_path(_i, suffix, dst_dir, file_stem):
return dst_dir / f'{file_stem}-{_i}{suffix}'


def calculate_k_to_process(num_outputs_per_document, dst_dir, file_stem):
k_to_process = []
for i in range(num_outputs_per_document):
if not (
out_path(i, '.pdf', dst_dir, file_stem).exists() and
out_path(i, '.json', dst_dir, file_stem).exists()
):
k_to_process.append(i)
return k_to_process


def synthesize_pdf(
pdf_file,
json_file,
dst_dir,
max_fonts,
max_pages,
num_outputs_per_document,
synthesizer_class,
k_to_process,
):
ground_truth = json.loads(json_file.read_text())
pdf_io = BytesIO(pdf_file.read_bytes())
Expand All @@ -253,16 +270,7 @@ def synthesize_pdf(
interpreter = PDFPageInterpreter(rsrcmgr, device)
interpreter_fonts = {}

def _out_path(_i, suffix):
return dst_dir / f'{json_file.stem}-{_i}{suffix}'

k_to_process = []
for i in range(num_outputs_per_document):
if not (_out_path(i, '.pdf').exists() and _out_path(i, '.json').exists()):
k_to_process.append(i)

if not k_to_process:
raise AlreadyProcessed(f'Already processed {pdf_file} {json_file}')
_out_path = partial(out_path, dst_dir=dst_dir, file_stem=json_file.stem)

with pikepdf.Pdf.open(pdf_file) as pdf:
if max_pages and len(pdf.pages) > max_pages:
Expand Down Expand Up @@ -315,28 +323,37 @@ def parse_pdf(
tmp_dir: Path,
max_fonts: int = None,
max_pages: int = None,
timeout_in_seconds: int = 5,
timeout_in_seconds: int = DEFAULT_TIMEOUT,
autofix_pdf: bool = False,
):
logger.info(f'{name}: {pdf_file} {json_file}')
status = f'Error when synthesizing {name}'

k_to_process = calculate_k_to_process(num_outputs_per_document, dst_dir, json_file.stem)
if not k_to_process:
return f'Already processed {pdf_file} {json_file}'

synthesize_fn = partial(
synthesize_pdf,
json_file=json_file,
dst_dir=dst_dir,
max_fonts=max_fonts,
max_pages=max_pages,
num_outputs_per_document=num_outputs_per_document,
synthesizer_class=synthesizer_class,
k_to_process=k_to_process,
)

try:
timeout(synthesize_fn, args=(pdf_file,), timeout_in_seconds=timeout_in_seconds)
if autofix_pdf and (rewritten_pdf_file := rewrite_pdf(pdf_file, tmp_dir, timeout_in_seconds)):
timeout(synthesize_fn, args=(rewritten_pdf_file,), timeout_in_seconds=timeout_in_seconds)
else:
timeout(synthesize_fn, args=(pdf_file,), timeout_in_seconds=timeout_in_seconds)
status = f'Successfully synthesized {name}'
except HasFormException:
logger.info('Has form! Trying to flatten PDF')
if flattened_pdf_file := flatten(pdf_file, tmp_dir):
if flattened_pdf_file := flatten(pdf_file, tmp_dir, timeout_in_seconds):
try:
synthesize_fn(flattened_pdf_file)
timeout(synthesize_fn, args=(flattened_pdf_file,), timeout_in_seconds=timeout_in_seconds)
status = f'Successfully synthesized {name}'
except HasFormException:
logger.error('Failed to get rid of forms in flattened PDF')
Expand All @@ -345,9 +362,7 @@ def parse_pdf(
logger.error(f'Error when synthesizing {name}')
finally:
flattened_pdf_file.unlink(missing_ok=True)
except AlreadyProcessed as e:
logger.warning(e)
except (FileNotFoundError, NoTextException, TooManyFontsException, TooManyPagesException, TooManySingleChars) as e:
except (FileNotFoundError, NoTextException, TooManyFontsException, TooManyPagesException) as e:
logger.error(e)
except TimeoutError:
logger.error(f'Synthesizing timed out, took longer than {timeout_in_seconds}s')
Expand All @@ -359,10 +374,25 @@ def parse_pdf(
return status


def flatten(pdf_file, tmp_dir):
def run_in_shell(cmd, timeout_in_seconds):
return subprocess.run(cmd, shell=True, timeout=timeout_in_seconds)


def flatten(pdf_file, tmp_dir, timeout_in_seconds):
try:
dst_path = tmp_dir / f'{pdf_file.stem}.flattened.pdf'
subprocess.run(f'gs -q -sDEVICE=pdfwrite -o {dst_path} {pdf_file}', shell=True, timeout=15)
run_in_shell(f'gs -q -sDEVICE=pdfwrite -o {dst_path} {pdf_file}', timeout_in_seconds)
return dst_path
except (subprocess.CalledProcessError, subprocess.TimeoutExpired) as e:
logger.error(e)


def rewrite_pdf(pdf_file, tmp_dir, timeout_in_seconds):
try:
dst = tmp_dir / f'{pdf_file.stem}.pdf'
env = f'-env:UserInstallation=file:///tmp/{getpid()}'
src = f'--convert-to pdf {pdf_file}'
run_in_shell(f'libreoffice {env} --headless {src} --outdir {tmp_dir}', timeout_in_seconds)
return dst
except (subprocess.CalledProcessError, subprocess.TimeoutExpired) as e:
logger.error(e)

0 comments on commit b8142c0

Please sign in to comment.