In [1]:
from pdfminer.pdfparser import PDFParser, PDFDocument, PDFNoOutlines
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import PDFPageAggregator
from pdfminer.layout import LAParams, LTTextBox, LTTextLine, LTFigure, LTImage

In [22]:
def with_pdf(pdf, pdf_pwd, fn, *args):
    result = None
    try:
        fp = open(pdf, 'rb')
        parser = PDFParser(fp)
        doc = PDFDocument()
        parser.set_document(doc)
        doc.set_parser(parser)
        doc.initialize(pdf_pwd)
        
        if doc.is_extractable:
            result = fn(doc, *args)
        
        fp.close()
    except IOError:
        pass
    return result

In [23]:
def _parse_toc(doc):
    toc = []
    try:
        outlines = doc.get_outlines()
        for(level, title, dest, a, se) in outlines:
            toc.append((level, title))
    except PDFNoOutlines:
        pass
    return toc

In [26]:
def get_toc (pdf_doc, pdf_pwd=''):
    return with_pdf(pdf_doc, pdf_pwd, _parse_toc)

In [27]:
get_toc('Data/KMK-Liste_Splitterberufe_2015.pdf')

[]

In [32]:
def parse_lt_objs (lt_objs, page_number, images_folder, text=[]):
    """Iterate through the list of LT* objects and capture the text or image data contained in each"""
    text_content = [] 

    page_text = {} # k=(x0, x1) of the bbox, v=list of text strings within that bbox width (physical column)
    for lt_obj in lt_objs:
        if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine):
            # text, so arrange is logically based on its column width
            page_text = update_page_text_hash(page_text, lt_obj)
        elif isinstance(lt_obj, LTImage):
            # an image, so save it to the designated folder, and note its place in the text 
            saved_file = save_image(lt_obj, page_number, images_folder)
            if saved_file:
                # use html style <img /> tag to mark the position of the image within the text
                text_content.append('<img src="'+os.path.join(images_folder, saved_file)+'" />')
            else:
                print >> sys.stderr, "error saving image on page", page_number, lt_obj.__repr__
        elif isinstance(lt_obj, LTFigure):
            # LTFigure objects are containers for other LT* objects, so recurse through the children
            text_content.append(parse_lt_objs(lt_obj, page_number, images_folder, text_content))

    for k, v in sorted([(key,value) for (key,value) in page_text.items()]):
        # sort the page_text hash by the keys (x0,x1 values of the bbox),
        # which produces a top-down, left-to-right sequence of related columns
        text_content.append(''.join(v))

    return '\n'.join(text_content)

In [37]:
def _parse_pages(doc, images_folder):
    rsrcmgr = PDFResourceManager()
    laparams = LAParams()
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    
    text_content = []
    for i, page in enumerate(doc.get_pages()):
        interpreter.process_page(page)
        layout = device.get_result()
        for lt_obj in layout:
            if isinstance(lt_obj, LTImage):
                saved_file = save_image(lt_obj, page_number, images_folder)
                if saved_file:
                    text_content.append('<img src="' + os.path.join(images_folder, saved_file)+'" />')                    
    
    return text_content

In [38]:
def get_pages(pdf_doc, pdf_pwd='', images_folder='/tmp'):
    print('\n\n'.join(with_pdf(pdf_doc, pdf_pwd, _parse_pages, *tuple([images_folder]))))

In [39]:
get_pages('Data/KMK-Liste_Splitterberufe_2015.pdf')




In [42]:
pathToPdf = 'Data/KMK-Liste_Splitterberufe_2015.pdf'
text_out = with_pdf(pathToPdf, '', _parse_pages, *tuple(['Data']))

In [43]:
text_out

[]

In [44]:
pathToPdf = 'Data/RVSplit00-84-01-26ALT328.pdf'
text_out = with_pdf(pathToPdf, '', _parse_pages, *tuple(['Data']))

In [45]:
text_out

[]