# PDF text Extraction

Evaluate available pacakges for processing of PDF format files.
To prepare for ML6 use case.


### PDFminer

In [47]:
InputFile = 'data/23114.pdf'

In [48]:
# See https://www.blog.pythonlibrary.org/2018/05/03/exporting-data-from-pdfs-with-python/
#     http://zevross.com/blog/2014/04/09/extracting-tabular-data-from-a-pdf-an-example-using-python-and-regular-expressions/

import io
from pdfminer.converter import TextConverter
from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.pdfpage import PDFPage

def extract_text_from_pdf(pdf_path):
    resource_manager = PDFResourceManager()
    fake_file_handle = io.StringIO()
    converter = TextConverter(resource_manager, fake_file_handle)
    page_interpreter = PDFPageInterpreter(resource_manager, converter)
    
    with open(pdf_path, 'rb') as fh:
        for page in PDFPage.get_pages(fh, 
                                      caching=True,
                                      check_extractable=True):
            page_interpreter.process_page(page)
            
        text = fake_file_handle.getvalue()
    
    # close open handles
    converter.close()
    fake_file_handle.close()
    
    if text:
        return text


In [49]:
text = extract_text_from_pdf(InputFile)
text

'   Material Safety Data Sheet   LUPEROX® P      Product code: 051000   Version 2.0 Issued on: 07/11/2011 Page: 1 / 10     1. PRODUCT AND COMPANY IDENTIFICATION  Company  Arkema Inc. 900 First Avenue King of Prussia, Pennsylvania 19406  Functional Additives  Customer Service Telephone Number: (800) 331-7654 (Monday through Friday, 8:30 AM to 5:30 PM EST)  Emergency Information  Transportation: CHEMTREC: (800) 424-9300 (24 hrs., 7 days a week) Medical: Rocky Mountain Poison Center: (866) 767-5089 (24 hrs., 7 days a week)  Product Information  Product name: LUPEROX® P Synonyms: Peroxyester, t-butyl perbenzoate, tert-butyl peroxybenzoate Molecular formula: C11 H14 O3 Chemical family: Organic peroxide - peroxyesters Product use: Initiator  2. HAZARDS IDENTIFICATION  Emergency Overview  Color: Colourless to yellow. Physical state: liquid  Odor: unpleasant  CAUTION! ORGANIC PEROXIDE. HAZARDOUS DECOMPOSITION MAY OCCUR.  MAY CAUSE ALLERGIC SKIN REACTION.  MAY CAUSE SKIN IRRITATION.   Potential

In [5]:
# from: https://github.com/driscollis/reportlabbookcode/blob/master/chapter16_exporting_data/miner_text_generator.py

# miner_text_generator.py

import io

from pdfminer.converter import TextConverter
from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.pdfpage import PDFPage

def extract_text_by_page(pdf_path):
    with open(pdf_path, 'rb') as fh:
        for page in PDFPage.get_pages(fh, 
                                      caching=True,
                                      check_extractable=True):
            resource_manager = PDFResourceManager()
            fake_file_handle = io.StringIO()
            converter = TextConverter(resource_manager, fake_file_handle)
            page_interpreter = PDFPageInterpreter(resource_manager, converter)
            page_interpreter.process_page(page)
            
            text = fake_file_handle.getvalue()
            yield text
    
            # close open handles
            converter.close()
            fake_file_handle.close()
    
def extract_text(pdf_path):
    for page in extract_text_by_page(pdf_path):
        print(page)
        print()

In [20]:
# json_exporter.py
import json
import os
# from miner_text_generator import extract_text_by_page # included above

def export_as_json(pdf_path, json_path):
    filename = os.path.splitext(os.path.basename(pdf_path))[0]
    data = {'Filename': filename}
    data['Pages'] = []
    
    counter = 1
    for page in extract_text_by_page(pdf_path):
        text = page[0:100]
        page = {'Page_{}'.format(counter): text}
        data['Pages'].append(page)
        counter += 1
    
    with open(json_path, 'w') as fh:
        json.dump(data, fh)

In [10]:
json_path = InputFile + '.json'

export_as_json(InputFile, json_path)

In [11]:
with open(json_path, 'r') as handle:
    parsed = json.load(handle)
    print(json.dumps(parsed, indent=4, sort_keys=True))

{
    "Filename": "23114",
    "Pages": [
        {
            "Page_1": "   Material Safety Data Sheet   LUPEROX\u00ae P      Product code: 051000   Version 2.0 Issued on: 07/11/"
        },
        {
            "Page_2": "   Material Safety Data Sheet   LUPEROX\u00ae P      Product code: 051000   Version 2.0 Issued on: 07/11/"
        },
        {
            "Page_3": "   Material Safety Data Sheet   LUPEROX\u00ae P      Product code: 051000   Version 2.0 Issued on: 07/11/"
        },
        {
            "Page_4": "   Material Safety Data Sheet   LUPEROX\u00ae P      Product code: 051000   Version 2.0 Issued on: 07/11/"
        },
        {
            "Page_5": "   Material Safety Data Sheet   LUPEROX\u00ae P      Product code: 051000   Version 2.0 Issued on: 07/11/"
        },
        {
            "Page_6": "   Material Safety Data Sheet   LUPEROX\u00ae P      Product code: 051000   Version 2.0 Issued on: 07/11/"
        },
        {
            "Page_7": "   Material Safety 

In [21]:
InputFile = 'data/gylcol-ether-dpm-sds.pdf'

json_path = InputFile + '.json'

export_as_json(InputFile, json_path)

with open(json_path, 'r') as handle:
    parsed = json.load(handle)
    print(json.dumps(parsed, indent=4, sort_keys=True))

{
    "Filename": "gylcol-ether-dpm-sds",
    "Pages": [
        {
            "Page_1": "SAFETY DATA SHEET:GYLCOL ETHER DPM1. IDENTIFICATIONPRODUCT NAME:  GLYCOL ETHER DPMCAS NO:34590-94-8P"
        },
        {
            "Page_2": "Dipropylene glycol monomethyl ether34590-94-8>99.0%4. FIRST AID MEASURESDescription of first aid mea"
        },
        {
            "Page_3": "Personal precautions, protective equipment and emergency procedures: Isolate area. Refer to Section "
        },
        {
            "Page_4": "conditions no respiratory protection should be needed; however, if discomfort is experienced, use an"
        },
        {
            "Page_5": "Molecular Weight   148.2 g/molHenry's Law Constant (H)   1.6E-07 atm*m3/mole; 25 \u00b0C Estimated.10. ST"
        },
        {
            "Page_6": "For similar material(s): In laboratory animal studies, effects on reproduction have been seen only a"
        },
        {
            "Page_7": "& UNCONTAMINATED PRODUCT, the

In [54]:
# See https://programtalk.com/python-examples/pdfminer.converter.XMLConverter/

import io
from pdfminer.converter import XMLConverter, HTMLConverter, TextConverter
from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.pdfpage import PDFPage

from io import StringIO
from pdfminer.layout import LAParams

def extract_pdf_page(filename, page_number_List):
    """Given the name of a PDF file and the pages to extract, use PDFMiner to extract those
    pages and return them as XML (in utf-8 bytes).
 
    The param page_number_or_numbers can be a single page number or an iterable thereof.
    """
    # This code adapted from pdf2txt.py which is part of PDFMiner.
    # Here's the command line version of the code below --
    #    pdf2txt.py -p 1 -o expected.xml sample.pdf
 
    #f_out = StringIO.StringIO()
    f_out = StringIO()
    laparams = LAParams()
    rsrcmgr = PDFResourceManager()
    #device = XMLConverter(rsrcmgr, f_out, codec='utf-8', laparams=laparams)
    device = XMLConverter(rsrcmgr, f_out, laparams=laparams)
 
    with open(filename, 'rb') as f_in:
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        for page in PDFPage.get_pages(f_in, page_number_List):
            interpreter.process_page(page)
 
    device.close()
    

def extract_xml_from_pdf(pdf_path):
    resource_manager = PDFResourceManager()
    fake_file_handle = io.StringIO()
    converter = XMLConverter(resource_manager, fake_file_handle, codec='utf-8')
    page_interpreter = PDFPageInterpreter(resource_manager, converter)
    
    with open(pdf_path, 'rb') as fh:
        for page in PDFPage.get_pages(fh, 
                                      caching=True,
                                      check_extractable=True):
            page_interpreter.process_page(page)
            
        text = fake_file_handle.getvalue()
    
    # close open handles
    converter.close()
    fake_file_handle.close()
    
    if text:
        return text

In [51]:
InputFile = 'data/23114.pdf'

In [58]:
# See: https://pydoc.net/pdfminer.six/20200124/pdfminer.high_level/

"""Functions that can be used for the most common use-cases for pdfminer.six"""
 
import logging
import sys
from io import StringIO
 
from pdfminer.converter import XMLConverter, HTMLConverter, TextConverter, PDFPageAggregator
from pdfminer.image import ImageWriter
from pdfminer.layout import LAParams
from pdfminer.pdfdevice import TagExtractor
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfpage import PDFPage
 
 
def extract_text_to_fp(inf, outfp, output_type='text', codec='utf-8',
                       laparams=None, maxpages=0, page_numbers=None,
                       password="", scale=1.0, rotation=0, layoutmode='normal',
                       output_dir=None, strip_control=False, debug=False,
                       disable_caching=False, **kwargs):
    """Parses text from inf-file and writes to outfp file-like object.
 
    Takes loads of optional arguments but the defaults are somewhat sane.
    Beware laparams: Including an empty LAParams is not the same as passing
        None!
 
    :param inf: a file-like object to read PDF structure from, such as a
        file handler (using the builtin `open()` function) or a `BytesIO`.
    :param outfp: a file-like object to write the text to.
    :param output_type: May be 'text', 'xml', 'html', 'tag'. Only 'text' works
        properly.
    :param codec: Text decoding codec
    :param laparams: An LAParams object from pdfminer.layout. Default is None
        but may not layout correctly.
    :param maxpages: How many pages to stop parsing after
    :param page_numbers: zero-indexed page numbers to operate on.
    :param password: For encrypted PDFs, the password to decrypt.
    :param scale: Scale factor
    :param rotation: Rotation factor
    :param layoutmode: Default is 'normal', see
        pdfminer.converter.HTMLConverter
    :param output_dir: If given, creates an ImageWriter for extracted images.
    :param strip_control: Does what it says on the tin
    :param debug: Output more logging data
    :param disable_caching: Does what it says on the tin
    :param other:
    :return: nothing, acting as it does on two streams. Use StringIO to get
        strings.
    """
    if debug:
        logging.getLogger().setLevel(logging.DEBUG)
 
    imagewriter = None
    if output_dir:
        imagewriter = ImageWriter(output_dir)
 
    rsrcmgr = PDFResourceManager(caching=not disable_caching)
 
    if output_type == 'text':
        device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams,
                               imagewriter=imagewriter)
 
    if outfp == sys.stdout:
        outfp = sys.stdout.buffer
 
    if output_type == 'xml':
        device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams,
                              imagewriter=imagewriter,
                              stripcontrol=strip_control)
    elif output_type == 'html':
        device = HTMLConverter(rsrcmgr, outfp, codec=codec, scale=scale,
                               layoutmode=layoutmode, laparams=laparams,
                               imagewriter=imagewriter)
    elif output_type == 'tag':
        device = TagExtractor(rsrcmgr, outfp, codec=codec)
 
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    for page in PDFPage.get_pages(inf,
                                  page_numbers,
                                  maxpages=maxpages,
                                  password=password,
                                  caching=not disable_caching,
                                  check_extractable=True):
        page.rotate = (page.rotate + rotation) % 360
        interpreter.process_page(page)
 
    device.close()
 
 
def extract_text(pdf_file, password='', page_numbers=None, maxpages=0,
                 caching=True, codec='utf-8', laparams=None):
    """Parse and return the text contained in a PDF file.
 
    :param pdf_file: Path to the PDF file to be worked on
    :param password: For encrypted PDFs, the password to decrypt.
    :param page_numbers: List of zero-indexed page numbers to extract.
    :param maxpages: The maximum number of pages to parse
    :param caching: If resources should be cached
    :param codec: Text decoding codec
    :param laparams: An LAParams object from pdfminer.layout. If None, uses
        some default settings that often work well.
    :return: a string containing all of the text extracted.
    """
    if laparams is None:
        laparams = LAParams()
 
    with open(pdf_file, "rb") as fp, StringIO() as output_string:
        rsrcmgr = PDFResourceManager()
        device = TextConverter(rsrcmgr, output_string, codec=codec,
                               laparams=laparams)
        interpreter = PDFPageInterpreter(rsrcmgr, device)
 
        for page in PDFPage.get_pages(
                fp,
                page_numbers,
                maxpages=maxpages,
                password=password,
                caching=caching,
                check_extractable=True,
        ):
            interpreter.process_page(page)
 
        return output_string.getvalue()
 
 
def extract_pages(pdf_file, password='', page_numbers=None, maxpages=0,
                  caching=True, laparams=None):
    """Extract and yield LTPage objects
 
    :param pdf_file: Path to the PDF file to be worked on
    :param password: For encrypted PDFs, the password to decrypt.
    :param page_numbers: List of zero-indexed page numbers to extract.
    :param maxpages: The maximum number of pages to parse
    :param caching: If resources should be cached
    :param laparams: An LAParams object from pdfminer.layout. If None, uses
        some default settings that often work well.
    :return:
    """
    if laparams is None:
        laparams = LAParams()
 
    with open(pdf_file, "rb") as fp:
        resource_manager = PDFResourceManager()
        device = PDFPageAggregator(resource_manager, laparams=laparams)
        interpreter = PDFPageInterpreter(resource_manager, device)
        for page in PDFPage.get_pages(fp, page_numbers, maxpages=maxpages,
                                      password=password, caching=caching):
            interpreter.process_page(page)
            layout = device.get_result()
            yield layout


In [67]:
InputFile = 'data/23114.pdf'

InFile = open(InputFile, "rb")

OutFile = open(InputFile + '.xml', "wb") # StringIO() as output_string:

In [68]:
extract_text_to_fp(InFile, OutFile, output_type='xml')

This does extract the PDF file in XML format, but each letter is a seperate item.

In [27]:
# Extract jpg's from pdf's. Quick and dirty.

import sys

#pdf = file(sys.argv[1], "rb").read()
f = open(InputFile, "rb")
pdf = f.read()

print(type(pdf))

#startmark = "\xff\xd8"
startmark = bytes("\xff\xd8", encoding="utf-8")
startfix = 0
#endmark = "\xff\xd9"
endmark = bytes("\xff\xd9", encoding="utf-8")
endfix = 2
i = 0
njpg = 0

while True:
    #istream = pdf.find("stream", i)
    #enc_word=bytes("stream", encoding="utf-8")
    #istream = pdf.find(enc_word)
    #if istream < 0:
    #   break
    #istart = pdf.find(startmark, istream, istream+20)
    istart = pdf.find(startmark)
    if istart < 0:
        i = istream+20
        continue
    iend = pdf.find("endstream", istart)
    if iend < 0:
        raise Exception("Didn't find end of stream!")
    iend = pdf.find(endmark, iend-20)
    if iend < 0:
        raise Exception("Didn't find end of JPG!")
    
    istart += startfix
    iend += endfix
    print("JPG %d from %d to %d" % (njpg, istart, iend))
    jpg = pdf[istart:iend]
    jpgfile = file("jpg%d.jpg" % njpg, "wb")
    jpgfile.write(jpg)
    jpgfile.close()
    
    njpg += 1
    i = iend

<class 'bytes'>


KeyboardInterrupt: 

#### PDFQuery

From: https://github.com/jcushman/pdfquery#installation

      https://medium.com/@vince.shields913/handling-data-stored-across-multiple-pdf-files-with-python-33c6c26425c8

In [7]:
pdf = pdfquery.PDFQuery(InputFile)
pdf.load()
label = pdf.pq('LTTextLineHorizontal:contains("Hazards identification")')
print(label)
label = pdf.pq('LTTextLineHorizontal:contains("HAZARDS IDENTIFICATION")')
label




[<LTTextLineHorizontal>]

In [9]:
import pdfquery

pdf = pdfquery.PDFQuery(InputFile)
pdf.load()
label = pdf.pq('LTTextLineHorizontal:contains("HAZARDS IDENTIFICATION")')

print(label, "\n")

left_corner = float(label.attr('x0'))
bottom_corner = float(label.attr('y0'))
name = pdf.pq('LTTextLineHorizontal:in_bbox("%s, %s, %s, %s")' % (left_corner, bottom_corner-30, left_corner+150, bottom_corner)).text()
name

<LTTextLineHorizontal y0="408.364" y1="421.87" x0="75.423" x1="211.569" width="136.146" height="13.506" bbox="[75.423, 408.364, 211.569, 421.87]" word_margin="0.1"><LTTextBoxHorizontal y0="408.364" y1="421.87" x0="75.423" x1="211.569" width="136.146" height="13.506" bbox="[75.423, 408.364, 211.569, 421.87]" index="9">. HAZARDS IDENTIFICATION </LTTextBoxHorizontal></LTTextLineHorizontal> 



'mergency Overview oiling point/boiling Reference number: Date of Revision:'

#### Slate
wrapper around PDFminer. works page based.

See: https://github.com/timClicks/slate

### PyPDF2

In [3]:
# From: https://www.geeksforgeeks.org/working-with-pdf-files-in-python/
#       https://github.com/Brett-Plemons/PDF-to-JSON-Converter/blob/master/IDTtoJSON.py

# importing required modules 
import PyPDF2 
  
# creating a pdf file object 
pdfFileObj = open(InputFile, 'rb') 
  
# creating a pdf reader object 
pdfReader = PyPDF2.PdfFileReader(pdfFileObj) 
  
# printing number of pages in pdf file 
print(pdfReader.numPages) 
  
# creating a page object 
pageObj = pdfReader.getPage(0) 
  
# extracting text from page 
print(pageObj.extractText()) 
  
# closing the pdf file object 
pdfFileObj.close() 

10
   Material Safety Data Sheet 
  LUPEROX® P
      Product code: 051000
   Version 2.0
 Issued on: 07/11/2011
 Page: 1 / 10
     1. PRODUCT AND COMPANY IDENTIFICATION
  Company
  
Arkema Inc.
 900 First Avenue 
King of Prussia, Pennsylvania 19406 

 Functional Additives
  
Customer Service Telephone Number: 
(800) 331-7654 
(Monday through Friday, 8:30 AM to 5:30 PM EST) 
 Emergency Information
  
Transportation:
 CHEMTREC: (800) 424-9300 
(24 hrs., 7 days a week) 
Medical: 
Rocky Mountain Poison Center:
 (866) 767-5089 
(24 hrs., 7 days a week) 
 
Product Information
  
Product name:
 LUPEROX® P
 Synonyms: 
Peroxyester, t-butyl perbenzoate, tert-butyl peroxybenzoate 
Molecular formula:
 C11 H14 O3
 Chemical family:
 Organic peroxide - peroxyesters
 Product use: 
Initiator
  2. HAZARDS IDENTIFICATION
  Emergency Overview
  Color:
 Colourless to yellow.
 Physical state: 
liquid 
 Odor: unpleasant 
 CAUTION! 
ORGANIC PEROXIDE. 
HAZARDOUS DECOMPOSITION MAY OCCUR.
  MAY CAUSE ALLERGIC SK

No json conversion in this one

### pdfx

In [7]:
# From: https://stackoverflow.com/questions/48846091/pdf-to-json-using-pdfx-python-library
#       https://pypi.org/project/pdfx/
#       https://www.metachris.com/pdfx/

import pdfx

pdf = pdfx.PDFx(InputFile)
metadata = pdf.get_metadata()
reference_list = pdf.get_references()
reference_dict = pdf.get_references_as_dict()
#pdf.download_pdfs("target-directory")

In [8]:
print(metadata, "\n")
print(reference_list, "\n")
print(reference_dict)

{'Author': 'A3000772', 'CreationDate': "D:20110711115446-04'00'", 'Creator': 'PScript5.dll Version 5.2', 'ModDate': "D:20120210094950-06'00'", 'Producer': 'Acrobat Distiller 6.0.1 (Windows)', 'Title': 'Microsoft Word - r5356702.rtf', 'pdf': {'Producer': 'Acrobat Distiller 6.0.1 (Windows)'}, 'xap': {'CreatorTool': 'PScript5.dll Version 5.2', 'ModifyDate': '2012-02-10T09:49:50-06:00', 'CreateDate': '2011-07-11T11:54:46-04:00', 'MetadataDate': '2012-02-10T09:49:50-06:00'}, 'xapmm': {'DocumentID': 'uuid:e2a20ffa-64dc-4da9-ad6a-9a9970591106', 'InstanceID': 'uuid:702f0e9e-7004-4733-b718-f96e4488cc71'}, 'dc': {'format': 'application/pdf', 'title': {'x-default': 'Microsoft Word - r5356702.rtf'}, 'creator': ['A3000772']}, 'Pages': 10} 

set() 

{}


In [9]:
print(pdf)

<pdfx.PDFx object at 0x7f2ac45e21c0>


### Tabula-py

https://tabula.technology/

In [71]:
InputFile = 'data/gylcol-ether-dpm-sds.pdf'

In [76]:
from tabula import read_pdf

ImportError: cannot import name 'read_pdf' from 'tabula' (unknown location)

In [74]:
import tabula
df = tabula.read_pdf("test.pdf", pages='all')

AttributeError: module 'tabula' has no attribute 'read_pdf'

In [70]:
#From: https://github.com/chezou/tabula-py
#      https://nbviewer.jupyter.org/github/chezou/tabula-py/blob/master/examples/tabula_example.ipynb
#      

import tabula
#tabula.environment_info()
environment_info()


dfs = tabula.read_pdf(InputFile, stream=True)
dfs

NameError: name 'environment_info' is not defined

needed to pip install tabula-py, but installed tabula. After uninstalling tabula, things seem confused

### pydf2json

###### From
              https://pypi.org/project/pydf2json/
              https://pypi.org/project/pydf2json/#files

ERROR: pydf2json-2.3.4.post6-py2-none-any.whl is not a supported wheel on this platform

Source code is available in the .whl file, approx.3400 lines of code

### PDF.co

In [6]:
# From https://pdf.co/samples/pdf-co-web-api-pdf-to-json-api-python-convert-pdf-to-json-from-uploaded-file-asynchronously

""" Cloud API asynchronous "PDF To Text" job example.
    Allows avoiding timeout errors when processing huge or scanned PDF documents.
"""
import os
import requests # pip install requests
import time
import datetime


def convertPdfToJson(uploadedFileUrl, destinationFile):
    """Converts PDF To Json using PDF.co Web API"""

    # Prepare requests params as JSON
    # See documentation: https://apidocs.pdf.co
    parameters = {}
    parameters["async"] = Async
    parameters["name"] = os.path.basename(destinationFile)
    parameters["password"] = Password
    parameters["pages"] = Pages
    parameters["url"] = uploadedFileUrl

    # Prepare URL for 'PDF To Json' API request
    url = "{}/pdf/convert/to/json".format(BASE_URL)

    # Execute request and get response as JSON
    response = requests.post(url, data=parameters, headers={ "x-api-key": API_KEY })
    if (response.status_code == 200):
        json = response.json()

        if json["error"] == False:
            # Asynchronous job ID
            jobId = json["jobId"]
            #  URL of the result file
            resultFileUrl = json["url"]
            
            # Check the job status in a loop. 
            # If you don't want to pause the main thread you can rework the code 
            # to use a separate thread for the status checking and completion.
            while True:
                status = checkJobStatus(jobId) # Possible statuses: "working", "failed", "aborted", "success".
                
                # Display timestamp and status (for demo purposes)
                print(datetime.datetime.now().strftime("%H:%M.%S") + ": " + status)
                
                if status == "success":
                    # Download result file
                    r = requests.get(resultFileUrl, stream=True)
                    if (r.status_code == 200):
                        with open(destinationFile, 'wb') as file:
                            for chunk in r:
                                file.write(chunk)
                        print(f"Result file saved as \"{destinationFile}\" file.")
                    else:
                        print(f"Request error: {response.status_code} {response.reason}")
                    break
                elif status == "working":
                    # Pause for a few seconds
                    time.sleep(3)
                else:
                    print(status)
                    break
        else:
            # Show service reported error
            print(json["message"])
    else:
        print(f"Request error: {response.status_code} {response.reason}")


def checkJobStatus(jobId):
    """Checks server job status"""

    url = f"{BASE_URL}/job/check?jobid={jobId}"
    
    response = requests.get(url, headers={ "x-api-key": API_KEY })
    if (response.status_code == 200):
        json = response.json()
        return json["status"]
    else:
        print(f"Request error: {response.status_code} {response.reason}")

    return None


def uploadFile(fileName):
    """Uploads file to the cloud"""
    
    # 1. RETRIEVE PRESIGNED URL TO UPLOAD FILE.

    # Prepare URL for 'Get Presigned URL' API request
    url = "{}/file/upload/get-presigned-url?contenttype=application/octet-stream&name={}".format(
        BASE_URL, os.path.basename(fileName))
    
    # Execute request and get response as JSON
    response = requests.get(url, headers={ "x-api-key": API_KEY })
    if (response.status_code == 200):
        json = response.json()
        
        if json["error"] == False:
            # URL to use for file upload
            uploadUrl = json["presignedUrl"]
            # URL for future reference
            uploadedFileUrl = json["url"]

            # 2. UPLOAD FILE TO CLOUD.
            with open(fileName, 'rb') as file:
                requests.put(uploadUrl, data=file, headers={ "x-api-key": API_KEY, "content-type": "application/octet-stream" })

            return uploadedFileUrl
        else:
            # Show service reported error
            print(json["message"])    
    else:
        print(f"Request error: {response.status_code} {response.reason}")

    return None

# The authentication key (API Key).
# Get your own by registering at https://app.pdf.co/documentation/api
API_KEY = "******************************************"

# Base URL for PDF.co Web API requests
BASE_URL = "https://api.pdf.co/v1"


# Comma-separated list of page indices (or ranges) to process. Leave empty for all pages. Example: '0,2-5,7-'.
Pages = ""
# PDF document password. Leave empty for unprotected documents.
Password = ""

# (!) Make asynchronous job
Async = True


DestinationFile = InputFile + '.json'

uploadedFileUrl = uploadFile(InputFile)
if (uploadedFileUrl != None):
        convertPdfToJson(uploadedFileUrl, DestinationFile)


Request error: 401 Unauthorized


This looks like an API, i.e. using a external service at http://pdf.co

### pdfrw

Mostly for manipulating pdf format

In [None]:
# From: https://github.com/pmaupin/pdfrw#pdfrw-04

### pdfreader

extraction of images

In [None]:
# From: https://pypi.org/project/pdfreader/

### PyMuPDF

from: https://pypi.org/project/PyMuPDF/

In [4]:
# See: https://www.quora.com/Is-there-any-way-to-Convert-PDF-to-Json

#PDF to JSON using Python 3+ 
 
# package to install  
# pip install Fitz  
# pip install pymupdf  
 
import fitz  
import json  

InputFile = 'data/gylcol-ether-dpm-sds.pdf'
document  = fitz.open(InputFile) 
page  = document.loadPage(7) #enter page 
text = page.getText('dict')  
print(text) 
 
with open('data.json', 'w') as f: 
    text_data = json.dump(text, f) 
    

{'width': 612.0, 'height': 792.0, 'blocks': [{'number': 0, 'type': 0, 'bbox': (39.0, 29.7216796875, 560.669921875, 64.7958984375), 'lines': [{'spans': [{'size': 10.0, 'flags': 4, 'font': 'TimesNewRomanPSMT', 'color': 0, 'ascender': 0.89111328125, 'descender': -0.21630859375, 'text': 'CEPA - Domestic Substances List (DSL)', 'origin': (39.0, 38.6328125), 'bbox': (39.0, 29.7216796875, 204.1200714111328, 40.7958984375)}], 'wmode': 0, 'dir': (1.0, 0.0), 'bbox': (39.0, 29.7216796875, 204.1200714111328, 40.7958984375)}, {'spans': [{'size': 10.0, 'flags': 4, 'font': 'TimesNewRomanPSMT', 'color': 0, 'ascender': 0.89111328125, 'descender': -0.21630859375, 'text': 'This product contains one or more substances which are not listed on the Canadian Domestic Substances List (DSL). Contact your', 'origin': (39.0, 49.6328125), 'bbox': (39.0, 40.7216796875, 560.669921875, 51.7958984375)}], 'wmode': 0, 'dir': (1.0, 0.0), 'bbox': (39.0, 40.7216796875, 560.669921875, 51.7958984375)}, {'spans': [{'size': 10

TypeError: Object of type bytes is not JSON serializable

### Tika

from: https://github.com/chrismattmann/tika-python

In [12]:
InputFile = 'data/gylcol-ether-dpm-sds.pdf'

In [23]:
import tika
from tika import parser
parsed = parser.from_file(InputFile)

In [24]:
# Returns keys applicable for given pdf. 
print(parsed.keys()) 

dict_keys(['metadata', 'content', 'status'])


In [25]:
print(parsed["metadata"])
print(parsed["content"])

{'Author': '', 'Content-Type': 'application/pdf', 'Creation-Date': '2016-02-26T16:35:00Z', 'DLI': '10.1.0.50', 'DLI_Copyright': 'Datalogics Interface (DLI) Copyright (C) 1998-2012 Datalogics, Inc. -- www.datalogics.com', 'Keywords': '', 'Last-Modified': '2016-02-26T16:35:00Z', 'Last-Save-Date': '2016-02-26T16:35:00Z', 'Platform': 'Macintosh', 'X-Parsed-By': ['org.apache.tika.parser.DefaultParser', 'org.apache.tika.parser.pdf.PDFParser'], 'X-TIKA:content_handler': 'ToTextContentHandler', 'X-TIKA:embedded_depth': '0', 'X-TIKA:parse_time_millis': '366', 'access_permission:assemble_document': 'true', 'access_permission:can_modify': 'true', 'access_permission:can_print': 'true', 'access_permission:can_print_degraded': 'true', 'access_permission:extract_content': 'true', 'access_permission:extract_for_accessibility': 'true', 'access_permission:fill_in_form': 'true', 'access_permission:modify_annotations': 'true', 'cp:subject': '', 'created': '2016-02-26T16:35:00Z', 'creator': '', 'date': '20

In [18]:
import tika
# tika.TikaClientOnly = True

    # See: https://github.com/chrismattmann/tika-python/blob/master/tika/tika.py

from tika import runCommand

jsonOutput1 = runCommand('parse', 'all', InputFile)

    # See: https://github.com/chrismattmann/tika-python/blob/master/tika/tika.py

from tika import parse1

jsonOutput2 = parse1('all', InputFile)


### SDS Parser

See:

https://github.com/astepe/sds_parser

https://pypi.org/project/SDSParser/


In [22]:
from sdsparser import SDSParser

parser = SDSParser()
sds_data = parser.get_sds_data('data/23114.pdf')

sds_data

{'manufacturer': 'IDENTIFICATION  Company  Arkema Inc. 900 First Avenue King of Prussia, Pennsylv',
 'product_name': 'code: 051000   Version 2.0 Issued on: 07/11/2011 Page: 1 / 10     1. PRODUCT AND',
 'flash_point': 'The flashpoint of this produc',
 'specific_gravity': '(Relative density): 1.0428 (6',
 'nfpa_fire': '1',
 'nfpa_health': '3',
 'nfpa_reactivity': '1',
 'sara_311': 'Data not listed',
 'revision_date': 'Data not listed',
 'physical_state': 'United States – Federal Regul',
 'cas_number': '-No. Wt/Wt OSHA Hazardous Benz'}

In [23]:
request_keys = ['manufacturer', 'flash_point', 'specific_gravity', 'product_name', 'sara_311', 'nfpa_fire']

parser = SDSParser(request_keys=request_keys)
sds_data = parser.get_sds_data('data/23114.pdf')

sds_data

{'manufacturer': 'IDENTIFICATION  Company  Arkema Inc. 900 First Avenue King of Prussia, Pennsylv',
 'flash_point': 'The flashpoint of this produc',
 'specific_gravity': '(Relative density): 1.0428 (6',
 'product_name': 'code: 051000   Version 2.0 Issued on: 07/11/2011 Page: 1 / 10     1. PRODUCT AND',
 'sara_311': 'Data not listed',
 'nfpa_fire': '1'}

In [18]:
sdsparser.request_keys

['manufacturer',
 'product_name',
 'flash_point',
 'specific_gravity',
 'nfpa_fire',
 'nfpa_health',
 'nfpa_reactivity',
 'sara_311',
 'revision_date',
 'physical_state',
 'cas_number']

In [19]:
sdsparser.manufacturers

{'acros_organics',
 'alfa_aesar',
 'basf',
 'citrus_and_allied',
 'excellentia',
 'exxon_mobil',
 'firmenich',
 'fisher',
 'formosa_plastics',
 'frutarom',
 'givaudan',
 'iff',
 'indofine',
 'innophos',
 'kerry',
 'pepsico_inc',
 'pfizer',
 'reckitt_benckiser',
 'robertet',
 'sc_johnson',
 'sigma_aldrich',
 'symrise',
 'takasago',
 'the_clorox_company',
 'treatt',
 'ungerer'}

Not needed :

In [15]:
import sdsparser.configs

In [17]:
import sdsparser

SDSconf = sdsparser.Configs()

In [20]:
SDSconf.REQUEST_KEYS

['manufacturer',
 'product_name',
 'flash_point',
 'specific_gravity',
 'nfpa_fire',
 'nfpa_health',
 'nfpa_reactivity',
 'sara_311',
 'revision_date',
 'physical_state',
 'cas_number']

### PDFbox

from:

https://pdfbox.apache.org/

https://pypi.org/project/python-pdfbox/


In [1]:
InputFile = 'data/gylcol-ether-dpm-sds.pdf'

In [5]:
import pdfbox

p = pdfbox.PDFBox()

In [6]:
#text extraction, plain text:
p.extract_text(InputFile)   # writes text to /path/to/my_file.txt, i.e. strips the extention and replaces by .txt

In [9]:
#text extraction, in html format:
p.extract_text(InputFile, '-html')   # writes text to /path/to/my_file.txt, i.e. strips the extention and replaces by .html

In [26]:
#Doesn't work, doesn't know xml
#text extraction, in html format:
p.extract_text(InputFile, '-xml')   # writes text to /path/to/my_file.txt, i.e. strips the extention and replaces by .html

In [11]:
#Doesn't work on console. Probably needs some '% ...' instruction for jupyter notebook
p.extract_text(InputFile, '-html', '-console')   # writes text to /path/to/my_file.txt

In [7]:
# extracts the pages of the PDF file as .jpg images
p.pdf_to_images(InputFile)  # writes images to /path/to/my_file1.jpg, /path/to/my_file2.jpg, etc.

In [8]:
# extracts images embedded in the PDF file as .png files
p.extract_images(InputFile) # writes images to /path/to/my_file-1.png, /path/to/my_file-2.png, etc.

### Environment settings for Java

In [4]:
#To run java, its executable path needs to be in the $PATH environment variable

import os

print("Original PATH:", os.environ['PATH'])

# look for Java
print("\n", os.popen('whereis java').read())

Original PATH: /mnt/sda2/Users/Administrator/Documents/ICT/Miniconda3/envs/NLP/bin:/mnt/sda2/Users/Administrator/Documents/ICT/Miniconda3/bin:/mnt/sda2/Users/Administrator/Documents/ICT/Miniconda3/condabin:/usr/local/bin:/usr/bin:/usr/sbin:/bin:/sbin:/opt/qt5/bin

 java: /usr/bin/java /opt/jre1.8.0_25/bin/java

