In [1]:
import pandas as pd
import argparse
import json

from PIL import Image
from bs4 import BeautifulSoup
from collections import Counter
from textractor import Textractor
from pdf2image import convert_from_path
from IPython.display import Image, display
from unstructured.partition.pdf import partition_pdf
from textractor.data.constants import TextractFeatures

In [2]:
def extract_page_as_image(pdf_path, page_number, output_image_path, crop_coords=None):
    images = convert_from_path(pdf_path, first_page=page_number, last_page=page_number)
    
    if crop_coords:
        left = min(point[0] for point in crop_coords)
        top = min(point[1] for point in crop_coords)
        right = max(point[0] for point in crop_coords)
        bottom = max(point[1] for point in crop_coords)
        images[0] = images[0].crop((left, top, right, bottom))
    
    images[0].save(output_image_path, 'PNG')

In [3]:
extractor = Textractor(profile_name="default")

In [4]:
def parse_command_line_arguments():
    parser = argparse.ArgumentParser()

    parser.add_argument('--document_path', type=str, help='The path to the input PDF document.')
    parser.add_argument('--save_path', type=str, help='The path where to print the parsed json document.')

    return parser.parse_args()

def prepare_counter(elements):
    titles_text = [el.text for el in elements if el.category == "Title"]
    text_text = [el.text for el in elements if el.category == "Text"]
    narrtext_text = [el.text for el in elements if el.category == "NarrativeText"]

    all_text = titles_text + narrtext_text + text_text
    all_counter = Counter(all_text)

    return all_counter

def create_parsed_dictionary(elements, all_counter):
    parsed_doc = {'page_1': ''}
    curr_page = 'page_1'
    page_number = 1

    for el in elements:
        if el.category == 'PageBreak':
            page_number += 1
            curr_page = f"page_{page_number}"
            parsed_doc[curr_page] = ''

        elif el.category == 'ListItem':
            if all_counter[el.text] < 10 and all_counter[el.text] != 0:
                parsed_doc[curr_page] += el.text
                parsed_doc[curr_page] += '\n'
        
        elif el.category == 'NarrativeText':
            if all_counter[el.text] < 10 and all_counter[el.text] != 0:
                parsed_doc[curr_page] += el.text
                parsed_doc[curr_page] += ' '

        elif el.category == 'Table':
            parsed_doc[curr_page] += '\n'
            parsed_doc[curr_page] += el.metadata.text_as_html
            parsed_doc[curr_page] += '\n'

        elif el.category == 'Text':
            if all_counter[el.text] < 10 and all_counter[el.text] != 0:
                parsed_doc[curr_page] += el.text
                parsed_doc[curr_page] += ' '

        elif el.category == 'Title':
            if page_number == 1:
                parsed_doc[curr_page] += el.text
                parsed_doc[curr_page] += '\n'
            elif all_counter[el.text] < 10 and all_counter[el.text] != 0:
                parsed_doc[curr_page] += el.text
                parsed_doc[curr_page] += '\n'
        

    parsed_doc = {k:v for k,v in parsed_doc.items() if v}   

    return parsed_doc

In [5]:
def save_parsed_document(parsed_doc, save_path):
    """
    Saves the parsed document to the specified path
    """
    with open(save_path, 'w') as f:
        json.dump(parsed_doc, f)

In [6]:
elements = partition_pdf(filename='DPC0FQ-JD14-M1001-KB022-00.pdf', infer_table_structure=True, include_page_breaks=True, languages=['ita', 'eng'])
all_counter = prepare_counter(elements=elements)

Some weights of the model checkpoint at microsoft/table-transformer-structure-recognition were not used when initializing TableTransformerForObjectDetection: ['model.backbone.conv_encoder.model.layer2.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer3.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer4.0.downsample.1.num_batches_tracked']
- This IS expected if you are initializing TableTransformerForObjectDetection from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TableTransformerForObjectDetection from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [7]:
filename='DPC0FQ-JD14-M1001-KB022-00.pdf'

In [8]:
tablas = []

In [9]:
parsed_doc = {'page_1': ''}
curr_page = 'page_1'
page_number = 1
n=0

for el in elements:
    if el.category == 'PageBreak':
        page_number += 1
        curr_page = f"page_{page_number}"
        parsed_doc[curr_page] = ''

    elif el.category == 'ListItem':
        if all_counter[el.text] < 10 and all_counter[el.text] != 0:
            parsed_doc[curr_page] += el.text
            parsed_doc[curr_page] += '\n'
    
    elif el.category == 'NarrativeText':
        if all_counter[el.text] < 10 and all_counter[el.text] != 0:
            parsed_doc[curr_page] += el.text
            parsed_doc[curr_page] += ' '

    elif el.category == 'Table':
        parsed_doc[curr_page] += '\n'
        pdf_path = filename
        page_number = el.metadata.page_number 
        coords = el.metadata.coordinates.points 
        output_image_path = f'output_{n}.png'
        extract_page_as_image(pdf_path, page_number, output_image_path, coords)
        document = extractor.analyze_document(file_source=f'output_{n}.png',features=[TextractFeatures.TABLES])
        table_html = document.tables[0].to_pandas().to_html()
        tablas.append(table_html)
        parsed_doc[curr_page] += table_html
        parsed_doc[curr_page] += '\n'
        n+=1
        

    elif el.category == 'Text':
        if all_counter[el.text] < 10 and all_counter[el.text] != 0:
            parsed_doc[curr_page] += el.text
            parsed_doc[curr_page] += ' '

    elif el.category == 'Title':
        if page_number == 1:
            parsed_doc[curr_page] += el.text
            parsed_doc[curr_page] += '\n'
        elif all_counter[el.text] < 10 and all_counter[el.text] != 0:
            parsed_doc[curr_page] += el.text
            parsed_doc[curr_page] += '\n'
    
# Remove all empty entries.
parsed_doc = {k:v for k,v in parsed_doc.items() if v} 

In [10]:
document.tables[0].to_pandas().to_html()

'<table border="1" class="dataframe">\n  <thead>\n    <tr style="text-align: right;">\n      <th></th>\n      <th>0</th>\n      <th>1</th>\n      <th>2</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>Danieli Engineering</td>\n      <td>Since 1964</td>\n      <td>Turnkey Plants and Systems Engineering</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>Danieli Automation</td>\n      <td>Since 1969</td>\n      <td>Process Control Systems</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>Danieli Centro Metallics</td>\n      <td>Since 1987</td>\n      <td>Ore Processing and Direct Reduction Plants</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>Danieli Centro Met</td>\n      <td>Since 1914</td>\n      <td>Steelmaking Plants</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>Danieli Davy Distington</td>\n      <td>Since 1951</td>\n      <td>Slab Casters</td>\n    </tr>\n    <tr>\n      <th>5</th>\n      <td>Danieli Wean United</td>\n      <td

In [11]:
for index, el in enumerate(elements):
    if el.category == 'Table':
        print(f"Element at position {index} is a Table")

Element at position 12 is a Table
Element at position 404 is a Table
Element at position 608 is a Table
Element at position 734 is a Table
Element at position 738 is a Table
Element at position 764 is a Table


In [12]:
len(tablas)

6

In [13]:
tablas[0]

'<table border="1" class="dataframe">\n  <thead>\n    <tr style="text-align: right;">\n      <th></th>\n      <th>0</th>\n      <th>1</th>\n      <th>2</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>visione</td>\n      <td>Data</td>\n      <td>Motivo</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td></td>\n      <td>23 Aprile 2020</td>\n      <td>Primo in</td>\n    </tr>\n  </tbody>\n</table>'

In [15]:
tablas[1]

'<table border="1" class="dataframe">\n  <thead>\n    <tr style="text-align: right;">\n      <th></th>\n      <th>0</th>\n      <th>1</th>\n      <th>2</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>Anomalia</td>\n      <td>Causa</td>\n      <td>Soluzione</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>Rumorosità</td>\n      <td>Allentamento dei bulloni di ancoraggio del basamento alle fondazioni e di fissaggio dei diversi componenti sulla macchina.</td>\n      <td>Serrare i bulloni e le viti che lo richiedono.</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td></td>\n      <td>Condizione dei cuscinetti.</td>\n      <td>Controllare la lubrificazione.</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td></td>\n      <td></td>\n      <td>Controllare il gioco dei cuscinetti.</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td></td>\n      <td></td>\n      <td>Controllare l\'allineamento delle parti.</td>\n    </tr>\n    <tr>\n      <th>5</th>

In [14]:
xxxxxxxxxxxx

NameError: name 'xxxxxxxxxxxx' is not defined

In [None]:
image_path = 'output_0.png'
display(Image(filename=image_path))

In [None]:
html_content = elements[12].metadata.text_as_html

# Utiliza BeautifulSoup para parsear el HTML si es necesario
soup = BeautifulSoup(html_content, 'html.parser')
table = soup.find('table')

# Convierte la tabla HTML a un DataFrame de pandas
df = pd.read_html(str(table))[0]

df

In [None]:
tablas[0]

In [None]:
image_path = 'output_1.png'
display(Image(filename=image_path))

In [None]:
html_content = elements[404].metadata.text_as_html

# Utiliza BeautifulSoup para parsear el HTML si es necesario
soup = BeautifulSoup(html_content, 'html.parser')
table = soup.find('table')

# Convierte la tabla HTML a un DataFrame de pandas
df = pd.read_html(str(table))[0]

df

In [None]:
tablas[1]

In [None]:
image_path = 'output_2.png'
display(Image(filename=image_path))

In [None]:
html_content = elements[608].metadata.text_as_html

# Utiliza BeautifulSoup para parsear el HTML si es necesario
soup = BeautifulSoup(html_content, 'html.parser')
table = soup.find('table')

# Convierte la tabla HTML a un DataFrame de pandas
df = pd.read_html(str(table))[0]

df

In [None]:
tablas[2]

In [None]:
image_path = 'output_3.png'
display(Image(filename=image_path))

In [None]:
html_content = elements[734].metadata.text_as_html

# Utiliza BeautifulSoup para parsear el HTML si es necesario
soup = BeautifulSoup(html_content, 'html.parser')
table = soup.find('table')

# Convierte la tabla HTML a un DataFrame de pandas
df = pd.read_html(str(table))[0]

df

In [None]:
tablas[3]

In [None]:
image_path = 'output_4.png'
display(Image(filename=image_path))

In [None]:
html_content = elements[738].metadata.text_as_html

# Utiliza BeautifulSoup para parsear el HTML si es necesario
soup = BeautifulSoup(html_content, 'html.parser')
table = soup.find('table')

# Convierte la tabla HTML a un DataFrame de pandas
df = pd.read_html(str(table))[0]

df

In [None]:
tablas[4]

In [None]:
image_path = 'output_5.png'
display(Image(filename=image_path))

In [None]:
html_content = elements[764].metadata.text_as_html

# Utiliza BeautifulSoup para parsear el HTML si es necesario
soup = BeautifulSoup(html_content, 'html.parser')
table = soup.find('table')

# Convierte la tabla HTML a un DataFrame de pandas
df = pd.read_html(str(table))[0]

df

In [None]:
tablas[5]

In [None]:
xxxx

In [None]:
args = parse_command_line_arguments()
elements = partition_pdf(filename=args.document_path, infer_table_structure=True, include_page_breaks=True, languages=['ita', 'eng'])
all_counter = prepare_counter(elements=elements)

parsed_doc = create_parsed_dictionary(elements, all_counter)

save_path = args.document_path.replace('.pdf', '.json')
save_parsed_document(parsed_doc, save_path)