In [1]:
import pandas as pd
import argparse
import json

from PIL import Image
from bs4 import BeautifulSoup
from collections import Counter
from textractor import Textractor
from pdf2image import convert_from_path
from unstructured.partition.pdf import partition_pdf
from textractor.data.constants import TextractFeatures

In [2]:
extractor = Textractor(profile_name="default")

In [3]:
def extract_page_as_image(pdf_path, page_number, output_image_path, crop_coords=None):

    images = convert_from_path(pdf_path, first_page=page_number, last_page=page_number)
    
    if crop_coords:
        left = min(point[0] for point in crop_coords)
        top = min(point[1] for point in crop_coords)
        right = max(point[0] for point in crop_coords)
        bottom = max(point[1] for point in crop_coords)
        images[0] = images[0].crop((left, top, right, bottom))
    
    images[0].save(output_image_path, 'PNG')
    
def parse_command_line_arguments():
    parser = argparse.ArgumentParser()

    parser.add_argument('--document_path', type=str, help='The path to the input PDF document.')
    parser.add_argument('--save_path', type=str, help='The path where to print the parsed json document.')

    return parser.parse_args()

def prepare_counter(elements):

    titles_text = [el.text for el in elements if el.category == "Title"]
    text_text = [el.text for el in elements if el.category == "Text"]
    narrtext_text = [el.text for el in elements if el.category == "NarrativeText"]

    all_text = titles_text + narrtext_text + text_text
    all_counter = Counter(all_text)

    return all_counter

def create_parsed_dictionary(elements, all_counter):
    parsed_doc = {'page_1': ''}
    curr_page = 'page_1'
    page_number = 1

    for el in elements:

        if el.category == 'PageBreak':
            page_number += 1
            curr_page = f"page_{page_number}"
            parsed_doc[curr_page] = ''

        elif el.category == 'ListItem':
            if all_counter[el.text] < 10 and all_counter[el.text] != 0:
                parsed_doc[curr_page] += el.text
                parsed_doc[curr_page] += '\n'
        
        elif el.category == 'NarrativeText':
            if all_counter[el.text] < 10 and all_counter[el.text] != 0:
                parsed_doc[curr_page] += el.text
                parsed_doc[curr_page] += ' '

        elif el.category == 'Table':
            parsed_doc[curr_page] += '\n'
            parsed_doc[curr_page] += el.metadata.text_as_html
            parsed_doc[curr_page] += '\n'

        elif el.category == 'Text':
            if all_counter[el.text] < 10 and all_counter[el.text] != 0:
                parsed_doc[curr_page] += el.text
                parsed_doc[curr_page] += ' '

        elif el.category == 'Title':
            if page_number == 1:
                parsed_doc[curr_page] += el.text
                parsed_doc[curr_page] += '\n'
            elif all_counter[el.text] < 10 and all_counter[el.text] != 0:
                parsed_doc[curr_page] += el.text
                parsed_doc[curr_page] += '\n'
        
    parsed_doc = {k:v for k,v in parsed_doc.items() if v}   

    return parsed_doc


def save_parsed_document(parsed_doc, save_path):
    with open(save_path, 'w') as f:
        json.dump(parsed_doc, f)

In [4]:
filename='valvole-di-controllo-rotative-vee-ball-modello-v150-v200-e-v300-da-1-a-12-in-fisher-vee-ball-v150-v200-v300-rotary-control-valves-nps-1-through-12-italian-it-134892.pdf'

elements = partition_pdf(filename=filename, infer_table_structure=True, include_page_breaks=True, languages=['ita', 'eng'])
all_counter = prepare_counter(elements=elements)

Some weights of the model checkpoint at microsoft/table-transformer-structure-recognition were not used when initializing TableTransformerForObjectDetection: ['model.backbone.conv_encoder.model.layer2.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer3.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer4.0.downsample.1.num_batches_tracked']
- This IS expected if you are initializing TableTransformerForObjectDetection from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TableTransformerForObjectDetection from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [5]:
parsed_doc = {'page_1': ''}
curr_page = 'page_1'
page_number = 1
tables = []
n=0

for el in elements:

    if el.category == 'PageBreak':
        page_number += 1
        curr_page = f"page_{page_number}"
        parsed_doc[curr_page] = ''

    elif el.category == 'ListItem':
        if all_counter[el.text] < 10 and all_counter[el.text] != 0:
            parsed_doc[curr_page] += el.text
            parsed_doc[curr_page] += '\n'
    
    elif el.category == 'NarrativeText':
        if all_counter[el.text] < 10 and all_counter[el.text] != 0:
            parsed_doc[curr_page] += el.text
            parsed_doc[curr_page] += ' '

    elif el.category == 'Table':
        parsed_doc[curr_page] += '\n'
        pdf_path = filename
        page_number = el.metadata.page_number 
        coords = el.metadata.coordinates.points 
        output_image_path = f'output_{n}.png'
        extract_page_as_image(pdf_path, page_number, output_image_path, coords)
        document = extractor.analyze_document(file_source=f'output_{n}.png',features=[TextractFeatures.TABLES])
        if len(document.tables) > 0:
            for k in range(len(document.tables)):
                #table_html = document.tables[0].to_pandas().to_html()
                table_format = document.tables[k].to_pandas().to_markdown()
                tables.append(document.tables[k].to_pandas())
                parsed_doc[curr_page] += table_format
                parsed_doc[curr_page] += '\n'
                n+=1
        else:
            pass
        

    elif el.category == 'Text':
        if all_counter[el.text] < 10 and all_counter[el.text] != 0:
            parsed_doc[curr_page] += el.text
            parsed_doc[curr_page] += ' '

    elif el.category == 'Title':

        if page_number == 1:
            parsed_doc[curr_page] += el.text
            parsed_doc[curr_page] += '\n'
        elif all_counter[el.text] < 10 and all_counter[el.text] != 0:
            parsed_doc[curr_page] += el.text
            parsed_doc[curr_page] += '\n'

parsed_doc = {k:v for k,v in parsed_doc.items() if v} 

In [6]:
for index, el in enumerate(elements):
    if el.category == 'Table':
        print(f"Element at position {index} is a Table")

Element at position 6 is a Table
Element at position 55 is a Table
Element at position 73 is a Table
Element at position 140 is a Table
Element at position 283 is a Table
Element at position 289 is a Table
Element at position 290 is a Table
Element at position 317 is a Table
Element at position 397 is a Table
Element at position 455 is a Table
Element at position 457 is a Table
Element at position 527 is a Table
Element at position 530 is a Table
Element at position 749 is a Table
Element at position 767 is a Table
Element at position 786 is a Table
Element at position 790 is a Table
Element at position 802 is a Table
Element at position 924 is a Table
Element at position 979 is a Table
Element at position 981 is a Table
Element at position 1044 is a Table
Element at position 1045 is a Table


In [7]:
len(tables)

26

In [8]:
html_content = elements[6].metadata.text_as_html
soup = BeautifulSoup(html_content, 'html.parser')
table = soup.find('table')
df = pd.read_html(str(table))[0]

df

Unnamed: 0,Introduzione .......,Unnamed: 1,6.,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,cee,cece,Unnamed: 9,eee,eee.1,1
0,Scopo del manuale..............,,,,,,,0.0.0,e eee,eee,ee,eee,1
1,Descrizione,,,,,,,,,,,,2
2,Specifiche ...,,,,,,,,,,,,2
3,Servizi educativi,,..........,,0...,c,,eee,,eee,eee,eee,2
4,Installazione 0...,2...,,cece,,,,,,eee,,,eee 3
5,Manutenzione ..............,,,,,,,,,,wee,,8
6,Manutenzione della,,baderna,,......,,,,,,,,...8
7,Sostituzione della,tenuta,,della,sfera,.,,,,,,,11
8,Smontaggio Montaggio Lubrificazione,della,................ ..........,tenuta,cece della,,,cece sfera,,eee,,. eee,11 14
9,rinforzata Manutenzione del,,00.2... cuscinetto,,eee della,,,sfera,,eee,,eee,


In [9]:
tables[0]

Unnamed: 0,0,1
0,Introduzione 1,
1,Scopo del manuale 1,
2,Descrizione 2,
3,Specifiche 2,
4,Servizi educativi,2
5,Installazione,3
6,Manutenzione,8
7,Manutenzione della baderna 8,
8,Sostituzione della tenuta della sfera 11 [ ],
9,Smontaggio 11 [ ],


In [10]:
html_content = elements[55].metadata.text_as_html

soup = BeautifulSoup(html_content, 'html.parser')
table = soup.find('table')
df = pd.read_html(str(table))[0]

df

Unnamed: 0_level_0,DESIGN DELLA VALVOLA,MATERIALE DEL CORPO VALVOLA Wwcc,POLLICI/ DN,ASME | PN
Unnamed: 0_level_1,MATERIALE DEL CORPO VALVOLA Wwcc,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,Vv150,"1, 1-1/2, 2, 3, 4, 6, 8, 10, 12, 14, 16, 20, 2...",CL150,
1,Vv150,Wcc | 1.0619(1) .,"DN 80, 100, 150",PN 10-16
2,Vv150,Wcc | 1.0619(1) .,"DN 200, 250, 300",PN 100PN16
3,Vv150,tcc,"1, 1-1/2, 2, 3, 4, 6, 8, 10, 12 pollici",CL150
4,Vv150,tcc,"DN 80, 100, 150",PN 10-16
5,Vv150,tcc,"DN 200, 250, 300",PN 100PN16 CL150 PN 10-16 PN 100PN16
6,Vv150,CF3M(4),"1, 1-1/2, 2, 3, 4, 6, 8, 10, 12 pollici",PN 100PN16 CL150 PN 10-16 PN 100PN16
7,Vv150,CF3M/1.4409(),"DN 80, 100, 150",PN 100PN16 CL150 PN 10-16 PN 100PN16
8,Vv150,CF3M/1.4409(),"DN 200, 250, 300",PN 100PN16 CL150 PN 10-16 PN 100PN16
9,Vv150,CW2M M35-2 CD3MN\3) CD3MWCuNG) CK3MCuN,"1, 1-1/2, 2, 3, 4, 6, 8, 10, 12 pollici",CL150


In [11]:
tables[1]

Unnamed: 0,0,1,2,3
0,DESIGN DELLA VALVOLA,MATERIALE DEL CORPO VALVOLA,DIMENSIONE,VALORI NOMINALI
1,,,POLLICI I DN,ASME / PN
2,V150,WCC,"3, 4, 6, 8, 10, 12, 14, 16, 20, 24x20 pollici(5)",CL150
3,,WCC /1.0619(),"DN 80, 100, 150",PN 10-16
4,,,"DN 200,250,300",PN 10 o PN 16
5,,LCC,"1, 1-1/2, 2, 3, 4, 6, 8, 10, 12 pollici",CL150
6,,,"DN 80, 100, 150",PN 10-16
7,,,"DN 200, 250, 300",PN 10 o PN 16
8,,CF3M2,"1, 1-1/2, 2, 3, 4, 6, 8, 10, 12 pollici",CL L150
9,,CF3M/1.4409(1),"DN 80, 100, 150",PN 10-16


In [12]:
html_content = elements[73].metadata.text_as_html
soup = BeautifulSoup(html_content, 'html.parser')
table = soup.find('table')
df = pd.read_html(str(table))[0]

df

Unnamed: 0_level_0,DIMENSIONE DELLA VALVOLA,DIMENSIONE DELLA VALVOLA,eso,eso,eso,eso,v200(1) Misura da faccia a faccia ANSI/ISA S75.08.02,v200(1) Misura da faccia a faccia ANSI/ISA S75.08.02,v300() Misura da faccia a faccia ANSI/ISA S75.08.02,v300() Misura da faccia a faccia ANSI/ISA S75.08.02
Unnamed: 0_level_1,DIMENSIONE DELLA VALVOLA,DIMENSIONE DELLA VALVOLA,Misura da faccia a faccia ANSI/ISA S75.08.02,Misura da faccia a faccia ANSI/ISA S75.08.02,Misura da faccia a faccia ASME B16.10 corta,Misura da faccia a faccia ASME B16.10 corta,v200(1) Misura da faccia a faccia ANSI/ISA S75.08.02,v200(1) Misura da faccia a faccia ANSI/ISA S75.08.02,v300() Misura da faccia a faccia ANSI/ISA S75.08.02,v300() Misura da faccia a faccia ANSI/ISA S75.08.02
Unnamed: 0_level_2,DN,Pollici,mm,in.,mm,in.,mm,in.,mm,in.
0,250 300,10 12,133 140,5.25 5.50,165 159,6.50 6.25,,cee,171 184,6.75 7.25


In [13]:
tables[2]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,DIMENSIONE DELLA VALVOLA,,V150(2),,,,V200(¹),,V300(2),
1,,,Misura da faccia a faccia ANSI/ISA S75.08.02,,Misura da faccia a faccia ASME B16.10 corta,,Misura da faccia a faccia ANSI/ISA S75.08.02,,Misura da faccia a faccia ANSI/ISA S75.08.02,
2,DN,Pollici,mm,in.,mm,in.,mm,in.,mm,in.
3,25,1,70,2.75 3.25 3.75 3.75 4.25 4.50 4.75 5.25 5.50,95 127 146 133 146 152 171 165 159,3.75 5.00 5.75 5.25 5.75 6.00 6.75 6.50 6.25,121 140 165 197 216,4.75 5.50 6.50 7.75 8.50,89 102 95 121 127 140 152 171 184,3.50 4.00 3.75 4.75 5.00 5.50 6.00 6.75 7.25
4,40,1-1/2,83,,,,,,,
5,50,2,95,,,,,,,
6,80,3,95,,,,,,,
7,100 150,4,108,,,,,,,
8,200,6,114,,,,,,,
9,250,8 10,121 133,,,,,,,


In [14]:
html_content = elements[140].metadata.text_as_html
soup = BeautifulSoup(html_content, 'html.parser')
table = soup.find('table')
df = pd.read_html(str(table))[0]

df

Unnamed: 0_level_0,"DIMENSIONE VALVOLA 200, POLLICI",A,A,B,M,M,M,M
Unnamed: 0_level_1,"DIMENSIONE VALVOLA 200, POLLICI",Standard ANSI/ISA $75.08.02(1),L150 ASME B16.10(2) corta (opzionale),B,Standard CL150 ANSI/ISA $75.08.02(1),CL150 ASME B16.10(2) corta (opzionale),CL300,cL600
0,mm,mm,mm,mm,mm,mm,mm,mm
1,1 1-1/2 2,102 114 124,127 165 178,58 64 57,176 189 21,202 240 268,202 224 237,202 224 237
2,3 4 6,165 194 229,203 229 267,87 92 119,254 286 343,286 321 381,279 305 362,286 343 423
3,8 10,243 297,292 330,119 151,343 419,394 451,387 _,426 a
4,in.,in.,in.,in.,in.,in.,in.,in.
5,1 1-1/2 2,4.00 4.50 4.88,5.00 6.50 7.00,2.29 2.50 2.25,6.94 7.44 8.31,7.94 9.44 10.56,7.94 8.81 9.31,7.94 8.81 9.31
6,3 4 6,6.50 7.62 9.00,8.00 9.00 10.50,3.44 3.62 4.69,10.00 11.25 13.50,11.25 12.62 15.00,11.00 12.00 14.25,11.25 13.50 16.25
7,8 10,9.56 11.69,11.50 13.00,4.69 5.94,13.50 16.50,15.50 17.75,15.25 a,16.75 os


In [15]:
tables[3]

Unnamed: 0,0,1,2,3,4,5,6,7
0,"DIMENSIONE VALVOLA V200, POLLICI",DIMENSIONE,,,,,,
1,,A,,B,M,,,
2,,Standard ANSI/ISA S75.08.02(1),CL150 ASME B16.10² corta (opzionale),,Standard CL150 ANSI/ISA S75.08.02¹,CL150 ASME B16.10² corta (opzionale),CL300,CL600
3,mm,,,,,,,
4,1 1-1/2 2,102 114 124,127 165 178,58 64 57,176 189 211,202 240 268,202 224 237,202 224 237
5,3 4 6,165 194 229,203 229 267,87 92 119,254 286 343,286 321 381,279 305 362,286 343 423
6,8 10,243 297,292 330,119 151,343 419,394 451,387,426
7,in.,,,,,,,
8,1 1-1/2 2,4.00 4.50 4.88,5.00 6.50 7.00,2.29 2.50 2.25,6.94 7.44 8.31,7.94 9.44 10.56,7.94 8.81 9.31,7.94 8.81 9.31
9,3 4 6,6.50 7.62 9.00,8.00 9.00 10.50,3.44 3.62 4.69,10.00 11.25 13.50,11.25 12.62 15.00,11.00 12.00 14.25,11.25 13.50 16.25
