In [1]:
!pip install huggingface_hub transformers timm

Collecting huggingface_hub
  Obtaining dependency information for huggingface_hub from https://files.pythonhosted.org/packages/7f/c4/adcbe9a696c135578cabcbdd7331332daad4d49b7c43688bc2d36b3a47d2/huggingface_hub-0.16.4-py3-none-any.whl.metadata
  Downloading huggingface_hub-0.16.4-py3-none-any.whl.metadata (12 kB)
Collecting transformers
  Obtaining dependency information for transformers from https://files.pythonhosted.org/packages/21/02/ae8e595f45b6c8edee07913892b3b41f5f5f273962ad98851dc6a564bbb9/transformers-4.31.0-py3-none-any.whl.metadata
  Downloading transformers-4.31.0-py3-none-any.whl.metadata (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.9/116.9 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting timm
  Obtaining dependency information for timm from https://files.pythonhosted.org/packages/29/90/94f5deb8d76e24a89813aef95e8809ca8fd7414490428480eda19b133d4a/timm-0.9.2-py3-none-any.whl.metadata
  Downloading timm-0.9.2-py3-none-any.whl.metad

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import sys
sys.path.append("../extraction_core/page_selection")
sys.path.append("../extraction_core/extraction")
sys.path.append("../extraction_core/")

In [4]:
import hvac
import os
client = hvac.Client(
    url="https://vault.lab.sspcloud.fr", token=os.environ["VAULT_TOKEN"]
)

secret = os.environ["VAULT_MOUNT"] + os.environ["VAULT_TOP_DIR"] + "/s3_creds"
mount_point, secret_path = secret.split("/", 1)
secret_dict = client.secrets.kv.read_secret_version(
    path=secret_path, mount_point=mount_point
)

os.environ["AWS_ACCESS_KEY_ID"] = secret_dict["data"]["data"][
    "ACCESS_KEY"
]
os.environ["AWS_SECRET_ACCESS_KEY"] = secret_dict["data"]["data"][
    "SECRET_KEY"
]
try:
    del os.environ["AWS_SESSION_TOKEN"]
except KeyError:
    pass

In [5]:
from extraction.data import fs

fs.ls("projet-extraction-tableaux/orange/bmp")

['projet-extraction-tableaux/orange/bmp/306769688.bmp',
 'projet-extraction-tableaux/orange/bmp/307299248.bmp',
 'projet-extraction-tableaux/orange/bmp/345039416.bmp',
 'projet-extraction-tableaux/orange/bmp/379984891.bmp',
 'projet-extraction-tableaux/orange/bmp/380129866.bmp',
 'projet-extraction-tableaux/orange/bmp/384518114.bmp',
 'projet-extraction-tableaux/orange/bmp/409759156.bmp',
 'projet-extraction-tableaux/orange/bmp/430107359.bmp',
 'projet-extraction-tableaux/orange/bmp/432668432.bmp',
 'projet-extraction-tableaux/orange/bmp/440419240.bmp',
 'projet-extraction-tableaux/orange/bmp/500413505.bmp',
 'projet-extraction-tableaux/orange/bmp/500440813.bmp',
 'projet-extraction-tableaux/orange/bmp/501614572.bmp']

In [6]:
from pathlib import Path

for path in fs.ls("projet-extraction-tableaux/orange/bmp"):
    file_name = Path(path).stem
    fs.get(path, "../data/orange/" + file_name + ".bmp")

In [7]:
for path in fs.ls("projet-extraction-tableaux/orange/bmp"):
    file_name = Path(path).stem
    print(file_name)

306769688
307299248
345039416
379984891
380129866
384518114
409759156
430107359
432668432
440419240
500413505
500440813
501614572


In [8]:
from extraction.utils import get_root_path

In [22]:
from pathlib import Path
from PIL import Image
from huggingface_hub import hf_hub_download
from transformers import DetrFeatureExtractor, DetrImageProcessor
from transformers import TableTransformerForObjectDetection
import torch
import pytesseract
from pytesseract import Output
from extraction.table_transformer_utils import *
from io import StringIO
import pandas as pd
from extraction.utils import get_root_path

In [None]:
Path('../data/orange/tabletrans_output').mkdir(parents=True, exist_ok=True)

In [56]:
TEST_DATA = [
    #"306769688",
    "345039416",
    #"380129866",
    #"384518114",
    #"409759156",
    #"432668432",
    #"440419240",
    #"500413505",
    #"307299248",
    #"379984891",
    #"430107359",
    #"500440813",
    #"501614572",
]
data_dir = "../data/orange"
test_images = [
    Path(data_dir).joinpath(path + ".bmp") for path in TEST_DATA
]

feature_extractor = DetrImageProcessor()
detection_model = TableTransformerForObjectDetection.from_pretrained(
    "microsoft/table-transformer-detection"
)
structure_model = TableTransformerForObjectDetection.from_pretrained(
    "microsoft/table-transformer-structure-recognition"
)

# Paddings
left_padding = 20
top_padding = 50
right_padding = 20
bottom_padding = 50

# Dict
output = {}
check = []

for siren, image_path in zip(TEST_DATA, test_images):
    print(f"--- Siren {siren} ---")
    image = Image.open(image_path)
    width, height = image.size

    # Encoding for table detection
    encoding = feature_extractor(image, return_tensors="pt")
    with torch.no_grad():
        outputs = detection_model(**encoding)
    results = feature_extractor.post_process_object_detection(
        outputs, threshold=0.7, target_sizes=[(height, width)]
    )[0]
    table_boxes = results["boxes"].tolist()

    for table_idx, table_box in enumerate(table_boxes):
        print(f"--- Table {table_idx} ---")
        xmin, ymin, xmax, ymax = table_box
        # Cropped image (only detected table)
        resized_image = image.crop(
            (
                xmin - right_padding,
                ymin - top_padding,
                xmax + right_padding,
                ymax + bottom_padding,
            )
        )

        # Encoding for structure recognition
        encoding = feature_extractor(resized_image, return_tensors="pt")
        with torch.no_grad():
            outputs = structure_model(**encoding)

        target_sizes = [resized_image.size[::-1]]
        results = feature_extractor.post_process_object_detection(
            outputs, threshold=0.6, target_sizes=target_sizes
        )[0]

        # Getting tokens
        d = pytesseract.image_to_data(
            resized_image, output_type=Output.DICT
        )
        tokens = []
        n_boxes = len(d["level"])
        for i in range(n_boxes):
            (xmin, ymin, w, h) = (
                d["left"][i],
                d["top"][i],
                d["width"][i],
                d["height"][i],
            )
            xmax = xmin + w
            ymax = ymin + h
            text = d["text"][i]
            tokens.append({"bbox": [xmin, ymin, xmax, ymax], "text": text})
        # 'tokens' is a list of tokens
        # Need to be in a relative reading order
        # If no order is provided, use current order
        for idx, token in enumerate(tokens):
            if not "span_num" in token:
                token["span_num"] = idx
            if not "line_num" in token:
                token["line_num"] = 0
            if not "block_num" in token:
                token["block_num"] = 0

        # Post-process detected objects, assign class labels
        objects = results_to_objects(
            results, resized_image.size, str_class_idx2name
        )

        check.append(tokens)

        # Further process the detected objects so they correspond to a consistent table
        tables_structure = objects_to_structures(
            objects, tokens, structure_class_thresholds
        )

        # Enumerate all table cells: grid cells and spanning cells
        table_cells = [
            structure_to_cells(structure, tokens)[0]
            for structure in tables_structure
        ]

        table_csvs = [cells_to_csv(cells) for cells in table_cells]

        io = StringIO(table_csvs[0])
        df = pd.read_csv(io, sep=",")

        output[f'{siren}_{table_idx}'] = df

--- Siren 345039416 ---
--- Table 0 ---
--- Table 1 ---


In [57]:
check[0]

[{'bbox': [0, 0, 2692, 533],
  'text': '',
  'span_num': 0,
  'line_num': 0,
  'block_num': 0},
 {'bbox': [13, 17, 372, 369],
  'text': '',
  'span_num': 1,
  'line_num': 0,
  'block_num': 0},
 {'bbox': [16, 17, 372, 80],
  'text': '',
  'span_num': 2,
  'line_num': 0,
  'block_num': 0},
 {'bbox': [16, 17, 372, 45],
  'text': '',
  'span_num': 3,
  'line_num': 0,
  'block_num': 0},
 {'bbox': [16, 19, 57, 45],
  'text': 'Log',
  'span_num': 4,
  'line_num': 0,
  'block_num': 0},
 {'bbox': [68, 19, 87, 39],
  'text': 'In',
  'span_num': 5,
  'line_num': 0,
  'block_num': 0},
 {'bbox': [98, 17, 247, 39],
  'text': 'Consultants',
  'span_num': 6,
  'line_num': 0,
  'block_num': 0},
 {'bbox': [256, 19, 372, 45],
  'text': 'Germany',
  'span_num': 7,
  'line_num': 0,
  'block_num': 0},
 {'bbox': [16, 58, 172, 80],
  'text': '',
  'span_num': 8,
  'line_num': 0,
  'block_num': 0},
 {'bbox': [16, 58, 132, 80],
  'text': 'Basefarm',
  'span_num': 9,
  'line_num': 0,
  'block_num': 0},
 {'bbox':

In [43]:
result = output.copy()

In [58]:
result['345039416_0']

Unnamed: 0,Basefarm AS,344 369 NOK,635 057 571 NOK,305 118 784,305 118 784.1,[oO,Unnamed: 6,864 128 535 NOK,36 158 453 NOK
0,Orange Cyberdéfense,54 727 929,13 445 521,63 948 662,63 948 662,,,368 998 871,724 428
1,SDH Holdo Limited,266 586 GBP,-11 445 544 GBP,104 577 118,104 577 118,,,,8 955 615 GBP
2,SL Bidco B.V,245 492 560,23 710 246,406 584 698,406 584 698,ooo0oo0o0c0.0,ooooooo0o;9o,35 700 783,-2 053 751
3,Business et Decision SA,551 808,25 120 403,61 063 439,61 063 439,,,22177021,1665 575
4,Enovacom,,,35 210 000,35 210 000,,,,
5,SCI,1829,,1814,1814,,,323 953,238 691
6,Telefact,2 804 748,960 672,1953 769,1953 769,oo,,3 984 158,675 961
7,"Participations (10 a 50% du capital,","Participations (10 a 50% du capital,","Participations (10 a 50% du capital,","Participations (10 a 50% du capital,","Participations (10 a 50% du capital,","Participations (10 a 50% du capital,","Participations (10 a 50% du capital,","Participations (10 a 50% du capital,","Participations (10 a 50% du capital,"
8,CNTP Docapost BPO,,,1520 474,1520 474,,,,


In [59]:
result['384518114_0']

Unnamed: 0.1,Unnamed: 0,Copttal SoclaleRéserves et report’ nouveau avant affectation dew résultats,Quote-Part du capital détenue (en pourcentage),Chittre | affaires HT du damier exercice écouls,Résultat (bindfice ou porte du demier exercics clos)
0,A. Renseignements détalllés concernant les fil...,A. Renseignements détalllés concernant les fil...,A. Renseignements détalllés concernant les fil...,A. Renseignements détalllés concernant les fil...,A. Renseignements détalllés concernant les fil...
1,1. Fillates,1. Fillates,1. Fillates,1. Fillates,1. Fillates
2,(+ 50% du capital détenu par la société),,,,
3,"BD Holding France (153 rue de Courcelles, Pari...",,,,
4,"BO LIFESCIENCES France (153rue de Courcelles, ...",,,,
5,"BDU (153 re de Courcelles, Paris) Siren : 381 ...",,,,
6,"BD France (153 rue de Courcelles, Paris) Siren...",,,,
7,"EOLAS (8 rue Voitaire, Grenoble) Siren : 382 1...",,,,
8,"SCI MANGIN (Rue du général Mangin, Grenoble) S...",,,,
9,"METAPHORA (153 rue de Courcelles, Paris) Siren...",,,,
