In [15]:
!pip install huggingface_hub transformers timm

Collecting timm
  Obtaining dependency information for timm from https://files.pythonhosted.org/packages/29/90/94f5deb8d76e24a89813aef95e8809ca8fd7414490428480eda19b133d4a/timm-0.9.2-py3-none-any.whl.metadata
  Downloading timm-0.9.2-py3-none-any.whl.metadata (68 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m68.5/68.5 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
Downloading timm-0.9.2-py3-none-any.whl (2.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m24.0 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: timm
Successfully installed timm-0.9.2


In [2]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
import sys
sys.path.append("../extraction_core/page_selection")
sys.path.append("../extraction_core/extraction")
sys.path.append("../extraction_core/")

In [8]:
import hvac
import os
client = hvac.Client(
    url="https://vault.lab.sspcloud.fr", token=os.environ["VAULT_TOKEN"]
)

secret = os.environ["VAULT_MOUNT"] + os.environ["VAULT_TOP_DIR"] + "/s3_creds"
mount_point, secret_path = secret.split("/", 1)
secret_dict = client.secrets.kv.read_secret_version(
    path=secret_path, mount_point=mount_point
)

os.environ["AWS_ACCESS_KEY_ID"] = secret_dict["data"]["data"][
    "ACCESS_KEY"
]
os.environ["AWS_SECRET_ACCESS_KEY"] = secret_dict["data"]["data"][
    "SECRET_KEY"
]
try:
    del os.environ["AWS_SESSION_TOKEN"]
except KeyError:
    pass

In [4]:
from extraction.data import fs

fs.ls("projet-extraction-tableaux/orange/bmp")

['projet-extraction-tableaux/orange/bmp/306769688.bmp',
 'projet-extraction-tableaux/orange/bmp/307299248.bmp',
 'projet-extraction-tableaux/orange/bmp/345039416.bmp',
 'projet-extraction-tableaux/orange/bmp/379984891.bmp',
 'projet-extraction-tableaux/orange/bmp/380129866.bmp',
 'projet-extraction-tableaux/orange/bmp/384518114.bmp',
 'projet-extraction-tableaux/orange/bmp/409759156.bmp',
 'projet-extraction-tableaux/orange/bmp/430107359.bmp',
 'projet-extraction-tableaux/orange/bmp/432668432.bmp',
 'projet-extraction-tableaux/orange/bmp/440419240.bmp',
 'projet-extraction-tableaux/orange/bmp/500413505.bmp',
 'projet-extraction-tableaux/orange/bmp/500440813.bmp',
 'projet-extraction-tableaux/orange/bmp/501614572.bmp']

In [5]:
from pathlib import Path

for path in fs.ls("projet-extraction-tableaux/orange/bmp"):
    file_name = Path(path).stem
    fs.get(path, "../data/orange/" + file_name + ".bmp")

In [6]:
for path in fs.ls("projet-extraction-tableaux/orange/bmp"):
    file_name = Path(path).stem
    print(file_name)

306769688
307299248
345039416
379984891
380129866
384518114
409759156
430107359
432668432
440419240
500413505
500440813
501614572


In [4]:
from extraction.utils import get_root_path

In [5]:
from pathlib import Path
from PIL import Image
from huggingface_hub import hf_hub_download
from transformers import DetrFeatureExtractor
from transformers import TableTransformerForObjectDetection
import torch
import pytesseract
from pytesseract import Output
from extraction.table_transformer_utils import *
from io import StringIO
import pandas as pd
from extraction.utils import get_root_path

In [12]:
TEST_DATA = [
    "306769688",
    #"307299248",
    "345039416",
    #"379984891",
    "380129866",
    "384518114",
    "409759156",
    #"430107359",
    "432668432",
    "440419240",
    "500413505",
    #"500440813",
    #"501614572",
]
data_dir = "../data/orange"
test_images = [
    Path(data_dir).joinpath(path + ".bmp") for path in TEST_DATA
]

feature_extractor = DetrFeatureExtractor()
detection_model = TableTransformerForObjectDetection.from_pretrained(
    "microsoft/table-transformer-detection"
)
structure_model = TableTransformerForObjectDetection.from_pretrained(
    "microsoft/table-transformer-structure-recognition"
)

# Paddings
left_padding = 20
top_padding = 50
right_padding = 20
bottom_padding = 50

for siren, image_path in zip(TEST_DATA, test_images):
    print(f"--- Siren {siren} ---")
    image = Image.open(image_path)
    width, height = image.size

    # Encoding for table detection
    encoding = feature_extractor(image, return_tensors="pt")
    with torch.no_grad():
        outputs = detection_model(**encoding)
    results = feature_extractor.post_process_object_detection(
        outputs, threshold=0.7, target_sizes=[(height, width)]
    )[0]
    table_boxes = results["boxes"].tolist()

    for table_idx, table_box in enumerate(table_boxes):
        print(f"--- Table {table_idx} ---")
        xmin, ymin, xmax, ymax = table_box
        # Cropped image (only detected table)
        resized_image = image.crop(
            (
                xmin - right_padding,
                ymin - top_padding,
                xmax + right_padding,
                ymax + bottom_padding,
            )
        )

        # Encoding for structure recognition
        encoding = feature_extractor(resized_image, return_tensors="pt")
        with torch.no_grad():
            outputs = structure_model(**encoding)

        target_sizes = [resized_image.size[::-1]]
        results = feature_extractor.post_process_object_detection(
            outputs, threshold=0.6, target_sizes=target_sizes
        )[0]

        # Getting tokens
        d = pytesseract.image_to_data(
            resized_image, output_type=Output.DICT
        )
        tokens = []
        n_boxes = len(d["level"])
        for i in range(n_boxes):
            (xmin, ymin, w, h) = (
                d["left"][i],
                d["top"][i],
                d["width"][i],
                d["height"][i],
            )
            xmax = xmin + w
            ymax = ymin + h
            text = d["text"][i]
            tokens.append({"bbox": [xmin, ymin, xmax, ymax], "text": text})
        # 'tokens' is a list of tokens
        # Need to be in a relative reading order
        # If no order is provided, use current order
        for idx, token in enumerate(tokens):
            if not "span_num" in token:
                token["span_num"] = idx
            if not "line_num" in token:
                token["line_num"] = 0
            if not "block_num" in token:
                token["block_num"] = 0

        # Post-process detected objects, assign class labels
        objects = results_to_objects(
            results, resized_image.size, str_class_idx2name
        )

        # Further process the detected objects so they correspond to a consistent table
        tables_structure = objects_to_structures(
            objects, tokens, structure_class_thresholds
        )

        # Enumerate all table cells: grid cells and spanning cells
        table_cells = [
            structure_to_cells(structure, tokens)[0]
            for structure in tables_structure
        ]

        # Convert cells to CSV, including flattening multi-row column headers to a single row
        table_csvs = [cells_to_csv(cells) for cells in table_cells]

        io = StringIO(table_csvs[0])
        df = pd.read_csv(io, sep=",")
        save_path = os.path.join(
            get_root_path(),
            "output/tt_" + siren + "_table_" + str(table_idx) + ".csv",
        )
        df.to_csv(save_path, sep=";", index=False)



--- Siren 306769688 ---
--- Table 0 ---
--- Siren 307299248 ---
--- Siren 345039416 ---
--- Table 0 ---
--- Table 1 ---
--- Siren 380129866 ---
--- Table 0 ---
--- Siren 384518114 ---
--- Table 0 ---
--- Siren 409759156 ---
--- Table 0 ---
--- Siren 430107359 ---
--- Siren 432668432 ---
--- Table 0 ---
--- Siren 440419240 ---
--- Table 0 ---
--- Siren 500413505 ---
--- Table 0 ---
--- Siren 500440813 ---
--- Siren 501614572 ---
