# Import des librairies

In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import lxml
from dotenv import load_dotenv
import os
import cv2
import pytesseract
import easyocr
from transformers import pipeline


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
load_dotenv()
blob_keys = os.getenv("AZURE_BLOB_KEYS")

# Fonction de requête  d'un url

In [3]:
def request_get(url):
  """
  Performs a GET request to the given URL and displays the response information.

  Args:
    str: url of the request
  """
  try:
    response = requests.get(url)
    response.raise_for_status()
    return response.text
  except requests.exceptions.RequestException as e:
    return f"Erreur lors de la requête: {e}"
  except ValueError as e:
    return f"Erreur lors du décodage JSON {e}"

# Fonction de récupération de la liste pour un url

In [4]:
def get_list_from_date(year, blob_keys):
    url = f"https://projetocrstorageacc.blob.core.windows.net/invoices-{year}?restype=container&comp=list{blob_keys}"
    content = request_get(url)
    tree = BeautifulSoup(content, features="xml")
    liste = [name.get_text() for name in tree.find_all("Name")]
    return liste
    

In [249]:
print(get_list_from_date("2018", blob_keys))

['FAC_2018_0001-654.png', 'FAC_2018_0002-114.png', 'FAC_2018_0003-025.png', 'FAC_2018_0004-759.png', 'FAC_2018_0005-281.png', 'FAC_2018_0006-250.png', 'FAC_2018_0007-228.png', 'FAC_2018_0008-142.png', 'FAC_2018_0009-754.png', 'FAC_2018_0010-104.png', 'FAC_2018_0011-692.png', 'FAC_2018_0012-758.png', 'FAC_2018_0013-913.png', 'FAC_2018_0014-558.png', 'FAC_2018_0015-089.png', 'FAC_2018_0016-604.png', 'FAC_2018_0017-432.png', 'FAC_2018_0018-032.png', 'FAC_2018_0019-030.png', 'FAC_2018_0020-095.png', 'FAC_2018_0021-223.png', 'FAC_2018_0022-238.png', 'FAC_2018_0023-517.png', 'FAC_2018_0024-616.png', 'FAC_2018_0025-027.png', 'FAC_2018_0026-574.png', 'FAC_2018_0027-203.png', 'FAC_2018_0028-733.png', 'FAC_2018_0029-665.png', 'FAC_2018_0030-718.png', 'FAC_2018_0031-558.png', 'FAC_2018_0032-429.png', 'FAC_2018_0033-225.png', 'FAC_2018_0034-459.png', 'FAC_2018_0035-603.png', 'FAC_2018_0036-284.png', 'FAC_2018_0037-828.png', 'FAC_2018_0038-890.png', 'FAC_2018_0039-006.png', 'FAC_2018_0040-777.png',

In [5]:
all_files = []
for year in range(2018, 2026):
    year_files = get_list_from_date(year, blob_keys)
    all_files.extend(year_files)
print(all_files)

['FAC_2018_0001-654.png', 'FAC_2018_0002-114.png', 'FAC_2018_0003-025.png', 'FAC_2018_0004-759.png', 'FAC_2018_0005-281.png', 'FAC_2018_0006-250.png', 'FAC_2018_0007-228.png', 'FAC_2018_0008-142.png', 'FAC_2018_0009-754.png', 'FAC_2018_0010-104.png', 'FAC_2018_0011-692.png', 'FAC_2018_0012-758.png', 'FAC_2018_0013-913.png', 'FAC_2018_0014-558.png', 'FAC_2018_0015-089.png', 'FAC_2018_0016-604.png', 'FAC_2018_0017-432.png', 'FAC_2018_0018-032.png', 'FAC_2018_0019-030.png', 'FAC_2018_0020-095.png', 'FAC_2018_0021-223.png', 'FAC_2018_0022-238.png', 'FAC_2018_0023-517.png', 'FAC_2018_0024-616.png', 'FAC_2018_0025-027.png', 'FAC_2018_0026-574.png', 'FAC_2018_0027-203.png', 'FAC_2018_0028-733.png', 'FAC_2018_0029-665.png', 'FAC_2018_0030-718.png', 'FAC_2018_0031-558.png', 'FAC_2018_0032-429.png', 'FAC_2018_0033-225.png', 'FAC_2018_0034-459.png', 'FAC_2018_0035-603.png', 'FAC_2018_0036-284.png', 'FAC_2018_0037-828.png', 'FAC_2018_0038-890.png', 'FAC_2018_0039-006.png', 'FAC_2018_0040-777.png',

In [206]:
len(all_files)

5123

# Téléchargement des fichiers

In [207]:
def download_file_requests_os(url, filename):
    """
    Télécharge un fichier à partir d'une URL en utilisant requests et os, et crée les dossiers nécessaires.

    Args:
        url: L'URL du fichier à télécharger.
        filename: Le nom du fichier sous lequel enregistrer le fichier.
    """
    try:
        response = requests.get(url, stream=True)
        response.raise_for_status()
        directory = os.path.dirname(filename)
        if directory and not os.path.exists(directory):
            os.makedirs(directory, exist_ok=True)
        with open(filename, "wb") as file:
            for chunk in response.iter_content(chunk_size=8192):
                file.write(chunk)
    except requests.exceptions.RequestException as e:
        print(f"Erreur lors du téléchargement du fichier (méthode 3) : {e}")
    except IOError as e:
        print(f"Erreur lors de l'enregistrement du fichier (méthode 3): {e}")
    except Exception as e:
      print(f"Erreur (méthode 3): {e}")

In [None]:
# count = 0
# for file in all_files:
#     year = file.split("_")[1]
#     folder_path = f"../data/files/{year}"
#     os.makedirs(folder_path, exist_ok=True)
#     url = f"https://projetocrstorageacc.blob.core.windows.net/invoices-{year}/{file}?{blob_keys}"
#     download_file_requests_os(url, f"{folder_path}/{file}")
#     count += 1
#     if count % 100 == 0:
#         print(count)

# Test d'ocr

## Avec Tesseract

### Extraction du texte et affichage des blocs de texte

In [6]:
file = all_files[20]
chemin = f"../data/files/{file.split("_")[1]}/{file}"
image_file = cv2.imread(chemin)
text = pytesseract.image_to_data(image_file)
print(text)

level	page_num	block_num	par_num	line_num	word_num	left	top	width	height	conf	text
1	1	0	0	0	0	0	0	850	1100	-1	
2	1	1	0	0	0	22	23	274	44	-1	
3	1	1	1	0	0	22	23	274	44	-1	
4	1	1	1	1	0	22	23	274	18	-1	
5	1	1	1	1	1	22	23	92	17	93.216675	INVOICE
5	1	1	1	1	2	124	23	172	18	88.242401	FAC/2018/0021
4	1	1	1	2	0	22	53	197	14	-1	
5	1	1	1	2	1	22	53	40	14	94.697586	Issue
5	1	1	1	2	2	70	53	35	14	96.736229	date
5	1	1	1	2	3	113	53	106	14	95.864395	2018-12-14
2	1	2	0	0	0	20	83	174	61	-1	
3	1	2	1	0	0	20	83	174	61	-1	
4	1	2	1	1	0	22	83	172	14	-1	
5	1	2	1	1	1	22	83	22	14	93.648109	Bill
5	1	2	1	1	2	52	84	16	13	93.648109	to
5	1	2	1	1	3	77	83	47	14	96.147522	Mario
5	1	2	1	1	4	132	83	62	14	92.557602	Benson
4	1	2	1	2	0	21	102	160	12	-1	
5	1	2	1	2	1	21	102	30	10	65.787224	Email
5	1	2	1	2	2	57	102	124	12	65.787224	julie45@example.org
4	1	2	1	3	0	20	121	153	9	-1	
5	1	2	1	3	1	20	121	14	9	43.980045	de
5	1	2	1	3	2	104	122	12	8	76.113052	53,
5	1	2	1	3	3	124	122	17	8	85.171509	Box
5	1	2	1	3	4	147	122	26	8	93.627121	301

In [263]:
def draw_bounding_boxes(input_img_path, output_path):
   img = cv2.imread(input_img_path)

   # Extract data
   data = pytesseract.image_to_data(img, output_type=pytesseract.Output.DICT)
   print(data)
   n_boxes = len(data["text"])

   for i in range(n_boxes):
    #    if data["conf"][i] == -1:
    #        continue
       # Coordinates
       x, y = data["left"][i], data["top"][i]
       w, h = data["width"][i], data["height"][i]

       # Corners
       top_left = (x, y)
       bottom_right = (x + w, y + h)

       # Box params
       green = (0, 255, 0)
       thickness = 1  # The function-version uses thinner lines

       cv2.rectangle(img, top_left, bottom_right, green, thickness)

   # Save the image with boxes
   cv2.imwrite(output_path, img)

In [264]:
file = all_files[6]
chemin = f"../data/files/{file.split("_")[1]}/{file}"
draw_bounding_boxes(chemin, "../data/test.png")


{'level': [1, 2, 3, 4, 5, 5, 4, 5, 5, 5, 2, 3, 4, 5, 5, 5, 5, 4, 5, 5, 5, 2, 3, 4, 5, 4, 5, 2, 3, 4, 5, 5, 5, 5, 2, 3, 4, 5, 2, 3, 4, 5, 2, 3, 4, 5, 5, 2, 3, 4, 5, 2, 3, 4, 5, 2, 3, 4, 5, 5], 'page_num': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'block_num': [0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7, 7, 8, 8, 8, 8, 9, 9, 9, 9, 10, 10, 10, 10, 10], 'par_num': [0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1], 'line_num': [0, 0, 0, 1, 1, 1, 2, 2, 2, 2, 0, 0, 1, 1, 1, 1, 1, 2, 2, 2, 2, 0, 0, 1, 1, 2, 2, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1], 'word_num': [0, 0, 0, 0, 1,

In [211]:
file = all_files[300]
chemin = f"../data/files/{file.split('_')[1]}/{file}"
chemin

'../data/files/2019/FAC_2019_0253-123.png'

### Personnalisation des Blocs de texte

In [244]:
import cv2
import pytesseract

def process_image(input_img_path, output_img_path, regions):
    img = cv2.imread(input_img_path)
    extracted_texts = {}

    for region_name, (x, y, w, h) in regions.items():
        # Dessiner les rectangles et noms des régions
        top_left = (x, y)
        bottom_right = (x + w, y + h)
        cv2.rectangle(img, top_left, bottom_right, (0, 255, 0), 2)

        # Extraire le texte de la région
        roi = img[y:y+h, x:x+w]
        text = pytesseract.image_to_string(roi, config="--psm 6")
        extracted_texts[region_name] = text.strip()

    # Sauvegarde de l'image annotée
    cv2.imwrite(output_img_path, img)

    return extracted_texts

# Définition des blocs
predefined_regions = {
    "Adresse": (10, 116, 400, 60),
    "Nom": (70, 70, 250, 30),
    "Mail": (50, 100, 250, 20),
    "Date": (105, 45, 250, 30),
    "Products": (20, 180, 400, 350),
    "Quantities_and_prices": (540, 180, 250, 350)
}

file = all_files[60]
chemin = f"../data/files/{file.split('_')[1]}/{file}"
output = "../data/test.png"

# Traitement de l'image
extracted_texts = process_image(chemin, output, predefined_regions)

# Affichage des résultats
for region, text in extracted_texts.items():
    if text != "" and text != " ":
        print(f"{region}\n{text}\n")


Adresse
Address Unit 7432 Box 6173
DPOAA 35170

Nom
Michael Pittman

Mail
larryaguirre@exam ple.org,

Date
2019-01-06

Products
Different direction son somebody.
TOTAL

Quantities_and_prices
4x 37.51 Euro
150.04 Euro



## Avec EasyOCR

In [7]:
file = all_files[1]
chemin = f"../data/files/{file.split("_")[1]}/{file}"
reader = easyocr.Reader(['en'])
result = reader.readtext(chemin)
for detection in result:
    print(detection[1])

Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.


INVOICE FAC/2018/0002
Issue date 2018-10-17
Bill to Samuel Coleman
Email qmeyer@example.com
Address 64623 Wright Mills
Turnermouth; KS45555
Between everybody size conference:
45 . 70
Euro
TOTAL
182
80
Euro
Biilllling


Plus efficace mais bien plus lent

## Avec docTR

In [66]:
from doctr.io import DocumentFile
from doctr.models import ocr_predictor

model = ocr_predictor(pretrained=True)
file = all_files[2]
chemin = f"../data/files/{file.split("_")[1]}/{file}"
doc = DocumentFile.from_images(f"../data/files/{file.split("_")[1]}/{file}")
result = model(doc)

In [64]:
result

Document(
  (pages): [Page(
    dimensions=(1100, 850)
    (blocks): [Block(
      (lines): [
        Line(
          (words): [
            Word(value='INVOICE', confidence=0.99),
            Word(value='AC/2018/0002', confidence=1.0),
          ]
        ),
        Line(
          (words): [
            Word(value='Issue', confidence=1.0),
            Word(value='date', confidence=1.0),
            Word(value='2018-10-21', confidence=1.0),
          ]
        ),
        Line(
          (words): [
            Word(value='a', confidence=0.57),
            Word(value='S', confidence=0.71),
          ]
        ),
        Line(
          (words): [Word(value='Eo', confidence=0.31)]
        ),
        Line(
          (words): [
            Word(value='Bill', confidence=0.99),
            Word(value='to', confidence=0.94),
            Word(value='Kirsten', confidence=0.67),
            Word(value='Martin', confidence=1.0),
          ]
        ),
        Line(
          (words): [
          

Assez efficace et rapide mais avec du preprocessing

In [67]:
# Extraire le texte de chaque ligne
for page in result.pages:
    for block in page.blocks:
        for line in block.lines:
            line_text = ' '.join(word.value for word in line.words)
            print(line_text)

INVOICE AC/2018/0003
Issue date 2018-10-21
S
Bill to Lynn Brown
EIR
Balling
Address 33039 Wilson Crest
Coxmouth, FM 49096
Form yet pressure heart.
4 X 366.15 Euro
Successful unit member world.
1 X 63.75 Euro
Most detail game within.
1 X 99.26 Euro
After agency read hope.
4 X 15.93 Euro
Character evening quality movie.
3 X 28.26 Euro
TOTAL
1776.11 Euro


## Avec le modèle donut-base-finetunned-invoices de hugging face

In [48]:
pipe = pipeline("image-to-text", model="to-be/donut-base-finetuned-invoices")

Config of the encoder: <class 'transformers.models.donut.modeling_donut_swin.DonutSwinModel'> is overwritten by shared encoder config: DonutSwinConfig {
  "attention_probs_dropout_prob": 0.0,
  "depths": [
    2,
    2,
    14,
    2
  ],
  "drop_path_rate": 0.1,
  "embed_dim": 128,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.0,
  "hidden_size": 1024,
  "image_size": [
    1920,
    1280
  ],
  "initializer_range": 0.02,
  "layer_norm_eps": 1e-05,
  "mlp_ratio": 4.0,
  "model_type": "donut-swin",
  "num_channels": 3,
  "num_heads": [
    4,
    8,
    16,
    32
  ],
  "num_layers": 4,
  "patch_size": 4,
  "path_norm": true,
  "qkv_bias": true,
  "torch_dtype": "float32",
  "transformers_version": "4.49.0",
  "use_absolute_embeddings": false,
  "window_size": 10
}

Config of the decoder: <class 'transformers.models.mbart.modeling_mbart.MBartForCausalLM'> is overwritten by shared decoder config: MBartConfig {
  "activation_dropout": 0.0,
  "activation_function": "gelu",
  "add_cr

In [57]:

file = all_files[2]
chemin = f"../data/files/{file.split("_")[1]}/{file}"
image = Image.open(chemin)
result = pipe(image)
print(result[0]["generated_text"])

<s_cord-v2><s_DocType> Invoice</s_DocType><s_Currency1> EUR</s_Currency1><s_DocumentDate> 2023-01-10</s_DocumentDate><s_GrossAmount> 1776.11</s_GrossAmount><s_InvoiceNumber> FAC/2018/0003</s_InvoiceNumber><s_NetAmount1> 1776.11</s_NetAmount1><s_TaxAmount1> 0.00</s_TaxAmount1>


In [58]:
import xml.etree.ElementTree as ET
text = "<s_cord-v2><s_DocType>Invoice</s_DocType><s_Currency1>EUR</s_Currency1><s_DocumentDate>2023-01-27</s_DocumentDate><s_GrossAmount>27.13</s_GrossAmount><s_InvoiceNumber>FAC/2018/0002</s_InvoiceNumber><s_NetAmount1>27.13</s_NetAmount1><s_TaxAmount1>0.00</s_TaxAmount1></s_cord-v2>"
root = ET.fromstring(text)
data = {child.tag: child.text for child in root}
print(data)


{'s_DocType': 'Invoice', 's_Currency1': 'EUR', 's_DocumentDate': '2023-01-27', 's_GrossAmount': '27.13', 's_InvoiceNumber': 'FAC/2018/0002', 's_NetAmount1': '27.13', 's_TaxAmount1': '0.00'}


Long et ne récupére que certaines données donc à compléter