In [4]:
import pandas as pd
from dotenv import load_dotenv # type: ignore
import os
import sys
sys.path.append("..")
from src.get_all_files import get_all_files
from dateparser import parse
import cv2
import pytesseract
import io
from PIL import Image


In [5]:
load_dotenv()
blob_keys = os.getenv("AZURE_BLOB_KEYS")
all_files = get_all_files(blob_keys)
print(len(all_files))


5123


In [96]:
import cv2
import pytesseract

def process_image(input_img_path, regions, scale_factor=2):
    img = cv2.imread(input_img_path)
    
    # Agrandir l'image pour améliorer la reconnaissance des caractères
    new_size = (int(img.shape[1] * scale_factor), int(img.shape[0] * scale_factor))
    img = cv2.resize(img, new_size, interpolation=cv2.INTER_LINEAR)
    
    extracted_texts = {}

    for region_name, (x, y, w, h) in regions.items():
        # Ajuster les coordonnées des régions à l'échelle
        x, y, w, h = int(x * scale_factor), int(y * scale_factor), int(w * scale_factor), int(h * scale_factor)
        
        roi = img[y:y+h, x:x+w]

        # Convertir en niveaux de gris
        gray = cv2.cvtColor(roi, cv2.COLOR_BGR2GRAY)

        # Amélioration du contraste (CLAHE)
        clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(10,10))
        enhanced = clahe.apply(gray)

        # Binarisation (Otsu)
        _, thresholded = cv2.threshold(enhanced, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)

        if region_name == "Quantities_and_prices":
            text =  pytesseract.image_to_string(thresholded, config="psm 6 --tessedit_char_whitelist 0123456789Eurox.")
        text = pytesseract.image_to_string(thresholded, config="--psm 6")
        extracted_texts[region_name] = text.strip()

    return extracted_texts

# Définition des blocs
predefined_regions = {
    "Adresse": (10, 116, 400, 60),
    "Nom": (70, 70, 250, 30),
    "Mail": (50, 100, 250, 20),
    "Date": (105, 45, 250, 30),
    "Products": (20, 180, 400, 350),
    "Quantities_and_prices": (540, 180, 250, 350)
}

In [9]:
df = pd.DataFrame(columns=["adress", "nom", "mail", "date", "products", "quantities", "prices", "total"])

In [97]:

file = all_files[7]
chemin = f"../data/files/{file.split('_')[1]}/{file}"
output = "../data/test.png"

# Traitement de l'image
extracted_texts = process_image(chemin, predefined_regions)

# Affichage des résultats
print(extracted_texts)
print(f"adresse: {extracted_texts["Adresse"].replace("\n", " ")}")
print(f"nom: {extracted_texts["Nom"]}")
print(f"mail: {extracted_texts["Mail"]}")
print(f"date: {parse(extracted_texts["Date"], languages=["fr","en"])}")
print(f"products: {[product for product in extracted_texts["Products"].split('\n') if product != "TOTAL"]}")
print(f"quantities: {[quantity.split("x")[0].strip() for quantity in extracted_texts["Quantities_and_prices"].split('\n')[:-1]]}")
print(f"prices: {[price.split("x")[1].strip().replace(" Euro", "") for price in extracted_texts["Quantities_and_prices"].replace("\n\n", "\n").split('\n')[:-1]]}")
print(f"total: {extracted_texts["Quantities_and_prices"].split('\n')[-1].replace(" Euro", "")}")


{'Adresse': 'Address 720 Norman Stravenue Apt. 861\nMaysfort, KY 71703', 'Nom': 'David Macdonald', 'Mail': '| zobnen@example.com', 'Date': '2018-12-03', 'Products': 'Statement sort where on.\nClearly treatment for up.\nBrother pretty local likely.\nBetter nor name physical.\nTOTAL', 'Quantities_and_prices': '1 x 21.15 Euro\n1 x 14.49 Euro\n3 x 78.32 Euro\n2x 14.91 Euro\n\n300.42 Euro'}
adresse: Address 720 Norman Stravenue Apt. 861 Maysfort, KY 71703
nom: David Macdonald
mail: | zobnen@example.com
date: 2018-12-03 00:00:00
products: ['Statement sort where on.', 'Clearly treatment for up.', 'Brother pretty local likely.', 'Better nor name physical.']
quantities: ['1', '1', '3', '2', '']
prices: ['21.15', '14.49', '78.32', '14.91']
total: 300.42


In [None]:
# Chargement des données extraites
file = "FAC_2019_0172-512.png"
chemin = f"../data/files/{file.split('_')[1]}/{file}"
output = "../data/test.png"

extracted_texts = process_image(chemin, predefined_regions)

# Nettoyage et formatage des données
adresse = extracted_texts["Adresse"].replace("\n", " ")
nom_client = extracted_texts["Nom"]
mail_client = extracted_texts["Mail"]
date_facturation = parse(extracted_texts["Date"], languages=["fr", "en"])

products = [product for product in extracted_texts["Products"].split('\n') if product != "TOTAL"]
quantities = [quantity.split("x")[0].strip() for quantity in extracted_texts["Quantities_and_prices"].split('\n')[:-1]]
prices = [price.split("x")[1].strip().replace(" Euro", "") for price in extracted_texts["Quantities_and_prices"].replace("\n\n", "\n").split('\n')[:-1]]
total = extracted_texts["Quantities_and_prices"].split('\n')[-1].replace(" Euro", "")

# Génération d'identifiants uniques
id_client = f"CLT_{hash(nom_client + mail_client) % 10**6}"
id_facture = file.replace(".png","")

# DataFrame Client
df_client = pd.DataFrame({
    "id_client": [id_client],
    "Nom": [nom_client],
    "mail": [mail_client],
    "birthday": [None]  # Date de naissance inconnue ici
})

# DataFrame Facture
df_facture = pd.DataFrame({
    "id_facture": [id_facture],
    "texte": [adresse],
    "date_facturation": [date_facturation]
})

products_filtre = [product for product in products if product.strip()]
# DataFrame Produit
df_produit = pd.DataFrame({
    "Id_produit": [f"PROD_{hash(p.strip().lower()) % 10**6}" for p in products_filtre],
    "Nom": products_filtre,
    "Prix": prices
})

# DataFrame Achat
df_achat = pd.DataFrame({
    "Id_produit": [f"PROD_{hash(p.strip().lower()) % 10**6}" for p in products_filtre],
    "id_client": [id_client] * len(products_filtre),
    "id_facture": [id_facture] * len(products_filtre),
    "quantité": [quantity for quantity in quantities if quantity.strip()]
})

# Affichage des DataFrames
print("Clients:\n") 
display(df_client)
print("Factures:\n")
display(df_facture)
print("Produits:\n")
display(df_produit)
print("Achats:\n")
display(df_achat)


Clients:



Unnamed: 0,id_client,Nom,mail,birthday
0,CLT_401470,Paul Perkins,| 2phillips@example.com,


Factures:



Unnamed: 0,id_facture,texte,date_facturation
0,FAC_2019_0172-512,"Address 1703 Justin Road Ayerston, MI 15139",2019-03-11


Produits:



Unnamed: 0,Id_produit,Nom,Prix
0,PROD_13982,Choose others including kitchen.,26.2
1,PROD_185359,Air million able memory.,333.65
2,PROD_205926,Fall style something those.,70.99
3,PROD_473054,Mr city sport herself.,63.75
4,PROD_185359,Air million able memory.,333.65


Achats:



Unnamed: 0,Id_produit,id_client,id_facture,quantité
0,PROD_13982,CLT_401470,FAC_2019_0172-512,1
1,PROD_185359,CLT_401470,FAC_2019_0172-512,1
2,PROD_205926,CLT_401470,FAC_2019_0172-512,3
3,PROD_473054,CLT_401470,FAC_2019_0172-512,1
4,PROD_185359,CLT_401470,FAC_2019_0172-512,2


In [None]:

df_clients = pd.DataFrame(columns=["id_client", "Nom", "mail", "birthday"])
df_factures = pd.DataFrame(columns=["id_facture", "texte", "date_facturation"])
df_produits = pd.DataFrame(columns=["Id_produit", "Nom", "Prix"])
df_achats = pd.DataFrame(columns=["Id_produit", "id_client", "id_facture", "quantité"])

def nettoyer_total(total):
    """Vérifie si le total est bien détecté (ne doit pas contenir 'x')"""
    if "x" in total:
        return None  # Erreur de détection
    return total.replace(" Euro", "")

def extraire_donnees(file):
    """Extrait et nettoie les données d'un fichier"""
    chemin = f"../data/files/{file.split('_')[1]}/{file}"
    output = "../data/test.png"

    try:
        extracted_texts = process_image(chemin, predefined_regions)
        
        # Nettoyage et formatage
        adresse = extracted_texts["Adresse"].replace("\n", " ")
        nom_client = extracted_texts["Nom"]
        mail_client = extracted_texts["Mail"]
        date_facturation = parse(extracted_texts["Date"], languages=["fr", "en"])

        products = [product for product in extracted_texts["Products"].split('\n') if product != "TOTAL"]
        quantities = [quantity.split("x")[0].strip() for quantity in extracted_texts["Quantities_and_prices"].split('\n')[:-1]]
        prices = [price.split("x")[1].strip().replace(" Euro", "") for price in extracted_texts["Quantities_and_prices"].replace("\n\n", "\n").split('\n')[:-1]]
        total = extracted_texts["Quantities_and_prices"].split('\n')[-1].replace(" Euro", "")

        # Vérification des erreurs
        erreurs = []
        if not nom_client: erreurs.append("Nom non détecté")
        if not mail_client: erreurs.append("Mail non détecté")
        if not date_facturation: erreurs.append("Date non détectée")
        if not products: erreurs.append("Produits non détectés")
        if not quantities: erreurs.append("Quantités non détectées")
        if not prices: erreurs.append("Prix non détectés")
        if total is None: erreurs.append("Total mal détecté")

        if erreurs:
            print(f"Erreur dans le fichier {file} : {', '.join(erreurs)}")
            return None  # On saute ce fichier en cas d'erreur

        # Génération d'identifiants uniques
        id_client = f"CLT_{hash(nom_client + mail_client) % 10**6}"
        id_facture = file

        # Création des DataFrames temporaires
        df_client = pd.DataFrame([{
            "id_client": id_client,
            "Nom": nom_client,
            "mail": mail_client,
            "birthday": None
        }])

        # DataFrame Facture
        df_facture = pd.DataFrame({
            "id_facture": [id_facture],
            "texte": [adresse],
            "date_facturation": [date_facturation]
        })

        products_filtre = [product for product in products if product.strip()]
        # DataFrame Produit
        df_produit = pd.DataFrame({
            "Id_produit": [f"PROD_{hash(p.strip().lower()) % 10**6}" for p in products_filtre],
            "Nom": products_filtre,
            "Prix": prices
        })

        # DataFrame Achat
        df_achat = pd.DataFrame({
            "Id_produit": [f"PROD_{hash(p.strip().lower()) % 10**6}" for p in products_filtre],
            "id_client": [id_client] * len(products_filtre),
            "id_facture": [id_facture] * len(products_filtre),
            "quantité": [quantity for quantity in quantities if quantity.strip()]
        })

        return df_client, df_facture, df_produit, df_achat

    except Exception as e:
        print(f"Échec pour le fichier {file} : {str(e)}")
        return None

# Boucle sur les 500 premiers fichiers
for file in all_files[:20]:
    data = extraire_donnees(file)
    
    if data:
        df_client, df_facture, df_produit, df_achat = data

        # Ajout aux DataFrames globaux
        df_clients = pd.concat([df_clients, df_client], ignore_index=True)
        df_factures = pd.concat([df_factures, df_facture], ignore_index=True)
        df_produits = pd.concat([df_produits, df_produit], ignore_index=True)
        df_achats = pd.concat([df_achats, df_achat], ignore_index=True)

# Affichage final des DataFrames après traitement
print("\nTraitement terminé. Voici les DataFrames finaux :")

print("Clients :")
display(df_clients)

print("Factures :")
display(df_factures)

print("Produits :")
display(df_produits)

print("Achats :")
display(df_achats)


  df_factures = pd.concat([df_factures, df_facture], ignore_index=True)



Traitement terminé. Voici les DataFrames finaux :
Clients :


Unnamed: 0,id_client,Nom,mail,birthday
0,CLT_787237,Carol Potter,| ashley38@example. org,
1,CLT_899167,Samuel Coleman,| qmever@example.com,
2,CLT_264425,Richard Dunn,| phughes@example.com,
3,CLT_650737,Mario Stout,| danie lledaniels@example.org,
4,CLT_849623,Rachel Ramirez,| patriciakelley@example.ore,
5,CLT_60997,Richard Green,| steevenmvyers@example.com,
6,CLT_143248,Bruce Pace,| alan48 @example.org,
7,CLT_484007,David Macdonald,| zobnen@example.com,
8,CLT_219277,Lisa Fisher,| rhondaO2@exampte.net,
9,CLT_339094,Tammy Solis,| stacey20@example.com,


Factures :


Unnamed: 0,id_facture,texte,date_facturation
0,FAC_2018_0001-654.png,"Address 405 Adrian Crest Suite 095 Jamesstad, ...",2018-10-13
1,FAC_2018_0002-114.png,"Address 64623 Wright Mils Turnermouth, KS 45555",2018-10-17
2,FAC_2018_0003-025.png,"Address 3305 Maureen Manors West Daniel, NY 27137",2018-11-03
3,FAC_2018_0004-759.png,Address 48010 Margaret Passage Suite 093 Walsh...,2018-11-14
4,FAC_2018_0005-281.png,"Address 7896 Jones Underpass Kennethborough, C...",2018-11-17
5,FAC_2018_0006-250.png,Address 72897 Snyder Viaduct Suite 726 Amandas...,2018-11-30
6,FAC_2018_0007-228.png,"Address 96374 Amanda Dam East Craigfort, AM 73373",2018-12-01
7,FAC_2018_0008-142.png,Address 720 Norman Stravenue Apt. 861 Maysfort...,2018-12-03
8,FAC_2018_0009-754.png,"Address 6673 Cook Skyway West Jessica, PR 17982",2018-12-06
9,FAC_2018_0010-104.png,"Address 9614 Cook Shores Suite 908 West Lisa, ...",2018-12-07


Produits :


Unnamed: 0,Id_produit,Nom,Prix
0,PROD_35179,Edge so crime share.,12.18
1,PROD_35188,Thank do article especially.,67.86
2,PROD_544548,Include dinner main friend.,287.99
3,PROD_548964,Capital hear morning people.,55.43
4,PROD_839326,Between everybody size conference.,45.70
...,...,...,...
59,PROD_451698,Especially environmental through spring.,12.72
60,PROD_24078,From south animal he.,6.79
61,PROD_172206,Social similar the people.,442.52
62,PROD_955183,Beautiful car different work.,4.64


Achats :


Unnamed: 0,Id_produit,id_client,id_facture,quantité
0,PROD_35179,CLT_787237,FAC_2018_0001-654.png,4
1,PROD_35188,CLT_787237,FAC_2018_0001-654.png,1
2,PROD_544548,CLT_787237,FAC_2018_0001-654.png,3
3,PROD_548964,CLT_787237,FAC_2018_0001-654.png,3
4,PROD_839326,CLT_899167,FAC_2018_0002-114.png,4
...,...,...,...,...
59,PROD_451698,CLT_248889,FAC_2018_0019-030.png,1
60,PROD_24078,CLT_967006,FAC_2018_0020-095.png,2
61,PROD_172206,CLT_967006,FAC_2018_0020-095.png,2
62,PROD_955183,CLT_967006,FAC_2018_0020-095.png,4


Beaucoup trop d'échec, seulement 140 / 500 correctement lus, en 9 minutes

## Extraire les données des qrcode

In [None]:
file = all_files[4]
chemin = f"../data/files/{file.split('_')[1]}/{file}"
img = cv2.imread(chemin)
(x, y, w, h) = (540, 8, 150, 150)
top_left = (x, y)
bottom_right = (x + w, y + h)
cv2.rectangle(img, top_left, bottom_right, (0, 255, 0), 2)
roi = img[y:y+h, x:x+w]
cv2.imwrite("../data/test.png", img)
detector = cv2.QRCodeDetector()
data, bbox, straight_qrcode = detector.detectAndDecode(img)
data = data.split("\n")
datetime = data[1].split("DATE:")[1]
birthday = data[2].split(", birth ")[1]
print(birthday)
print(datetime)




IndexError: string index out of range

# Avec Azure

In [None]:
from azure.cognitiveservices.vision.computervision import ComputerVisionClient
from azure.cognitiveservices.vision.computervision.models import OperationStatusCodes
from azure.cognitiveservices.vision.computervision.models import VisualFeatureTypes
from msrest.authentication import CognitiveServicesCredentials

import time

# Configuration du client
endpoint = "endpoint"
key = "key"
computervision_client = ComputerVisionClient(endpoint, CognitiveServicesCredentials(key))

# Chemin de l'image locale
file = all_files[4499]
chemin = f"../data/files/{file.split('_')[1]}/{file}"
local_image_path = chemin

# Ouvrir l'image
with open(local_image_path, "rb") as image_stream:
    # Appeler l'API OCR
    read_response = computervision_client.read_in_stream(image_stream, raw=True)

    # Récupérer l'ID de l'opération pour vérifier le résultat
    read_operation_location = read_response.headers["Operation-Location"]
    operation_id = read_operation_location.split("/")[-1]

    # Attendre que l'analyse soit terminée
    while True:
        read_result = computervision_client.get_read_result(operation_id)
        if read_result.status not in [OperationStatusCodes.running, OperationStatusCodes.not_started]:
            break
        time.sleep(1)

    # Afficher le texte extrait
    if read_result.status == OperationStatusCodes.succeeded:
        for text_result in read_result.analyze_result.read_results:
            for line in text_result.lines:
                print(line.text)


INVOICE FAC/2024/0220
Issue date 2024-04-11
Bill to Deborah Phillips
Email hardymaurice@example.net
Brilllling
Address 3931 Anthony Locks Apt. 747
Greerborough, WI 45662
Read create no office.
2 x 119.10 Euro
Themselves individual identify scene.
4 x 134.12 Euro
TOTAL
774.68 Euro


In [None]:
import io
import time
from azure.cognitiveservices.vision.computervision import ComputerVisionClient
from azure.cognitiveservices.vision.computervision.models import OperationStatusCodes
from azure.cognitiveservices.vision.computervision.models import VisualFeatureTypes
from msrest.authentication import CognitiveServicesCredentials
from PIL import Image
from PIL import Image, ImageOps
import io

from PIL import Image, ImageOps
import io

def add_white_border_to_region(region, img):
    x, y, w, h = region
    img_width, img_height = img.size  # Dimensions de l'image originale

    # Créer une image de fond blanc de la même taille que l'image d'origine
    new_img = Image.new("RGB", (img_width, img_height), (255, 255, 255))  # Image avec fond blanc

    # Recadrer la région d'origine
    region_img = img.crop((x, y, x + w, y + h))

    # Coller la région recadrée au centre de l'image avec fond blanc
    new_x = x
    new_y = y
    new_img.paste(region_img, (new_x, new_y))

    return new_img

def process_image_azure(input_img_path, regions, endpoint, key):
    computervision_client = ComputerVisionClient(endpoint, CognitiveServicesCredentials(key))
    extracted_texts = {}

    try:
        with Image.open(input_img_path) as img:
            img_width, img_height = img.size  # Dimensions de l'image

            for region_name, (x, y, w, h) in regions.items():
                # Ajouter du blanc autour de la région sans changer ses dimensions
                new_img = add_white_border_to_region((x, y, w, h), img)

                # Convertir l'image modifiée en bytes pour l'API OCR
                img_byte_arr = io.BytesIO()
                new_img.save(img_byte_arr, format='PNG')
                img_byte_arr.seek(0)

                # Appel à l'API OCR Azure
                try:
                    read_response = computervision_client.read_in_stream(img_byte_arr, raw=True)
                    operation_location = read_response.headers["Operation-Location"]
                    operation_id = operation_location.split("/")[-1]

                    # Attendre que l'analyse soit terminée
                    while True:
                        read_result = computervision_client.get_read_result(operation_id)
                        if read_result.status not in [OperationStatusCodes.running, OperationStatusCodes.not_started]:
                            break
                        time.sleep(1)

                    # Extraire le texte si l'analyse est réussie
                    if read_result.status == OperationStatusCodes.succeeded:
                        text = "\n".join(line.text for text_result in read_result.analyze_result.read_results for line in text_result.lines)
                        extracted_texts[region_name] = text.strip()
                    else:
                        print(f"Erreur OCR pour la région '{region_name}': {read_result.status}")
                except Exception as e:
                    print(f"Erreur lors de l'appel à l'API OCR pour la région '{region_name}': {e}")

    except Exception as e:
        print(f"Erreur lors de l'ouverture ou du traitement de l'image {input_img_path}: {e}")

    return extracted_texts


# Définition des régions
predefined_regions = {
    "Adresse": (10, 116, 400, 60),
    "Nom": (70, 70, 250, 30),
    "Mail": (55, 100, 250, 20),
    "Date": (105, 45, 250, 30),
    "Products": (20, 180, 400, 350),
    "Quantities_and_prices": (540, 180, 250, 350)
}

# Configuration Azure
endpoint = "endpoint"
key = "key"

# Vérifier que la variable all_files existe
try:
    file = all_files[5000]
    chemin = f"../data/files/{file.split('_')[1]}/{file}"
    extracted_texts = process_image_azure(chemin, predefined_regions, endpoint, key)

    print(f"adresse: {extracted_texts.get('Adresse', '').replace('\n', ' ')}")
    print(f"nom: {extracted_texts.get('Nom', '')}")
    print(f"mail: {extracted_texts.get('Mail', '')}")
    print(f"date: {extracted_texts.get('Date', '')}")
    print(f"products: {[product for product in extracted_texts.get('Products', '').split('\n') if product != 'TOTAL']}")
    print(f"quantities_and_prices: {extracted_texts.get('Quantities_and_prices', '')}")
except NameError:
    print("Erreur : la variable all_files n'est pas définie.")


adresse: Address 3887 Haynes Circle Apt. 995 West Sallyville, WV 87676
nom: Gary Carrillo
mail: meredithturner@example.com
date: 2024-11-06
products: ['Source edge score their.', 'Teach themselves despite we.', 'Especially environmental through spring.', 'Phone interesting a look.', 'Cost from without stage.']
quantities_and_prices: 2 x
25.04 Euro
4 x
34.70 Euro
1 x
12.72 Euro
1 x
43.49 Euro
4 x
3.31 Euro
258.33 Euro


Plus efficace mais bien plus lent, trop lent pour utiliser la technique de faire par bout d'image