In [1]:
!pip install fastapi uvicorn transformers pytesseract Pillow python-multipart

Collecting pytesseract
  Downloading pytesseract-0.3.13-py3-none-any.whl.metadata (11 kB)
Downloading pytesseract-0.3.13-py3-none-any.whl (14 kB)
Installing collected packages: pytesseract
Successfully installed pytesseract-0.3.13


In [2]:
!pip install fastapi uvicorn pyngrok nest-asyncio
!apt install tesseract-ocr -y

Collecting pyngrok
  Downloading pyngrok-7.2.9-py3-none-any.whl.metadata (9.3 kB)
Downloading pyngrok-7.2.9-py3-none-any.whl (25 kB)
Installing collected packages: pyngrok
Successfully installed pyngrok-7.2.9
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
tesseract-ocr is already the newest version (4.1.1-2.1build1).
0 upgraded, 0 newly installed, 0 to remove and 35 not upgraded.


In [3]:
from fastapi import FastAPI, UploadFile, File
from fastapi.responses import JSONResponse
from PIL import Image
import pytesseract
import io
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
model_path = "/content/drive/MyDrive/ner-model"

In [6]:
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForTokenClassification.from_pretrained(model_path)
ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")

def extract_data_from_text(text: str):
    predictions = ner_pipeline(text)
    result = {}
    for entity in predictions:
        label = entity["entity_group"]
        word = entity["word"]
        result.setdefault(label, []).append(word)
    return result

Device set to use cuda:0


In [7]:
app = FastAPI()

@app.post("/extract")
async def extract_entities_from_image(file: UploadFile = File(...)):
    try:
        image_bytes = await file.read()
        image = Image.open(io.BytesIO(image_bytes))
        text = pytesseract.image_to_string(image, lang='eng+fra')
        extracted = extract_data_from_text(text)
        return JSONResponse(content={"success": True, "text": text, "entities": extracted})
    except Exception as e:
        return JSONResponse(status_code=500, content={"success": False, "error": str(e)})


In [8]:
from google.colab import userdata

token = userdata.get('ngrok')
!ngrok config add-authtoken {token}

Authtoken saved to configuration file: /root/.config/ngrok/ngrok.yml


In [9]:
from pyngrok import ngrok
import nest_asyncio
import uvicorn
import threading

nest_asyncio.apply()

public_url = ngrok.connect(8000)
print(f"🚀 Ton API est disponible ici : {public_url}/extract")

def run():
    uvicorn.run(app, host="0.0.0.0", port=8000)

thread = threading.Thread(target=run)
thread.start()


🚀 Ton API est disponible ici : NgrokTunnel: "https://f4a1-34-125-178-255.ngrok-free.app" -> "http://localhost:8000"/extract


In [14]:
!curl -F "file=@FACTU2017120061.jpg" https://f4a1-34-125-178-255.ngrok-free.app/extract


INFO:     34.125.178.255:0 - "POST /extract HTTP/1.1" 200 OK
{"success":true,"text":" \n\nFacture FA12/2017/086781\n\nDescription\n\nPoubelle a pédale\n\nBureau personnalisable\n\nBureau personnalisable\n\nLampe de bureau\n\nBureau personnalisable\n\nArchitecte Principal (Facturation sur Feuilles de Temps)\nBureau personnalisable\n\nChaise de bureau noire\n\nService Client (Heures Prépayées)\n\nBiotech\n215 Vine St\n\nScranton PA 18503\n\nEtats Unis\n\nQuantité\n88,00 Unités\n62,00 Unités\n\n8,00 Unités\n18,00 Unités\n84,00 Unités\n\n33,00 Heures\n55,00 Unités\n94,00 Unités\n38,00 Heures\n\nDate Echéacnce\n\nN° BC\n\n2017-12-28 2017-12-28 BCO2214\n\nPrix Taxes\n\n10,00 TVA 20%\n500,00 TVA 20%\n500,00 TVA 20%\n\n35,00 TVA 20%\n500,00 TVA 20%\n150,00 TVA 20%\n500,00 TVA 20%\n180,00 TVA 10%\n190,00 TVA 20%\n\nMontant HT\nTaxes\n\nMontant TTC\n\nSous total\n880,00 €\n31000,00€\n4000,00 €\n630,00 €\n42000,00€\n4 950,00 €\n27 500,00 €\n16 920,00€\n7 220,00 €\n\n€135 100,00\n€ 25 328,00\n€ 16

In [15]:
import requests

url = "https://f4a1-34-125-178-255.ngrok-free.app/extract"
file_path = "FACTU2017120061.jpg"

with open(file_path, "rb") as file:
    response = requests.post(
        url,
        files={"file": (file_path, file, "image/jpeg")}
    )


INFO:     34.125.178.255:0 - "POST /extract HTTP/1.1" 200 OK


In [16]:
print(response.text)

{"success":true,"text":" \n\nFacture FA12/2017/086781\n\nDescription\n\nPoubelle a pédale\n\nBureau personnalisable\n\nBureau personnalisable\n\nLampe de bureau\n\nBureau personnalisable\n\nArchitecte Principal (Facturation sur Feuilles de Temps)\nBureau personnalisable\n\nChaise de bureau noire\n\nService Client (Heures Prépayées)\n\nBiotech\n215 Vine St\n\nScranton PA 18503\n\nEtats Unis\n\nQuantité\n88,00 Unités\n62,00 Unités\n\n8,00 Unités\n18,00 Unités\n84,00 Unités\n\n33,00 Heures\n55,00 Unités\n94,00 Unités\n38,00 Heures\n\nDate Echéacnce\n\nN° BC\n\n2017-12-28 2017-12-28 BCO2214\n\nPrix Taxes\n\n10,00 TVA 20%\n500,00 TVA 20%\n500,00 TVA 20%\n\n35,00 TVA 20%\n500,00 TVA 20%\n150,00 TVA 20%\n500,00 TVA 20%\n180,00 TVA 10%\n190,00 TVA 20%\n\nMontant HT\nTaxes\n\nMontant TTC\n\nSous total\n880,00 €\n31000,00€\n4000,00 €\n630,00 €\n42000,00€\n4 950,00 €\n27 500,00 €\n16 920,00€\n7 220,00 €\n\n€135 100,00\n€ 25 328,00\n€ 160 428,00\n\n \n\f","entities":{"INVOICE_NO":["Facture FA12"],

In [17]:
import json
print(json.dumps(response.json(), indent=4, ensure_ascii=False))

{
    "success": true,
    "text": " \n\nFacture FA12/2017/086781\n\nDescription\n\nPoubelle a pédale\n\nBureau personnalisable\n\nBureau personnalisable\n\nLampe de bureau\n\nBureau personnalisable\n\nArchitecte Principal (Facturation sur Feuilles de Temps)\nBureau personnalisable\n\nChaise de bureau noire\n\nService Client (Heures Prépayées)\n\nBiotech\n215 Vine St\n\nScranton PA 18503\n\nEtats Unis\n\nQuantité\n88,00 Unités\n62,00 Unités\n\n8,00 Unités\n18,00 Unités\n84,00 Unités\n\n33,00 Heures\n55,00 Unités\n94,00 Unités\n38,00 Heures\n\nDate Echéacnce\n\nN° BC\n\n2017-12-28 2017-12-28 BCO2214\n\nPrix Taxes\n\n10,00 TVA 20%\n500,00 TVA 20%\n500,00 TVA 20%\n\n35,00 TVA 20%\n500,00 TVA 20%\n150,00 TVA 20%\n500,00 TVA 20%\n180,00 TVA 10%\n190,00 TVA 20%\n\nMontant HT\nTaxes\n\nMontant TTC\n\nSous total\n880,00 €\n31000,00€\n4000,00 €\n630,00 €\n42000,00€\n4 950,00 €\n27 500,00 €\n16 920,00€\n7 220,00 €\n\n€135 100,00\n€ 25 328,00\n€ 160 428,00\n\n \n\f",
    "entities": {
        "IN