In [1]:
import json
import datasets
from datasets import load_dataset, ClassLabel
from transformers import AutoTokenizer, AutoModelForTokenClassification, DataCollatorForTokenClassification
from transformers import TrainingArguments, Trainer
import evaluate
import numpy as np
from transformers import pipeline

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
MODEL_PATH = "mikhaelkrns/invoice-ner-v1"
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
model = AutoModelForTokenClassification.from_pretrained(MODEL_PATH)

In [5]:
model.config.id2label = {
 0: 'B-CLIENT_ADDRESS',
 1: 'B-CLIENT_NAME',
 2: 'B-INVOICE_DATE',
 3: 'B-INVOICE_NUMBER',
 4: 'B-ITEM_DESC',
 5: 'B-NET_WORTH',
 6: 'B-PRICE',
 7: 'B-QUANTITY',
 8: 'B-SELLER_NAME',
 9: 'B-TOTAL',
 10: 'B-VAT',
 11: 'I-CLIENT_ADDRESS',
 12: 'I-CLIENT_NAME',
 13: 'I-ITEM_DESC',
 14: 'I-NET_WORTH',
 15: 'I-PRICE',
 16: 'I-SELLER_NAME',
 17: 'I-TOTAL',
 18: 'I-VAT',
 19: 'O'
 }


model.config.label2id = {v: k for k, v in model.config.id2label.items()}


ner_pipeline = pipeline(
    "ner",
    model=model,
    tokenizer=tokenizer,
    aggregation_strategy="max"  
)

testing_csv = """ 
Invoice no: 11580833 Date of issue: 11/24/2019 
Seller: Client: Wood, Simpson and Summers Tapia LLC 
7000 Graves Ville 53393 Peters Overpass Juliastad, 
OH 02764 Boothside, WY 76847 Tax Id: 968-86-1662 T
ax Id: 979-96-3992 IBAN: GB61VHVS67246807986062 ITEMS 
No. Description Qty UM Net price Net worth VAT [%] Gross worth 1. 
Dell Optiplex SFF Desktop 1,00 each 89,99 89,99 10% 98,99 
Computer Windows 10 4GB 160GB 2 Dell Desktop ComputerDIntel 
3,00 each 69,95 209,85 10% 230,84 Core 2 Duo 8GB 1TB HD 
[Windows 10 PC 22 LCD Wifi 3 HP 6200 Pro Core i7 3.4GHz
5,00 each 256,68 1 283,40 10% 1 411,74 Quad Core 16GB 500GB 
Computer Vintage Microsolutions Pentium 3,00 each 390,00 1
170,00 10% 1 287,00 133mhz Desktop Tower PC Windows 95 5.25 
Floppy 5 Dell OptiPlex 7060 SFF 4,00 each 202,50 810,00 10% 891,00 
Computer, 8th Gen Core i5, 8GB DDR4, New 240GB SSD DVD 6 Custom Gaming 
PC Computer 1,00 each 449,99 449,99 10% 494,99 FX-6300 3.SGHz 8GB 
RAM 128GB SSD 1 TB HDD RX 580 4GB Custom Build HP Desktop 2,00 each 
329,00 658,00 10% 723,80 Computer 16GB 2TB SSD Windows 10 
Windows PC WiFi SUMMARY VAT [%] Net worth VAT Gross worth 10% 
4 671,23 467,12 5 138,35 Total $ 4 671,23 $ 467,12 $ 5 138,35"

"""

testing_ocr = """
Invoice no: 11580833 Date of issue: 11/24/2019 
Seller: Client: Wood, Simpson and Summers Tapia LLC 7000
Graves Ville 53393 Peters Overpass Juliastad, OH 02764 Boothside, 
WY 76847 Tax Id: 968-86-1662 Tax Id: 979-96-3992
IBAN: GB61VHVS67246807986062 
ITEMS No. Description Qty UM 
Net price Net worth VAT [%] Gross worth 1. 
Dell Optiplex SFF Desktop 1,00 each 89,99 89,99 10% 98,99 
Computer Windows 10 4GB 160GB 2 Dell Desktop ComputerDIntel
3,00 each 69,95 209,85 10% 230,84 Core 2 Duo 8GB 1TB HD
[Windows 10 PC 22 LCD Wifi 3 HP 6200 Pro Core i7 3.4GHz 5,00 
each 256,68 1 283,40 10% 1 411,74 Quad Core 16GB 500GB 
Computer Vintage Microsolutions Pentium 3,00 each 390,00 1 170,00
10% 1 287,00 133mhz Desktop Tower PC Windows 95 5.25 Floppy 
5 Dell OptiPlex 7060 SFF 4,00 each 202,50 810,00 10% 891,
00 Computer, 8th Gen Core i5, 8GB DDR4, New 240GB SSD DVD 6 Custom
Gaming PC Computer 1,00 each 449,99 10% 494,99 FX-6300 3.SGHz 8GB 
RAM 128GB SSD 1 TB HDD RX 580 4GB Custom Build HP Desktop 2,00 each
329,00 658,00 10% 723,80 Computer 16GB 2TB SSD Windows 10 
Windows PC WiFi SUMMARY VAT [%] Net worth VAT Gross worth 10% 4
671,23 467,12 5 138,35 Total $ 4 671,23 $ 467,12 $ 5 138,35
"""

testing_batch2 ="""
Invoice acct 1N8CpQGmFzQxIIDx Invoice number 436773 Date of issue Oct. 6, 2023 Date due Nov. 17,2023 acct_1NBCpQGmFzQxIIDx Bill to Cristini North America Inc. S5809 USD due Nov. 17 , 2023 Pay online Description Quantity unit_price Amount 81120905-101 PPT SHAPE 813 S52 81120905-101 PPT SHAPE 3M SJ3572 Scotchmate Hook 1.5"" S215 S1505 3M SJ3572 Scotchmate Hook 1.5"" 3M 48 Thread Sealant Tape S6 S42 3M 48 Thread Sealant Tape Loctite 415-10z S12 S48 Loctite 415-1oz 3M 3762LM-PG Hot Melt TAN 1"" x 3"" 8 S13 S104 3M 3762LM-PG Hot Melt TAN 1""x3"" 3M 468MP Adh Transfer Tape S766 S3064 3M 468MP Adh: Transfer Tape Akzo 10p20-13SC Spray 250ml 836 $36 Akzo 10p20-13SC Spray 250ml SubTotal: S2096 Total: S2096 Amount due: S5809 USD
"""



results = ner_pipeline(testing_csv)

for r in results:
    print(r)

Device set to use cpu


{'entity_group': 'INVOICE_NUMBER', 'score': np.float32(0.9969909), 'word': '11580833', 'start': 14, 'end': 22}
{'entity_group': 'INVOICE_DATE', 'score': np.float32(0.99620664), 'word': '11', 'start': 38, 'end': 40}
{'entity_group': 'INVOICE_DATE', 'score': np.float32(0.61783093), 'word': '/', 'start': 40, 'end': 41}
{'entity_group': 'INVOICE_DATE', 'score': np.float32(0.99297607), 'word': '24', 'start': 41, 'end': 43}
{'entity_group': 'SELLER_NAME', 'score': np.float32(0.9921085), 'word': 'Wood, Simpson and Summers', 'start': 66, 'end': 91}
{'entity_group': 'CLIENT_NAME', 'score': np.float32(0.9941952), 'word': 'Tapia LLC', 'start': 92, 'end': 101}
{'entity_group': 'QUANTITY', 'score': np.float32(0.99731135), 'word': '1', 'start': 354, 'end': 355}
{'entity_group': 'NET_WORTH', 'score': np.float32(0.24504663), 'word': ',', 'start': 355, 'end': 356}
{'entity_group': 'QUANTITY', 'score': np.float32(0.6150784), 'word': '00', 'start': 356, 'end': 358}
{'entity_group': 'PRICE', 'score': np.f

In [7]:
from collections import defaultdict

def extract_entities(text, pipeline, threshold=0.9):
    results = pipeline(text)
    entities = defaultdict(list)
    for r in results:
        if r["score"] >= threshold:  
            entities[r["entity_group"]].append(r["word"])  
    return dict(entities)

hasil = extract_entities(testing_csv, ner_pipeline, threshold=0.6)
print(hasil)


{'INVOICE_NUMBER': ['11580833'], 'INVOICE_DATE': ['11', '/', '24'], 'SELLER_NAME': ['Wood, Simpson and Summers'], 'CLIENT_NAME': ['Tapia LLC'], 'QUANTITY': ['1', '00', '3', '5'], 'PRICE': [',', '98, 99', ',', ',', '230, 84', ', 40', '1 411, 74']}
