## Import Libraries

In [1]:
import pandas as pd
import json
import re

## Load Dataset

In [5]:
df = pd.read_csv('../data/batch_1/batch1_1.csv')

In [7]:
def normalize_number_variants(s: str):
    s = s.strip()
    if not s:
        return []
    variants = {s}
    if "," in s:
        variants.add(s.replace(",", "."))
    if "." in s:
        variants.add(s.replace(".", ","))
    variants.add(re.sub(r"\s+", "", s))
    return list(variants)


In [8]:
def annotate_invoice(ocr_text, json_data):
    tokens = ocr_text.split()
    labels = ["O"] * len(tokens)

    total_from_json = None
    try:
        total_from_json = json_data.get("subtotal", {}).get("total")
    except Exception:
        total_from_json = None

    entities = {
        "INVOICE_NUMBER": json_data.get("invoice", {}).get("invoice_number"),
        "INVOICE_DATE": json_data.get("invoice", {}).get("invoice_date"),
        "CLIENT_NAME": json_data.get("invoice", {}).get("client_name"),
        "SELLER_NAME": json_data.get("invoice", {}).get("seller_name"),
        "TOTAL": total_from_json,
    }

    token_strs = tokens[:]  

    def try_match_sequence(label, value):
        if not value:
            return False
        value_candidates = [value]
        if label == "TOTAL":
            value_candidates = normalize_number_variants(value)
        matched = False
        for val in value_candidates:
            value_tokens = val.split()
            if not value_tokens:
                continue
            L = len(value_tokens)
            for i in range(0, len(token_strs) - L + 1):
                if token_strs[i:i+L] == value_tokens:
                    labels[i] = f"B-{label}"
                    for j in range(1, L):
                        labels[i+j] = f"I-{label}"
                    matched = True
                    break
            if matched:
                break
        return matched

    for label, value in entities.items():
        try_match_sequence(label, value)

    return {"tokens": tokens, "ner_tags": labels}

In [10]:
df = pd.read_csv("../data/batch_1/batch1_1.csv", dtype=str, keep_default_na=False)
df_testing = df.loc[0:5]
annotated_rows = []
for idx, row in df_testing.iterrows():
    raw_json = row["Json Data"]

    json_obj = json.loads(raw_json)

    ocr_text = row["OCRed Text"]

    ann = annotate_invoice(ocr_text, json_obj)

    annotated_rows.append({
        "file_name": row["File Name"],
        "tokens": ann["tokens"],
        "ner_tags": ann["ner_tags"],
    })


In [13]:
print(annotated_rows[0])

{'file_name': 'batch1-0494.jpg', 'tokens': ['Invoice', 'no:', '84652373', 'Date', 'of', 'issue:', '02/23/2021', 'Seller:', 'Client:', 'Nguyen-Roach', 'Clark-Foster', '247', 'David', 'Highway', '77477', 'Cliff', 'Apt.', '853', 'Lake', 'John,', 'WV', '84178', 'Washingtonbury,', 'MS', '78346', 'Tax', 'Id:', '991-72-5826', 'Tax', 'Id:', '937-70-8530', 'IBAN:', 'GB91/YXO05542456978150', 'ITEMS', 'No.', 'Description', 'Qty', 'UM', 'Net', 'price', 'Net', 'worth', 'VAT', '[%]', 'Gross', 'worth', 'Stemware', 'Rack', 'Display', 'Kitchen', '1,00', 'each', '42,32', '42,32', '10%', '46,55', 'Wine', 'Glass', 'Holder', 'Bottle', 'Carbon', 'Steel', 'Free', 'Punch', '2', 'VTG', '(4)', '7', 'Ounce', 'Since', '1852', '1,00', 'each', '14,00', '14,00', '10%', '15,40', 'Milk', 'Bottle', 'Wine', 'Carafe', 'Juice', 'Glass', 'with', 'Cork', 'Lids', '3', 'Vintage', 'Crystal', 'Red', 'Wine', '1,00', 'each', '35,45', '35,45', '10%', '39,00', 'Glasses', 'NOS', 'West', 'Germany', '1983', '6', '10', 'ounce', 'elegan