**HTML File Directory**

In [60]:
!pwd

/content


**Downloading Libraries**

In [61]:
!pip install -U spacy
!python -m spacy download en_core_web_trf
!pip install transformers

Collecting en-core-web-trf==3.6.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_trf-3.6.1/en_core_web_trf-3.6.1-py3-none-any.whl (460.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m460.3/460.3 MB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_trf')


**Import Libraries**

In [62]:
import re
import spacy
from bs4 import BeautifulSoup
import torch
from transformers import LayoutLMForTokenClassification, LayoutLMTokenizer

**Loading Models spaCY NER and LayoutLM**

In [63]:
#Load SpaCY NER Model
nlp = spacy.load("en_core_web_sm")
# Load the LayoutLM model and tokenizer
model_name = "microsoft/layoutlm-base-uncased"
model = LayoutLMForTokenClassification.from_pretrained(model_name)
tokenizer = LayoutLMTokenizer.from_pretrained(model_name)

Some weights of the model checkpoint at microsoft/layoutlm-base-uncased were not used when initializing LayoutLMForTokenClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing LayoutLMForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing LayoutLMForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of LayoutLMForTokenClassification were not initialized from the model checkpoint at microsoft

**Key Information Extraction Function Using Models for Dummy Samples**

In [64]:
def extract_key_information_from_html(html):
    #Function Docstrings for easy readibility in """ for anyone who is looking through my code.
    """
    Extracts key information from the provided eReceipt HTML content.

    Parameters:
        html (str): The HTML content of the eReceipt.

    Returns:
        dict or None: A dictionary containing the extracted key information
                      (store name, receipt date, subtotal, total, order ID, and products).
                      Returns None if the provided HTML is not a genuine eReceipt.
    """
    marketing_pattern = r"(marketing)"
    refund_pattern = r"refunded"
    credit_pattern = r"credit"
    cancellation_pattern = r"cancellation"
    shipping_update = r"shipping update"



    soup = BeautifulSoup(html, "html.parser")
    text = soup.get_text()
    """
    Checks whether the provided HTML content is a genuine eReceipt or not.

    Parameters:
        html (str): The HTML content to be checked.

    Returns:
        bool: True if the provided HTML is a genuine eReceipt, False otherwise.
    """
    if re.search(marketing_pattern, text, re.IGNORECASE) or \
       re.search(refund_pattern, text, re.IGNORECASE) or \
       re.search(credit_pattern, text, re.IGNORECASE) or \
       re.search(cancellation_pattern, text, re.IGNORECASE)or \
       re.search(shipping_update, text, re.IGNORECASE):
        return None

    # Apply spaCy NER to identify general entities
    doc = nlp(text)

    org_name = None
    for ent in doc.ents:
        if ent.label_ == "ORG":
            org_name = ent.text

    # Extract specific entities using regular expressions
    receipt_date = re.search(r"\d{1,2}/\d{1,2}/\d{4}", text)
    receipt_subtotal = re.search(r"Sub Total: \$([\d.]+)", text)
    receipt_total = re.search(r"Order Total: \$([\d.]+)", text)
    order_id = re.search(r"Order No\.:\s*(\d+)", text)

    product_info = []
    product_pattern = r"\s+(.*?)\s+(\d+)\s+\$(\d+\.\d{2})\s+(\d{1,2}/\d{1,2}/\d{4})"
    product_matches = re.findall(product_pattern, text)
    for match in product_matches:
        product_description, product_quantity, product_price, product_date = match
        product_info.append({
            "product_description": product_description.strip(),
            "product_quantity": int(product_quantity),
            "product_price": float(product_price),
            "product_date": product_date
        })

    # Check for general entities from spaCy NER
    entities = {}
    for ent in doc.ents:
     if ent.label_ in ["ORG"]:
        if "ORG" not in entities:
            entities["ORG"] = ent.text
            if "Order Confirmation" in entities["ORG"]:
                entities["ORG"] = entities["ORG"].replace("Order Confirmation", "")
            break

    # Add specific entities to the result
    entities["receipt_date"] = receipt_date.group() if receipt_date else None
    entities["receipt_subtotal"] = float(receipt_subtotal.group(1)) if receipt_subtotal else None
    entities["receipt_total"] = float(receipt_total.group(1)) if receipt_total else None
    entities["order_id"] = int(order_id.group(1)) if order_id else None
    entities["products"] = product_info if product_info else None

    return entities

In [65]:
def extract_key_information_from_file(file_path):
    """
    Extracts key information from the provided eReceipt HTML file.

    Parameters:
        file_path (str): The file path of the eReceipt HTML file.

    Returns:
        dict or None: A dictionary containing the extracted key information
                      (store name, receipt date, subtotal, total, order ID, and products).
                      Returns None if the provided HTML is not a genuine eReceipt.
    """
    # Read the HTML content from the file
    with open(file_path, "r") as file:
        html_content = file.read()

    # Load the LayoutLM model and tokenizer
    model_name = "microsoft/layoutlm-base-uncased"
    model = LayoutLMForTokenClassification.from_pretrained(model_name)
    tokenizer = LayoutLMTokenizer.from_pretrained(model_name)

    # Perform inference using the LayoutLM model for token classification
    inputs = tokenizer(html_content, return_tensors="pt", truncation=True, padding=True)
    outputs = model(**inputs)

    # Rest of the code for extracting key information remains the same
    key_information = extract_key_information_from_html(html_content)
    if key_information is not None:
        print("\n\nGenuine eReceipt\n\n. Extracted key information:")
        print(key_information)
    else:
        print("\n\n\nNot a genuine eReceipt.")

    return key_information

***Detecting if Dummy_order.html is genuine or not and then extracting key information  ***

In [66]:
# File path of the eReceipt HTML file
file_path = "/content/dummy_order.html"

key_information=extract_key_information_from_file(file_path)

Some weights of the model checkpoint at microsoft/layoutlm-base-uncased were not used when initializing LayoutLMForTokenClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing LayoutLMForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing LayoutLMForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of LayoutLMForTokenClassification were not initialized from the model checkpoint at microsoft



Genuine eReceipt

. Extracted key information:
{'ORG': 'Barnes & Noble ', 'receipt_date': '10/21/2022', 'receipt_subtotal': 43.38, 'receipt_total': 46.83, 'order_id': 241300967, 'products': [{'product_description': 'The Fine Print  by Lauren Asher  Paperback', 'product_quantity': 1, 'product_price': 14.99, 'product_date': '10/21/2022'}, {'product_description': 'Twisted Love - Special Edition (Twisted Series #1)  by Ana Huang  Paperback', 'product_quantity': 1, 'product_price': 13.99, 'product_date': '10/21/2022'}, {'product_description': 'Ice Planet Barbarians  by Ruby Dixon  Paperback', 'product_quantity': 1, 'product_price': 14.4, 'product_date': '10/22/2022'}]}


In [67]:
def print_receipt_information(key_information):
    #function docstring for easy readiblity of what this function is doing in """
    """
    Prints the extracted key information in a formatted manner.

    Parameters:
        key_information (dict): A dictionary containing the extracted key information
                                (store name, receipt date, subtotal, total, order ID, and products).
    """

    print("Genuine eReceipt. Extracted key information:")
    print("-" * 70)
    print(f"{'Store Name:':<25} {key_information['ORG']}")
    print(f"{'Receipt Date:':<25} {key_information['receipt_date']}")
    print(f"{'Receipt Subtotal:':<25} ${key_information['receipt_subtotal']:.2f}")
    print(f"{'Receipt Total:':<25} ${key_information['receipt_total']:.2f}")
    print(f"{'Order ID:':<25} {key_information['order_id']}")
    print("-" * 70)
    print("Products:")
    print("-" * 70)
    for product in key_information['products']:
        print(f"{'Product Description:':<30} {product['product_description']}")
        print(f"{'Product Quantity:':<30} {product['product_quantity']}")
        print(f"{'Product Price:':<30} ${product['product_price']:.2f}")
        print(f"{'Product Date:':<30} {product['product_date']}")
        print("-" * 70)

# Call the function to print the formatted results
print_receipt_information(key_information)


Genuine eReceipt. Extracted key information:
----------------------------------------------------------------------
Store Name:               Barnes & Noble 
Receipt Date:             10/21/2022
Receipt Subtotal:         $43.38
Receipt Total:            $46.83
Order ID:                 241300967
----------------------------------------------------------------------
Products:
----------------------------------------------------------------------
Product Description:           The Fine Print  by Lauren Asher  Paperback
Product Quantity:              1
Product Price:                 $14.99
Product Date:                  10/21/2022
----------------------------------------------------------------------
Product Description:           Twisted Love - Special Edition (Twisted Series #1)  by Ana Huang  Paperback
Product Quantity:              1
Product Price:                 $13.99
Product Date:                  10/21/2022
----------------------------------------------------------------------
Pr

***Detecting if dummy_shipping.html is genuine or not and then detecting the key information***

In [68]:
# File path of the eReceipt HTML file
file_path = "/content/dummy_shipping.html"

key_information=extract_key_information_from_file(file_path)

Some weights of the model checkpoint at microsoft/layoutlm-base-uncased were not used when initializing LayoutLMForTokenClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing LayoutLMForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing LayoutLMForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of LayoutLMForTokenClassification were not initialized from the model checkpoint at microsoft




Not a genuine eReceipt.
