In [4]:
!pip install opencv-python
!pip install numpy
!pip install matplotlib



In [1]:
import cv2
import numpy as np
import matplotlib.pyplot as plt
import os
import glob

# Ensure directories exist
os.makedirs('Cropped_Receipts', exist_ok=True)

# ADD YOUR FOLDER PATH
input_folder = 'dataset'
output_folder = 'Cropped_Receipts'

# Gamma correction
invGamma = 1.0 / 0.3
table = np.array([((i / 255.0) ** invGamma) * 255 for i in np.arange(0, 256)]).astype("uint8")

def biggestRectangle(contours):
    biggest = None
    max_area = 0
    indexReturn = -1
    for index in range(len(contours)):
        i = contours[index]
        area = cv2.contourArea(i)
        if area > 100:
            peri = cv2.arcLength(i, True)
            approx = cv2.approxPolyDP(i, 0.1 * peri, True)
            if area > max_area:  
                biggest = approx
                max_area = area
                indexReturn = index
    return indexReturn

# Get all image files from input folder
image_files = glob.glob(os.path.join(input_folder, '*.jpg'))

# Process each image
for idx, image_path in enumerate(image_files):
    # Read image
    img = cv2.imread(image_path)
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

    # Apply gamma correction
    gray = cv2.LUT(gray, table)

    # Thresholding
    ret, thresh1 = cv2.threshold(gray, 10, 255, cv2.THRESH_BINARY)

    # Find contours
    contours, hierarchy = cv2.findContours(thresh1, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)

    # Find the biggest rectangle contour
    indexReturn = biggestRectangle(contours)

    if indexReturn != -1:
        # Get the bounding box for the largest contour
        hull = cv2.convexHull(contours[indexReturn])
        x, y, w, h = cv2.boundingRect(hull)
        cv2.drawContours(img, [hull], 0, (0, 255, 0), 3)
        
        # Crop the ROI
        cropped = img[y:y+h, x:x+w]

        # Save cropped image with incremental name
        cropped_filename = os.path.join(output_folder, f'cropped_receipt_{idx+1}.png')
        cv2.imwrite(cropped_filename, cropped)
        print(f"Saved: {cropped_filename}")

        # # Optional: Display cropped image
        # plt.imshow(cv2.cvtColor(cropped, cv2.COLOR_BGR2RGB))
        # plt.axis("off")
        # plt.title(f"Cropped Receipt {idx+1}")
        # plt.show()
    else:
        print(f"No valid contours found for image: {os.path.basename(image_path)}")




In [None]:
!pip install spacy
# Download the spaCy English model - this is crucial
!python -m spacy download en_core_web_sm
import spacy
from spacy.tokens import Doc
from spacy.util import filter_spans

def identify_food_entities(processed_text):
    """
    Use spaCy's NER to identify food items, prices, quantities, and dates in the bill text.
    Custom entity rules are added to recognize food items and menu-specific terminology.
    
    Args:
        processed_text: The preprocessed text from the hotel bill
        
    Returns:
        doc: spaCy Doc object with entities identified
        extracted_entities: Dictionary containing the extracted entities by category
    """
    try:
        # Load spaCy model
        nlp = spacy.load("en_core_web_sm")
    except OSError:
        # If model not found, provide helpful error
        print("Error: The spaCy English model is not installed.")
        print("Please run: !python -m spacy download en_core_web_sm")
        return None, {}
    
    # Add custom food entity patterns to the pipeline
    # This helps recognize food items that aren't in the default NER model
    food_patterns = [
        {"label": "FOOD", "pattern": [{"LOWER": "coffee"}]},
        {"label": "FOOD", "pattern": [{"LOWER": "lunch"}]},
        {"label": "FOOD", "pattern": [{"LOWER": "dinner"}]},
        {"label": "FOOD", "pattern": [{"LOWER": "breakfast"}]},
        {"label": "FOOD", "pattern": [{"LOWER": "salad"}]},
        {"label": "FOOD", "pattern": [{"LOWER": "sandwich"}]},
        {"label": "FOOD", "pattern": [{"LOWER": "burger"}]},
        {"label": "FOOD", "pattern": [{"LOWER": "pizza"}]},
        {"label": "FOOD", "pattern": [{"LOWER": "pasta"}]},
        {"label": "FOOD", "pattern": [{"LOWER": "steak"}]},
        {"label": "FOOD", "pattern": [{"LOWER": "chicken"}]},
        {"label": "FOOD", "pattern": [{"LOWER": "fish"}]},
        {"label": "FOOD", "pattern": [{"LOWER": "coke"}, {"LOWER": "cola"}]},
        {"label": "FOOD", "pattern": [{"LOWER": "water"}]},
        {"label": "FOOD", "pattern": [{"LOWER": "juice"}]},
        {"label": "FOOD", "pattern": [{"LOWER": "tea"}]},
    ]
    
    # Create the matcher and add it to the pipeline
    ruler = nlp.add_pipe("entity_ruler", before="ner")
    ruler.add_patterns(food_patterns)
    
    # Process the text
    doc = nlp(processed_text)
    
    # Extract entities by category
    extracted_entities = {
        "FOOD": [],
        "MONEY": [],
        "QUANTITY": [],
        "DATE": [],
        "PERSON": [],  # For server names
        "ORG": [],     # For restaurant names
    }
    
    # Custom recognition for prices and quantities using regex
    import re
    
    # Find prices (e.g., $12.99, 12.99, 12)
    price_pattern = r'\$?\d+\.?\d*'
    for match in re.finditer(price_pattern, processed_text):
        start, end = match.span()
        span = doc.char_span(start, end, label="MONEY")
        if span is not None:
            doc.ents = list(doc.ents) + [span]
    
    # Find quantities (e.g., 2x, x2, 2 items)
    quantity_pattern = r'\d+\s*x|\bx\s*\d+|\b\d+\s+(?:items?|pcs)\b'
    for match in re.finditer(quantity_pattern, processed_text):
        start, end = match.span()
        span = doc.char_span(start, end, label="QUANTITY")
        if span is not None:
            doc.ents = list(doc.ents) + [span]
    
    # Remove overlapping entities
    doc.ents = filter_spans(list(doc.ents))
    
    # Categorize entities
    for ent in doc.ents:
        if ent.label_ in extracted_entities:
            extracted_entities[ent.label_].append((ent.text, ent.start_char, ent.end_char))
    
    return doc, extracted_entities

def display_entities(doc, extracted_entities):
    """
    Display the extracted entities in a readable format
    
    Args:
        doc: spaCy Doc object with entities identified
        extracted_entities: Dictionary containing the extracted entities by category
    """
    print("Extracted Entities:")
    print("-" * 40)
    
    for category, entities in extracted_entities.items():
        if entities:
            print(f"\n{category}:")
            for entity in entities:
                print(f"  • {entity[0]}")
    
    print("\nEntity visualization:")
    print(spacy.displacy.render(doc, style="ent", jupyter=False))

# Example usage
if __name__ == "__main__":
    # Example processed text from a hotel bill
    processed_text = """
    GREEN FIELD
    5305 E PACIFIC COAST HWY
    Long Beach, CA 90004
    (562) 597-0906
    
    Server: Francis
    Order #: 69923
    Table: B11
    Guests: 2
    
    1 Coffee              3.00
    2 Lunch              45.90
    1 Coke                3.00
    
    SUB TOTAL:           51.90
    Tax:                  4.60
    
    TOTAL:               $56.50
    
    5/26/2016 12:53:10 PM
    
    THANK YOU!
    """
    
    doc, entities = identify_food_entities(processed_text)
    display_entities(doc, entities)







In [None]:
def apply_rule_based_classification(text):
    """
    Apply rule-based token classification with regex patterns to identify domain-specific entities
    """
    import re
    from collections import defaultdict
    
    # Dictionary to store all recognized entities
    entities = defaultdict(list)
    
    # Define patterns for different entity types
    patterns = {
        'FOOD_ITEM': [
            r'(?:Coffee|Lunch|Dinner|Breakfast|Salad|Sandwich|Burger|Pizza|Pasta|Steak|Chicken|Fish|Coke|Water|Juice|Tea)\s*[\w\s]*',
            r'[A-Z][a-z]+\s+[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*'  # Capitalized multi-word items
        ],
        'PRICE': [
            r'\$\s*\d+(?:[,.]\d{1,2})?',  # US currency format
            r'\d+\.\d{2}'  # Decimal price format
        ],
        'QUANTITY': [
            r'\b\d+\b(?!\.\d+)',  # Simple numbers like 1, 2, 3
            r'\d+\s*x',  # Format: 2x
            r'x\s*\d+',  # Format: x2
            r'\d+\s+(?:pcs|items?|pieces)'  # Format: 2 items, 3 pieces
        ],
        'DATE': [
            r'\d{1,2}[-/]\d{1,2}[-/]\d{2,4}',  # DD/MM/YYYY or MM/DD/YYYY
            r'\d{1,2}\s+(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*\s+\d{2,4}'  # 20 May 18
        ],
        'TIME': [
            r'\d{1,2}:\d{2}(?::\d{2})?\s*(?:AM|PM|am|pm)?'  # 12:55, 12:55 PM
        ],
        'ORDER_NUMBER': [
            r'(?:Order|Invoice|Bill)(?:\s+#|:|\s+Number:?)\s*[A-Za-z0-9-]+',
            r'#\s*[A-Za-z0-9-]+'  # #12345
        ],
        'CONTACT': [
            r'(?:\+\d{1,3}\s*)?(?:\(\d{3,4}\)\s*|\d{3,4}[-\s])\d{3,4}[-\s]\d{4}',  # Phone number formats
            r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'  # Email
        ],
        'ADDRESS': [
            r'\d+\s+[A-Za-z\s]+(?:St|Street|Ave|Avenue|Blvd|Boulevard|Rd|Road|Hwy|Highway)',
            r'[A-Za-z]+,\s*[A-Z]{2}\s+\d{5}'  # City, State ZIP
        ]
    }
    
    # Apply each pattern and extract matches
    for entity_type, pattern_list in patterns.items():
        for pattern in pattern_list:
            matches = re.finditer(pattern, text)
            for match in matches:
                # Get the matched text and its position
                start, end = match.span()
                matched_text = match.group()
                
                # Store the entity with its position
                entities[entity_type].append({
                    'text': matched_text,
                    'start': start,
                    'end': end
                })
    
    return entities

def enhance_spacy_ner(text, extracted_entities=None):
    """
    Enhance spaCy NER with rule-based token classification
    """
    try:
        # Load spaCy model
        nlp = spacy.load("en_core_web_sm")
        
        # Process text with spaCy
        doc = nlp(text)
        
        # Get entities recognized by spaCy
        entities = []
        for ent in doc.ents:
            entities.append({
                'text': ent.text,
                'start': ent.start_char,
                'end': ent.end_char,
                'label': ent.label_
            })
        
        # If we have rule-based entities, add them
        if extracted_entities:
            for entity_type, entity_list in extracted_entities.items():
                for entity in entity_list:
                    # Check if this entity overlaps with any spaCy entity
                    overlaps = False
                    for spacy_entity in entities:
                        if (entity['start'] < spacy_entity['end'] and 
                            entity['end'] > spacy_entity['start']):
                            overlaps = True
                            break
                    
                    # Add if no overlap
                    if not overlaps:
                        entities.append({
                            'text': entity['text'],
                            'start': entity['start'],
                            'end': entity['end'],
                            'label': entity_type
                        })
        
        return entities
        
    except Exception as e:
        print(f"Error enhancing NER: {e}")
        return []

# Modify the identify_food_entities function to incorporate rule-based classification
def identify_food_entities(processed_text):
    """
    Use spaCy's NER combined with rule-based classification to identify entities in the bill text.
    """
    try:
        # Load spaCy model
        nlp = spacy.load("en_core_web_sm")
    except OSError:
        print("Error: The spaCy English model is not installed.")
        print("Please run: !python -m spacy download en_core_web_sm")
        return None, {}
    
    # First apply rule-based classification
    rule_based_entities = apply_rule_based_classification(processed_text)
    
    # Add custom food entity patterns to the pipeline
    food_patterns = [
        {"label": "FOOD", "pattern": [{"LOWER": "coffee"}]},
        {"label": "FOOD", "pattern": [{"LOWER": "lunch"}]},
        {"label": "FOOD", "pattern": [{"LOWER": "dinner"}]},
        {"label": "FOOD", "pattern": [{"LOWER": "breakfast"}]},
        {"label": "FOOD", "pattern": [{"LOWER": "salad"}]},
        {"label": "FOOD", "pattern": [{"LOWER": "sandwich"}]},
        {"label": "FOOD", "pattern": [{"LOWER": "burger"}]},
        {"label": "FOOD", "pattern": [{"LOWER": "pizza"}]},
        {"label": "FOOD", "pattern": [{"LOWER": "pasta"}]},
        {"label": "FOOD", "pattern": [{"LOWER": "steak"}]},
        {"label": "FOOD", "pattern": [{"LOWER": "chicken"}]},
        {"label": "FOOD", "pattern": [{"LOWER": "fish"}]},
        {"label": "FOOD", "pattern": [{"LOWER": "coke"}, {"LOWER": "cola"}]},
        {"label": "FOOD", "pattern": [{"LOWER": "water"}]},
        {"label": "FOOD", "pattern": [{"LOWER": "juice"}]},
        {"label": "FOOD", "pattern": [{"LOWER": "tea"}]},
    ]
    
    # Create the matcher and add it to the pipeline
    ruler = nlp.add_pipe("entity_ruler", before="ner")
    ruler.add_patterns(food_patterns)
    
    # Process the text
    doc = nlp(processed_text)
    
    # Extract entities by category
    extracted_entities = {
        "FOOD": [],
        "MONEY": [],
        "QUANTITY": [],
        "DATE": [],
        "PERSON": [],  # For server names
        "ORG": [],     # For restaurant names
    }
    
    # Merge rule-based entities with spaCy entities
    for entity_type in rule_based_entities:
        mapped_type = entity_type
        if entity_type == 'FOOD_ITEM':
            mapped_type = 'FOOD'
        elif entity_type == 'PRICE':
            mapped_type = 'MONEY'
        
        if mapped_type in extracted_entities:
            for entity in rule_based_entities[entity_type]:
                extracted_entities[mapped_type].append((entity['text'], entity['start'], entity['end']))
    
    # Add spaCy entities that don't overlap with rule-based ones
    for ent in doc.ents:
        if ent.label_ in extracted_entities:
            # Check if this entity overlaps with any already extracted entity
            overlaps = False
            for entity_type in extracted_entities:
                for entity in extracted_entities[entity_type]:
                    if (ent.start_char < entity[2] and ent.end_char > entity[1]):
                        overlaps = True
                        break
                if overlaps:
                    break
            
            # Add if no overlap
            if not overlaps:
                extracted_entities[ent.label_].append((ent.text, ent.start_char, ent.end_char))
    
    return doc, extracted_entities