In [2]:
!pip install pytesseract
!pip install nltk
!pip install requests
!pip install python-dotenv
!pip install spacy
!python -m spacy download en_core_web_sm

from PIL import Image
import pytesseract
# Install required libraries
import requests
import json
import nltk
nltk.download('stopwords')
nltk.download('punkt_tab')
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.probability import FreqDist
import spacy
from spacy.tokens import Doc
from spacy.util import filter_spans

























In [3]:
import pytesseract
pytesseract.pytesseract.tesseract_cmd = r'C:/Program Files/Tesseract-OCR/tesseract.exe'

# Load the image from file
image_path = 'test4.jpeg'  # Replace with your image file path
image = Image.open(image_path)

In [4]:
extracted_text = pytesseract.image_to_string(image)

# Print the extracted text
print("Extracted Text:")
print(extracted_text)



In [5]:
def preprocess_text(text):
    # Tokenize into sentences
    sentences = sent_tokenize(text)
    print("\nTokenized Sentences:")
    print(sentences)
    
    # Tokenize into words
    words = word_tokenize(text)
    print("\nTokenized Words:")
    print(words)

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    filtered_words = [word for word in words if word.lower() not in stop_words]
    print("\nFiltered Words (Stopwords Removed):")
    print(filtered_words)

    # Calculate word frequency
    freq_dist = FreqDist(filtered_words)
    print("\nWord Frequencies:")
    for word, freq in freq_dist.most_common(10):
        print(f"{word}: {freq}")

    return {
        "sentences": sentences,
        "filtered_words": filtered_words,
        "freq_dist": freq_dist
    }

# Process the extracted text
processed_data = preprocess_text(extracted_text)
print(processed_data)



In [6]:
# Process the extracted text
processed_data = preprocess_text(extracted_text)
print(processed_data)



In [7]:
import requests
import json
import os
from dotenv import load_dotenv

load_dotenv()

def summarize_with_gemini(api_key, extracted_text, custom_prompt):
    """
    Summarize text using Google's Gemini API
    """
    # Define the API endpoint - using the correct Gemini API URL
    endpoint = f"https://generativelanguage.googleapis.com/v1/models/gemini-pro:generateContent?key={api_key}"
    
    # Construct the payload according to Gemini API specifications
    payload = {
        "contents": [{
            "parts": [{
                "text": f"Instructions: {custom_prompt}\n\nText to analyze: {extracted_text}"
            }]
        }],
        "generationConfig": {
            "temperature": 0.7,
            "maxOutputTokens": 300,
            "topP": 0.8,
            "topK": 40
        }
    }
    
    headers = {
        "Content-Type": "application/json"
    }
    
    try:
        # Send the POST request
        response = requests.post(endpoint, headers=headers, json=payload)
        
        # Check if the request was successful
        if response.status_code == 200:
            response_data = response.json()
            
            # Extract the generated text from the response
            if ('candidates' in response_data and 
                len(response_data['candidates']) > 0 and 
                'content' in response_data['candidates'][0] and 
                'parts' in response_data['candidates'][0]['content'] and 
                len(response_data['candidates'][0]['content']['parts']) > 0 and 
                'text' in response_data['candidates'][0]['content']['parts'][0]):
                
                return {
                    "success": True,
                    "summary": response_data['candidates'][0]['content']['parts'][0]['text']
                }
            else:
                return {
                    "success": False,
                    "error": "Unexpected response structure"
                }
        else:
            error_message = response.json().get('error', {}).get('message', 'Unknown error occurred')
            return {
                "success": False,
                "error": f"API request failed with status {response.status_code}: {error_message}"
            }
            
    except requests.exceptions.RequestException as e:
        return {
            "success": False,
            "error": f"Request failed: {str(e)}"
        }

# Example usage
if __name__ == "__main__":
    # Your API key
    api_key = os.getenv('GEMINI_API_KEY') # Replace with actual API key
    
    # Example text and prompt
    extracted_text = pytesseract.image_to_string(image)
    custom_prompt = "explain all the dishes?"
    
    # Call the function
    result = summarize_with_gemini(api_key, extracted_text, custom_prompt)
    
    # Handle the result
    if result.get("success", False):
        print("Summary:", result["summary"])
    else:
        print("Error:", result.get("error", "An unknown error occurred"))



In [9]:
def clean_and_normalize_text(text):
    """
    Advanced text cleaning and normalization using regex and NLTK
    """
    import re
    
    # Split text into lines and process each menu item
    lines = text.split('\n')
    food_items = []
    prices = []
    
    # Flag to identify menu section
    in_menu_section = False
    
    for line in lines:
        line = line.strip()
        
        # Skip empty lines
        if not line:
            continue
            
        # Start capturing items after the header
        if 'item Qty. Rate Total' in line:
            in_menu_section = True
            continue
            
        # Stop capturing when we hit totals section
        if 'Total Qty:' in line or 'Sub Total:' in line:
            in_menu_section = False
            continue
        
        if in_menu_section:
            # Try to extract menu items
            # Pattern: [Item name] [quantity] [rate] [total]
            parts = line.split()
            if len(parts) >= 4:  # Need at least item name, qty, rate, total
                try:
                    # Last number is the total price
                    total = float(parts[-1])
                    # Remove the last 3 numbers (qty, rate, total) to get item name
                    item_name = ' '.join(parts[:-3])
                    # Clean up any special characters
                    item_name = re.sub(r'[^\s]', '', item_name)
                    
                    if item_name and len(item_name) > 2:
                        food_items.append(item_name.title())
                        prices.append(total)
                except ValueError:
                    # Handle cases like "HYDERABADI MURG) BIRYANI"
                    if 'BIRYANI' in line:
                        # Special handling for biryani line
                        biryani_parts = [p for p in parts if p.replace('.', '').isdigit()]
                        if biryani_parts:
                            total = float(biryani_parts[-1])
                            item_name = ' '.join([p for p in parts if not p.replace('.', '').isdigit()])
                            item_name = re.sub(r'[^\s]', '', item_name)
                            food_items.append(item_name.title())
                            prices.append(total)
    
    return {
        'food_items': food_items,
        'prices': prices
    }

def generate_summary(processed_text):
    """
    Generate a descriptive summary of the food items and bill
    """
    items = processed_text['food_items']
    prices = processed_text['prices']
    
    if not items:
        return "No food items found in the bill."
    
    summary = "The bill includes "
    if len(items) == 1:
        summary += f"{items[0]}"
    elif len(items) == 2:
        summary += f"{items[0]} and {items[1]}"
    else:
        summary += ", ".join(items[:-1]) + f", and {items[-1]}"
    
    if prices:
        total = sum(prices)
        summary += f". The total amount is Rs. {total:.2f}"
    
    return summary

def extract_food_entities_with_spacy(text):
    """
    Use spaCy NER to identify food items and related entities
    """
    try:
        # Load the spaCy model
        nlp = spacy.load("en_core_web_sm")
        
        # Add custom food entity patterns
        food_patterns = [
            {"label": "FOOD", "pattern": [{"LOWER": "tandoori"}]},
            {"label": "FOOD", "pattern": [{"LOWER": "chicken"}]},
            {"label": "FOOD", "pattern": [{"LOWER": "biryani"}]},
            {"label": "FOOD", "pattern": [{"LOWER": "roti"}]},
            {"label": "FOOD", "pattern": [{"LOWER": "dal"}]},
            {"label": "FOOD", "pattern": [{"LOWER": "tadka"}]},
            {"label": "FOOD", "pattern": [{"LOWER": "spicy"}]},
            {"label": "FOOD", "pattern": [{"LOWER": "lasooni"}]},
            {"label": "FOOD", "pattern": [{"LOWER": "hyderabadi"}]},
            {"label": "FOOD", "pattern": [{"LOWER": "murg"}]},
        ]
        
        # Add entity ruler to the pipeline
        ruler = nlp.add_pipe("entity_ruler", before="ner")
        ruler.add_patterns(food_patterns)
        
        # Process the text
        doc = nlp(text)
        
        # Return the identified food entities
        food_entities = [ent.text for ent in doc.ents if ent.label_ == "FOOD"]
        
        return food_entities
    
    except OSError:
        print("Error: The spaCy English model is not installed.")
        print("Please run: !python -m spacy download en_core_web_sm")
        return []

# Process the bill text to extract food items
food_items = extract_food_entities_with_spacy(extracted_text)
print("Food items identified using spaCy NER:")
for item in food_items:
    print(f"- {item}")



In [10]:
def apply_rule_based_classification(text):
    """
    Apply rule-based token classification with regex patterns to identify domain-specific entities
    """
    import re
    from collections import defaultdict
    
    # Dictionary to store all recognized entities
    entities = defaultdict(list)
    
    # Define patterns for different entity types
    patterns = {
        'FOOD_ITEM': [
            r'(?:Tandoori|Lasooni|HYDERABADI|BIRYANI|Dal|Tadka|Roti|chicken|spicy)\s*[\w\s]*',  # Food items
            r'[A-Z][a-z]+\s+[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*'  # Capitalized multi-word items
        ],
        'PRICE': [
            r'(?:Rs\.?|₹)?\s*\d+(?:[,.]\d{1,2})?',  # Indian currency format
            r'\d+\.\d{2}'  # Decimal price format
        ],
        'QUANTITY': [
            r'\b\d+\b(?!\.\d+)',  # Simple numbers like 1, 2, 3
            r'\d+\s*x',  # Format: 2x
            r'x\s*\d+',  # Format: x2
            r'\d+\s+(?:pcs|items?|pieces)'  # Format: 2 items, 3 pieces
        ],
        'DATE': [
            r'\d{1,2}[-/]\d{1,2}[-/]\d{2,4}',  # DD/MM/YYYY or MM/DD/YYYY
            r'\d{1,2}\s+(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*\s+\d{2,4}'  # 20 May 18
        ],
        'TIME': [
            r'\d{1,2}:\d{2}(?::\d{2})?\s*(?:AM|PM|am|pm)?'  # 12:55, 12:55 PM
        ],
        'ORDER_NUMBER': [
            r'(?:Order|Invoice|Bill)(?:\s+#|:|\s+Number:?)\s*[A-Za-z0-9-]+',  # Invoice Number: IN001001259
            r'#\s*[A-Za-z0-9-]+'  # #12345
        ],
        'CONTACT': [
            r'(?:\+\d{1,3}\s*)?(?:\(\d{3,4}\)\s*|\d{3,4}[-\s])\d{3,4}[-\s]\d{4}',  # Phone number formats
            r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'  # Email
        ],
        'ADDRESS': [
            r'\d+/\d+\s+\w+[-\s]\s*\d+',  # 11/2 Sector- 37
            r'[A-Z][a-z]+[-\s]\s*\d+'  # Faridabad- 121003
        ]
    }
    
    # Apply each pattern and extract matches
    for entity_type, pattern_list in patterns.items():
        for pattern in pattern_list:
            matches = re.finditer(pattern, text)
            for match in matches:
                # Get the matched text and its position
                start, end = match.span()
                matched_text = match.group()
                
                # Some post-processing and validation
                if entity_type == 'FOOD_ITEM' and len(matched_text) < 3:
                    continue  # Skip very short food items
                
                if entity_type == 'PRICE' and not any(c.isdigit() for c in matched_text):
                    continue  # Ensure prices contain digits
                
                # Store the entity with its position
                entities[entity_type].append({
                    'text': matched_text,
                    'start': start,
                    'end': end
                })
    
    return entities

def enhance_ner_with_rules(text):
    """
    Combine spaCy NER with rule-based classification to get the best of both approaches
    """
    # Get standard NER results
    try:
        nlp = spacy.load("en_core_web_sm")
        
        # Add custom entity patterns
        ruler = nlp.add_pipe("entity_ruler", before="ner")
        
        # Define patterns for food items and other domain-specific entities
        patterns = [
            {"label": "FOOD", "pattern": [{"LOWER": "tandoori"}]},
            {"label": "FOOD", "pattern": [{"LOWER": "chicken"}]},
            {"label": "FOOD", "pattern": [{"LOWER": "biryani"}]},
            {"label": "FOOD", "pattern": [{"LOWER": "roti"}]},
            {"label": "FOOD", "pattern": [{"LOWER": "dal"}]},
            {"label": "FOOD", "pattern": [{"LOWER": "tadka"}]},
            {"label": "FOOD", "pattern": [{"LOWER": "spicy"}]},
            {"label": "FOOD", "pattern": [{"LOWER": "lasooni"}]},
            {"label": "FOOD", "pattern": [{"LOWER": "hyderabadi"}]},
            {"label": "FOOD", "pattern": [{"LOWER": "murg"}]},
            {"label": "QUANTITY", "pattern": [{"IS_DIGIT": True}]},
            {"label": "PRICE", "pattern": [{"SHAPE": "ddd.dd"}]},
            {"label": "PRICE", "pattern": [{"SHAPE": "d,ddd.dd"}]},
        ]
        
        ruler.add_patterns(patterns)
        doc = nlp(text)
        
        spacy_entities = {
            "FOOD": [],
            "CARDINAL": [],
            "MONEY": [],
            "DATE": [],
            "TIME": [],
            "ORG": [],
            "PERSON": [],
        }
        
        # Extract entities recognized by spaCy
        for ent in doc.ents:
            if ent.label_ in spacy_entities:
                spacy_entities[ent.label_].append({
                    'text': ent.text,
                    'start': ent.start_char,
                    'end': ent.end_char
                })
    
    except Exception as e:
        print(f"Error in spaCy processing: {e}")
        spacy_entities = {}
    
    # Get rule-based entities
    rule_entities = apply_rule_based_classification(text)
    
    # Merge the results with priority to rule-based entities
    combined_entities = {}
    
    # First add rule-based entities
    for entity_type, entities in rule_entities.items():
        combined_entities[entity_type] = entities
    
    # Then add spaCy entities that don't overlap with rule-based ones
    for entity_type, entities in spacy_entities.items():
        if entity_type not in combined_entities:
            combined_entities[entity_type] = []
        
        for entity in entities:
            # Check if this entity overlaps with any existing entity
            overlaps = False
            for existing_type in combined_entities:
                for existing_entity in combined_entities[existing_type]:
                    # Check for overlap in character spans
                    if (entity['start'] < existing_entity['end'] and 
                        entity['end'] > existing_entity['start']):
                        overlaps = True
                        break
                if overlaps:
                    break
            
            # Add if no overlap
            if not overlaps:
                combined_entities[entity_type].append(entity)
    
    return combined_entities

print("\nApplying rule-based token classification to receipt text:")
rule_based_entities = apply_rule_based_classification(extracted_text)
for entity_type, entities in rule_based_entities.items():
    if entities:
        print(f"\n{entity_type}:")
        for entity in entities:
            print(f"  • {entity['text']}")

print("\nCombining spaCy NER with rule-based classification:")
combined_entities = enhance_ner_with_rules(extracted_text)
for entity_type, entities in combined_entities.items():
    if entities:
        print(f"\n{entity_type}:")
        for entity in entities:
            print(f"  • {entity['text']}")

