In [3]:
import json
import re
from datetime import datetime

def normalize_keys(data):
    """Standardize invoice keys to a consistent format."""
    key_map = {
        "inv_no": "invoice_number",
        "InvoiceNumber": "invoice_number",
        "invoice_no": "invoice_number",
        "date": "invoice_date",
        "Invoice_Date": "invoice_date",
        "due_date": "due_date",
        "amount": "amount",
        "total": "amount",
        "vendor": "vendor_name",
        "VendorName": "vendor_name"
    }
    normalized = {}
    for key, value in data.items():
        new_key = key_map.get(key, key.lower().replace(" ", "_"))
        normalized[new_key] = value
    return normalized

def normalize_date(date_str):
    """Convert various date formats to YYYY-MM-DD."""
    if not date_str:
        return None
    date_formats = [
        "%m/%d/%Y", "%Y-%m-%d", "%d-%b-%Y", "%d/%m/%Y", "%Y/%m/%d"
    ]
    for fmt in date_formats:
        try:
            parsed_date = datetime.strptime(date_str, fmt)
            return parsed_date.strftime("%Y-%m-%d")
        except ValueError:
            continue
    return date_str  # Return original if no format matches

def normalize_amount(amount_str):
    """Clean and standardize currency amounts."""
    if not amount_str:
        return 0.0
    # Remove currency symbols and text
    cleaned = re.sub(r"[^\d.]", "", amount_str)
    try:
        return float(cleaned)
    except ValueError:
        return 0.0

def normalize_invoice_data(input_file, output_file):
    """Main function to normalize invoice data."""
    # Read input JSON
    with open(input_file, 'r') as f:
        invoices = json.load(f)
    
    normalized_invoices = []
    for invoice in invoices:
        # Normalize keys
        normalized = normalize_keys(invoice)
        
        # Normalize dates
        if "invoice_date" in normalized:
            normalized["invoice_date"] = normalize_date(normalized["invoice_date"])
        if "due_date" in normalized:
            normalized["due_date"] = normalize_date(normalized["due_date"])
        
        # Normalize amount
        if "amount" in normalized:
            normalized["amount"] = normalize_amount(normalized["amount"])
        
        # Handle empty vendor_name
        normalized["vendor_name"] = normalized.get("vendor_name", "Unknown")
        
        normalized_invoices.append(normalized)
    
    # Write normalized data to output JSON
    with open(output_file, 'w') as f:
        json.dump(normalized_invoices, f, indent=4)
    
    print(f"Normalized data written to {output_file}")

if __name__ == "__main__":
    normalize_invoice_data("sample_input.json", "normalized_output.json")

Normalized data written to normalized_output.json
