In [7]:
import datetime
import re
import subprocess
import pdfplumber # Required for reading the text layer of the OCR'd PDF

# --- 1. Define Document Schema ---

document_schema = {
    'first_name': {
        'type': 'string',
        'required': True
    },
    'last_name': {
        'type': 'string',
        'required': True
    },
    'license_number': {
        'type': 'string',
        'required': True,
        'format': 'alphanumeric'
    },
    'date_of_birth': {
        'type': 'date',
        'required': True,
        'format': 'MM/DD/YYYY'
    },
    'expiration_date': {
        'type': 'date',
        'required': True,
        'format': 'MM/DD/YYYY'
    },
    'address': {
        'type': 'string',
        'required': False
    },
    'sex': {
        'type': 'string',
        'required': False,
        'allowed_values': ['M', 'F']
    }
}

print("Document Schema Defined.")

# ----------------------------------------------------------------------

# --- 2. OCR and Data Extraction ---

def run_ocrmypdf(input_pdf_path, output_pdf_path):
    """
    Calls the external OCRmyPDF command line utility to create a searchable PDF.
    """
    print(f"\n--- Executing OCRmyPDF on {input_pdf_path} ---")
    try:
        subprocess.run(
            ['ocrmypdf', '--output-type', 'pdfa', '--skip-text', input_pdf_path, output_pdf_path],
            check=True,
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
            text=True
        )

        print(f"OCRmyPDF executed successfully. OCR'd PDF saved to {output_pdf_path}.")
        return True
    except FileNotFoundError:
        print("\n❌ ERROR: OCRmyPDF command not found. Ensure 'ocrmypdf' is installed and in your system PATH.")
        return False
    except subprocess.CalledProcessError as e:
        print(f"\n❌ ERROR: OCRmyPDF failed with return code {e.returncode}.")
        # Use a more user-friendly message for the common return code 5 (permission denied)
        if e.returncode == 5:
            print("HINT: Return code 5 often means a file permission error (cannot write to the output path).")
        print(f"Standard Error:\n{e.stderr}")
        return False
    except Exception as e:
        print(f"\n❌ An unexpected error occurred during OCR: {e}")
        return False


def extract_structured_data(ocr_pdf_path):
    """
    Reads the text layer of the OCR'd PDF and extracts fields using simple regex rules.
    *** You MUST customize the regex rules below for your specific document layout. ***
    """
    print(f"\n--- Attempting text extraction from {ocr_pdf_path} ---")

    full_text = ""
    data = {}

    try:
        with pdfplumber.open(ocr_pdf_path) as pdf:
            for page in pdf.pages:
                full_text += page.extract_text() + "\n"
    except Exception as e:
        print(f"Error reading PDF text with pdfplumber (Is the OCR'd file valid?): {e}")
        return {}

    print("\n--- RAW TEXT EXTRACTED ---")
    # Only print a snippet to prevent massive output
    print(full_text[:500])
    print("--------------------------\n")
    # ------------------ CUSTOM EXTRACTION LOGIC GOES HERE --------------------

    # Placeholder/Example Logic: Use the known-failing SIMULATED data for robust validation testing
    # If your regex fails, this will be the output.
    if full_text.strip() == "":
        print("WARNING: Extracted text is empty. Using simulated data for validation test.")
        data = {
            'first_name': 'JANE',
            'last_name': 'SMITH',
            'license_number': '123-ABC-456', # Format error test (non-alphanumeric chars)
            'date_of_birth': '1985/05/20', # Date format test (YYYY/MM/DD)
            'expiration_date': '01/01/2030',
            'address': '789 Oak Ave, Example City',
            'gender': 'Female' # Needs normalization to 'F'
        }
    else:
        # Example 1: Extract License Number (assuming pattern DLN: XXXX)

        license_match = re.search(r'(?:DLN|Driver(?:\'|’)?s?\s+License|License)\b[\s\S]*?([0-9]{3}-[0-9]{3}-[0-9]{3})', full_text, re.IGNORECASE)
        if license_match:
            data['license_number'] = 'S' + license_match.group(1)

        # Example 2: Extract Date of Birth (assuming DOB: XX/XX/XXXX or XXXX-XX-XX)
        dob_match = re.search(r'(Date of Birth|DOB|3008)[:\s]*(\d{1,4}[-/]\d{1,2}[-/]\d{2,4})', full_text, re.IGNORECASE)
        if dob_match:
            data['date_of_birth'] = dob_match.group(2)

        # Example 3: Extract Sex (needs robust mapping)
        sex_match = re.search(r'Sex[:\s]*(Male|Female|M|F)', full_text, re.IGNORECASE)
        if sex_match:
            data['gender'] = sex_match.group(1)

        exp_match = re.search(r'(Expiration Date|Expiry|exe)[:\s]*(\d{1,4}[-/]\d{1,2}[-/]\d{2,4})', full_text, re.IGNORECASE)
        if exp_match:
            data['expiration_date'] = exp_match.group(2)

        address_pattern = re.compile(r'\b\d{1,6}\s+(?:[A-Z]\s+)?[A-Z0-9]+(?:\s+[A-Z0-9]+)*\s+(?:ST|RD|AVE|BLVD|DR|LN|CT|HWY|PKWY|PL|TER|WAY|CIR)\b',re.IGNORECASE)

        match = address_pattern.search(full_text)
        if match:
            data['street_address'] = match.group(0)


        last_name_match = re.search(r'~ \+([A-Z]+)',full_text)
        if last_name_match:
            data['first_name'] = last_name_match.group(1)

        first_name_match = re.search(r'(?:^|\n)[^A-Za-z]*([A-Z]+ [A-Z]+)(?!.*LICENSE)',full_text)
        if first_name_match:
            data['last_name'] = first_name_match.group(1)

        # You will need much more advanced logic for names and addresses!

    # ------------------------------------------------------------------------

    print("Structured Data Extracted (Pre-Normalization):")
    print(data)
    return data


def normalize_ocr_data(ocr_data, document_schema):
    """Normalize keys and values from OCR output."""
    normalized = {}

    key_map = {
        'gender': 'sex', 'dob': 'date_of_birth', 'birth_date': 'date_of_birth',
        'expiry': 'expiration_date', 'exp_date': 'expiration_date',
        '4d DLN': 'license_number', 'sex': 'sex'
    }

    def normalize_date(val):
        if not isinstance(val, str): return val
        val = val.strip()
        # Try common date formats and convert to MM/DD/YYYY
        for fmt in ('%m/%d/%Y', '%m-%d-%Y', '%Y-%m-%d', '%Y/%m/%d', '%d-%m-%Y', '%d/%m/%Y'):
            try:
                dt = datetime.datetime.strptime(val, fmt)
                return dt.strftime('%m/%d/%Y')
            except Exception:
                continue
        return val

    def normalize_gender(val):
        if not isinstance(val, str): return val
        v = val.strip().upper()
        if v in ('M', 'MALE'): return 'M'
        if v in ('F', 'FEMALE'): return 'F'
        return val

    for k, v in ocr_data.items():
        k_norm = k.strip().lower()
        target = key_map.get(k_norm, k_norm if k_norm in document_schema else None)

        if target:
            if target in ('date_of_birth', 'expiration_date'):
                v = normalize_date(v)
            elif target == 'sex':
                v = normalize_gender(v)
            elif target == 'license_number' and isinstance(v, str):
                v = re.sub(r'[^A-Za-z0-9]', '', v)

            if isinstance(v, str):
                v = v.strip()

            normalized[target] = v

    for schema_key in document_schema.keys():
        if schema_key not in normalized:
            normalized[schema_key] = None

    print('\nNormalized OCR data (pre-validation):')
    print(normalized)
    return normalized

# ----------------------------------------------------------------------

# --- 3. Validation Function ---

def validate_ocr_data(ocr_data, document_schema):
    """Validates extracted OCR data against a defined schema."""
    validation_report = {
        'missing_fields': [], 'type_mismatches': [], 'format_errors': [],
        'value_out_of_range': [], 'invalid_values': []
    }

    for field_name, schema_props in document_schema.items():
        field_value = ocr_data.get(field_name)

        is_missing = field_value is None or (isinstance(field_value, str) and field_value.strip() == '')
        if schema_props.get('required') and is_missing:
            validation_report['missing_fields'].append(field_name)
            continue

        if field_value is None: continue

        expected_type = schema_props.get('type')

        if expected_type in ('string', 'date') and not isinstance(field_value, str):
            validation_report['type_mismatches'].append({'field': field_name, 'expected': expected_type, 'actual': type(field_value).__name__})
            continue

        if expected_type == 'date':
            date_format = schema_props.get('format')
            if date_format:
                try:
                    datetime.datetime.strptime(field_value, '%m/%d/%Y')
                except ValueError:
                    validation_report['format_errors'].append({'field': field_name, 'expected_format': date_format, 'actual_value': field_value})
                    continue

        if field_name == 'license_number' and schema_props.get('format') == 'alphanumeric':
            if not re.match(r'^[a-zA-Z0-9]+$', field_value):
                validation_report['format_errors'].append({'field': field_name, 'expected_format': 'alphanumeric', 'actual_value': field_value})

        allowed_values = schema_props.get('allowed_values')
        if allowed_values and field_value not in allowed_values:
            validation_report['invalid_values'].append({'field': field_name, 'expected_one_of': allowed_values, 'actual_value': field_value})

    return validation_report

# ----------------------------------------------------------------------

# --- 4. Execution and Reporting ---

# ⚠️ Set your local file paths here ⚠️
INPUT_PDF = '/Users/octane.hinojosa/Downloads/aitest (1).pdf'
OUTPUT_PDF = 'document_ocrd.pdf'

print("\n--- Starting Data Processing and Validation ---")

# 1. Run OCRmyPDF
if run_ocrmypdf(INPUT_PDF, OUTPUT_PDF):

    # 2. Extract Structured Data
    ocr_data = extract_structured_data(OUTPUT_PDF)

    # 3. Normalize extracted data
    ocr_data_normalized = normalize_ocr_data(ocr_data, document_schema)

    # 4. Run your Validation Logic
    report = validate_ocr_data(ocr_data_normalized, document_schema)

    print("\n\n---  FINAL VALIDATION REPORT ---")

    has_discrepancies = False
    for category, findings in report.items():
        if findings:
            has_discrepancies = True
            print(f"\n⚠️ **{category.replace('_', ' ').upper()}:** ⚠️")
            for item in findings:
                if isinstance(item, dict):
                    details = "; ".join([f"**{k.replace('_', ' ')}**: '{v}'" for k, v in item.items()])
                    print(f"  - {details}")
                else:
                    print(f"  - **Field**: {item}")

    if has_discrepancies:
        print("\n **✅ SUMMARY: Validation completed with critical discrepancies found. Review and manual correction required.**")
    else:
        print("\n **✅ SUMMARY: Validation completed successfully. No discrepancies found.**")
else:
    print("\nProcessing Halted: Could not successfully complete the OCR step.")

Document Schema Defined.

--- Starting Data Processing and Validation ---

--- Executing OCRmyPDF on /Users/octane.hinojosa/Downloads/aitest (1).pdf ---

❌ ERROR: OCRmyPDF command not found. Ensure 'ocrmypdf' is installed and in your system PATH.

Processing Halted: Could not successfully complete the OCR step.


In [5]:
pip install --upgrade ocrmypdf

Collecting ocrmypdf
  Using cached ocrmypdf-15.4.4-py39-none-any.whl (152 kB)
Collecting reportlab>=3.6.8
  Using cached reportlab-4.4.5-py3-none-any.whl (2.0 MB)
Collecting rich>=13
  Using cached rich-14.2.0-py3-none-any.whl (243 kB)
Collecting pikepdf>=8.7.1
  Using cached pikepdf-9.11.0.tar.gz (4.5 MB)
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h    Preparing wheel metadata ... [?25ldone
Collecting pluggy>=0.13.0
  Using cached pluggy-1.6.0-py3-none-any.whl (20 kB)
Collecting img2pdf>=0.4.4
  Using cached img2pdf-0.6.3-py3-none-any.whl (49 kB)
Collecting deprecation>=2.1.0
  Using cached deprecation-2.1.0-py2.py3-none-any.whl (11 kB)
Collecting Deprecated
  Using cached deprecated-1.3.1-py2.py3-none-any.whl (11 kB)
Collecting lxml>=4.8
  Using cached lxml-6.0.2-cp39-cp39-macosx_10_9_universal2.whl (8.6 MB)
Collecting markdown-it-py>=2.2.0
  Using cached markdown_it_py-3.0.0-py3-none-any.whl (87 kB)
Collecting mdur

In [4]:
!apt-get update && apt-get install ocrmypdf

zsh:1: command not found: apt-get


In [2]:
pip install pdfplumber

Collecting pdfplumber
  Using cached pdfplumber-0.11.8-py3-none-any.whl (60 kB)
Collecting Pillow>=9.1
  Using cached pillow-11.3.0-cp39-cp39-macosx_11_0_arm64.whl (4.7 MB)
Collecting pypdfium2>=4.18.0
  Using cached pypdfium2-5.1.0-py3-none-macosx_11_0_arm64.whl (2.8 MB)
Collecting pdfminer.six==20251107
  Using cached pdfminer_six-20251107-py3-none-any.whl (5.6 MB)
Collecting cryptography>=36.0.0
  Using cached cryptography-46.0.3-cp38-abi3-macosx_10_9_universal2.whl (7.2 MB)
Collecting charset-normalizer>=2.0.0
  Using cached charset_normalizer-3.4.4-cp39-cp39-macosx_10_9_universal2.whl (209 kB)
Collecting cffi>=2.0.0
  Using cached cffi-2.0.0-cp39-cp39-macosx_11_0_arm64.whl (180 kB)
Collecting pycparser
  Using cached pycparser-2.23-py3-none-any.whl (118 kB)
Installing collected packages: pycparser, cffi, cryptography, charset-normalizer, pypdfium2, Pillow, pdfminer.six, pdfplumber
Successfully installed Pillow-11.3.0 cffi-2.0.0 charset-normalizer-3.4.4 cryptography-46.0.3 pdfminer