In [8]:
import datetime
import re
import subprocess
import pdfplumber # Required for reading the text layer of the OCR'd PDF

# --- 1. Define Document Schema ---

document_schema = {
    'first_name': {
        'type': 'string',
        'required': True
    },
    'last_name': {
        'type': 'string',
        'required': True
    },
    'license_number': {
        'type': 'string',
        'required': True,
        'format': 'alphanumeric'
    },
    'date_of_birth': {
        'type': 'date',
        'required': True,
        'format': 'MM/DD/YYYY'
    },
    'expiration_date': {
        'type': 'date',
        'required': True,
        'format': 'MM/DD/YYYY'
    },
    'address': {
        'type': 'string',
        'required': False
    },
    'sex': {
        'type': 'string',
        'required': False,
        'allowed_values': ['M', 'F']
    }
}

print("Document Schema Defined.")

# ----------------------------------------------------------------------

# --- 2. OCR and Data Extraction ---

def run_ocrmypdf(input_pdf_path, output_pdf_path):
    """
    Calls the external OCRmyPDF command line utility to create a searchable PDF.
    """
    print(f"\n--- Executing OCRmyPDF on {input_pdf_path} ---")
    try:
        subprocess.run(
            ['ocrmypdf', '--output-type', 'pdfa', '--skip-text', input_pdf_path, output_pdf_path],
            check=True,
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
            text=True
        )

        print(f"OCRmyPDF executed successfully. OCR'd PDF saved to {output_pdf_path}.")
        return True
    except FileNotFoundError:
        print("\n❌ ERROR: OCRmyPDF command not found. Ensure 'ocrmypdf' is installed and in your system PATH.")
        return False
    except subprocess.CalledProcessError as e:
        print(f"\n❌ ERROR: OCRmyPDF failed with return code {e.returncode}.")
        # Use a more user-friendly message for the common return code 5 (permission denied)
        if e.returncode == 5:
            print("HINT: Return code 5 often means a file permission error (cannot write to the output path).")
        print(f"Standard Error:\n{e.stderr}")
        return False
    except Exception as e:
        print(f"\n❌ An unexpected error occurred during OCR: {e}")
        return False


def extract_structured_data(ocr_pdf_path):
    """
    Reads the text layer of the OCR'd PDF and extracts fields using simple regex rules.
    *** You MUST customize the regex rules below for your specific document layout. ***
    """
    print(f"\n--- Attempting text extraction from {ocr_pdf_path} ---")

    full_text = ""
    data = {}

    try:
        with pdfplumber.open(ocr_pdf_path) as pdf:
            for page in pdf.pages:
                full_text += page.extract_text() + "\n"
    except Exception as e:
        print(f"Error reading PDF text with pdfplumber (Is the OCR'd file valid?): {e}")
        return {}

    print("\n--- RAW TEXT EXTRACTED ---")
    # Only print a snippet to prevent massive output
    print(full_text[:500])
    print("--------------------------\n")
    # ------------------ CUSTOM EXTRACTION LOGIC GOES HERE --------------------

    # Placeholder/Example Logic: Use the known-failing SIMULATED data for robust validation testing
    # If your regex fails, this will be the output.
    if full_text.strip() == "":
        print("WARNING: Extracted text is empty. Using simulated data for validation test.")
        data = {
            'first_name': 'JANE',
            'last_name': 'SMITH',
            'license_number': '123-ABC-456', # Format error test (non-alphanumeric chars)
            'date_of_birth': '1985/05/20', # Date format test (YYYY/MM/DD)
            'expiration_date': '01/01/2030',
            'address': '789 Oak Ave, Example City',
            'gender': 'Female' # Needs normalization to 'F'
        }
    else:
        # Example 1: Extract License Number (assuming pattern DLN: XXXX)

        license_match = re.search(r'(?:DLN|Driver(?:\'|’)?s?\s+License|License)\b[\s\S]*?([0-9]{3}-[0-9]{3}-[0-9]{3})', full_text, re.IGNORECASE)
        if license_match:
            data['license_number'] = 'S' + license_match.group(1)

        # Example 2: Extract Date of Birth (assuming DOB: XX/XX/XXXX or XXXX-XX-XX)
        dob_match = re.search(r'(Date of Birth|DOB|3008)[:\s]*(\d{1,4}[-/]\d{1,2}[-/]\d{2,4})', full_text, re.IGNORECASE)
        if dob_match:
            data['date_of_birth'] = dob_match.group(2)

        # Example 3: Extract Sex (needs robust mapping)
        sex_match = re.search(r'Sex[:\s]*(Male|Female|M|F)', full_text, re.IGNORECASE)
        if sex_match:
            data['gender'] = sex_match.group(1)

        exp_match = re.search(r'(Expiration Date|Expiry|exe)[:\s]*(\d{1,4}[-/]\d{1,2}[-/]\d{2,4})', full_text, re.IGNORECASE)
        if exp_match:
            data['expiration_date'] = exp_match.group(2)

        address_pattern = re.compile(r'\b\d{1,6}\s+(?:[A-Z]\s+)?[A-Z0-9]+(?:\s+[A-Z0-9]+)*\s+(?:ST|RD|AVE|BLVD|DR|LN|CT|HWY|PKWY|PL|TER|WAY|CIR)\b',re.IGNORECASE)

        match = address_pattern.search(full_text)
        if match:
            data['street_address'] = match.group(0)


        last_name_match = re.search(r'~ \+([A-Z]+)',full_text)
        if last_name_match:
            data['first_name'] = last_name_match.group(1)

        first_name_match = re.search(r'(?:^|\n)[^A-Za-z]*([A-Z]+ [A-Z]+)(?!.*LICENSE)',full_text)
        if first_name_match:
            data['last_name'] = first_name_match.group(1)

        # You will need much more advanced logic for names and addresses!

    # ------------------------------------------------------------------------

    print("Structured Data Extracted (Pre-Normalization):")
    print(data)
    return data


def normalize_ocr_data(ocr_data, document_schema):
    """Normalize keys and values from OCR output."""
    normalized = {}

    key_map = {
        'gender': 'sex', 'dob': 'date_of_birth', 'birth_date': 'date_of_birth',
        'expiry': 'expiration_date', 'exp_date': 'expiration_date',
        '4d DLN': 'license_number', 'sex': 'sex'
    }

    def normalize_date(val):
        if not isinstance(val, str): return val
        val = val.strip()
        # Try common date formats and convert to MM/DD/YYYY
        for fmt in ('%m/%d/%Y', '%m-%d-%Y', '%Y-%m-%d', '%Y/%m/%d', '%d-%m-%Y', '%d/%m/%Y'):
            try:
                dt = datetime.datetime.strptime(val, fmt)
                return dt.strftime('%m/%d/%Y')
            except Exception:
                continue
        return val

    def normalize_gender(val):
        if not isinstance(val, str): return val
        v = val.strip().upper()
        if v in ('M', 'MALE'): return 'M'
        if v in ('F', 'FEMALE'): return 'F'
        return val

    for k, v in ocr_data.items():
        k_norm = k.strip().lower()
        target = key_map.get(k_norm, k_norm if k_norm in document_schema else None)

        if target:
            if target in ('date_of_birth', 'expiration_date'):
                v = normalize_date(v)
            elif target == 'sex':
                v = normalize_gender(v)
            elif target == 'license_number' and isinstance(v, str):
                v = re.sub(r'[^A-Za-z0-9]', '', v)

            if isinstance(v, str):
                v = v.strip()

            normalized[target] = v

    for schema_key in document_schema.keys():
        if schema_key not in normalized:
            normalized[schema_key] = None

    print('\nNormalized OCR data (pre-validation):')
    print(normalized)
    return normalized

# ----------------------------------------------------------------------

# --- 3. Validation Function ---

def validate_ocr_data(ocr_data, document_schema):
    """Validates extracted OCR data against a defined schema."""
    validation_report = {
        'missing_fields': [], 'type_mismatches': [], 'format_errors': [],
        'value_out_of_range': [], 'invalid_values': []
    }

    for field_name, schema_props in document_schema.items():
        field_value = ocr_data.get(field_name)

        is_missing = field_value is None or (isinstance(field_value, str) and field_value.strip() == '')
        if schema_props.get('required') and is_missing:
            validation_report['missing_fields'].append(field_name)
            continue

        if field_value is None: continue

        expected_type = schema_props.get('type')

        if expected_type in ('string', 'date') and not isinstance(field_value, str):
            validation_report['type_mismatches'].append({'field': field_name, 'expected': expected_type, 'actual': type(field_value).__name__})
            continue

        if expected_type == 'date':
            date_format = schema_props.get('format')
            if date_format:
                try:
                    datetime.datetime.strptime(field_value, '%m/%d/%Y')
                except ValueError:
                    validation_report['format_errors'].append({'field': field_name, 'expected_format': date_format, 'actual_value': field_value})
                    continue

        if field_name == 'license_number' and schema_props.get('format') == 'alphanumeric':
            if not re.match(r'^[a-zA-Z0-9]+$', field_value):
                validation_report['format_errors'].append({'field': field_name, 'expected_format': 'alphanumeric', 'actual_value': field_value})

        allowed_values = schema_props.get('allowed_values')
        if allowed_values and field_value not in allowed_values:
            validation_report['invalid_values'].append({'field': field_name, 'expected_one_of': allowed_values, 'actual_value': field_value})

    return validation_report

# ----------------------------------------------------------------------

# --- 4. Execution and Reporting ---

# ⚠️ Set your local file paths here ⚠️
INPUT_PDF = '/content/aitest.pdf'
OUTPUT_PDF = 'document_ocrd.pdf'

print("\n--- Starting Data Processing and Validation ---")

# 1. Run OCRmyPDF
if run_ocrmypdf(INPUT_PDF, OUTPUT_PDF):

    # 2. Extract Structured Data
    ocr_data = extract_structured_data(OUTPUT_PDF)

    # 3. Normalize extracted data
    ocr_data_normalized = normalize_ocr_data(ocr_data, document_schema)

    # 4. Run your Validation Logic
    report = validate_ocr_data(ocr_data_normalized, document_schema)

    print("\n\n---  FINAL VALIDATION REPORT ---")

    has_discrepancies = False
    for category, findings in report.items():
        if findings:
            has_discrepancies = True
            print(f"\n⚠️ **{category.replace('_', ' ').upper()}:** ⚠️")
            for item in findings:
                if isinstance(item, dict):
                    details = "; ".join([f"**{k.replace('_', ' ')}**: '{v}'" for k, v in item.items()])
                    print(f"  - {details}")
                else:
                    print(f"  - **Field**: {item}")

    if has_discrepancies:
        print("\n **✅ SUMMARY: Validation completed with critical discrepancies found. Review and manual correction required.**")
    else:
        print("\n **✅ SUMMARY: Validation completed successfully. No discrepancies found.**")
else:
    print("\nProcessing Halted: Could not successfully complete the OCR step.")

Document Schema Defined.

--- Starting Data Processing and Validation ---

--- Executing OCRmyPDF on /content/aitest.pdf ---
OCRmyPDF executed successfully. OCR'd PDF saved to document_ocrd.pdf.

--- Attempting text extraction from document_ocrd.pdf ---

--- RAW TEXT EXTRACTED ---
KENTUCKYTM DRIVER’S LICENSE
_, won $123-259-256 Oo.
~ +HARRISON
2MONA COOPER
8313 E 3RD ST ‘
FRANKFORT, KY 40601
' 3008 02/23/1953
nes /avexe 02/23/2027
Mw) 9CLASs D —
|
9aEND NONE OR
12RES 1 g
15 SEX F 16 HGT 5’-04” 18 EYES BRO 4alSS
5 DD $0000001CL10234 090-0 = avo __— 06/09/2020

--------------------------

Structured Data Extracted (Pre-Normalization):
{'license_number': 'S123-259-256', 'date_of_birth': '02/23/1953', 'gender': 'F', 'expiration_date': '02/23/2027', 'street_address': '8313 E 3RD ST', 'first_name': 'HARRISON', 'last_name': 'MONA COOPER'}

Normalized OCR data (pre-validation):
{'license_number': 'S123259256', 'date_of_birth': '02/23/1953', 'sex': 'F', 'expiration_date': '02/23/2027', 'first_n

In [7]:
from google.colab import files

print('Please upload the PDF file (e.g., aitest.pdf) that you want to OCR:')
uploaded = files.upload()

for fn in uploaded.keys():
  print(f'User uploaded file "{fn}"')
  # Optionally, rename the uploaded file to match the expected INPUT_PDF path
  # if the uploaded filename is different from 'aitest.pdf'
  if fn != 'aitest.pdf':
    import os
    os.rename(fn, 'aitest.pdf')
    print(f'Renamed "{fn}" to "aitest.pdf" to match INPUT_PDF.')

INPUT_PDF = '/content/aitest.pdf'
print(f'INPUT_PDF is set to: {INPUT_PDF}')

# After uploading, you can re-run the previous cell to execute the OCR and data extraction process.

Please upload the PDF file (e.g., aitest.pdf) that you want to OCR:


Saving aitest.pdf to aitest.pdf
User uploaded file "aitest.pdf"
INPUT_PDF is set to: /content/aitest.pdf


In [4]:
pip install --upgrade ocrmypdf

Collecting ocrmypdf
  Downloading ocrmypdf-16.12.0-py3-none-any.whl.metadata (11 kB)
Collecting img2pdf>=0.5 (from ocrmypdf)
  Downloading img2pdf-0.6.3-py3-none-any.whl.metadata (13 kB)
Collecting pi-heif (from ocrmypdf)
  Downloading pi_heif-1.1.1-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (6.5 kB)
Collecting pikepdf>=10 (from ocrmypdf)
  Downloading pikepdf-10.0.3-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (8.3 kB)
Collecting Deprecated (from pikepdf>=10->ocrmypdf)
  Downloading deprecated-1.3.1-py2.py3-none-any.whl.metadata (5.9 kB)
Downloading ocrmypdf-16.12.0-py3-none-any.whl (163 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m163.4/163.4 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading img2pdf-0.6.3-py3-none-any.whl (49 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m49.7/49.7 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pikepdf-10.0.3-cp312-cp312-manylinux_

In [3]:
!apt-get update && apt-get install ocrmypdf

0% [Working]            Get:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,632 B]
0% [Waiting for headers] [Waiting for headers] [1 InRelease 0 B/3,632 B 0%] [Co0% [Waiting for headers] [Waiting for headers] [Connected to r2u.stat.illinois.                                                                               Get:2 https://cli.github.com/packages stable InRelease [3,917 B]
0% [Waiting for headers] [Waiting for headers] [Connected to r2u.stat.illinois.                                                                               Hit:3 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:4 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
Get:5 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
Get:6 https://r2u.stat.illinois.edu/ubuntu jammy InRelease [6,555 B]
Hit:7 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Hit:8 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InReleas

In [2]:
pip install pdfplumber

Collecting pdfplumber
  Downloading pdfplumber-0.11.8-py3-none-any.whl.metadata (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pdfminer.six==20251107 (from pdfplumber)
  Downloading pdfminer_six-20251107-py3-none-any.whl.metadata (4.2 kB)
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloading pypdfium2-5.2.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (67 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.8/67.8 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
Downloading pdfplumber-0.11.8-py3-none-any.whl (60 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.0/60.0 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pdfminer_six-20251107-py3-none-any.whl (5.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.6/5.6 MB[0m [31m31.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pypdfium2-5.2.0-py3