# Debugging PDF Text Extraction and Data Extraction Pipeline

This notebook will help you step through the PDF text extraction and OpenAI-based data extraction process to identify where extraction is failing.

In [1]:
# Import PDF text extractor and OpenAIService
import sys
import os
sys.path.append(os.path.abspath('src'))

from services.pdf_processor import PDFProcessor  # Adjust if your extractor class/module is named differently
from services.openai_service import OpenAIService

  from cryptography.hazmat.primitives.ciphers.algorithms import AES, ARC4


In [2]:
# Load and inspect a sample PDF file
pdf_path = 'Filled Forms AXA XL/FNOL-form-MOTOR-v10_filled.pdf'  # Change to your test file if needed

assert os.path.exists(pdf_path), f"PDF file not found: {pdf_path}"
print(f"PDF file found: {pdf_path}")

PDF file found: Filled Forms AXA XL/FNOL-form-MOTOR-v10_filled.pdf


In [3]:
# Extract text from PDF
pdf_processor = PDFProcessor()

try:
    extraction_success, extracted_text, extracted_tables, _ = pdf_processor.extract_text_and_tables(pdf_path)
    print("Text extraction succeeded.")
except Exception as e:
    print(f"Text extraction failed: {e}")
    extracted_text = None

Text extraction succeeded.


In [4]:
# Display extracted text
if extraction_success:
    print(extracted_text[:2000])  # Print first 2000 characters for inspection
    print(extracted_tables)
else:
    print("No text extracted.")

--- Page 1 ---
First notification of loss (FNOL)
MOTOR CLAIMS
Broker details - internal use
2025/05/02
Date of Claim Notification
Jenny Lauren
Lycetts Claims Handler
A827100GV
Lycetts Ref
Client Details
Penelope Tulley
Client Name
123 Little Village St., Cambridge, UK
Client Address
CB1 234
Postcode
VAT Registered
 Yes    
 No
Accountant
Business Description / 
Occupation(s)
098271283
Policy Number
Client Contact Name
+44 1234 567823
Contact Details
Telephone
penelopetulley123@gmail.com
Email
Incident Details
Type of incident / claim
 Accident    
 Theft    
 Break-in / Malicious damage    
 Fire
2024/03/23
Date of Incident
14:00
Time of Incident
Cambridge mall
Location of Incident
Use of vehicle at time of or 
prior to incident
Used for contracting 
 Yes    
 No
yes
If Theft, was the vehicle locked 
and keys removed
 Yes    
 No
If Theft, has the vehicle been
recovered
 Yes    
 No
If yes, please provide location 
details and postcode
Postcode

--- Page 2 ---
Incident Details
Narrow r

In [5]:
# Call extract_data_from_text and inspect output
openai_service = OpenAIService()

extraction_fields = [
    "Policy Number",
    "Insured Name",
    "Date of Loss",
    "Claim Number"
]  # Adjust fields as needed for your form

if extracted_text:
    result = openai_service.extract_data_from_text(
        text=extracted_text,
        tables=extracted_tables,
        extraction_fields=extraction_fields,
        form_type="FNOL-form-MOTOR"
    )
    print(result)
else:
    print("Skipping extraction: No text to process.")

{'success': True, 'extracted_data': {'Policy Number': 'Not found', 'Insured Name': 'Penelope Tulley', 'Date of Loss': '2024-03-23', 'Claim Number': 'A827100GV'}, 'confidence_scores': {'Policy Number': 0.9, 'Insured Name': 0.95, 'Date of Loss': 0.85, 'Claim Number': 0.85}, 'reasoning': {'Policy Number': "No value appeared directly after the 'Policy Number' label in the provided text.", 'Insured Name': "Found under 'Client Name' as Penelope Tulley.", 'Date of Loss': 'Date of Incident is listed as 2024/03/23; interpreted as the date of loss and reformatted to ISO.', 'Claim Number': "The value A827100GV appears next to 'Lycetts Ref', treated as the claim reference/number."}, 'raw_response': '{\n  "extracted_data": {\n    "Policy Number": "Not found",\n    "Insured Name": "Penelope Tulley",\n    "Date of Loss": "2024-03-23",\n    "Claim Number": "A827100GV"\n  },\n  "confidence_scores": {\n    "Policy Number": 0.9,\n    "Insured Name": 0.95,\n    "Date of Loss": 0.85,\n    "Claim Number": 0