# Debugging PDF Text Extraction and Data Extraction Pipeline

This notebook will help you step through the PDF text extraction and OpenAI-based data extraction process to identify where extraction is failing.

In [1]:
# Import PDF text extractor and OpenAIService
import sys
import os
sys.path.append(os.path.abspath('src'))

from services.pdf_processor import PDFProcessor  # Adjust if your extractor class/module is named differently
from services.openai_service import OpenAIService

In [2]:
# Load and inspect a sample PDF file
pdf_path = 'Filled Forms AXA XL/FNOL-form-MOTOR-v10_filled.pdf'  # Change to your test file if needed

assert os.path.exists(pdf_path), f"PDF file not found: {pdf_path}"
print(f"PDF file found: {pdf_path}")

PDF file found: Filled Forms AXA XL/FNOL-form-MOTOR-v10_filled.pdf


In [3]:
# Extract text from PDF
pdf_processor = PDFProcessor()

try:
    extracted_text = pdf_processor.extract_text(pdf_path)
    print("Text extraction succeeded.")
except Exception as e:
    print(f"Text extraction failed: {e}")
    extracted_text = None

Unexpected error processing PDF: a bytes-like object is required, not 'str'


Text extraction succeeded.


In [4]:
# Display extracted text
if extracted_text:
    print(extracted_text[:2000])  # Print first 2000 characters for inspection
else:
    print("No text extracted.")

(False, '', "Unexpected error processing PDF: a bytes-like object is required, not 'str'")


In [5]:
# Call extract_data_from_text and inspect output
openai_service = OpenAIService()

extraction_fields = [
    "Policy Number",
    "Insured Name",
    "Date of Loss",
    "Claim Number"
]  # Adjust fields as needed for your form

if extracted_text:
    result = openai_service.extract_data_from_text(
        text=extracted_text,
        extraction_fields=extraction_fields,
        form_type="FNOL-form-MOTOR"
    )
    print(result)
else:
    print("Skipping extraction: No text to process.")

{'success': True, 'extracted_data': {'Policy Number': 'Not found', 'Insured Name': 'Not found', 'Date of Loss': 'Not found', 'Claim Number': 'Not found'}, 'confidence_scores': {'Policy Number': 0.0, 'Insured Name': 0.0, 'Date of Loss': 0.0, 'Claim Number': 0.0}, 'reasoning': {'Policy Number': 'Input text contains no policy numbers or identifiers; only a tuple describing an error.', 'Insured Name': 'Input text contains no insured name; no person identifiers present.', 'Date of Loss': 'Input text contains no date; no loss date present.', 'Claim Number': 'Input text contains no claim/claim number; no such field present.'}, 'raw_response': '{\n  "extracted_data": {\n    "Policy Number": "Not found",\n    "Insured Name": "Not found",\n    "Date of Loss": "Not found",\n    "Claim Number": "Not found"\n  },\n  "confidence_scores": {\n    "Policy Number": 0.0,\n    "Insured Name": 0.0,\n    "Date of Loss": 0.0,\n    "Claim Number": 0.0\n  },\n  "reasoning": {\n    "Policy Number": "Input text 

In [6]:
# Debug extraction pipeline step-by-step
import pprint

# 1. PDF Reading
try:
    with open(pdf_path, 'rb') as f:
        pdf_bytes = f.read()
    print(f"PDF read: {len(pdf_bytes)} bytes")
except Exception as e:
    print(f"PDF reading failed: {e}")

# 2. Text Extraction
try:
    text = pdf_processor.extract_text(pdf_path)
    print(f"Extracted text length: {len(text)}")
except Exception as e:
    print(f"Text extraction error: {e}")
    text = None

# 3. Prompt Creation
if text:
    try:
        prompt = openai_service._create_extraction_prompt(text, extraction_fields, "FNOL-form-MOTOR")
        print("Prompt created. First 1000 chars:")
        print(prompt[:1000])
    except Exception as e:
        print(f"Prompt creation error: {e}")

# 4. API Call
if text:
    try:
        response = openai_service._make_api_call(prompt, openai_service.default_model, 1)
        print("API call succeeded. First 1000 chars of response:")
        print(response[:1000])
    except Exception as e:
        print(f"API call error: {e}")
        response = None

# 5. Response Parsing
if text and response:
    try:
        parsed = openai_service._parse_extraction_response(response, extraction_fields)
        print("Parsed extraction response:")
        pprint.pprint(parsed)
    except Exception as e:
        print(f"Response parsing error: {e}")

Unexpected error processing PDF: a bytes-like object is required, not 'str'


PDF read: 168402 bytes
Extracted text length: 3
Prompt created. First 1000 chars:

You are an expert at extracting structured data from FNOL-form-MOTOR forms. 

Please extract the following fields from the provided text: "Policy Number", "Insured Name", "Date of Loss", "Claim Number"

Instructions:
1. Analyze the text carefully and identify the requested information
2. If a field is not found, use "Not found" as the value
3. Return the data in JSON format with the exact field names requested
4. Include confidence scores (0.0 to 1.0) for each extraction
5. Provide brief reasoning for each extraction

Text to analyze:
(False, '', "Unexpected error processing PDF: a bytes-like object is required, not 'str'")  # Limit text length to avoid token limits

Please respond with a JSON object in this exact format:
{
    "extracted_data": {
        "field_name": "extracted_value"
    },
    "confidence_scores": {
        "field_name": 0.95
    },
    "reasoning": {
        "field_name": "Brief exp