# Process


**Do** some test what data can we get with tools like fitz.

In [6]:
import fitz
import os

def extract_detailed_font_text(pdf_path):
    doc = fitz.open(pdf_path)
    results = []
    
    for page_num, page in enumerate(doc):
        blocks = page.get_text("dict")
        import json

        # Save the blocks dict for each page to a separate JSON file for inspection,
        # but remove any non-JSON-serializable objects (like bytes) before dumping.
        def remove_bytes(obj):
            if isinstance(obj, dict):
                return {k: remove_bytes(v) for k, v in obj.items()}
            elif isinstance(obj, list):
                return [remove_bytes(i) for i in obj]
            elif isinstance(obj, bytes):
                return obj.decode(errors="replace")
            else:
                return obj

        cleaned_blocks = remove_bytes(blocks)
        import os
        pdf_filename = os.path.splitext(os.path.basename(pdf_path))[0]
        output_dir = f"blocks_output/{pdf_filename}"
        os.makedirs(output_dir, exist_ok=True)
        with open(f"{output_dir}/page_{page_num+1}.json", "w", encoding="utf-8") as f:
            json.dump(cleaned_blocks, f, ensure_ascii=False, indent=2)
        
        for block in blocks["blocks"]:
            if "lines" in block:
                #print(block["lines"])          #<--- Ucomment this to see output
                for line in block["lines"]:
                    for span in line["spans"]:
                        #print(span)
                        pass
    
    return results

# Usage
detailed_data = []
samples_dir = "samples"
for filename in os.listdir(samples_dir):
    if filename.lower().endswith(".pdf"):
        pdf_path = os.path.join(samples_dir, filename)
        detailed_data.extend(extract_detailed_font_text(pdf_path))

for item in detailed_data:
    print(f"Text: {item['text']}")
    print(f"Font: {item['font_name']} (Size: {item['font_size']})")
    print(f"Bold: {item['is_bold']}, Italic: {item['is_italic']}")
    print(f"Page: {item['page']}")
    print("---")

## output
 The text is structed in a way where you can extract 
- blocks -> a blocks is a group of lines
- lines -> a line is a group of spans
- spans -> a span is a group of text
- text -> the text of the span



so we will probably mine the lines and get font size and font name and then we can use that to classify the document and color of the text.


I also added a save function for all the pages in blocks_output so we can see all the data there.


## TODO: Lets look at the text data to see if its all there and if i can learn anything from it.

In [7]:
#!/usr/bin/env python3
"""
Script to extract all text from JSON files in the blocks_output directory.
Filters out dots (".") and empty text entries.
"""#

import json
import os
from pathlib import Path


def extract_text_from_json(file_path):
    """Extract all text from a single JSON file, filtering out dots and empty text."""
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            data = json.load(f)
        
        all_text = []
        
        # Check if the file has the expected structure
        if 'blocks' in data:
            for block in data['blocks']:
                if 'lines' in block:
                    for line in block['lines']:
                        if 'spans' in line:
                            for span in line['spans']:
                                if 'text' in span:
                                    text = span['text'].strip()
                                    # Filter out dots, empty text, and whitespace-only text
                                    if text and text != "." and text != "" and not text.isspace():
                                        all_text.append(text)
        
        return all_text
    except Exception as e:
        print(f"Error processing {file_path}: {e}")
        return []

def main():
    """Main function to process all JSON files in blocks_output directory."""
    blocks_dir = Path("blocks_output")
    
    if not blocks_dir.exists():
        print("blocks_output directory not found!")
        return
    
    all_documents_text = {}
    
    # Process each subdirectory
    for subdir in blocks_dir.iterdir():
        if subdir.is_dir():
            print(f"\n{'='*60}")
            print(f"Processing: {subdir.name}")

            
            document_text = []
            
            # Process each JSON file in the subdirectory
            for json_file in subdir.glob("*.json"):
                print(f"\n--- {json_file.name} ---")
                
                text_parts = extract_text_from_json(json_file)
                document_text.extend(text_parts)
                
                # Print text from this file
                for i, text in enumerate(text_parts, 1):
                    #print(f"{i:3d}. {text}") # <--- Ucomment this to see output
                    pass
            
            all_documents_text[subdir.name] = document_text
    
    # Print summary
    print(f"\n{'='*60}")
    print("SUMMARY")
    print(f"{'='*60}")
    
    total_text_parts = 0
    for doc_name, text_parts in all_documents_text.items():
        print(f"{doc_name}: {len(text_parts)} text parts")
        total_text_parts += len(text_parts)
    
    print(f"\nTotal text parts across all documents: {total_text_parts}")
    
    # Save all text to a single file
    output_file = "all_extracted_text_filtered.txt"
    with open(output_file, 'w', encoding='utf-8') as f:
        for doc_name, text_parts in all_documents_text.items():
            f.write(f"\n{'='*60}\n")
            f.write(f"DOCUMENT: {doc_name}\n")
            f.write(f"{'='*60}\n\n")
            
            for i, text in enumerate(text_parts, 1):
                f.write(f"{i:3d}. {text}\n")
    
    print(f"\nAll filtered text saved to: {output_file}")

if __name__ == "__main__":
    main()



Processing: f1098

--- page_1.json ---

Processing: f1099int

--- page_1.json ---

Processing: f1099div

--- page_1.json ---

Processing: handwritten

--- page_1.json ---

Processing: idcard

--- page_1.json ---

Processing: fw2

--- page_2.json ---

--- page_1.json ---

Processing: f1099div-2031

--- page_1.json ---

Processing: Morris_Simons_CV_EN 

--- page_1.json ---

Processing: f1040--2022

--- page_2.json ---

--- page_1.json ---

SUMMARY
f1098: 105 text parts
f1099int: 105 text parts
f1099div: 118 text parts
handwritten: 0 text parts
idcard: 0 text parts
fw2: 221 text parts
f1099div-2031: 119 text parts
Morris_Simons_CV_EN : 0 text parts
f1040--2022: 426 text parts

Total text parts across all documents: 1094

All filtered text saved to: all_extracted_text_filtered.txt


## problem with years:

Looking at year i found that for documents like 1098 it uses a 20/ {edit_your_year_here} format while the other documents seems to use a 2024 format but are all empty.
This part is a bit tricky because we can use the font and size etc to find it because there are other cols with the same font and size so we might get the wrong year.

Here we have a couple of options:
- Mabye use OCR on that part of the document.
- but the best solution i think is to use the box cordinates of the text and then use that to find the year. (this is something we could add on the other documents awell if we wanted to) Because cordinates tells us where the text is on the page.



Looking at the documents w2 and f1040 i think the years are 2024 for w2 and 2022 for 1040.

Looking at online documents it seems that From 1040 (2021) - > indicates year 2021 is filed year and it does not based on when the document was modifed.




## ID handling


Now to check id i want to use google document ai as this is a library i used before. But if this where to be a comercial project we might want to use azure document ai to scan the id as they have solution to run it on edge. But foe now this is not a on prem solution. But its good and fast.

In [8]:
from dotenv import load_dotenv
from google.api_core.client_options import ClientOptions
from google.cloud import documentai_v1
from typing import Dict, List, Optional
import shutil





# Load environment variables
load_dotenv()

# === CONFIGURATION ===
os.environ["GOOGLE_APPLICATION_CREDENTIALS"]
project_id = os.getenv("GOOGLE_CLOUD_PROJECT_ID")
location = os.getenv("GOOGLE_CLOUD_LOCATION")
processor_id = os.getenv("GOOGLE_CLOUD_PROCESSOR_ID")


In [9]:
def ID_check(pdf_path: str) -> Optional[str]:
    """
    Check if the PDF is an ID card using Google Document AI
    Returns "ID Card" if detected, None otherwise
    """
    try:
        # Set `api_endpoint` if you use a location other than "us".
        opts = ClientOptions(api_endpoint=f"{location}-documentai.googleapis.com")
        
        # Initialize Document AI client.
        client = documentai_v1.DocumentProcessorServiceClient(client_options=opts)
        
        # Build request
        name = f"projects/{project_id}/locations/{location}/processors/{processor_id}"
        
        # Read the file into memory.
        with open(pdf_path, "rb") as file:
            file_content = file.read()
        
        # Load binary data.
        raw_document = documentai_v1.RawDocument(
            content=file_content,
            mime_type="application/pdf",
        )
        
        # Send a request and get the processed document.
        request = documentai_v1.ProcessRequest(name=name, raw_document=raw_document)
        result = client.process_document(request=request)
        document = result.document

        # Check if document contains ID-related entities
        if document.entities:
            for entity in document.entities:
                if entity.mention_text:
                    mention_text = entity.mention_text.strip().upper()
                    if mention_text == "PASS":
                        print(f"ID Card detected: {mention_text}")
                        return "ID Card"
                    elif mention_text == "NOT_AN_ID":
                        print(f"Not an ID card: {mention_text}")
                        return None
        return None
        
    except Exception as e:
        print(f"Error in ID check: {str(e)}")
        return None

# Handwritten check

For this, I used GPT Vision because it is fast, easy to use, and performs well on these types of tasks. However, you can also use specialized models if you prefer.

**A kind of quick and dirty solution**

In [10]:
# Handwritten check
from pdf2image import convert_from_path
from PIL import Image
import base64
import openai
import io

In [11]:
def handwritten_check(pdf_path: str) -> str:
    """Use OpenAI GPT to check if the PDF contains handwritten notes"""
    try:
        

        # Get OpenAI API key from environment
        openai_api_key = os.getenv("OPENAI_API_KEY")
        if not openai_api_key:
            print("OpenAI API key not found in environment variables")
            return None
            
        # Convert PDF to image (first page only for quick check)
        images = convert_from_path(pdf_path, first_page=1, last_page=1)
        if not images:
            return None
            
        # Convert image to base64
        img = images[0]
        buffer = io.BytesIO()
        img.save(buffer, format='PNG')
        img_base64 = base64.b64encode(buffer.getvalue()).decode()
        
        # Create OpenAI client
        client = openai.OpenAI(api_key=openai_api_key)
        
        # Send request to GPT-4 Vision
        response = client.chat.completions.create(
            model="gpt-4.1-mini",
            messages=[
                {
                    "role": "user",
                    "content": [
                        {
                            "type": "text",
                            "text": "Look at this document image. Is this a handwritten notes? Answer with just 'YES' if it's mostly handwritten, or 'NO' if it's something else or typed."
                        },
                        {
                            "type": "image_url",
                            "image_url": {
                                "url": f"data:image/png;base64,{img_base64}"
                            }
                        }
                    ]
                }
            ],
            max_tokens=10
        )
        
        result = response.choices[0].message.content.strip().upper()
        
        if result == "YES":
            print(f"Handwritten document detected via GPT-4 Vision")
            return "Handwritten Notes"
        else:
            print(f"Not primarily handwritten (GPT-4 Vision result: {result})")
            return None
            
    except Exception as e:
        print(f"Error in handwritten check: {str(e)}")
        return None

In [13]:
from typing import Dict, List, Optional
import shutil

def extract_text_from_pdf(pdf_path: str) -> List[Dict]:
    """
    Extract text content from a PDF using the same method as test_1_get_data.py
    Returns a list of text spans with their properties and also complete lines
    """
    doc = fitz.open(pdf_path)
    results = []
    lines = []
    
    for page_num, page in enumerate(doc):
        blocks = page.get_text("dict")
        
        # Clean blocks for JSON serialization
        def remove_bytes(obj):
            if isinstance(obj, dict):
                return {k: remove_bytes(v) for k, v in obj.items()}
            elif isinstance(obj, list):
                return [remove_bytes(i) for i in obj]
            elif isinstance(obj, bytes):
                return obj.decode(errors="replace")
            else:
                return obj

        
        # Extract text spans and complete lines
        for block in blocks["blocks"]:
            if "lines" in block:
                for line in block["lines"]:
                    line_text = ""
                    line_spans = []
                    
                    for span in line["spans"]:
                        text = span["text"].strip()
                        if text:
                            span_info = {
                                "text": text,
                                "font_name": span["font"],
                                "font_size": span["size"],
                                "font_color": span["color"],
                                "bbox": span.get("bbox", None),
                                "page": page_num + 1
                            }
                            results.append(span_info)
                            line_spans.append(span_info)
                            line_text += text + " "
                    
                    if line_text.strip():
                        lines.append({
                            "text": line_text.strip(),
                            "spans": line_spans,
                            "page": page_num + 1
                        })
    
    doc.close()
    return results, lines

def analyze_form_content(text_spans: List[Dict], lines: List[Dict]) -> Optional[str]:
    """
    Analyze text content to determine form type
    Returns the form number if found, None otherwise
    """
    for line in lines:
        text = line["text"].strip()
        
        # Check for various form types with their specific font requirements
        if text == "Form 1098":
            if len(line["spans"]) == 2 and line["spans"][0]["text"] == "Form" and line["spans"][1]["text"] == "1098":
                form_span, num_span = line["spans"][0], line["spans"][1]
                if (abs(form_span["font_size"] - 7.0) <= 2 and form_span["font_name"] == "HelveticaNeueLTStd-Roman" and
                    abs(num_span["font_size"] - 14.0) <= 2 and num_span["font_name"] == "HelveticaNeueLTStd-Bd"):

                    # Get year from the document by looking for a span with text "24" in the expected box area, font, and size
                    year = None
                    # The expected box for the year "24" is approximately:
                    # [437.46307373046875, 98.37499237060547, 444.1362609863281, 104.37500762939453]
                    year_bbox = [437.46, 98.37, 444.14, 104.38]
                    
                    # FIXED: Use a single large tolerance for all coordinates
                    tolerance = 20  # Reduced tolerance to avoid matching unrelated text
                    
                    print(f"Looking for year in 1098 form with bbox: {year_bbox}")
                    print(f"Using tolerance: {tolerance}")
                    
                    for l in lines:
                        for s in l.get("spans", []):
                            bbox = s.get("bbox", None)
                            # Check if bbox is available and matches the expected area with large tolerance
                            if bbox and all(abs(b - e) < tolerance for b, e in zip(bbox, year_bbox)):
                                # Debug: Show what we found
                                print(f"Found potential year text: '{s.get('text', '')}' at bbox: {bbox}")
                                print(f"Font: {s.get('font_name', '')}, Size: {s.get('font_size', 0)}")
                                
                                # Now check if the text is a 2-digit year (e.g., "24", "21", "22"), font is Helvetica, and size is about 6
                                text_val = s.get("text", "")
                                if (
                                    text_val.isdigit() and len(text_val) == 2
                                    and abs(s.get("font_size", 0) - 6.0) < 2  # Increased tolerance from 1 to 2
                                    and "Helvetica" in s.get("font_name", "")  # More flexible font matching
                                ):
                                    # Add millennium prefix
                                    year = "20" + text_val
                                    print(f"Matched year: {year}")
                                    break
                        if year:
                            break

                    return "1098", year


        
        elif text in ["Form 1099-INT", "Form 1099-DIV"]:
            if len(line["spans"]) == 2 and line["spans"][0]["text"] == "Form":
                form_span, num_span = line["spans"][0], line["spans"][1]
                document_type = num_span["text"]  # "1099-INT" or "1099-DIV"
                
                # Common font requirements for 1099 forms
                if (
                    abs(form_span["font_size"] - 7.0) <= 2
                    and form_span["font_name"] == "HelveticaNeueLTStd-Roman"
                    and abs(num_span["font_size"] - 12.0) <= 2
                    and num_span["font_name"] == "HelveticaNeueLTStd-Bd"
                    and text in ["Form 1099-INT", "Form 1099-DIV"]
                ):

                    # Get year from the document - look for year in various formats
                    year = None

                    # Method 1: Look for 4-digit years in the document
                    for l in lines:
                        for s in l.get("spans", []):
                            text_val = s.get("text", "").strip()
                            if text_val.isdigit() and len(text_val) == 4 and text_val.startswith("20"):
                                year = text_val
                                break
                        if year:
                            break


                    return "1099", year
        
        elif text == "Form W-2":
            if len(line["spans"]) == 2 and line["spans"][0]["text"] == "Form" and line["spans"][1]["text"] == "W-2":
                form_span, num_span = line["spans"][0], line["spans"][1]
                if (abs(form_span["font_size"] - 7.0) <= 2 and form_span["font_name"] == "HelveticaNeueLTStd-Bd" and
                    abs(num_span["font_size"] - 24.0) <= 2 and num_span["font_name"] == "HelveticaNeueLTStd-BlkCn"):

                    # Get year from the document
                    # Find the year from a span with font size 24.0 and font "OCRAStd"
                    year = ""
                    for l in lines:
                        for s in l.get("spans", []):
                            if (
                                abs(s.get("font_size", 0) - 24.0) <= 2
                                and s.get("font_name", "") == "OCRAStd"
                                and s.get("text", "").isdigit()
                                and len(s.get("text", "")) == 4
                            ):
                                year = s["text"]
                                break
                        if year:
                            break

                    return "W2", year
        
        elif text.startswith("Form 1040"):
            if len(line["spans"]) >= 2 and line["spans"][0]["text"] == "Form" and line["spans"][1]["text"] == "1040":
                form_span, num_span = line["spans"][0], line["spans"][1]
                if (abs(form_span["font_size"] - 6.0) <= 2 and form_span["font_name"] == "HelveticaNeueLTStd-Roman" and
                    abs(num_span["font_size"] - 9.0) <= 2 and num_span["font_name"] == "HelveticaNeueLTStd-Bd"):

                    # Get year from the document - look for year in various formats
                    year = None
                    
                    # Method 1: Look for year in the form title (e.g., "Form 1040 (2022)")
                    line_text = line["text"]
                    if "(" in line_text and ")" in line_text:
                        # Extract year from parentheses
                        start = line_text.find("(") + 1
                        end = line_text.find(")")
                        if start < end:
                            year_text = line_text[start:end].strip()
                            if year_text.isdigit() and len(year_text) == 4 and year_text.startswith("20"):
                                year = year_text
                    


                    return "1040", year
    
    return None, None


def ID_check(pdf_path: str) -> Optional[str]:
    """
    Check if the PDF is an ID card using Google Document AI
    Returns "ID Card" if detected, None otherwise
    """
    try:
        # Set `api_endpoint` if you use a location other than "us".
        opts = ClientOptions(api_endpoint=f"{location}-documentai.googleapis.com")
        
        # Initialize Document AI client.
        client = documentai_v1.DocumentProcessorServiceClient(client_options=opts)
        
        # Build request
        name = f"projects/{project_id}/locations/{location}/processors/{processor_id}"
        
        # Read the file into memory.
        with open(pdf_path, "rb") as file:
            file_content = file.read()
        
        # Load binary data.
        raw_document = documentai_v1.RawDocument(
            content=file_content,
            mime_type="application/pdf",
        )
        
        # Send a request and get the processed document.
        request = documentai_v1.ProcessRequest(name=name, raw_document=raw_document)
        result = client.process_document(request=request)
        document = result.document

        # Check if document contains ID-related entities
        if document.entities:
            for entity in document.entities:
                if entity.mention_text:
                    mention_text = entity.mention_text.strip().upper()
                    if mention_text == "PASS":
                        print(f"ID Card detected: {mention_text}")
                        return "ID Card"
                    elif mention_text == "NOT_AN_ID":
                        print(f"Not an ID card: {mention_text}")
                        return None
        return None
        
    except Exception as e:
        print(f"Error in ID check: {str(e)}")
        return None



def classify_and_copy_pdfs(samples_dir: str = "samples", output_base_dir: str = "classified_pdfs"):
    """
    Main function to classify PDFs and copy them to appropriate folders
    """
    # Create output base directory
    os.makedirs(output_base_dir, exist_ok=True)
    
    # Process each PDF in the samples directory
    for filename in os.listdir(samples_dir):
        if filename.lower().endswith(".pdf"):
            pdf_path = os.path.join(samples_dir, filename)
            print(f"\nProcessing: {filename}")
            
            try:
                # Extract text content
                text_spans, lines = extract_text_from_pdf(pdf_path)
                
                # Analyze content to determine form type
                document_type, year = analyze_form_content(text_spans, lines)
                
                if not document_type:
                    document_type = ID_check(pdf_path) # Check ID

                if not document_type:
                    document_type = handwritten_check(pdf_path) # Check Handwritten


                if document_type:
                    # Create form-specific directory
                    form_dir = os.path.join(output_base_dir, document_type)
                    os.makedirs(form_dir, exist_ok=True)
                    
                    # Copy PDF to form directory
                    destination = os.path.join(form_dir, filename)
                    shutil.copy2(pdf_path, destination)
                    print(f"✓ Copied {filename} to {form_dir}/")
                    print(f"✓ Year: {year}")
                else:
                    # If no form type detected, put in "unknown" folder
                    unknown_dir = os.path.join(output_base_dir, "Other")
                    os.makedirs(unknown_dir, exist_ok=True)
                    destination = os.path.join(unknown_dir, filename)
                    shutil.copy2(pdf_path, destination)
                    print(f"? No form type detected for {filename}, copied to Other/")
                    
            except Exception as e:
                print(f"✗ Error processing {filename}: {str(e)}")
    
    print(f"\nClassification complete! Check the '{output_base_dir}' directory for results.")


if __name__ == "__main__":
    classify_and_copy_pdfs()



Processing: f1098.pdf
Looking for year in 1098 form with bbox: [437.46, 98.37, 444.14, 104.38]
Using tolerance: 20
Found potential year text: '20' at bbox: (421.10400390625, 95.09298706054688, 430.0, 104.42098236083984)
Font: HelveticaNeueLTStd-Roman, Size: 8.0
Found potential year text: '24' at bbox: (437.46307373046875, 98.37499237060547, 444.1362609863281, 104.37500762939453)
Font: Helvetica, Size: 6.000009059906006
Matched year: 2024
✓ Copied f1098.pdf to classified_pdfs/1098/
✓ Year: 2024

Processing: handwritten.pdf
Not an ID card: NOT_AN_ID
Handwritten document detected via GPT-4 Vision
✓ Copied handwritten.pdf to classified_pdfs/Handwritten Notes/
✓ Year: None

Processing: fw2.pdf
✓ Copied fw2.pdf to classified_pdfs/W2/
✓ Year: 2024

Processing: idcard.pdf
ID Card detected: PASS
✓ Copied idcard.pdf to classified_pdfs/ID Card/
✓ Year: None

Processing: f1099div-2031.pdf
✓ Copied f1099div-2031.pdf to classified_pdfs/1099/
✓ Year: 2031

Processing: f1040--2022.pdf
✓ Copied f1040-

KeyboardInterrupt: 