# 

# Process


**Do** some test what data can we get with tools like fitz.

In [5]:
import os
from pathlib import Path

from PyPDF2 import PdfReader


In [8]:
# Extract all metadata from all PDF files in the 'samples' folder and print them, not as functions

samples_dir = Path('samples')

if not samples_dir.exists():
    print("❌ Samples folder not found!")
else:
    pdf_files = list(samples_dir.glob('*.pdf'))

    if not pdf_files:
        print("❌ No PDF files found!")
    else:
        print(f"📚 PDF All Metadata ({len(pdf_files)} files):")
        print("=" * 60)

        for pdf_file in pdf_files:
            try:
                with open(pdf_file, 'rb') as file:
                    reader = PdfReader(file)
                    metadata = {}
                    if reader.metadata:
                        for key, value in reader.metadata.items():
                            # Convert key to string and value to string (if not None)
                            metadata[str(key)] = str(value) if value is not None else ""
            except Exception as e:
                metadata = {"error": str(e)}

            print(f"Metadata for {pdf_file.name}:")
            for k, v in metadata.items():
                print(f"  {k}: {v}")
            print("-" * 40)

        print("\n" + "=" * 60)



📚 PDF All Metadata (7 files):
Metadata for f1098.pdf:
  /Title: Form 1098 (Rev. January 2022)
  /Producer: macOS Version 15.1 (Build 24B5055e) Quartz PDFContext
  /Author: SE:W:CAR:MP
  /Subject: Mortgage Interest Statement 
  /Creator: Adobe LiveCycle Designer ES 9.0
  /CreationDate: D:20250114181627Z00'00'
  /ModDate: D:20250114181627Z00'00'
----------------------------------------
Metadata for handwritten.pdf:
  /Producer: macOS Version 15.4.1 (Build 24E263) Quartz PDFContext, AppendMode 1.1
  /Author: Atul
  /CreationDate: D:20250519094720Z00'00'
  /ModDate: D:20250519094736Z00'00'
  /Title: handwritten.jpg
  /Creator: Preview
----------------------------------------
Metadata for fw2.pdf:
  /Title: 2024 Form W-2
  /Producer: macOS Version 15.2 (Build 24C101) Quartz PDFContext
  /Author: SE:W:CAR:MP
  /Subject: Wage and Tax Statement
  /Creator: Designer 6.5
  /CreationDate: D:20250211182827Z00'00'
  /ModDate: D:20250211182827Z00'00'
----------------------------------------
Metadata

## What i learned

**Outcome:** some of the documents have fileds like **/Title:** here we can extract the title of the document (Type and year) E.G FORM 1099-DIV (Rev. January 2024)


**Assumption:** we get these docuemnts and they have not been modified or someone uploads and image etc. They come from a direct source so title is always there and correct.

**Process:**

1. Get all the documents in the samples folder
2. For each document, extract the title
3. Print the title

**Code:**

In [10]:

# Extract and print the metadata title for all PDF files in 'samples' folder, not as functions

samples_dir = Path('samples')

if not samples_dir.exists():
    print("❌ Samples folder not found!")
else:
    pdf_files = list(samples_dir.glob('*.pdf'))

    if not pdf_files:
        print("❌ No PDF files found!")
    else:
        print(f"📚 PDF Metadata Titles ({len(pdf_files)} files):")
        print("=" * 60)

        for pdf_file in pdf_files:
            try:
                with open(pdf_file, 'rb') as file:
                    reader = PdfReader(file)
                    title = ""
                    if reader.metadata:
                        # Get title from metadata
                        title = reader.metadata.get('/Title', '')
                        if title:
                            title = title.strip()
                        else:
                            # If no title, try other metadata fields
                            subject = reader.metadata.get('/Subject', '')
                            if subject:
                                title = subject.strip()
                            else:
                                # Try to get from document info
                                creator = reader.metadata.get('/Creator', '')
                                if creator:
                                    title = creator.strip()
                    if not title:
                        # Fallback: get first meaningful line from text
                        if len(reader.pages) > 0:
                            page = reader.pages[0]
                            text = page.extract_text()
                            if text:
                                lines = text.strip().split('\n')
                                for line in lines:
                                    line = line.strip()
                                    if line and len(line) > 3 and not line.startswith('a ') and not line.startswith('OMB'):
                                        title = line
                                        break
                    if not title:
                        title = "No metadata title found"
            except Exception as e:
                title = f"Error: {e}"

            print(f"{title} = {pdf_file.name}")

        print("\n" + "=" * 60)

📚 PDF Metadata Titles (7 files):
Form 1098 (Rev. January 2022) = f1098.pdf
handwritten.jpg = handwritten.pdf
2024 Form W-2 = fw2.pdf
idcard.jpeg = idcard.pdf
2022 Form 1040 = f1040--2022.pdf
Form 1099-INT (Rev. January 2024) = f1099int.pdf
Form 1099-DIV (Rev. January 2024) = f1099div.pdf



In [2]:
#!/usr/bin/env python3
"""
Test script for the document classifier API
Sends all PDFs from the samples folder to the /classify endpoint
"""

import os
from pathlib import Path

import requests

base_url = "http://localhost:8000"

# Check if server is running
try:
    response = requests.get(f"{base_url}/docs")
    print("✅ Server is running!")
except:
    print("❌ Server is not running. Please start it with: uvicorn main:app --reload")
    exit()

samples_dir = Path('samples')
if not samples_dir.exists():
    print("❌ Samples folder not found!")
    exit()

pdf_files = list(samples_dir.glob('*.pdf'))
if not pdf_files:
    print("❌ No PDF files found!")
    exit()

print(f"\n📚 Testing classifier with {len(pdf_files)} PDF files:")
print("=" * 80)

for pdf_file in pdf_files:
    print(f"\n📄 Testing: {pdf_file.name}")

    try:
        with open(pdf_file, 'rb') as f:
            files = {'file': (pdf_file.name, f, 'application/pdf')}
            response = requests.post(f"{base_url}/classify", files=files)

            if response.status_code == 200:
                result = response.json()
                print(f"   ✅ Document Type: {result.get('document_type', 'N/A')}")
                print(f"   ✅ Year: {result.get('year', 'N/A')}")
            else:
                print(f"   ❌ Error: {response.status_code} - {response.text}")

    except Exception as e:
        print(f"   ❌ Failed to process: {e}")

print("\n" + "=" * 80)
print("🎯 Classification test completed!")

✅ Server is running!

📚 Testing classifier with 7 PDF files:

📄 Testing: f1098.pdf
   ✅ Document Type: 1098
   ✅ Year: 2022

📄 Testing: handwritten.pdf
   ✅ Document Type: handwritten note
   ✅ Year: Unknown

📄 Testing: fw2.pdf
   ✅ Document Type: w2
   ✅ Year: 2024

📄 Testing: idcard.pdf
   ✅ Document Type: OTHER
   ✅ Year: Unknown

📄 Testing: f1040--2022.pdf
   ✅ Document Type: 1040
   ✅ Year: 2022

📄 Testing: f1099int.pdf
   ✅ Document Type: 1099int
   ✅ Year: 2024

📄 Testing: f1099div.pdf
   ✅ Document Type: 1099div
   ✅ Year: 2024

🎯 Classification test completed!


## Note the year is not in title don't trust this solution to give you the year. // solution 2 will be better

The year we want is the year the document filed by the user not when it was modified by the IRS



## Also comments on my solution

**This solution is probably not the best solution** To be honest its quick but does not actually check the content of the document and relies on the title and metadata. So im going to build a more complex solution for this that also gives us the year.