In [3]:
!pip install torch torchvision torchaudio  # Remove CUDA index for Mac compatibility
!pip install transformers
!pip install olmocr
!pip install flask pillow

Looking in indexes: https://download.pytorch.org/whl/cu121
[31mERROR: Could not find a version that satisfies the requirement torch (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for torch[0m[31m
[0mCollecting transformers
  Downloading transformers-4.53.2-py3-none-any.whl.metadata (40 kB)
Collecting filelock (from transformers)
  Using cached filelock-3.18.0-py3-none-any.whl.metadata (2.9 kB)
Collecting huggingface-hub<1.0,>=0.30.0 (from transformers)
  Downloading huggingface_hub-0.33.4-py3-none-any.whl.metadata (14 kB)
Collecting numpy>=1.17 (from transformers)
  Downloading numpy-2.3.1-cp311-cp311-macosx_14_0_arm64.whl.metadata (62 kB)
Collecting pyyaml>=5.1 (from transformers)
  Downloading PyYAML-6.0.2-cp311-cp311-macosx_11_0_arm64.whl.metadata (2.1 kB)
Collecting regex!=2019.12.17 (from transformers)
  Downloading regex-2024.11.6-cp311-cp311-macosx_11_0_arm64.whl.metadata (40 kB)
Collecting requests (from transformers)
  Downloading requests-2.

In [None]:
import os
import json
import re
import uuid
import tempfile
import base64
from io import BytesIO
from PIL import Image
from flask import Flask, request, jsonify
from werkzeug.utils import secure_filename
import torch

# OlmOCR Prompting & Rendering
from olmocr.data.renderpdf import render_pdf_to_base64png
from olmocr.prompts import build_finetuning_prompt
from olmocr.prompts.anchor import get_anchor_text
from transformers import AutoProcessor, Qwen2VLForConditionalGeneration

# Initialize model + processor globally to avoid reloading every time
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = Qwen2VLForConditionalGeneration.from_pretrained(
    "allenai/olmOCR-7B-0225-preview", torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32
).eval().to(device)
processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")

# Initialize Flask app
app = Flask(__name__)
UPLOAD_FOLDER = 'temp_files'
os.makedirs(UPLOAD_FOLDER, exist_ok=True)
app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER
app.config['MAX_CONTENT_LENGTH'] = 16 * 1024 * 1024  # 16 MB limit
ALLOWED_EXTENSIONS = {'pdf'}

def allowed_file(filename):
    return '.' in filename and filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS

# OlmOCR Manual Inference
def process_document_with_olmocr(filepath):
    print(f"Running OlmOCR model on file: {filepath}")
    image_base64 = render_pdf_to_base64png(filepath, 1, target_longest_image_dim=1024)
    anchor_text = get_anchor_text(filepath, 1, pdf_engine="pdfreport", target_length=4000)
    prompt = build_finetuning_prompt(anchor_text)

    messages = [
        {
            "role": "user",
            "content": [
                {"type": "text", "text": prompt},
                {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_base64}"}},
            ],
        }
    ]

    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    main_image = Image.open(BytesIO(base64.b64decode(image_base64)))

    inputs = processor(
        text=[text],
        images=[main_image],
        padding=True,
        return_tensors="pt",
    )
    inputs = {key: value.to(device) for (key, value) in inputs.items()}

    output = model.generate(
        **inputs,
        temperature=0.8,
        max_new_tokens=512,
        num_return_sequences=1,
        do_sample=True,
    )

    prompt_length = inputs["input_ids"].shape[1]
    new_tokens = output[:, prompt_length:]
    text_output = processor.tokenizer.batch_decode(new_tokens, skip_special_tokens=True)
    return text_output[0]  # Return the raw output string

# Optional: Simple parsing logic (can be improved)
def parse_olmocr_output(olmocr_raw_output):
    structured_data = {
        "document_type": "unknown",
        "extracted_entities": {},
        "tables": [],
        "raw_text": olmocr_raw_output
    }

    name_match = re.search(r'(Customer Name|Patient Name):\s*(.+)', olmocr_raw_output, re.IGNORECASE)
    if name_match:
        structured_data["extracted_entities"]["name"] = name_match.group(2).strip()

    date_match = re.search(r'(Date:|Report Date:)\s*(\d{4}-\d{2}-\d{2})', olmocr_raw_output, re.IGNORECASE)
    if date_match:
        structured_data["extracted_entities"]["date"] = date_match.group(2).strip()

    total_match = re.search(r'Total Amount Due:\s*([\d.]+)', olmocr_raw_output)
    if total_match:
        structured_data["extracted_entities"]["total_amount"] = float(total_match.group(1))
        structured_data["document_type"] = "invoice"

    diagnosis_match = re.search(r'Diagnosis:\s*(.+)', olmocr_raw_output, re.IGNORECASE)
    if diagnosis_match:
        structured_data["extracted_entities"]["diagnosis"] = diagnosis_match.group(1).strip()
        structured_data["document_type"] = "medical_report"

    # Table parsing
    table_lines = []
    in_table_section = False
    for line in olmocr_raw_output.split('\n'):
        if "Item Code | Description | Quantity | Unit Price | Total" in line:
            in_table_section = True
            continue
        if in_table_section and "---" in line:
            continue
        if in_table_section and line.strip() and not re.match(r'^\s*(Total Amount Due|Payment Terms):', line):
            table_lines.append(line.strip())
        elif in_table_section and (not line.strip() or re.match(r'^\s*(Total Amount Due|Payment Terms):', line)):
            in_table_section = False

    if table_lines:
        items = []
        for item_line in table_lines:
            parts = [p.strip() for p in item_line.split('|')]
            if len(parts) == 5:
                try:
                    items.append({
                        "item_code": parts[0],
                        "description": parts[1],
                        "quantity": int(parts[2]),
                        "unit_price": float(parts[3]),
                        "total": float(parts[4])
                    })
                except:
                    continue
        if items:
            structured_data["tables"].append({
                "name": "invoice_items",
                "headers": ["Item Code", "Description", "Quantity", "Unit Price", "Total"],
                "rows": items
            })

    return structured_data

@app.route('/')
def home():
    return "Backend is running! Send files to /extract."

@app.route('/extract', methods=['POST'])
def extract_file():
    if 'file' not in request.files:
        return jsonify({'error': 'No file part in the request'}), 400

    file = request.files['file']
    if file.filename == '':
        return jsonify({'error': 'No selected file'}), 400

    filepath = None
    try:
        if file and allowed_file(file.filename):
            filename = secure_filename(file.filename)
            unique_filename = f"{uuid.uuid4()}_{filename}"
            filepath = os.path.join(app.config['UPLOAD_FOLDER'], unique_filename)
            file.save(filepath)

            print(f"Calling OlmOCR for file: {filepath}")
            olmocr_output = process_document_with_olmocr(filepath)
            print("OlmOCR processing complete. Parsing output...")

            structured_data = parse_olmocr_output(olmocr_output)

            return jsonify({
                'message': 'File processed and data extracted successfully!',
                'filename': filename,
                'extracted_data': structured_data
            }), 200
        else:
            return jsonify({'error': 'File type not allowed or invalid file'}), 400

    except Exception as e:
        print(f"An error occurred: {e}")
        return jsonify({'error': f'An internal server error occurred: {str(e)}'}), 500
    finally:
        if filepath and os.path.exists(filepath):
            os.remove(filepath)
            print(f"Cleaned up temporary file: {filepath}")

if __name__ == '__main__':
    app.run(debug=True, port=5000)