In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import json
import re
import requests

class QwenFieldMapper:
    def __init__(self, model_name="qwen2.5:1.5b"):
        """
        Uses an Ollama model that is already pulled locally.
        Example: ollama pull qwen2.5:1.5b
        """
        self.model_name = model_name
        self.api_url = "http://localhost:11434/api/generate"
        print(f"Using Ollama model (offline): {self.model_name}")

    def extract_fields(self, ocr_text: str, required_fields: list[str]) -> dict:
        """
        Extract the given required_fields from ocr_text and return only JSON 
        with those exact keys.
        """
        # Build JSON skeleton
        skeleton = "{\n"
        skeleton += ",\n".join([f'  "{field}": ""' for field in required_fields])
        skeleton += "\n}"

        prompt = f"""Extract information from the following text and return ONLY a JSON object
with these exact field names (no other text or keys):

Text: {ocr_text}

Return only this JSON format:
{skeleton}
"""

        payload = {
            "model": self.model_name,
            "prompt": prompt,
            "stream": False
        }

        try:
            response = requests.post(self.api_url, json=payload, timeout=120)
            response.raise_for_status()
            generated = response.json().get("response", "")
        except Exception as e:
            print("Error communicating with Ollama:", e)
            return {field: "" for field in required_fields}

        return self._extract_strict_json(generated, required_fields)

    def _extract_strict_json(self, text: str, required_fields: list[str]) -> dict:
        """
        Extracts the first valid JSON object from text and ensures all required fields exist.
        """
        default_result = {field: "" for field in required_fields}

        try:
            clean_text = text.replace("```json", "").replace("```", "").strip()
            match = re.search(r'\{.*?\}', clean_text, re.DOTALL)
            if match:
                parsed_json = json.loads(match.group(0))
                # Guarantee all requested fields exist
                for key in default_result.keys():
                    if key not in parsed_json:
                        parsed_json[key] = ""
                return parsed_json
        except json.JSONDecodeError:
            pass
        return default_result
mapper = QwenFieldMapper("qwen2.5:1.5b")




Using Ollama model (offline): qwen2.5:1.5b
{'Name': 'Alice Johnson', 'Date': '2025-09-29', 'ID': 'A12345'}


In [11]:
! pip install pyngrok

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




In [12]:
!ngrok authtoken 33CVblnubvJ0XkO6OFIoahA1ayu_9wXpjRCRthtwcvgUPQVq

Authtoken saved to configuration file: /root/.config/ngrok/ngrok.yml


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [None]:
import json
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
import nest_asyncio


# ----------------- FastAPI Setup -----------------
app = FastAPI(title="OCR Field Extractor API")

# ----------------- Request Body Model -----------------
class OCRRequest(BaseModel):
    text: str
    fields: list[str]

# ----------------- Initialize the Qwen Model -----------------
print("Loading model, this may take a few minutes...")
mapper = QwenFieldMapper()  # Ollama offline model
print("Model ready!")

# ----------------- API Endpoint -----------------
@app.post("/extract")
def extract_fields(request: OCRRequest):
    try:
        result = mapper.extract_fields(request.text, request.fields)
        return result
    except Exception as e:
        raise HTTPException(status_code=500, detail=str(e))
nest_asyncio.apply()

# ----------------- Run FastAPI -----------------
if __name__ == "__main__":
    import uvicorn
    # Change the port to whatever you want, e.g., 9000
    uvicorn.run(app, host="127.0.0.1", port=9000)


Loading model, this may take a few minutes...
Using Ollama model (offline): qwen2.5:1.5b
Model ready!


INFO:     Started server process [31352]
INFO:     Waiting for application startup.
INFO:     Application startup complete.
INFO:     Uvicorn running on http://127.0.0.1:9000 (Press CTRL+C to quit)


INFO:     127.0.0.1:50837 - "POST /extract HTTP/1.1" 200 OK
