In [8]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [10]:
import json
import re
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

class QwenFieldMapper:
    def __init__(self, model_name="Qwen/Qwen2.5-1.5B-Instruct"):
        print(f"Loading Qwen model: {model_name}")
        self.tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)

        device = "cuda" if torch.cuda.is_available() else "cpu"
        self.model = AutoModelForCausalLM.from_pretrained(
            model_name,
            torch_dtype=torch.float16 if device == "cuda" else torch.float32,
            device_map="auto" if device == "cuda" else None,
            low_cpu_mem_usage=True,
            trust_remote_code=True
        )
        if device == "cpu":
            self.model = self.model.to(device)
        self.device = device
        print(f"Model loaded on {device}")

    def extract_fields(self, ocr_text: str, required_fields: list[str]) -> dict:
        """
        Extract the given required_fields from ocr_text and return only JSON 
        with those exact keys.
        """
        # Build a JSON skeleton dynamically
        skeleton = "{\n"
        skeleton += ",\n".join([f'  "{field}": ""' for field in required_fields])
        skeleton += "\n}"

        prompt = f"""Extract information from the following text and return ONLY a JSON object
with these exact field names (no other text or keys):

Text: {ocr_text}

Return only this JSON format:
{skeleton}
"""

        inputs = self.tokenizer(prompt, return_tensors="pt").to(self.device)
        with torch.no_grad():
            outputs = self.model.generate(
                **inputs,
                max_new_tokens=300,
                temperature=0.05,
                do_sample=False,
                pad_token_id=self.tokenizer.pad_token_id,
                eos_token_id=self.tokenizer.eos_token_id,
                repetition_penalty=1.1
            )

        generated_tokens = outputs[0][inputs['input_ids'].shape[-1]:]
        generated = self.tokenizer.decode(generated_tokens, skip_special_tokens=True)

        return self._extract_strict_json(generated, required_fields)

    def _extract_strict_json(self, text: str, required_fields: list[str]) -> dict:
        """
        Extracts the first valid JSON object from text and ensures all required fields exist.
        """
        default_result = {field: "" for field in required_fields}

        try:
            clean_text = text.replace("```json", "").replace("```", "").strip()
            match = re.search(r'\{.*?\}', clean_text, re.DOTALL)
            if match:
                parsed_json = json.loads(match.group(0))
                # Guarantee all requested fields exist
                for key in default_result.keys():
                    if key not in parsed_json:
                        parsed_json[key] = ""
                return parsed_json
        except json.JSONDecodeError:
            pass
        return default_result



    

In [11]:
! pip install pyngrok

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




In [12]:
!ngrok authtoken 33CVblnubvJ0XkO6OFIoahA1ayu_9wXpjRCRthtwcvgUPQVq

Authtoken saved to configuration file: /root/.config/ngrok/ngrok.yml


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [None]:
import json
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
  # Assuming your class is in qwen_mapper.py
from pyngrok import ngrok

# ----------------- FastAPI Setup -----------------
app = FastAPI(title="OCR Field Extractor API")

# ----------------- Request Body Model -----------------
class OCRRequest(BaseModel):
    text: str
    fields: list[str]

# ----------------- Initialize the Qwen Model -----------------
print("Loading model, this may take a few minutes...")
mapper = QwenFieldMapper()
print("Model ready!")

# ----------------- API Endpoint -----------------
@app.post("/extract")
def extract_fields(request: OCRRequest):
    try:
        result = mapper.extract_fields(request.text,request.fields)
        return result
    except Exception as e:
        raise HTTPException(status_code=500, detail=str(e))
import nest_asyncio
nest_asyncio.apply()

# ----------------- Run with ngrok -----------------
if __name__ == "__main__":
    # Expose API via ngrok
    public_url = ngrok.connect(8000)
    print(f"ngrok tunnel URL: {public_url}")

    # Start FastAPI server
    import uvicorn
    uvicorn.run(app, host="0.0.0.0", port=8000)


Loading model, this may take a few minutes...
Loading Qwen model: Qwen/Qwen2.5-1.5B-Instruct
Model loaded on cuda
Model ready!
ngrok tunnel URL: NgrokTunnel: "https://apologal-flutelike-bert.ngrok-free.dev" -> "http://localhost:8000"


INFO:     Started server process [36]
INFO:     Waiting for application startup.
INFO:     Application startup complete.
INFO:     Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit)
The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


INFO:     119.161.98.68:0 - "POST /extract HTTP/1.1" 200 OK


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


INFO:     119.161.98.68:0 - "POST /extract HTTP/1.1" 200 OK


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


INFO:     119.161.98.68:0 - "POST /extract HTTP/1.1" 200 OK


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


INFO:     119.161.98.68:0 - "POST /extract HTTP/1.1" 200 OK


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


INFO:     119.161.98.68:0 - "POST /extract HTTP/1.1" 200 OK


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


INFO:     119.161.98.68:0 - "POST /extract HTTP/1.1" 200 OK


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


INFO:     119.161.98.68:0 - "POST /extract HTTP/1.1" 200 OK
