In [None]:
import json
import logging
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

logger = logging.getLogger(__name__)

# -------------------------------
# NuExtractFieldMapper (your code)
# -------------------------------

class NuExtractFieldMapper:
    def __init__(self, model_name="numind/NuExtract-1.5"):
        try:
            print(f"Loading NuExtract model: {model_name}")
            self.tokenizer = AutoTokenizer.from_pretrained(model_name)

            device = "cuda" if torch.cuda.is_available() else "cpu"
            print(f"Using device: {device}")

            self.model = AutoModelForCausalLM.from_pretrained(
                model_name,
                torch_dtype=torch.float16 if device == "cuda" else torch.float32,
                device_map="auto" if device == "cuda" else None,
                low_cpu_mem_usage=True
            )

            if device == "cpu":
                self.model = self.model.to(device)

            self.device = device
            logger.info(f"Successfully loaded NuExtract model on {device}")

        except Exception as e:
            logger.error(f"Failed to load NuExtract model: {e}")
            self.model = None
            self.tokenizer = None

    def extract_fields(self, ocr_text: str) -> dict:
        if not self.model or not self.tokenizer:
            logger.error("Model not loaded")
            return {}

        try:
            template = {
                "Name": "",
                "Age": "",
                "Gender": "",
                "DOB": "",
                "Address": "",
                "Phone": "",
                "Email": "",
                "ID": "",
                "Country": ""
            }

            prompt = f"""<|input|>
{ocr_text}
<|template|>
{json.dumps(template, indent=2)}
<|output|>
"""

            inputs = self.tokenizer(
                prompt,
                return_tensors="pt",
                max_length=2048,
                truncation=True
            ).to(self.device)

            with torch.no_grad():
                outputs = self.model.generate(
                    **inputs,
                    max_new_tokens=500,
                    temperature=0.1,
                    do_sample=False,
                    pad_token_id=self.tokenizer.eos_token_id,
                    eos_token_id=self.tokenizer.eos_token_id
                )

            generated_text = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
            output_start = generated_text.find("<|output|>")
            if output_start != -1:
                json_part = generated_text[output_start + len("<|output|>"):].strip()
            else:
                json_part = generated_text.split(prompt)[-1].strip()

            try:
                result = json.loads(json_part)
                logger.info(f"Successfully extracted fields: {result}")
                return result
            except json.JSONDecodeError as e:
                logger.error(f"Failed to parse JSON output: {e}")
                logger.error(f"Raw output: {json_part}")
                return {}

        except Exception as e:
            logger.error(f"Error in field extraction: {e}")
            return {}

nuextract_mapper = NuExtractFieldMapper()

def map_fields_with_nuextract(llm_output, ocr_result=None):
    if isinstance(llm_output, dict) and 'text' in llm_output:
        ocr_text = llm_output['text']
    else:
        ocr_text = str(llm_output)

    extracted_data = nuextract_mapper.extract_fields(ocr_text)

    fields = {
        "name": {"value": None, "confidence": None},
        "age": {"value": None, "confidence": None},
        "gender": {"value": None, "confidence": None},
        "dob": {"value": None, "confidence": None},
        "address": {"value": None, "confidence": None},
        "country": {"value": None, "confidence": None},
        "phone": {"value": None, "confidence": None},
        "email": {"value": None, "confidence": None},
        "id_number": {"value": None, "confidence": None},
    }

    if not extracted_data:
        logger.warning("NuExtract failed to extract fields")
        return fields

    def calculate_confidence(field_value, ocr_result, field_name):
        if not field_value or not ocr_result or "detections" not in ocr_result:
            return 0.9
        field_str = str(field_value).lower().strip()
        matching_confidences = []
        for detection in ocr_result.get("detections", []):
            detection_text = str(detection.get("text", "")).lower().strip()
            detection_confidence = float(detection.get("confidence", 0.0))
            if field_str in detection_text or detection_text in field_str:
                matching_confidences.append(detection_confidence)
            else:
                field_words = field_str.split()
                if any(word in detection_text for word in field_words if len(word) > 2):
                    matching_confidences.append(detection_confidence * 0.8)
        if matching_confidences:
            return round(sum(matching_confidences) / len(matching_confidences), 3)
        else:
            return 0.85

    field_mapping = {
        "Name": "name",
        "Age": "age",
        "Gender": "gender",
        "DOB": "dob",
        "Address": "address",
        "Phone": "phone",
        "Email": "email",
        "ID": "id_number",
        "Country": "country"
    }

    for extract_key, internal_key in field_mapping.items():
        value = extracted_data.get(extract_key)
        if value and str(value).strip():
            clean_value = str(value).strip()
            confidence = calculate_confidence(clean_value, ocr_result, internal_key)
            fields[internal_key] = {"value": clean_value, "confidence": confidence}
            logger.info(f"Mapped {extract_key} -> {internal_key}: '{clean_value}'")

    return fields

# -------------------------------
# FastAPI setup
# -------------------------------

app = FastAPI(title="NuExtract API")

class ExtractRequest(BaseModel):
    text: str

@app.post("/extract")
def extract_fields_api(request: ExtractRequest):
    if not request.text.strip():
        raise HTTPException(status_code=400, detail="Text cannot be empty")
    result = map_fields_with_nuextract({"text": request.text})
    print(result)
    return {"mapped_fields": result}

# -------------------------------
# Run the server in Colab with ngrok
# -------------------------------
if __name__ == "__main__":
    import nest_asyncio
    import uvicorn
    from pyngrok import ngrok

    nest_asyncio.apply()

    # Expose port 8000 via ngrok
    public_url = ngrok.connect(8000)
    print("NuExtract API URL:", public_url)

    # Start FastAPI server
    uvicorn.run(app, host="0.0.0.0", port=8000)


Loading NuExtract model: numind/NuExtract-1.5


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
`torch_dtype` is deprecated! Use `dtype` instead!


Using device: cuda


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

NuExtract API URL: NgrokTunnel: "https://apologal-flutelike-bert.ngrok-free.dev" -> "http://localhost:8000"


INFO:     Started server process [12443]
INFO:     Waiting for application startup.
INFO:     Application startup complete.
INFO:     Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit)
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'name': {'value': 'Ananya Sharma', 'confidence': 0.9}, 'age': {'value': '29', 'confidence': 0.9}, 'gender': {'value': 'Female', 'confidence': 0.9}, 'dob': {'value': None, 'confidence': None}, 'address': {'value': '123, MG Road, Bengaluru, Karnataka - 560001', 'confidence': 0.9}, 'country': {'value': 'India', 'confidence': 0.9}, 'phone': {'value': '+91-9876543210', 'confidence': 0.9}, 'email': {'value': 'ananya.sharma@example.com', 'confidence': 0.9}, 'id_number': {'value': None, 'confidence': None}}
INFO:     119.161.98.68:0 - "POST /extract HTTP/1.1" 200 OK
{'name': {'value': 'JOHN DOE', 'confidence': 0.9}, 'age': {'value': None, 'confidence': None}, 'gender': {'value': None, 'confidence': None}, 'dob': {'value': '08/03/2000', 'confidence': 0.9}, 'address': {'value': None, 'confidence': None}, 'country': {'value': None, 'confidence': None}, 'phone': {'value': None, 'confidence': None}, 'email': {'value': None, 'confidence': None}, 'id_number': {'value': '012 345 678 9', 'confidence':

In [1]:
!pip install flask flask-ngrok transformers torch




In [2]:
!pip install pyngrok

^C


In [3]:
!ngrok authtoken 33CVblnubvJ0XkO6OFIoahA1ayu_9wXpjRCRthtwcvgUPQVq


Authtoken saved to configuration file: /root/.config/ngrok/ngrok.yml


In [4]:
from pyngrok import ngrok

# Open an HTTP tunnel on port 8000
public_url = ngrok.connect(8000)
print("NuExtract server URL:", public_url)

NuExtract server URL: NgrokTunnel: "https://apologal-flutelike-bert.ngrok-free.dev" -> "http://localhost:8000"
