<a href="https://colab.research.google.com/github/NgoKien15802/DemoIdentityKeyCloak/blob/main/ocr.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -U -q transformers==4.44.2 bitsandbytes
!pip install -U -q huggingface_hub
!pip install -q flask flask-cors pyngrok flash_attn

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.7/43.7 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.5/9.5 MB[0m [31m98.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.1/76.1 MB[0m [31m11.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.6/3.6 MB[0m [31m99.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m112.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m85.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m54.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
import numpy as np
import torch
import torchvision.transforms as T
from PIL import Image
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer
import requests


#Thư viện xử lý ảnh đầu vào

In [3]:
# Thư viện xử lý ảnh đầu vào (từ source gốc HF)
IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)

def build_transform(input_size):
    MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
    transform = T.Compose([
        T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
        T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
        T.ToTensor(),
        T.Normalize(mean=MEAN, std=STD)
    ])
    return transform

def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size):
    best_ratio_diff = float('inf')
    best_ratio = (1, 1)
    area = width * height
    for ratio in target_ratios:
        target_aspect_ratio = ratio[0] / ratio[1]
        ratio_diff = abs(aspect_ratio - target_aspect_ratio)
        if ratio_diff < best_ratio_diff:
            best_ratio_diff = ratio_diff
            best_ratio = ratio
        elif ratio_diff == best_ratio_diff:
            if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
                best_ratio = ratio
    return best_ratio

def dynamic_preprocess(image, min_num=1, max_num=12, image_size=448, use_thumbnail=False):
    orig_width, orig_height = image.size
    aspect_ratio = orig_width / orig_height

    # calculate the existing image aspect ratio
    target_ratios = set(
        (i, j) for n in range(min_num, max_num + 1) for i in range(1, n + 1) for j in range(1, n + 1) if
        i * j <= max_num and i * j >= min_num)
    target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])

    # find the closest aspect ratio to the target
    target_aspect_ratio = find_closest_aspect_ratio(
        aspect_ratio, target_ratios, orig_width, orig_height, image_size)

    # calculate the target width and height
    target_width = image_size * target_aspect_ratio[0]
    target_height = image_size * target_aspect_ratio[1]
    blocks = target_aspect_ratio[0] * target_aspect_ratio[1]

    # resize the image
    resized_img = image.resize((target_width, target_height))
    processed_images = []
    for i in range(blocks):
        box = (
            (i % (target_width // image_size)) * image_size,
            (i // (target_width // image_size)) * image_size,
            ((i % (target_width // image_size)) + 1) * image_size,
            ((i // (target_width // image_size)) + 1) * image_size
        )
        # split the image
        split_img = resized_img.crop(box)
        processed_images.append(split_img)
    assert len(processed_images) == blocks
    if use_thumbnail and len(processed_images) != 1:
        thumbnail_img = image.resize((image_size, image_size))
        processed_images.append(thumbnail_img)
    return processed_images

def load_image(image_file, input_size=448, max_num=12):
    image = Image.open(requests.get(image_file, stream=True).raw).convert('RGB')#Image.open(image_file).convert('RGB')
    transform = build_transform(input_size=input_size)
    images = dynamic_preprocess(image, image_size=input_size, use_thumbnail=True, max_num=max_num)
    pixel_values = [transform(image) for image in images]
    pixel_values = torch.stack(pixel_values)
    return pixel_values


#Load model và test model trên Colab

In [4]:
model_name = "5CD-AI/Vintern-1B-v2"
model = AutoModel.from_pretrained(
    model_name,
    torch_dtype=torch.bfloat16,
    low_cpu_mem_usage=True,
    trust_remote_code=True,
).eval().cuda()

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True, use_fast=False)
generation_config = dict(max_new_tokens= 512, do_sample=False, num_beams = 3, repetition_penalty=3.5)



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/5.33k [00:00<?, ?B/s]

configuration_internvl_chat.py:   0%|          | 0.00/3.80k [00:00<?, ?B/s]

configuration_intern_vit.py:   0%|          | 0.00/5.55k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/5CD-AI/Vintern-1B-v2:
- configuration_intern_vit.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/5CD-AI/Vintern-1B-v2:
- configuration_internvl_chat.py
- configuration_intern_vit.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_internvl_chat.py:   0%|          | 0.00/15.2k [00:00<?, ?B/s]

conversation.py:   0%|          | 0.00/15.8k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/5CD-AI/Vintern-1B-v2:
- conversation.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_intern_vit.py:   0%|          | 0.00/18.2k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/5CD-AI/Vintern-1B-v2:
- modeling_intern_vit.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/5CD-AI/Vintern-1B-v2:
- modeling_internvl_chat.py
- conversation.py
- modeling_intern_vit.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


model.safetensors:   0%|          | 0.00/1.88G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/69.0 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/3.02k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/3.38M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/265 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/498 [00:00<?, ?B/s]

In [5]:
test_image = 'https://media-cdn-v2.laodong.vn/Storage/NewsPortal/2022/12/7/1124909/Karaoke-2.jpg'

pixel_values = load_image(test_image, max_num=6).to(torch.bfloat16).cuda()

prompt = '''<image>\nNhận diện hoá đơn trong ảnh. Chỉ trả về phần liệt kê các mặt hàng hàng dưới dạng JSON:
[
  {
    "Tên món": "Tên món",
    "Số lượng": "Số lượng",
    "Đơn giá": "Đơn giá",
    "Thành tiền": "Thành tiền"
  },
]
'''
response = model.chat(tokenizer, pixel_values, prompt, generation_config)

del pixel_values
response

Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.


'[\n  {\n    "Tên món": "Giờ VIP222",\n    "Số lượng": "1h54\'",\n    "Đơn giá": "500 000",\n    "Thành tiền": "950 000"\n  },\n  {\n    "Tên món": "Suối",\n    "Số lượng": "3",\n    "Đơn giá": "12 000",\n    "Thành tiền": "36 000"\n  },\n  {\n    "Tên món": "Hoa quả thập cẩm",\n    "Số lượng": "1",\n    "Đơn giá": "140 000",\n    "Thành tiền": "140 000"\n  },\n  {\n    "Tên món": "Hoa quả Bưởi",\n    "Số lượng": "2",\n    "Đơn giá": "220 000",\n    "Thành tiền": "440 000"\n  },\n  {\n    "Tên món": "Hoa Quả Roi",\n    "Số lượng": "1",\n    "Đơn giá": "100 000",\n    "Thành tiền": "100 000"\n  },\n  {\n    "Tên món": "Ken ngoại",\n    "Số lượng": "14",\n    "Đơn giá": "60 000",\n    "Thành tiền": "840 000"\n  }\n]'

# Triển khai Flask và Expose ra API qua Ngrok

In [6]:
# Setup Ngrok Token
from google.colab import userdata
from flask import Flask, jsonify, request
from flask_cors import CORS
from pyngrok import ngrok

authtoken = userdata.get("ngrok_token")
ngrok.set_auth_token(authtoken)



# Viết code Flask để expose ra API

In [38]:
import json
from flask import Flask, request, jsonify
from flask_cors import CORS

# Initialize Flask app
app = Flask(__name__)
CORS(app)

# Prompt cho mặt trước CCCD
prompt_front = '''<image> Hãy nhận diện các thông tin trên căn cước công dân và trích xuất thành JSON có cấu trúc sau:
{
  "No": "Số căn cước công dân",
  "FullName": "Họ và tên đầy đủ",
  "DateOfBirth": "Ngày tháng năm sinh (định dạng DD/MM/YYYY)",
  "Sex": "Giới tính (Nam/Nữ)",
  "Nationality": "Quốc tịch",
  "PlaceOfOrigin": "Nguyên quán",
  "PlaceOfResidence": "Nơi thường trú (bao gồm cả thông tin đầy đủ ở phần dưới, không chỉ ghi vắn tắt như 'VD Xóm 1, Hưng Yên,...')",
  "DateOfExpiry": "Ngày hết hạn (định dạng DD/MM/YYYY)"
}

Lưu ý:
- Trả về JSON hợp lệ.
- Đảm bảo trích xuất đúng định dạng ngày tháng.
- "PlaceOfResidence" phải bao gồm đầy đủ cả phần ghi chú phía dưới (nếu có).
'''

# Prompt cho mặt sau CCCD
prompt_back = '''<image>
Hãy nhận diện các thông tin trên mặt sau của căn cước công dân và trích xuất thành JSON có cấu trúc sau:

{
  "IssueDate": "Ngày cấp (định dạng DD/MM/YYYY)",
  "No": "Số căn cước công dân (12 số cuối từ chuỗi 27 ký tự đầu tiên trước dấu `<<`)"
}

Lưu ý:
- Trả về JSON hợp lệ.
- "IssueDate" phải đúng định dạng DD/MM/YYYY.
- "No" phải lấy **chính xác 12 số cuối của chuỗi 27 ký tự đầu tiên trước dấu `<<`**, không lấy nhầm số khác.

Ví dụ:
- Nếu chuỗi trên thẻ là: `IDVNM202004857036203995703<<8239489023894`
- Kết quả mong muốn: `"No": "036203995703"`
'''




@app.route('/ocr/front', methods=['POST'])
def ocr_front():
    data = request.json
    image_url = data.get('image_url', None)

    response_message = ocr_by_llm(image_url, prompt_front)
    try:
        json_object = json.loads(response_message)
        return jsonify(json_object)  # ✅ Trả về object JSON hợp lệ
    except json.JSONDecodeError:
        return jsonify({"error": "Lỗi định dạng JSON", "raw": response_message}), 500

@app.route('/ocr/back', methods=['POST'])
def ocr_back():
    data = request.json
    image_url = data.get('image_url', None)

    response_message = ocr_by_llm(image_url, prompt_back)
    try:
        json_object = json.loads(response_message)
        return jsonify(json_object)  # ✅ Trả về object JSON hợp lệ
    except json.JSONDecodeError:
        return jsonify({"error": "Lỗi định dạng JSON", "raw": response_message}), 500

def ocr_by_llm(image_url, prompt):
    pixel_values = load_image(image_url, max_num=6).to(torch.bfloat16).cuda()
    response_message = model.chat(tokenizer, pixel_values, prompt, generation_config)
    del pixel_values

    print(response_message)
    return response_message

if __name__ == '__main__':
    ngrok_url = ngrok.connect(5555)
    print(f"Ngrok URL: {ngrok_url}")
    app.run(port=5555)


Ngrok URL: NgrokTunnel: "https://7c03-34-16-171-22.ngrok-free.app" -> "http://localhost:5555"
 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on http://127.0.0.1:5555
INFO:werkzeug:[33mPress CTRL+C to quit[0m
Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.
INFO:werkzeug:127.0.0.1 - - [19/Mar/2025 16:42:32] "POST /ocr/back HTTP/1.1" 200 -


{
  "IssueDate": "05/06/2021",
  "No": "IDVNM202004857036203995703<<8239489023894"
}


Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.
INFO:werkzeug:127.0.0.1 - - [19/Mar/2025 16:43:02] "POST /ocr/front HTTP/1.1" 200 -


{
  "No": "036202004987",
  "FullName": "NGÔ TRUNG KIÊN",
  "DateOfBirth": "15/08/2002",
  "Sex": "Nam",
  "Nationality": "Việt Nam",
  "PlaceOfOrigin": "Hải Vân, Hải Hậu, Nam Định",
  "PlaceOfResidence": "Xóm 11, Hải Vân, Hải Hậu, Nam Định",
  "DateOfExpiry": "15/08/2027"
}
