In [2]:
from datasets import load_dataset
import numpy as np
import easyocr
import google.generativeai as genai
import ast
import re
from tqdm import tqdm
from PIL import Image, ImageFile
import imagehash
from dotenv import load_dotenv
from time import sleep
import os

In [3]:
reader = easyocr.Reader(['vi', 'en'])

load_dotenv()
genai.configure(api_key = os.getenv("GEMINI_API_KEY"))
model = genai.GenerativeModel("gemini-2.0-flash")

Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.


In [4]:
# Import Hugging Face Hub utilities
from huggingface_hub.hf_api import HfFolder  # For handling authentication tokens
from huggingface_hub import Repository, HfApi  # Tools for managing repositories on Hugging Face Hub

REPO_ACCESS_TOKEN = "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"
# Save the Hugging Face authentication atoken
HfFolder.save_token(REPO_ACCESS_TOKEN)

In [5]:
val_hf_dataset = load_dataset("Namronaldo2004/ViInfographicsVQA", split = "val", streaming = True)
test_hf_dataset = load_dataset("Namronaldo2004/ViInfographicsVQA", split = "test", streaming = True)

Resolving data files:   0%|          | 0/225 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/31 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/62 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/225 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/31 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/62 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/225 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/31 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/62 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/225 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/31 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/62 [00:00<?, ?it/s]

In [6]:
def fix_list_string(raw_str: str) -> str:
    # Bước 1: Dọn sạch chuỗi đầu vào
    raw_str = raw_str.strip()

    # Bước 2: Chèn dấu phẩy giữa các dấu " liền nhau nếu có
    raw_str = re.sub(r'"\s*"', '", "', raw_str)

    # Bước 3: Parse từng dòng
    lines = raw_str.strip('[]').split('\n')
    cleaned_lines = []

    for i, line in enumerate(lines):
        line = line.strip().rstrip(',')

        if not line:
            continue

        # Loại bỏ dấu " bao ngoài nếu có
        if line.startswith('"') and line.endswith('"'):
            line = line[1:-1]

        # Escape các dấu " bên trong nội dung
        line = line.replace('\\', '\\\\').replace('"', '\\"')

        # Thêm lại dấu " bao ngoài
        line = f'"{line}"'

        # Thêm dấu phẩy nếu chưa phải dòng cuối
        if i < len(lines) - 1:
            line += ','

        cleaned_lines.append(line)

    fixed_str = '[\n' + '\n'.join(cleaned_lines) + '\n]'
    return fixed_str

implementor = "Nam"
with open (f"{implementor}_val_index.txt", 'r', encoding = 'utf-8') as file:
    lst = file.read().strip('\n').split()
    start_index = int(lst[0])
    end_index = int(lst[1])
    print(f"Start to read dataset from the index {start_index} to the index {end_index}")
    
ImageFile.LOAD_TRUNCATED_IMAGES = True
count = 0
count_ocr = 0
oldImage = None
api_error = False

for sample in tqdm(val_hf_dataset, desc = "Scanning dataset"):
    try:
        if (count < start_index):
            count += 1
            count_ocr += 1
            continue
                                                             
        # OCR + Gemini 
        image = sample["image"]  # Load image from dataset
        image_np = np.array(image)
        image = Image.fromarray(image_np) 
        
        if count == start_index or imagehash.phash(image) != imagehash.phash(oldImage):
            oldImage = image
            count_ocr = count
            ocr_chunks = reader.readtext(image_np)
            ocr_chunks = list(chunk[1] for chunk in ocr_chunks)
            
            prompt = f"""Bạn nhận được một danh sách văn bản ngắn, trích xuất từ ảnh infographic (OCR).
            
            Yêu cầu:
            - Gom nhóm các dòng có liên quan theo ngữ cảnh.
            - Viết lại thành các câu hoàn chỉnh, ngắn gọn, rõ nghĩa dùng để thực hiện text embedding.
            - Trả về **duy nhất một Python list hợp lệ** chứa các câu, ví dụ:
            ```python
            [
                "Câu hoàn chỉnh 1.",
                "Câu hoàn chỉnh 2.",
                ...
            ]
            
            Dữ liệu:
            {ocr_chunks}
            
            """
            
            MAX_WEBP_SIZE = 16383
            try:
                resample_mode = Image.Resampling.LANCZOS
            except AttributeError:
                resample_mode = Image.LANCZOS  # For Pillow < 10.0

            if image.width > MAX_WEBP_SIZE or image.height > MAX_WEBP_SIZE:
                print(f"[!] Resizing large image at index {count}: original size {image.size}")
                
                resize_ratio = min(MAX_WEBP_SIZE / image.width, MAX_WEBP_SIZE / image.height)
                new_size = (int(image.width * resize_ratio), int(image.height * resize_ratio))
                
                image = image.resize(new_size, resample=resample_mode)
                image_np = np.array(image)  # Update numpy version too

            count_429 = 0
            while (True):
                try:
                    response = model.generate_content([prompt, image])
                    break
                except Exception as e:
                    if "429" in str(e):
                        print("[-] Received 429 Too Many Requests. Retrying after 10 seconds...")
                        count_429 += 1
                        
                        if (count_429 < 6):
                            sleep(10)
                        else:
                            api_error = True
                            raise
                    elif "Image size exceeds" in str(e):
                        raise
                    else:
                        api_error = True
                        raise  # Other errors should not be ignored

            # Xu ly output
            # Loại bỏ khối ```python ... ```
            response = response.text
            response = re.sub(r"^```python\s*", "", response.strip(), flags = re.IGNORECASE)
            response = re.sub(r"\s*```$", "", response.strip())
            # Thay dấu ngoặc kép cong thành thẳng
            response = response.replace("“", '"').replace("”", '"')
            # Thực thi parsing
            response = fix_list_string(response)
            meaning_sentences = ast.literal_eval(response)
            joined_text = " ".join(meaning_sentences)
            
            with open(f"{implementor}_val_OCR.txt", "a+", encoding = "utf-8") as out_file:
                out_file.write(f"Index {count}: " + joined_text + "\n")
        
        count += 1
        if (count > end_index):
            break
    
    except (Exception, KeyboardInterrupt) as e:  
        print(f"[-] Error: {e if not isinstance(e, KeyboardInterrupt) else 'Interrupted'}")
        
        if (count_ocr <= start_index):
            break
              
        with open(f"{implementor}_val_index.txt", "w", encoding = "utf-8") as f:
            f.write(f"{count_ocr} {end_index}")
        
        if isinstance(e, KeyboardInterrupt):
            break
        
        if (api_error):
            break
        
        count += 1
        continue

Start to read dataset from the index 13559 to the index 13885


Scanning dataset: 13716it [1:34:46, 18.93s/it]

[!] Resizing large image at index 13716: original size (1500, 21406)


Scanning dataset: 13885it [2:37:52,  1.47it/s] 


In [11]:
with open (f"{implementor}_test_index.txt", 'r', encoding = 'utf-8') as file:
    lst = file.read().strip('\n').split()
    start_index = int(lst[0])
    end_index = int(lst[1])
    print(f"Start to read dataset from the index {start_index} to the index {end_index}")
    
ImageFile.LOAD_TRUNCATED_IMAGES = True
count = 0
count_ocr = 0
oldImage = None
api_error = False

for sample in tqdm(test_hf_dataset, desc = "Scanning dataset"):
    try:
        if (count < start_index):
            count += 1
            count_ocr += 1
            continue
                                                             
        # OCR + Gemini 
        image = sample["image"]  # Load image from dataset
        image_np = np.array(image)
        image = Image.fromarray(image_np) 
        
        if count == start_index or imagehash.phash(image) != imagehash.phash(oldImage):
            oldImage = image
            count_ocr = count
            ocr_chunks = reader.readtext(image_np)
            ocr_chunks = list(chunk[1] for chunk in ocr_chunks)
            
            prompt = f"""Bạn nhận được một danh sách văn bản ngắn, trích xuất từ ảnh infographic (OCR).
            
            Yêu cầu:
            - Gom nhóm các dòng có liên quan theo ngữ cảnh.
            - Viết lại thành các câu hoàn chỉnh, ngắn gọn, rõ nghĩa dùng để thực hiện text embedding.
            - Trả về **duy nhất một Python list hợp lệ** chứa các câu, ví dụ:
            ```python
            [
                "Câu hoàn chỉnh 1.",
                "Câu hoàn chỉnh 2.",
                ...
            ]
            
            Dữ liệu:
            {ocr_chunks}
            
            """
            
            MAX_WEBP_SIZE = 16383
            try:
                resample_mode = Image.Resampling.LANCZOS
            except AttributeError:
                resample_mode = Image.LANCZOS  # For Pillow < 10.0

            if image.width > MAX_WEBP_SIZE or image.height > MAX_WEBP_SIZE:
                print(f"[!] Resizing large image at index {count}: original size {image.size}")
                
                resize_ratio = min(MAX_WEBP_SIZE / image.width, MAX_WEBP_SIZE / image.height)
                new_size = (int(image.width * resize_ratio), int(image.height * resize_ratio))
                
                image = image.resize(new_size, resample=resample_mode)
                image_np = np.array(image)  # Update numpy version too

            count_429 = 0
            while (True):
                try:
                    response = model.generate_content([prompt, image])
                    break
                except Exception as e:
                    if "429" in str(e):
                        print("[-] Received 429 Too Many Requests. Retrying after 10 seconds...")
                        count_429 += 1
                        
                        if (count_429 < 6):
                            sleep(10)
                        else:
                            api_error = True
                            raise
                    elif "Image size exceeds" in str(e):
                        raise
                    else:
                        api_error = True
                        raise  # Other errors should not be ignored

            # Xu ly output
            # Loại bỏ khối ```python ... ```
            response = response.text
            response = re.sub(r"^```python\s*", "", response.strip(), flags = re.IGNORECASE)
            response = re.sub(r"\s*```$", "", response.strip())
            # Thay dấu ngoặc kép cong thành thẳng
            response = response.replace("“", '"').replace("”", '"')
            # Thực thi parsing
            response = fix_list_string(response)
            meaning_sentences = ast.literal_eval(response)
            joined_text = " ".join(meaning_sentences)
            
            with open(f"{implementor}_test_OCR.txt", "a+", encoding = "utf-8") as out_file:
                out_file.write(f"Index {count}: " + joined_text + "\n")
        
        count += 1
        if (count > end_index):
            break
    
    except (Exception, KeyboardInterrupt) as e:  
        print(f"[-] Error: {e if not isinstance(e, KeyboardInterrupt) else 'Interrupted'}")
        
        if (count_ocr <= start_index):
            break
              
        with open(f"{implementor}_test_index.txt", "w", encoding = "utf-8") as f:
            f.write(f"{count_ocr} {end_index}")
        
        if isinstance(e, KeyboardInterrupt):
            break
        
        if (api_error):
            break
        
        count += 1
        continue

Start to read dataset from the index 22467 to the index 27997


Scanning dataset: 9816it [27:11, 21.37it/s]'(ProtocolError('Connection aborted.', RemoteDisconnected('Remote end closed connection without response')), '(Request ID: 86aa1d1d-0be7-4fdc-96d0-9e44ce43018b)')' thrown while requesting GET https://huggingface.co/datasets/Namronaldo2004/ViInfographicsVQA/resolve/5823ee92b7596d432a651d64bc8423830b2864f5/data/test_part2-00012-of-00013.parquet
Retrying in 1s [Retry 1/5].
Scanning dataset: 25324it [11:24:25, 10.56s/it]

[-] Error: Invalid operation: The `response.text` quick accessor requires the response to contain a valid `Part`, but none were returned. The candidate's [finish_reason](https://ai.google.dev/api/generate-content#finishreason) is 4. Meaning that the model was reciting from copyrighted material.


Scanning dataset: 27829it [19:33:03,  8.46s/it]

[!] Resizing large image at index 27829: original size (2083, 22977)


Scanning dataset: 27997it [20:01:25,  2.57s/it]
