In [None]:
import os
import pandas as pd
import google.generativeai as genai

# Config API Key for Gemini
genai.configure(api_key='AIzaSyDyZElXEdwEwkQh0-gumrRkH2N4uAo9amE')

user_name = "Ant"
image_folder = f"user/dataset/table_ocr/image/local/{user_name}"
output_folder = f"user/dataset/table_ocr/text/local/{user_name}"
log_path = f"user/file_logs/ocr_logs/{user_name}_image_log.csv"

In [None]:


from tqdm import tqdm
import time

log_file = pd.read_csv(log_path)


if 'Processing Status' not in log_file.columns:
    log_file['Processing Status'] = 'Not Started'


reviewed_images = log_file[
    (log_file["Status"] == "Accept") &
    (log_file["Processing Status"].isin(['Not Started', 'Failed','Processing']))
]


os.makedirs(output_folder, exist_ok=True)


generation_config = {
    "temperature": 0,
    "top_p": 0.95,
    "top_k": 40,
    "max_output_tokens": 8192,
    "response_mime_type": "text/plain",
}

model = genai.GenerativeModel(
    model_name="gemini-2.0-flash-exp",
    generation_config=generation_config,
)


def upload_to_gemini(file_path, mime_type="image/png"):
    file = genai.upload_file(file_path, mime_type=mime_type)
    return file


def process_ocr(file_path):
    gemini_file = upload_to_gemini(file_path)
    chat_session = model.start_chat(
        history=[
            {
                "role": "user",
                "parts": [
                    gemini_file,
                    (
                        "Act as an OCR assistant and table extractor. Extract information with this step:\n"
                        "+ Please provide output with this instruction: Do not provide any additional explanations\n"
                        "+ Correct spelling mistakes or words you think don't make sense if necessary.\n"
                        "+ For tables in the image, generate a responsive HTML table that preserves the original formatting and layout.\n"
                        "+ Wrap each table in a <div class='table-container'> element.\n"
                        "+ Include basic CSS styling to ensure tables are mobile-friendly and maintain proper alignment.\n"
                        "+ Output format should be a complete HTML document with necessary styling."
                        '''<!DOCTYPE html>
<html>
<head>
    <style>
        .table-container {
            margin: 20px 0;
            overflow-x: auto;
        }
        table {
            width: 100%;
            border-collapse: collapse;
            min-width: 300px;
        }
        th, td {
            border: 1px solid #ddd;
            padding: 8px;
            text-align: left;
        }
        th {
            background-color: #f2f2f2;
        }
        @media screen and (max-width: 600px) {
            table {
                font-size: 14px;
            }
            th, td {
                padding: 4px;
            }
        }
    </style>
</head>
<body>
    <div class='table-container'>
        <table>
            <!-- Table content here -->
        </table>
    </div>
</body>
</html>'''
                    ),
                ],
            }
        ]
    )
    response = chat_session.send_message("Process this image as instructed above.")
    return response.text

In [None]:
import time  # Thêm thư viện để sử dụng sleep


MAX_RETRIES = 5
RETRY_DELAY = 5 

for index, row in tqdm(reviewed_images.iterrows()):
    retry_count = 0
    success = False

    while retry_count < MAX_RETRIES and not success:
        try:
            image_name = row["Image Name"]
            image_path = row["Image Path"]

            
            log_file.loc[index, "Processing Status"] = "Processing"
            log_file.to_csv(log_path, index=False) 

            
            print(image_path)
            html_content = process_ocr(image_path)

            
            output_file_path = os.path.join(output_folder, f"{os.path.splitext(image_name)[0]}.html")
            with open(output_file_path, "w", encoding="utf-8") as html_file:
                html_file.write(html_content)

    
            log_file.loc[index, "HTML Link"] = output_file_path
            log_file.loc[index, "Processing Status"] = "Completed"  

            print(f"Processed {image_name} and saved to {output_file_path}")
            success = True

        except Exception as e:
            retry_count += 1
            print(f"Failed to process {row['Image Name']} (Attempt {retry_count}/{MAX_RETRIES}): {e}")

            if retry_count < MAX_RETRIES:
                print(f"Retrying in {RETRY_DELAY} seconds...")
                time.sleep(RETRY_DELAY)
            else:
                print(f"Exceeded max retries for {row['Image Name']}. Skipping.")
                log_file.loc[index, "Processing Status"] = "Failed"  

     
        log_file.to_csv(log_path, index=False)

print("OCR process completed and log file updated.")