In [None]:
user_name = "Ant"

log_path = "user/file_logs/ocr_logs"
ocr_log_path = "user/file_logs/ocr_logs"
image_folder = f"user/dataset/table_ocr/image/local/{user_name}"
output_folder = f"user/dataset/table_ocr/text/local/{user_name}"

In [2]:
import os
import pandas as pd
from PIL import Image
import ipywidgets as widgets
from IPython.display import display, clear_output, HTML

def load_accepted_files_with_metadata(user_name, log_path):
    log_file = os.path.join(log_path, f"{user_name}_image_log.csv")
    if not os.path.exists(log_file):
        print(f"Table log for {user_name} not found at {log_file}.")
        return pd.DataFrame(columns=['Image Name', 'HTML Link'])

    log_df = pd.read_csv(log_file)
    accepted_files = log_df[log_df['Status'] == 'Accept'][['Image Name', 'HTML Link', 'Image Path']]
    return accepted_files

def sync_ocr_log_with_metadata(user_name, log_path, ocr_log_path):
    accepted_metadata = load_accepted_files_with_metadata(user_name, log_path)

    ocr_log_file = os.path.join(ocr_log_path, f"{user_name}_ocr_log.csv")
    os.makedirs(ocr_log_path, exist_ok=True)

    if not os.path.exists(ocr_log_file):
        ocr_log_df = accepted_metadata.copy()
        ocr_log_df['Status'] = 'Not Review'
        ocr_log_df.to_csv(ocr_log_file, index=False)
        print(f"Created new OCR log for {user_name} at {ocr_log_file}")
        return ocr_log_file

    ocr_log_df = pd.read_csv(ocr_log_file)
    current_files = set(ocr_log_df['Image Name'])

    new_files = accepted_metadata[~accepted_metadata['Image Name'].isin(current_files)]
    if not new_files.empty:
        new_files['Status'] = 'Not Review'
        ocr_log_df = pd.concat([ocr_log_df, new_files], ignore_index=True)

    accepted_files_set = set(accepted_metadata['Image Name'])
    ocr_log_df = ocr_log_df[ocr_log_df['Image Name'].isin(accepted_files_set)]

    if 'Image Path' in accepted_metadata.columns and 'Image Path' not in ocr_log_df.columns:
        ocr_log_df['Image Path'] = None

    ocr_log_df = ocr_log_df.merge(accepted_metadata[['Image Name', 'HTML Link', 'Image Path']],
                                   on='Image Name',
                                   how='left',
                                   suffixes=('', '_new'))

    ocr_log_df['HTML Link'] = ocr_log_df['HTML Link_new'].fillna(ocr_log_df['HTML Link'])
    ocr_log_df['Image Path'] = ocr_log_df['Image Path_new'].fillna(ocr_log_df['Image Path'])
    ocr_log_df.drop(columns=['HTML Link_new', 'Image Path_new'], inplace=True)

    ocr_log_df.to_csv(ocr_log_file, index=False)
    print(f"OCR log synchronized for {user_name} at {ocr_log_file}")
    return ocr_log_file

In [None]:
import os
import pandas as pd
import ipywidgets as widgets
from IPython.display import display

def review_images_gui(user_name, log_path, skip_reviewed=True):
    log_file = sync_ocr_log_with_metadata(user_name, log_path, ocr_log_path)
    log_df = pd.read_csv(log_file).reset_index(drop=True)

    status_filter = widgets.Dropdown(
        options=['All', 'Not Review', 'Accept', 'Decline'],
        value='Not Review' if skip_reviewed else 'All',
        description='Filter:',
        disabled=False
    )

    def update_idx_list():
        if status_filter.value == 'All':
            return log_df.index.tolist()
        return log_df[log_df['Status'] == status_filter.value].index.tolist()

    idx_list = update_idx_list()
    if not idx_list:
        print("No images to display!")
        return

    image_widget = widgets.Image(format='png', layout=widgets.Layout(width='800px', height='auto', margin='10px 0'))
    html_display = widgets.HTML(layout=widgets.Layout(border='1px solid #ccc', padding='10px', margin='20px 0'))
    html_edit_area = widgets.Textarea(
        placeholder='Chỉnh sửa HTML...',
        description='HTML:',
        disabled=False,
        layout=widgets.Layout(width='100%', height='300px', margin='20px 0', display='block')
    )
    save_button = widgets.Button(description="Save Changes", button_style="primary")

    status_label = widgets.HTML(value="<b>Reviewing:</b>")
    current_status_label = widgets.HTML(value="<b>Status:</b> Not Review")

    accept_button = widgets.Button(description="Accept", button_style="success")
    decline_button = widgets.Button(description="Decline", button_style="danger")
    skip_button = widgets.Button(description="Skip", button_style="info")
    back_button = widgets.Button(description="Back", button_style="warning")
    next_button = widgets.Button(description="Next", button_style="primary")
    show_latest_button = widgets.Button(description="Show Latest", button_style="warning")

    jump_to_input = widgets.BoundedIntText(
        value=0,
        min=0,
        max=len(log_df) - 1,
        step=1,
        description='Jump to:',
        layout=widgets.Layout(width='150px')
    )
    jump_to_button = widgets.Button(description="Go", button_style="info")

    jump_box = widgets.HBox([jump_to_input, jump_to_button], layout=widgets.Layout(margin='0 0 0 20px', display='none'))
    output = widgets.Output()

    current_index = idx_list[0]

    def load_review_item(idx):
        img_path = log_df.loc[idx, 'Image Path']
        html_path = log_df.loc[idx, 'HTML Link']

        with open(img_path, "rb") as file:
            image_widget.value = file.read()

        status_label.value = f"<b>Reviewing:</b> {log_df.loc[idx, 'Image Name']} ({idx_list.index(idx) + 1}/{len(idx_list)})"
        current_status_label.value = f"<b>Status:</b> {log_df.loc[idx, 'Status']}"

        if os.path.exists(html_path):
            with open(html_path, "r", encoding="utf8") as file:
                html_content = file.read()
            html_display.value = html_content

            if log_df.loc[idx, 'Status'] == 'Accept':
                html_edit_area.layout.display = 'none'  
                html_edit_area.value = html_content
                html_edit_area.layout.display = 'block'
        else:
            html_display.value = "<p style='color: red;'>HTML file not found!</p>"
            html_edit_area.layout.display = 'none'

    def save_html_changes(b):
        html_path = log_df.loc[current_index, 'HTML Link']
        if html_path:
            with open(html_path, 'w', encoding="utf8") as file:
                file.write(html_edit_area.value)
            html_display.value = html_edit_area.value
            print(f"Changes saved to {html_path}")

    def handle_button_click(button):
        nonlocal current_index, idx_list

        if button.description == "Accept":
            log_df.loc[current_index, 'Status'] = 'Accept'
        elif button.description == "Decline":
            log_df.loc[current_index, 'Status'] = 'Decline'

        log_df.to_csv(log_file, index=False)
        current_status_label.value = f"<b>Status:</b> {log_df.loc[current_index, 'Status']}"

        if status_filter.value != 'All':
            idx_list = update_idx_list()

        if idx_list:
            current_index = idx_list[0]
            load_review_item(current_index)
        else:
            status_label.value = f"<b>No more images in '{status_filter.value}' status!</b>"
            image_widget.value = b''
            html_display.value = ''
            html_edit_area.layout.display = 'none'

    def handle_navigation(button):
        nonlocal current_index, idx_list
        current_pos = idx_list.index(current_index)
        if button.description == "Back" and current_pos > 0:
            current_index = idx_list[current_pos - 1]
        elif button.description == "Next" and current_pos < len(idx_list) - 1:
            current_index = idx_list[current_pos + 1]
        load_review_item(current_index)

    def handle_jump(button):
        nonlocal current_index, idx_list
        jump_to_idx = jump_to_input.value
        if 0 <= jump_to_idx < len(log_df):
            current_index = idx_list[jump_to_idx]
            load_review_item(current_index)
        else:
            status_label.value = f"<b>Index {jump_to_idx} is out of bounds.</b>"

    def handle_show_latest(button):
        nonlocal current_index, idx_list
        for idx in idx_list:
            if log_df.loc[idx, 'Status'] == 'Not Review':
                current_index = idx
                load_review_item(current_index)
                return
        status_label.value = "<b>No images left to review!</b>"

    def handle_filter_change(change):
        nonlocal current_index, idx_list
        idx_list = update_idx_list()

        if status_filter.value == 'All':
            jump_box.layout.display = 'flex'
        else:
            jump_box.layout.display = 'none'

        if idx_list:
            current_index = idx_list[0]
            load_review_item(current_index)
        else:
            status_label.value = f"<b>No images with '{status_filter.value}' status!</b>"
            image_widget.value = b''

    accept_button.on_click(handle_button_click)
    decline_button.on_click(handle_button_click)
    skip_button.on_click(lambda button: handle_button_click(skip_button))
    back_button.on_click(handle_navigation)
    next_button.on_click(handle_navigation)
    jump_to_button.on_click(handle_jump)
    show_latest_button.on_click(handle_show_latest)
    save_button.on_click(save_html_changes)
    status_filter.observe(handle_filter_change, names='value')

    load_review_item(current_index)

    filter_box = widgets.HBox([status_filter], layout=widgets.Layout(margin='10px 0'))
    buttons_top = widgets.HBox([back_button, next_button, show_latest_button], layout=widgets.Layout(margin='10px auto', justify_content='space-around'))
    buttons_bottom = widgets.HBox([accept_button, decline_button, skip_button, jump_box, save_button], layout=widgets.Layout(margin='10px auto', justify_content='space-around'))

    display(widgets.VBox([
        filter_box,
        widgets.HTML(value="<b><u>Image Table</u></b><br>"),
        image_widget,
        widgets.HTML(value="<b><u>HTML Show</u></b><br>"),
        html_display,
        widgets.HTML(value="<b><u>HTML Edit (for modification)</u></b><br>"),
        html_edit_area,
        status_label,
        current_status_label,
        buttons_top,
        buttons_bottom
    ]))


In [4]:
review_images_gui(user_name,log_path,skip_reviewed=False)

Created new OCR log for Ant at user/file_logs/ocr_logs\Ant_ocr_log.csv


VBox(children=(HBox(children=(Dropdown(description='Filter:', options=('All', 'Not Review', 'Accept', 'Decline…