In [1]:
pip install selenium

Note: you may need to restart the kernel to use updated packages.


In [2]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service as ChromeService
import time
import pandas as pd
from datasets import load_dataset
import os

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# --- Configuration ---
SUBSA_URL = "https://ocr.subasa.lk"

In [4]:
# Path to your webdriver
WEBDRIVER_PATH = "/opt/homebrew/bin/chromedriver"  # Update this

In [5]:
# --- Load your dataset ---
dataset = load_dataset("Ransaka/sinhala_synthetic_ocr-large")
results = []

In [6]:
# --- Initialize WebDriver ---
service = ChromeService(executable_path=WEBDRIVER_PATH)
driver = webdriver.Chrome(service = service)

In [7]:
try:
    driver.get(SUBSA_URL)

    for i, item in enumerate(dataset['train']):
        image = item['image']
        reference_text = item['text']
        print(f"Reference: {reference_text}")
        temp_image_path = f"temp_image_{i}.png"
        image.save(temp_image_path)
        
        try:
            # --- Locate upload element (inspect Subasa's HTML) ---
            upload_input = WebDriverWait(driver, 20).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, "[data-testid='stDropzoneInput']"))
            )
            # time.sleep(0.5)
            # print("Uploading file...")
            upload_input.send_keys(os.path.abspath(temp_image_path))
            # print("Uploaded file")

            # if reference_text == "‡∂∏‡∑ö ‡∂≠‡∂ª‡∂∏‡∑ä ‡∂∏‡∑Ñ‡∂±‡∑ä‡∑É‡∑í ‡∑Ä‡∑ô‡∂Ω‡∑è, ‡∂Ω‡∑É‡∑ä‡∑É‡∂± ‡∂Ö‡∂≠‡∑ä‡∂Ø‡∑ê‡∂ö‡∑ì‡∂∏‡∂ö‡∑ä ‡∂Ö‡∂¥‡∑í‡∂ß‡∂≠‡∑ä ‡∂Ω‡∂ú‡∑è ‡∂ö‡∂ª‡∂Ω‡∑è ‡∂Ø‡∑î‡∂±‡∑ä‡∂±‡∂ß ‡∂Ø‡∑í‡∂±‡∑ö‡∑Å‡∑ä ‡∂Ö‡∂∫‡∑í‡∂∫‡∂ß ‡∂Ö‡∂±‡∑ö‡∂ö ‡∑Ä‡∑è‡∂ª‡∂∫‡∂ö‡∑ä":
            #     break

            # --- Locate and click OCR button (inspect Subasa's HTML) ---
            # ocr_button = driver.find_element(By.XPATH, "//button[text()='Process Image']") # Example XPath

            # Add a small delay to allow file to be selected
            time.sleep(0.5)
            # ocr_button.click()

            # --- Wait for result (inspect Subasa's HTML for the result element) ---
            wait = WebDriverWait(driver, 20)
            result_element = wait.until(EC.presence_of_element_located((By.XPATH, """//*[@id="root"]/div[1]/div[1]/div/div/div/section/div[1]/div/div/div/div[13]/div/div/p""")))
            # result_container = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "[data-testid='stMarkdownContainer p']")))

            # --- Extract text ---
            generated_text = result_element.text.strip()
            print(f"Generated: {generated_text}")
            results.append({'reference': reference_text, 'generated_subasa': generated_text})
            delete_button = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "[data-testid='fileDeleteBtn'] button")))
            delete_button.click()
            time.sleep(1)

        except Exception as processing_error:
            print(f"Error processing image: {processing_error}")

        finally:
            if os.path.exists(temp_image_path):
                os.remove(temp_image_path)


except Exception as main_error:
    print(f"An error occurred: {main_error}")

finally:
    driver.quit()

Reference: ‡∂î‡∑Ñ‡∑î ‡∑É‡∂≠‡∑î‡∑Ä ‡∂≠‡∑í‡∂∂‡∑ì ‡∑Ñ‡∑ô‡∂ª‡∑ú‡∂∫‡∑í‡∂±‡∑ä ‡∂ö‡∑í‡∂Ω‡∑ù‡∂ú‡∑ä‡∂ª‡∑ë‡∂∏‡∑ä 1‡∂ö‡∑ä ‡∑É‡∑ú‡∂∫‡∑è ‡∂ú‡∂≠‡∑ä ‡∂Ö‡∂≠‡∂ª ‡∂î‡∑Ñ‡∑î
Generated: ‡∂î‡∑Ñ‡∑î ‡∑É‡∂≠‡∑î‡∑Ä ‡∂≠‡∑í‡∂∂‡∑ì ‡∑Ñ‡∑ô‡∂ª‡∑ú‡∂∫‡∑í‡∂±‡∑ä ‡∂ö‡∑í‡∂Ω‡∑ù‡∂ú‡∑ä‡∂ª‡∑ë‡∂∏‡∑ä‚Äå 1‡∂ö‡∑ä ‡∑É‡∑ú‡∂∫‡∑è ‡∂ú‡∂≠‡∑ä ‡∂Ö‡∂≠‡∂ª ‡∂î‡∑Ñ‡∑î
Reference: ‡∂∏‡∑ö ‡∂≠‡∂ª‡∂∏‡∑ä ‡∂∏‡∑Ñ‡∂±‡∑ä‡∑É‡∑í ‡∑Ä‡∑ô‡∂Ω‡∑è, ‡∂Ω‡∑É‡∑ä‡∑É‡∂± ‡∂Ö‡∂≠‡∑ä‡∂Ø‡∑ê‡∂ö‡∑ì‡∂∏‡∂ö‡∑ä ‡∂Ö‡∂¥‡∑í‡∂ß‡∂≠‡∑ä ‡∂Ω‡∂ú‡∑è ‡∂ö‡∂ª‡∂Ω‡∑è ‡∂Ø‡∑î‡∂±‡∑ä‡∂±‡∂ß ‡∂Ø‡∑í‡∂±‡∑ö‡∑Å‡∑ä ‡∂Ö‡∂∫‡∑í‡∂∫‡∂ß ‡∂Ö‡∂±‡∑ö‡∂ö ‡∑Ä‡∑è‡∂ª‡∂∫‡∂ö‡∑ä
Generated: ‡∂∏‡∑ö ‡∂≠‡∂ª‡∂∏‡∑ä ‡∂∏‡∑Ñ‡∂±‡∑ä‡∑É‡∑í ‡∑Ä‡∑ô‡∂Ω‡∑è, ‡∂Ω‡∑É‡∑ä‡∑É‡∂± ‡∂Ö‡∂≠‡∑ä‡∂Ø‡∑ê‡∂ö‡∑ì‡∂∏‡∂ö‡∑ä ‡∂Ö‡∂ª‡∑í‡∂ß‡∂≠‡∑ä ‡∂Ω‡∂ú‡∑è ‡∂ö‡∂ª‡∂Ω‡∑è ‡∂Ø‡∑î‡∂±‡∑ä‡∂±‡∂ß ‡∂Ø‡∑í‡∂±‡∑ö‡∑Å‡∑ä ‡∂Ö‡∂∫‡∑í‡∂∫‡∂ß ‡∂Ö‡∂±‡∑ö‡∂ö ‡∑Ä‡∑è‡∂ª‡∂∫‡∂ö‡∑ä‚Äå
Reference: ‡∑É‡∑è‡∂∏‡∑è‡∂±‡∑ä‡∂∫‡∂∫‡∑ô‡∂±‡∑ä ‡∂ö‡∑è‡∂ª‡∑ä ‡∂∂‡∑ê‡∂ß‡∂ª‡∑í‡∂∫‡∂ö‡∑í‡∂±‡∑ä ‡∂∏‡∑è‡∑É ‡∂Ø‡∑ô‡∂ö‡∂ö‡∑ä ‡∂¥‡∂∏‡∂´ ‡∂ª‡∑ö‡∂©‡∑í‡∂∫‡∑ù‡∑Ä ‡∂Ö‡∑Ñ‡∂±‡∑ä‡∂± ‡∂¥‡∑î‡∑Ö‡∑î‡∑Ä‡∂±‡∑ä. ‡∑Ñ‡∑ê‡∂∂‡∑ê‡∂∫‡∑í
Gen

In [8]:
# --- Create DataFrame ---
df_subasa = pd.DataFrame(results)
print(df_subasa.head())
df_subasa.to_csv("subasa_selenium_ocr_results.csv", index=False, encoding='utf-8')

                                           reference  \
0  ‡∂î‡∑Ñ‡∑î ‡∑É‡∂≠‡∑î‡∑Ä ‡∂≠‡∑í‡∂∂‡∑ì ‡∑Ñ‡∑ô‡∂ª‡∑ú‡∂∫‡∑í‡∂±‡∑ä ‡∂ö‡∑í‡∂Ω‡∑ù‡∂ú‡∑ä‡∂ª‡∑ë‡∂∏‡∑ä 1‡∂ö‡∑ä ‡∑É‡∑ú‡∂∫‡∑è ‡∂ú‡∂≠‡∑ä...   
1  ‡∂∏‡∑ö ‡∂≠‡∂ª‡∂∏‡∑ä ‡∂∏‡∑Ñ‡∂±‡∑ä‡∑É‡∑í ‡∑Ä‡∑ô‡∂Ω‡∑è, ‡∂Ω‡∑É‡∑ä‡∑É‡∂± ‡∂Ö‡∂≠‡∑ä‡∂Ø‡∑ê‡∂ö‡∑ì‡∂∏‡∂ö‡∑ä ‡∂Ö‡∂¥‡∑í‡∂ß‡∂≠‡∑ä ‡∂Ω...   
2  ‡∑É‡∑è‡∂∏‡∑è‡∂±‡∑ä‡∂∫‡∂∫‡∑ô‡∂±‡∑ä ‡∂ö‡∑è‡∂ª‡∑ä ‡∂∂‡∑ê‡∂ß‡∂ª‡∑í‡∂∫‡∂ö‡∑í‡∂±‡∑ä ‡∂∏‡∑è‡∑É ‡∂Ø‡∑ô‡∂ö‡∂ö‡∑ä ‡∂¥‡∂∏‡∂´ ‡∂ª‡∑ö‡∂©‡∑í...   
3  ‡∂¢‡∂±‡∑è‡∂∞‡∑í‡∂¥‡∂≠‡∑í ‡∂∏‡∑Ñ‡∑í‡∂±‡∑ä‡∂Ø ‡∂ª‡∑è‡∂¢‡∂¥‡∂ö‡∑ä‡∑Ç‡∂ß ‡∂≠‡∂∏ ‡∑Ä‡∑É‡∂ª 4 ‡∂ö ‡∂±‡∑í‡∂Ω ‡∂ö‡∑è‡∂Ω‡∂∫ ‡∑É...   
4  ‡∂ö‡∑è‡∂ª‡∑ä‡∂∫‡∑è‡∂Ω‡∑ì‡∂∫ ‡∂á‡∂≥‡∑î‡∂∏‡∑ä, ‡∑Ñ‡∑Ä‡∑É‡∑ä‡∑Ä‡∂ª‡∑î‡∑Ä‡∑ö ‡∑É‡∑è‡∂Ø ‡∑É‡∂≥‡∑Ñ‡∑è ‡∂á‡∂Ø‡∑î‡∂∏‡∑ä, ‡∂Ω‡∑í‡∂±...   

                                    generated_subasa  
0  ‡∂î‡∑Ñ‡∑î ‡∑É‡∂≠‡∑î‡∑Ä ‡∂≠‡∑í‡∂∂‡∑ì ‡∑Ñ‡∑ô‡∂ª‡∑ú‡∂∫‡∑í‡∂±‡∑ä ‡∂ö‡∑í‡∂Ω‡∑ù‡∂ú‡∑ä‡∂ª‡∑ë‡∂∏‡∑ä‚Äå 1‡∂ö‡∑ä ‡∑É‡∑ú‡∂∫‡∑è ‡∂ú‡∂≠...  
1  ‡∂∏‡∑ö ‡∂≠‡∂ª‡∂∏‡∑ä ‡∂∏‡∑Ñ‡∂±‡∑ä‡∑É‡∑í ‡∑Ä‡∑ô‡∂Ω‡∑è, ‡∂Ω‡∑É‡∑ä‡∑É‡∂± ‡∂Ö‡∂≠‡∑ä‡∂Ø‡∑ê‡∂ö‡∑ì‡∂∏‡

In [9]:
df_subasa.head()

Unnamed: 0,reference,generated_subasa
0,‡∂î‡∑Ñ‡∑î ‡∑É‡∂≠‡∑î‡∑Ä ‡∂≠‡∑í‡∂∂‡∑ì ‡∑Ñ‡∑ô‡∂ª‡∑ú‡∂∫‡∑í‡∂±‡∑ä ‡∂ö‡∑í‡∂Ω‡∑ù‡∂ú‡∑ä‡∂ª‡∑ë‡∂∏‡∑ä 1‡∂ö‡∑ä ‡∑É‡∑ú‡∂∫‡∑è ‡∂ú‡∂≠‡∑ä...,‡∂î‡∑Ñ‡∑î ‡∑É‡∂≠‡∑î‡∑Ä ‡∂≠‡∑í‡∂∂‡∑ì ‡∑Ñ‡∑ô‡∂ª‡∑ú‡∂∫‡∑í‡∂±‡∑ä ‡∂ö‡∑í‡∂Ω‡∑ù‡∂ú‡∑ä‡∂ª‡∑ë‡∂∏‡∑ä‚Äå 1‡∂ö‡∑ä ‡∑É‡∑ú‡∂∫‡∑è ‡∂ú‡∂≠...
1,"‡∂∏‡∑ö ‡∂≠‡∂ª‡∂∏‡∑ä ‡∂∏‡∑Ñ‡∂±‡∑ä‡∑É‡∑í ‡∑Ä‡∑ô‡∂Ω‡∑è, ‡∂Ω‡∑É‡∑ä‡∑É‡∂± ‡∂Ö‡∂≠‡∑ä‡∂Ø‡∑ê‡∂ö‡∑ì‡∂∏‡∂ö‡∑ä ‡∂Ö‡∂¥‡∑í‡∂ß‡∂≠‡∑ä ‡∂Ω...","‡∂∏‡∑ö ‡∂≠‡∂ª‡∂∏‡∑ä ‡∂∏‡∑Ñ‡∂±‡∑ä‡∑É‡∑í ‡∑Ä‡∑ô‡∂Ω‡∑è, ‡∂Ω‡∑É‡∑ä‡∑É‡∂± ‡∂Ö‡∂≠‡∑ä‡∂Ø‡∑ê‡∂ö‡∑ì‡∂∏‡∂ö‡∑ä ‡∂Ö‡∂ª‡∑í‡∂ß‡∂≠‡∑ä ‡∂Ω..."
2,‡∑É‡∑è‡∂∏‡∑è‡∂±‡∑ä‡∂∫‡∂∫‡∑ô‡∂±‡∑ä ‡∂ö‡∑è‡∂ª‡∑ä ‡∂∂‡∑ê‡∂ß‡∂ª‡∑í‡∂∫‡∂ö‡∑í‡∂±‡∑ä ‡∂∏‡∑è‡∑É ‡∂Ø‡∑ô‡∂ö‡∂ö‡∑ä ‡∂¥‡∂∏‡∂´ ‡∂ª‡∑ö‡∂©‡∑í...,‡∑É‡∑è‡∂∏‡∑è‡∂±‡∑ä‡∂∫‡∂∫‡∑ô‡∂±‡∑ä ‡∂ö‡∑è‡∂ª‡∑ä ‡∂∂‡∑ê‡∂ß‡∂ª‡∑í‡∂∫‡∂ö‡∑í‡∂±‡∑ä ‡∂∏‡∑è‡∑É ‡∂Ø‡∑ô‡∂ö‡∂ö‡∑ä ‡∂¥‡∂∏‡∂´ ‡∂ª‡∑ö‡∂©‡∑í...
3,‡∂¢‡∂±‡∑è‡∂∞‡∑í‡∂¥‡∂≠‡∑í ‡∂∏‡∑Ñ‡∑í‡∂±‡∑ä‡∂Ø ‡∂ª‡∑è‡∂¢‡∂¥‡∂ö‡∑ä‡∑Ç‡∂ß ‡∂≠‡∂∏ ‡∑Ä‡∑É‡∂ª 4 ‡∂ö ‡∂±‡∑í‡∂Ω ‡∂ö‡∑è‡∂Ω‡∂∫ ‡∑É...,‡∂¢‡∂±‡∑è‡∂∞‡∑í‡∂¥‡∂≠‡∑í ‡∂∏‡∑Ñ‡∑í‡∂±‡∑ä‡∂Ø ‡∂ª‡∑è‡∂¢‡∂¥‡∂ö‡∑ä‡∑Ç‡∂ß ‡∂≠‡∂∏ ‡∑Ä‡∑É‡∂ª 4 ‡∂ö ‡∂±‡∑í‡∂Ω ‡∂ö‡∑è‡∂Ω‡∂∫ ‡∑É...
4,"‡∂ö‡∑è‡∂ª‡∑ä‡∂∫‡∑è‡∂Ω‡∑ì‡∂∫ ‡∂á‡∂≥‡∑î‡∂∏‡∑ä, ‡∑Ñ‡∑Ä‡∑É‡∑ä‡∑Ä‡∂ª‡∑î‡∑Ä‡∑ö ‡∑É‡∑è‡∂Ø ‡∑É‡∂≥‡∑Ñ‡∑è ‡∂á‡∂Ø‡∑î‡∂∏‡∑ä, ‡∂Ω‡∑í‡∂±...","211[1181[:1 (128, (1):1(31(1/1‡∂©] (1‡∂Ω‡∑í (3[(201]..."
