In [1]:
from efficient_ocr import EffOCR

INFO:faiss.loader:Loading faiss with AVX512 support.
INFO:faiss.loader:Successfully loaded faiss with AVX512 support.


In [2]:
model = EffOCR(
  config={
      'Recognizer': {
          'char': {
              'model_backend': 'onnx',
              'model_dir': './models',
              'hf_repo_id': 'dell-research-harvard/effocr_en/char_recognizer',
          },
          'word': {
              'model_backend': 'onnx',
              'model_dir': './models',
              'hf_repo_id': 'dell-research-harvard/effocr_en/word_recognizer',
          },
      },
      'Localizer': {
          'model_dir': './models',
          'hf_repo_id': 'dell-research-harvard/effocr_en',
          'model_backend': 'onnx'
      },
      'Line': {
          'model_dir': './models',
          'hf_repo_id': 'dell-research-harvard/effocr_en',
          'model_backend': 'onnx',
      },
  }
)

GPU is available?: False
Initializing model...
Loading (HF) pretrained word recognizer model!


Initializing model...
Loading (HF) pretrained char recognizer model!


In [28]:
import os
from pathlib import Path
from tqdm import tqdm

def process_images_to_text(model, source_folder, destination_folder, supported_extensions=('.jpg', '.jpeg', '.png')):
    """
    Process all images in source folder and save inference results as text files.
    Skips files that have already been processed.
    
    Args:
        model: The inference model to use
        source_folder (str): Path to folder containing source images
        destination_folder (str): Path to folder where text files will be saved
        supported_extensions (tuple): Tuple of supported image file extensions
    """
    # Create destination folder if it doesn't exist
    os.makedirs(destination_folder, exist_ok=True)
    
    # Get list of already processed files
    existing_txt_files = {Path(f).stem for f in os.listdir(destination_folder) if f.endswith('.txt')}
    
    # Get list of all image files that need processing
    image_files = [
        f for f in os.listdir(source_folder) 
        if f.lower().endswith(supported_extensions) 
        and Path(f).stem not in existing_txt_files
    ]
    
    # Counter for errors
    errors = 0
    
    # Process files with progress bar
    for filename in tqdm(image_files, desc="Processing images"):
        try:
            # Construct full path for input image
            image_path = os.path.join(source_folder, filename)
            
            # Perform inference
            result = model.infer(image_path)
            
            # Create text file name (same name as image but with .txt extension)
            text_filename = Path(filename).stem + '.txt'
            text_path = os.path.join(destination_folder, text_filename)
            
            # Save the inference result to text file
            with open(text_path, 'w', encoding='utf-8') as f:
                f.write(result[0].text)
                
        except Exception as e:
            errors += 1
            print(f"\nError processing {filename}: {str(e)}")
    
    # Final summary
    total_processed = len(image_files) - errors
    already_existing = len(existing_txt_files)
    
    print("\nProcessing complete!")
    print(f"Already existing files: {already_existing}")
    print(f"Newly processed files: {total_processed}")
    print(f"Errors encountered: {errors}")
    print(f"Total files in destination: {already_existing + total_processed}")

# Example usage:
# source_folder = "Images_jpg"
# destination_folder = "Text_output"
# process_images_to_text(model, source_folder, destination_folder)

In [25]:

result = model.infer('Images_jpg/3200797037.jpg')

result[0].text

TypeError: 'tuple' object does not support item assignment

In [30]:
process_images_to_text(model, 'Images_jpg', 'BLN_effocr', supported_extensions=('.jpg', '.png'))

Processing images:   0%|          | 1/577 [00:02<23:54,  2.49s/it]


Error processing 3200797037.jpg: 'tuple' object does not support item assignment


Processing images:   0%|          | 2/577 [00:05<25:29,  2.66s/it]


Error processing 3206254571.jpg: 'tuple' object does not support item assignment


Processing images:   1%|          | 3/577 [00:08<28:15,  2.95s/it]


Error processing 3200813282.jpg: 'tuple' object does not support item assignment


Processing images:   1%|          | 4/577 [00:11<28:20,  2.97s/it]


Error processing 3200811565.jpg: 'tuple' object does not support item assignment


Processing images:   2%|▏         | 9/577 [00:35<37:29,  3.96s/it]


Error processing 3206260965.jpg: 'tuple' object does not support item assignment


Processing images:   8%|▊         | 45/577 [04:08<40:02,  4.52s/it]  


Error processing 3206201313.jpg: 'tuple' object does not support item assignment


Processing images:   9%|▉         | 54/577 [05:07<50:10,  5.76s/it]  


Error processing 3206201960.jpg: 'tuple' object does not support item assignment


Processing images:  11%|█         | 63/577 [05:53<40:39,  4.75s/it]


Error processing 3200811765.jpg: 'tuple' object does not support item assignment


Processing images:  12%|█▏        | 67/577 [06:10<37:35,  4.42s/it]


Error processing 3206208471.jpg: 'tuple' object does not support item assignment


Processing images:  14%|█▎        | 79/577 [07:17<38:46,  4.67s/it]


Error processing 3206197609.jpg: 'tuple' object does not support item assignment


Processing images:  15%|█▌        | 88/577 [08:02<35:40,  4.38s/it]


Error processing 3206275818.jpg: 'tuple' object does not support item assignment


Processing images:  16%|█▋        | 94/577 [08:37<40:32,  5.04s/it]


Error processing 3200807902.jpg: 'tuple' object does not support item assignment


Processing images:  16%|█▋        | 95/577 [08:41<37:03,  4.61s/it]


Error processing 3207642541.jpg: 'tuple' object does not support item assignment


Processing images:  17%|█▋        | 97/577 [08:48<33:13,  4.15s/it]


Error processing 3200809739.jpg: 'tuple' object does not support item assignment


Processing images:  18%|█▊        | 104/577 [09:28<39:49,  5.05s/it]


Error processing 3206328941.jpg: 'tuple' object does not support item assignment


Processing images:  24%|██▍       | 138/577 [12:33<29:02,  3.97s/it]


Error processing 3206313644.jpg: 'tuple' object does not support item assignment


Processing images:  28%|██▊       | 162/577 [14:50<34:53,  5.05s/it]


Error processing 3200812698.jpg: 'tuple' object does not support item assignment


Processing images:  29%|██▉       | 170/577 [15:36<35:51,  5.29s/it]


Error processing 3206248855.jpg: 'tuple' object does not support item assignment


Processing images:  31%|███▏      | 181/577 [16:37<32:55,  4.99s/it]


Error processing 3200811421.jpg: 'tuple' object does not support item assignment


Processing images:  33%|███▎      | 191/577 [17:38<34:15,  5.33s/it]


Error processing 3206250097.jpg: 'tuple' object does not support item assignment


Processing images:  41%|████      | 237/577 [22:00<21:58,  3.88s/it]


Error processing 3206270362.jpg: 'tuple' object does not support item assignment


Processing images:  41%|████▏     | 239/577 [22:10<24:33,  4.36s/it]


Error processing 3200810641.jpg: 'tuple' object does not support item assignment


Processing images:  43%|████▎     | 247/577 [22:59<29:18,  5.33s/it]


Error processing 3206223532.jpg: 'tuple' object does not support item assignment


Processing images:  44%|████▎     | 252/577 [23:26<26:37,  4.92s/it]


Error processing 3206236091.jpg: list index out of range


Processing images:  45%|████▌     | 260/577 [24:13<24:17,  4.60s/it]


Error processing 3206187528.jpg: 'tuple' object does not support item assignment


Processing images:  46%|████▌     | 264/577 [24:30<20:47,  3.99s/it]


Error processing 3206274525.jpg: 'tuple' object does not support item assignment


Processing images:  53%|█████▎    | 305/577 [28:25<21:18,  4.70s/it]


Error processing 3206258241.jpg: 'tuple' object does not support item assignment


Processing images:  54%|█████▍    | 312/577 [29:03<21:47,  4.93s/it]


Error processing 3206222248.jpg: 'tuple' object does not support item assignment


Processing images:  55%|█████▍    | 315/577 [29:16<18:55,  4.34s/it]


Error processing 3200811448.jpg: 'tuple' object does not support item assignment


Processing images:  56%|█████▌    | 321/577 [29:42<16:47,  3.93s/it]


Error processing 3200812374.jpg: list index out of range


Processing images:  59%|█████▊    | 338/577 [31:16<19:24,  4.87s/it]


Error processing 3206304562.jpg: 'tuple' object does not support item assignment


Processing images:  60%|██████    | 349/577 [32:19<18:58,  4.99s/it]


Error processing 3200810766.jpg: 'tuple' object does not support item assignment


Processing images:  61%|██████    | 352/577 [32:31<15:57,  4.26s/it]


Error processing 3206277695.jpg: 'tuple' object does not support item assignment


Processing images:  62%|██████▏   | 359/577 [33:09<19:25,  5.35s/it]


Error processing 3200810690.jpg: 'tuple' object does not support item assignment


Processing images:  63%|██████▎   | 364/577 [33:36<17:35,  4.96s/it]


Error processing 3206232849.jpg: 'tuple' object does not support item assignment


Processing images:  64%|██████▍   | 372/577 [34:25<18:47,  5.50s/it]


Error processing 3206320990.jpg: 'tuple' object does not support item assignment


Processing images:  65%|██████▍   | 375/577 [34:36<13:47,  4.10s/it]


Error processing 3200810698.jpg: 'tuple' object does not support item assignment


Processing images:  66%|██████▌   | 380/577 [34:58<12:54,  3.93s/it]


Error processing 3206224251.jpg: 'tuple' object does not support item assignment


Processing images:  66%|██████▋   | 383/577 [35:13<14:13,  4.40s/it]


Error processing 3200812186.jpg: 'tuple' object does not support item assignment


Processing images:  67%|██████▋   | 384/577 [35:16<13:03,  4.06s/it]


Error processing 3200811332.jpg: 'tuple' object does not support item assignment


Processing images:  69%|██████▉   | 398/577 [36:31<13:02,  4.37s/it]


Error processing 3200810460.jpg: 'tuple' object does not support item assignment


Processing images:  72%|███████▏  | 416/577 [38:13<12:57,  4.83s/it]


Error processing 3206260890.jpg: 'tuple' object does not support item assignment


Processing images:  72%|███████▏  | 418/577 [38:22<11:41,  4.41s/it]


Error processing 3200810293.jpg: 'tuple' object does not support item assignment


Processing images:  73%|███████▎  | 419/577 [38:25<10:22,  3.94s/it]


Error processing 3200810261.jpg: 'tuple' object does not support item assignment


Processing images:  73%|███████▎  | 424/577 [38:51<11:51,  4.65s/it]


Error processing 3206232852.jpg: 'tuple' object does not support item assignment


Processing images:  74%|███████▍  | 428/577 [39:10<11:20,  4.57s/it]


Error processing 3200811895.jpg: 'tuple' object does not support item assignment


Processing images:  77%|███████▋  | 444/577 [40:34<09:54,  4.47s/it]


Error processing 3206262517.jpg: 'tuple' object does not support item assignment


Processing images:  79%|███████▉  | 456/577 [41:36<08:28,  4.20s/it]


Error processing 3200811234.jpg: 'tuple' object does not support item assignment


Processing images:  81%|████████▏ | 469/577 [42:41<08:29,  4.72s/it]


Error processing 3200812528.jpg: list index out of range


Processing images:  81%|████████▏ | 470/577 [42:43<07:18,  4.10s/it]


Error processing 3200810216.jpg: 'tuple' object does not support item assignment


Processing images:  82%|████████▏ | 475/577 [43:11<08:10,  4.81s/it]


Error processing 3200811366.jpg: 'tuple' object does not support item assignment


Processing images:  82%|████████▏ | 476/577 [43:14<07:08,  4.24s/it]


Error processing 3200811032.jpg: 'tuple' object does not support item assignment


Processing images:  83%|████████▎ | 478/577 [43:23<06:56,  4.20s/it]


Error processing 3200811894.jpg: 'tuple' object does not support item assignment


Processing images:  84%|████████▍ | 486/577 [44:10<08:01,  5.29s/it]


Error processing 3200810942.jpg: 'tuple' object does not support item assignment


Processing images:  85%|████████▌ | 491/577 [44:33<06:22,  4.45s/it]


Error processing 3200810957.jpg: 'tuple' object does not support item assignment


Processing images:  86%|████████▌ | 497/577 [44:59<05:21,  4.02s/it]


Error processing 3200808398.jpg: 'tuple' object does not support item assignment


Processing images:  90%|█████████ | 520/577 [47:06<05:04,  5.35s/it]


Error processing 3200813276.jpg: 'tuple' object does not support item assignment


Processing images:  92%|█████████▏| 533/577 [48:10<03:09,  4.32s/it]


Error processing 3206314316.jpg: 'tuple' object does not support item assignment


Processing images:  97%|█████████▋| 561/577 [50:41<01:08,  4.30s/it]


Error processing 3206234742.jpg: 'tuple' object does not support item assignment


Processing images:  98%|█████████▊| 564/577 [50:54<00:50,  3.92s/it]


Error processing 3206277737.jpg: 'tuple' object does not support item assignment


Processing images:  99%|█████████▊| 569/577 [51:23<00:42,  5.25s/it]


Error processing 3206212365.jpg: 'tuple' object does not support item assignment


Processing images:  99%|█████████▉| 574/577 [51:49<00:14,  4.85s/it]


Error processing 3200810145.jpg: 'tuple' object does not support item assignment


Processing images: 100%|██████████| 577/577 [52:07<00:00,  5.42s/it]


Processing complete!
Already existing files: 23
Newly processed files: 515
Errors encountered: 62
Total files in destination: 538





In [26]:
process_images_to_text(model, 'ncse_cropped_images', 'NCSE_effocr', supported_extensions=('.jpg', '.jpeg', '.png'))

Processed: TEC_1886-03-01_page_59_B0C1R2.png -> TEC_1886-03-01_page_59_B0C1R2.txt
Processed: TEC_1890-09-15_page_5_B0C2R10.png -> TEC_1890-09-15_page_5_B0C2R10.txt
Processed: NSS_1852-10-02_page_14_B0C2R13.png -> NSS_1852-10-02_page_14_B0C2R13.txt
Processed: EWJ_1858-09-01_page_25_B0C1R3.png -> EWJ_1858-09-01_page_25_B0C1R3.txt
Processed: NSS_1852-10-02_page_14_B0C1R7.png -> NSS_1852-10-02_page_14_B0C1R7.txt
Processed: EWJ_1859-05-01_page_66_B0C1R2.png -> EWJ_1859-05-01_page_66_B0C1R2.txt
Processed: CLD_1855-08-18_page_7_B0C1R6.png -> CLD_1855-08-18_page_7_B0C1R6.txt
Processed: TTW_1867-12-21_page_12_B0C4R44.png -> TTW_1867-12-21_page_12_B0C4R44.txt
Processed: MRP_1807-10-02_page_13_B0C1R2.png -> MRP_1807-10-02_page_13_B0C1R2.txt
Processed: TEC_1886-03-01_page_59_B0C1R9.png -> TEC_1886-03-01_page_59_B0C1R9.txt
Processed: TTW_1868-01-25_page_3_B0C1R2.png -> TTW_1868-01-25_page_3_B0C1R2.txt
Processed: TTW_1867-12-21_page_12_B0C4R45.png -> TTW_1867-12-21_page_12_B0C4R45.txt
Processed: TEC