In [34]:
# Libraries
from pathlib import Path
import os
import sys
import csv
import shutil
import itertools
import random
import PIL
import pytesseract
import numpy as np
import scipy.ndimage as nd
import pandas as pd
from collections import Counter
from PIL import Image, ImageChops, ImageStat
from random import sample
import re
import glob
from datetime import datetime
pytesseract.pytesseract.tesseract_cmd = r"/usr/bin/tesseract"

start_time = datetime.now()

# Set the root directory containing subdirectories with images
root_dir = Path.cwd()

# Create an output directory
output_dir = root_dir / "output_full"
output_dir.mkdir(exist_ok=True)

sys.path.append("/home/lucas-jerusalimiec/Documents/OCR Text/Notebooks")
from cropfunctions import rotation_angle, trim, get_bands, simp_bd, combine_bbox
from ocr_func import cutMarg, OCRtestImg, testList, tsvOCR, adjustImg

# Display a list of languages in their 3-letter codes supported by Tesseract.
print(pytesseract.get_languages(config=''))


['eng', 'fra', 'frm', 'lat', 'osd']


In [3]:
# Initialize a dictionary to store OCR results for each subdirectory
subdir_ocr_results = {}

# Prompt user for subdir name start string
start_string = input('Please enter the starting string for subdir names: ')

# Collect all images from subdirs whose names start with the user-supplied string and sort them alphabetically
all_images = []
sorted_subdirs = sorted([subdir for subdir in root_dir.iterdir() if subdir.is_dir() and subdir.name.startswith(start_string)])

for subdir in sorted_subdirs:
    images_dir = subdir / "images"
    if images_dir.exists():
        images = sorted(glob.glob(str(images_dir / '**/*.jpg'), recursive=True))
        all_images.extend(images)

# Prompt user for starting side
correct_sides = ['left', 'right']
while True:
    side_input = input('Please enter a starting side: ').lower()
    if side_input in correct_sides:
        break
    else:
        print('Incorrect value. Please try again.')

if side_input == 'right':
    side_1 = 'right'
    side_2 = 'left'
elif side_input == 'left':
    side_1 = 'left'
    side_2 = 'right'

# Create the 'side' list
side = [side_1 if i % 2 == 0 else side_2 for i in range(len(all_images))]

# Assign 'side' values to images in each subdir
for subdir in sorted_subdirs:
    images_dir = subdir / "images"
    if images_dir.exists():
        doc_name = subdir.name  # Set doc_name using the name of subdir
        images = sorted(glob.glob(str(images_dir / '**/*.jpg'), recursive=True))
        limit = len(images)
        
        if limit >= 50:
            sample_n = 5
        elif limit >= 20:
            sample_n = 2
        else:
            sample_n = 1
        
        subdir_ocr_results[subdir.name] = {
            'doc_name': doc_name,
            'limit': limit,
            'sample_n': sample_n,
            'side': side[:limit]  # Assign the corresponding 'side' values
        }
        side = side[limit:]  # Remove the assigned sides from the list

# Print the first two entries in subdir_ocr_results
first_two_entries = dict(list(subdir_ocr_results.items())[:2])
print(first_two_entries)

Please enter the starting string for subdir names:  Démon
Please enter a starting side:  right


{'Démonomanie I.1': {'doc_name': 'Démonomanie I.1', 'limit': 12, 'sample_n': 1, 'side': ['right', 'left', 'right', 'left', 'right', 'left', 'right', 'left', 'right', 'left', 'right', 'left']}, 'Démonomanie I.2': {'doc_name': 'Démonomanie I.2', 'limit': 14, 'sample_n': 1, 'side': ['right', 'left', 'right', 'left', 'right', 'left', 'right', 'left', 'right', 'left', 'right', 'left', 'right', 'left']}}


In [4]:
import os
import itertools
import re

# Function to check if subdir name ends with a Roman numeral followed by '.1'
def ends_with_roman_numeral(subdir_name):
    pattern = r'\bM{0,4}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})\.1\b'
    return re.search(pattern, subdir_name) is not None

# Iterate over each subdirectory in root_dir
sorted_subdirs = sorted([subdir for subdir in root_dir.iterdir() if subdir.is_dir() and subdir.name.startswith(start_string)])

first_image_processed = False

# Prompt user for choice
user_choice = input("Set find_top = True for the very first jpg of the very first subdir (1) or for the first jpg of every subdir whose name ends with a pattern like 'I.1' or 'II.1' (2)? Enter 1 or 2: ").strip()

for subdir in sorted_subdirs:
    images_dir = subdir / "images"
    if images_dir.exists():
        # Iterate over each subdictionary in subdir_ocr_results
        for key, subdict in subdir_ocr_results.items():
            if subdir.name == key:
                count = 0
                limit = subdict['limit']
                side = subdict['side']
                
                # Initialize the meta list for each subdictionary
                subdict['meta'] = []
                
                # For each jpeg file in the images directory, do the following:
                for jpg in itertools.islice(sorted(images_dir.glob('*.jpg')), limit):
                    row_dict = dict()
                    row_dict["filename"] = os.path.relpath(jpg)
                    row_dict["side"] = side[count]
                    row_dict["start_section"] = False

                    if user_choice == '1':
                        row_dict["find_top"] = not first_image_processed
                    elif user_choice == '2':
                        row_dict["find_top"] = ends_with_roman_numeral(subdir.name) and count == 0

                    row_dict["background"] = 0
                    row_dict["bbox"] = []
                    subdict['meta'].append(row_dict)
                    count += 1
                    first_image_processed = True


In [9]:
# Combined rotation, greyscale (inverse), and trim workflow metadata
### Convert multiple jpegs into a set of greyscale images ###
buff = 10

sorted_subdirs = sorted([subdir for subdir in root_dir.iterdir() if subdir.is_dir() and subdir.name.startswith(start_string)])

# Iterate over each subdirectory in root_dir
for subdir in sorted_subdirs:
    if subdir.is_dir() and subdir.name.startswith(start_string):
        images_dir = subdir / "images"
        if images_dir.exists():
            # Iterate over each subdictionary in subdir_ocr_results
            for key, subdict in subdir_ocr_results.items():
                if subdir.name == key:
                    background = []
                    bbox = []
                    fname = []
                    diff_open = []
                    angle = []
                    count = 0
                    limit = subdict['limit']
                    
                    # For each jpeg file in the images directory, do the following:
                    for jpg in itertools.islice(sorted(images_dir.glob('*.jpg')), limit):
                        print(f'Processing {os.path.relpath(jpg)}')
                        img = Image.open(jpg)
                        best_angle = rotation_angle(img)
                        
                        # Find the corresponding meta entry for the current image
                        meta_entry = next((item for item in subdict['meta'] if item['filename'] == os.path.relpath(jpg)), None)
                        find_top = meta_entry['find_top'] if meta_entry else False
                        diff0, background0, bbox0 = trim(img, angle=best_angle, buff=buff, find_top=find_top)
                        bbox.append(bbox0)
                        background.append(background0)
                        fname.append(os.path.relpath(jpg))
                        diff_open.append(diff0)
                        angle.append(best_angle)
                        count += 1

                    print(f'{subddir.name} Images successfully processed')

                    # Update meta with diffs and store diff_open in meta
                    diff_list = [
                        {
                            "filename": fname[i],
                            "bbox": bbox[i],
                            "background": background[i],
                            "angle": angle[i],
                            "diff_open": diff_open[i]
                        }
                        for i in range(len(fname))
                    ]

                    for update_dict in diff_list:
                        for original_dict in subdict['meta']:
                            if original_dict["filename"] == update_dict["filename"]:
                                original_dict.update(update_dict)


Processing Démonomanie I.1/images/Démonomanie I.1_image_0000.jpg
Processing Démonomanie I.1/images/Démonomanie I.1_image_0001.jpg
Processing Démonomanie I.1/images/Démonomanie I.1_image_0002.jpg
Processing Démonomanie I.1/images/Démonomanie I.1_image_0003.jpg
Processing Démonomanie I.1/images/Démonomanie I.1_image_0004.jpg
Processing Démonomanie I.1/images/Démonomanie I.1_image_0005.jpg
Processing Démonomanie I.1/images/Démonomanie I.1_image_0006.jpg
Processing Démonomanie I.1/images/Démonomanie I.1_image_0007.jpg
Processing Démonomanie I.1/images/Démonomanie I.1_image_0008.jpg
Processing Démonomanie I.1/images/Démonomanie I.1_image_0009.jpg
Processing Démonomanie I.1/images/Démonomanie I.1_image_0010.jpg
Processing Démonomanie I.1/images/Démonomanie I.1_image_0011.jpg
Images successfully processed
Processing Démonomanie I.2/images/Démonomanie I.2_image_0000.jpg
Processing Démonomanie I.2/images/Démonomanie I.2_image_0001.jpg
Processing Démonomanie I.2/images/Démonomanie I.2_image_0002

In [19]:
# Parameters
bheight = 50
pad = 10
allow = (0.1, 0.1)  # parameters for L'Hospital, Démonomanie, Théatre
# allow = (0.1, 0.3)  # parameters for Bodin
freq = 0.8
minfreq = 0.1

sorted_subdirs = sorted([subdir for subdir in root_dir.iterdir() if subdir.is_dir() and subdir.name.startswith(start_string)])

# Iterate over each subdirectory in root_dir
for subdir in sorted_subdirs:
    if subdir.is_dir() and subdir.name.startswith(start_string):
        images_dir = subdir / "images"
        if images_dir.exists():
            # Iterate over each subdictionary in subdir_ocr_results
            for key, subdict in subdir_ocr_results.items():
                if subdir.name == key:
                    band_dict = []
                    cut_list = []
                    count = 0
                    limit = subdir_ocr_results[key]['limit']  # Access limit from the first level of subdictionaries
                    
                    # For each jpeg file in the images directory, do the following:
                    for jpg in itertools.islice(sorted(images_dir.glob('*.jpg')), limit):
                        print(f'Processing {os.path.relpath(jpg)}')
                        
                        # Access diff_open from the meta subdict
                        meta_entry = next((item for item in subdict['meta'] if item['filename'] == os.path.relpath(jpg)), None)
                        if meta_entry:
                            diff_open_img = meta_entry['diff_open']
                        
                            # Bands creation workflow
                            band0 = get_bands(diff_open_img, bheight=bheight)
                            band_dict.append(band0)
                            
                            # Bands application workflow
                            cut0 = simp_bd(band_dict=band_dict[count], diff=diff_open_img, side=meta_entry['side'], width=diff_open_img.size[0],
                                           pad=pad, allow=allow, freq=freq, minfreq=minfreq)
                            cut_list.append(cut0)
                            
                            # Update meta with cuts and store band_dict in meta
                            meta_entry["cut"] = cut_list[count]
                            meta_entry["band_dict"] = band_dict[count]
                            
                            count += 1

print('Bands successfully created and applied')

Processing Démonomanie I.1/images/Démonomanie I.1_image_0000.jpg
Processing Démonomanie I.1/images/Démonomanie I.1_image_0001.jpg
Processing Démonomanie I.1/images/Démonomanie I.1_image_0002.jpg
Processing Démonomanie I.1/images/Démonomanie I.1_image_0003.jpg
Processing Démonomanie I.1/images/Démonomanie I.1_image_0004.jpg
Processing Démonomanie I.1/images/Démonomanie I.1_image_0005.jpg
Processing Démonomanie I.1/images/Démonomanie I.1_image_0006.jpg
Processing Démonomanie I.1/images/Démonomanie I.1_image_0007.jpg
Processing Démonomanie I.1/images/Démonomanie I.1_image_0008.jpg
Processing Démonomanie I.1/images/Démonomanie I.1_image_0009.jpg
Processing Démonomanie I.1/images/Démonomanie I.1_image_0010.jpg
Processing Démonomanie I.1/images/Démonomanie I.1_image_0011.jpg
Processing Démonomanie I.2/images/Démonomanie I.2_image_0000.jpg
Processing Démonomanie I.2/images/Démonomanie I.2_image_0001.jpg
Processing Démonomanie I.2/images/Démonomanie I.2_image_0002.jpg
Processing Démonomanie I.

In [20]:
sorted_subdirs = sorted([subdir for subdir in root_dir.iterdir() if subdir.is_dir() and subdir.name.startswith(start_string)])

# Iterate over each subdirectory in root_dir
for subdir in sorted_subdirs:
    if subdir.is_dir() and subdir.name.startswith(start_string):
        images_dir = subdir / "images"
        if images_dir.exists():
            # Iterate over each subdictionary in subdir_ocr_results
            for key, subdict in subdir_ocr_results.items():
                if subdir.name == key:
                    out_bbox = []
                    total_bbox = []
                    count = 0
                    limit = subdict['limit']
                    
                    # For each jpeg file in the images directory, do the following:
                    for jpg in itertools.islice(sorted(images_dir.glob('*.jpg')), limit):
                        print(f'Processing {os.path.relpath(jpg)}')
                        
                        # Get the meta data for the current file
                        meta_data = next((item for item in subdict['meta'] if item['filename'] == os.path.relpath(jpg)), None)
                        # Define bbox margins
                        out_bbox0 = [0, 0] + list(meta_data['diff_open'].size)
                        side_dict = {"left": 0, "right": 2}  # Dict to know which bbox dimension to "cut"
                        side_0 = meta_data['side']
                        out_bbox0[side_dict[side_0]] = meta_data['cut']  # Adjust outer edge of the page
                        out_bbox.append(out_bbox0)

                        # Adjust bbox dimensions
                        total_bbox0 = combine_bbox(meta_data['bbox'], out_bbox[count])
                        total_bbox0 = list(total_bbox0)
                        total_bbox.append(total_bbox0)

                        # Update the meta data with out_bbox and total_bbox
                        meta_data['out_bbox'] = out_bbox0
                        meta_data['total_bbox'] = total_bbox0

                        count += 1

print('Bbox margins successfully defined')
print('Bbox dimensions successfully adjusted')

Processing Démonomanie I.1/images/Démonomanie I.1_image_0000.jpg
Processing Démonomanie I.1/images/Démonomanie I.1_image_0001.jpg
Processing Démonomanie I.1/images/Démonomanie I.1_image_0002.jpg
Processing Démonomanie I.1/images/Démonomanie I.1_image_0003.jpg
Processing Démonomanie I.1/images/Démonomanie I.1_image_0004.jpg
Processing Démonomanie I.1/images/Démonomanie I.1_image_0005.jpg
Processing Démonomanie I.1/images/Démonomanie I.1_image_0006.jpg
Processing Démonomanie I.1/images/Démonomanie I.1_image_0007.jpg
Processing Démonomanie I.1/images/Démonomanie I.1_image_0008.jpg
Processing Démonomanie I.1/images/Démonomanie I.1_image_0009.jpg
Processing Démonomanie I.1/images/Démonomanie I.1_image_0010.jpg
Processing Démonomanie I.1/images/Démonomanie I.1_image_0011.jpg
Processing Démonomanie I.2/images/Démonomanie I.2_image_0000.jpg
Processing Démonomanie I.2/images/Démonomanie I.2_image_0001.jpg
Processing Démonomanie I.2/images/Démonomanie I.2_image_0002.jpg
Processing Démonomanie I.

In [22]:
sorted_subdirs = sorted([subdir for subdir in root_dir.iterdir() if subdir.is_dir() and subdir.name.startswith(start_string)])
# Iterate over each subdirectory in root_dir
for subdir in sorted_subdirs:
    if subdir.is_dir() and subdir.name.startswith(start_string):
        images_dir = subdir / "images"
        if images_dir.exists():
            # Iterate over each subdictionary in subdir_ocr_results
            for key, subdict in subdir_ocr_results.items():
                if subdir.name == key:
                    count = 0

                    # Metadata collected
                    filename_list = [d['filename'] for d in subdict['meta']]
                    side_list = [d['side'] for d in subdict['meta']]
                    cut_list = [d['cut'] for d in subdict['meta']]
                    background = [d['background'] for d in subdict['meta']]
                    angle_list = [d['angle'] for d in subdict['meta']]
                    
                    meta_final = []
                    count = 0
                    for jpg in itertools.islice(sorted(images_dir.glob('*.jpg')), limit):
                        meta_list = [filename_list[count], angle_list[count], side_list[count], cut_list[count]]
                        meta_list.extend(background[count])
                        meta_list.extend(total_bbox[count])
                        meta_final.append(meta_list)
                        count += 1

                    # Metadata CSV
                    headers = ["filename", "angle", "side", "cut", "backR", "backG", "backB",
                               "bbox1", "bbox2", "bbox3", "bbox4"]
                    with open(f'{output_dir}/{subdir.name}_metadata.csv', "w", newline="") as outfile:
                        writer = csv.writer(outfile)
                        writer.writerow(headers)
                        for row in meta_final:
                            writer.writerow(row)

                    # Load the CSV file into a pandas DataFrame
                    df = pd.read_csv(f'{output_dir}/{subdir.name}_metadata.csv')
                    print(df.head())

                                            filename  angle   side   cut  \
0  Démonomanie I.1/images/Démonomanie I.1_image_0...    0.0  right  3556   
1  Démonomanie I.1/images/Démonomanie I.1_image_0...    0.0   left     0   
2  Démonomanie I.1/images/Démonomanie I.1_image_0...    0.0  right  3556   
3  Démonomanie I.1/images/Démonomanie I.1_image_0...    0.0   left     0   
4  Démonomanie I.1/images/Démonomanie I.1_image_0...   -0.5  right  3556   

   backR  backG  backB  bbox1  bbox2  bbox3  bbox4  
0    221    207    177      0      0   3556   4987  
1    220    203    173      0      0   3556   5264  
2    222    207    178      0      0   3556   5011  
3    220    204    174      0      0   3556   5237  
4    222    208    178      0      0   3556   4987  
                                            filename  angle   side   cut  \
0  Démonomanie I.2/images/Démonomanie I.2_image_0...    0.0  right  3556   
1  Démonomanie I.2/images/Démonomanie I.2_image_0...    0.0   left     0 

In [38]:
sorted_subdirs = sorted([subdir for subdir in root_dir.iterdir() if subdir.is_dir() and subdir.name.startswith(start_string)])

# Iterate over each subdirectory in root_dir
for subdir in sorted_subdirs:
    if subdir.is_dir() and subdir.name.startswith(start_string):
        print(f"Processing subdirectory: {subdir.name}")
        
        # Load the CSV file into a pandas DataFrame
        df = pd.read_csv(output_dir / f'{subdir.name}_metadata.csv')
        print(f"Loaded CSV for {subdir.name}")
        
        folder_name = subdir.name

        # Initialize the subdictionary if it doesn't exist
        if folder_name not in subdir_ocr_results:
            subdir_ocr_results[folder_name] = {}

        # Get the value of sample_n from the subdictionary of subdir_ocr_results
        sample_n = subdir_ocr_results[folder_name].get("sample_n", 10)  # Default to 10 if not set

        # Initialize variables and process each metadata.csv file
        pool = []
        csvf = df.set_index("filename")
        csvf['file'] = csvf.index
        for row in csvf.itertuples():
            pool.append(Path(root_dir, row.file))

        pool = sample(pool, sample_n)

        # Get images for files in sample, cut margins and make a test list
        imgs = []
        results = []

        for img_path in pool:
            # Convert PosixPath to string and open the image
            img = str(img_path)

            # Get image name
            name = os.path.relpath(img_path, start=root_dir)
            print(f'Testing {name}')
            
            # Get values for cutting margins
            rotate = csvf.loc[name]["angle"]
            left = csvf.loc[name]["bbox1"]
            up = csvf.loc[name]["bbox2"]
            right = csvf.loc[name]["bbox3"]
            lower = csvf.loc[name]["bbox4"]
            bkgcol = (csvf.loc[name]["backR"], csvf.loc[name]["backG"], csvf.loc[name]["backB"])

            # Cut the margins
            img = cutMarg(img=img, rotate=rotate, left=left, up=up, right=right,
                          lower=lower, border=0, bkgcol=bkgcol)

            # Add the new image to the list
            imgs.append(img)

            # Perform an OCR test on the new image and add the results to the list
            results.append(OCRtestImg(img))
    
        # Create a testList object with the images and results
        testSample = testList(imgs, results)

        # Set up a dict of recommended adjustments and perform tests
        adjustments = {"volume": "default_volume", "color": 1.0, "invert": False, 
                       "autocontrast": 0, "blur": False, "sharpen": False, 
                       "smooth": False, "xsmooth": False}

        # Output testSample and adjustments to the subdictionary in subdir_ocr_results that corresponds to folder_name
        subdir_ocr_results[folder_name]["testSample"] = testSample
        subdir_ocr_results[folder_name]["adjustments"] = adjustments
        print(f"Updated subdir_ocr_results for {folder_name}")

Processing subdirectory: Démonomanie I.4
Loaded CSV for Démonomanie I.4
Sampled pool: [PosixPath('/home/lucas-jerusalimiec/Documents/OCR Bin/Split/Bodin Repair/Démonomanie/Démonomanie I.4/images/Démonomanie I.4_image_0001.jpg')]
Testing Démonomanie I.4/images/Démonomanie I.4_image_0001.jpg
Processed images: [<PIL.Image.Image image mode=RGB size=3556x5264 at 0x7C32C034E660>]
Created testSample: <ocr_func.testList object at 0x7c327c3577d0>
Updated subdir_ocr_results for Démonomanie I.4
Processing subdirectory: Démonomanie III.4
Loaded CSV for Démonomanie III.4
Sampled pool: [PosixPath('/home/lucas-jerusalimiec/Documents/OCR Bin/Split/Bodin Repair/Démonomanie/Démonomanie III.4/images/Démonomanie III.4_image_0009.jpg')]
Testing Démonomanie III.4/images/Démonomanie III.4_image_0009.jpg
Processed images: [<PIL.Image.Image image mode=RGB size=3556x5264 at 0x7C3277FB4DA0>]
Created testSample: <ocr_func.testList object at 0x7c3276d63fb0>
Updated subdir_ocr_results for Démonomanie III.4
Processi

In [48]:
tests = ["color", "autocontrast", "blur", "sharpen", "smooth", "xsmooth"]
levels = {
    "color": [1, .75, .5, .25, 0],
    "autocontrast": [0, 2, 4, 6, 8],
    "blur": [True],
    "sharpen": [True],
    "smooth": [True],
    "xsmooth": [True]
}

# Loop through subdir_ocr_results to apply the routine to each instance of testSample
for key, value in subdir_ocr_results.items():
    if isinstance(value, dict) and "testSample" in value:
        testSample = value["testSample"]
        adjustments = {}

        # Perform each test
        for test in tests:
            print(f"{test} test")
            testRes = testSample.adjustTest(test, levels=levels.get(test, []))

            # Adjust the image based on the test results
            if test in ["color", "autocontrast"]:
                best = float(testRes["best_adjustment"].replace(test, ""))
                if best != (1.0 if test == "color" else 0.0):
                    testSample = testSample.adjustImg(**{test: best})
                    adjustments[test] = best
                else:
                    adjustments[test] = 1.0 if test == "color" else 0.0
            else:
                if testRes["best_adjustment"] == f"{test}True":
                    testSample = testSample.adjustImg(**{test: True})
                    adjustments[test] = True
                else:
                    adjustments[test] = False

        # Update the adjustments in subdir_ocr_results
        subdir_ocr_results[key]["adjustments"] = adjustments

        # Post-test adjustment levels
        print(adjustments)
        adjustments_list = [*adjustments.values()]
        print(adjustments_list)

        # Write adjustments to CSV
        headers = ["color", "autocontrast","blur","sharpen","smooth", "xsmooth"]
        
        # Find the corresponding subdirectory for the current key
        for subdir in root_dir.iterdir():
            if subdir.is_dir() and subdir.name == key:
                # Create the adjustments.csv file in the same location as the subdirectory
                adjustments_csv_path = output_dir/ f"{value.get('doc_name', 'default_doc')}_adjustments.csv"
                with open(adjustments_csv_path, "w", newline="") as outfile:
                    writer = csv.writer(outfile)
                    writer.writerow(headers)
                    writer.writerow(adjustments_list)

color test



                                color1  color0.75  color0.5  color0.25  \
Démonomanie I.1_image_0001.jpg     115        121       113        118   

                                color0  totaltok  
Démonomanie I.1_image_0001.jpg     115       261  

TOTAL READABILITY
color1: 55.939
color0.75: 53.64
color0.5: 56.705
color0.25: 54.789
color0: 55.939
autocontrast test



                                autocontrast0  autocontrast2  autocontrast4  \
Démonomanie I.1_image_0001.jpg            113            111            113   

                                autocontrast6  autocontrast8  totaltok  
Démonomanie I.1_image_0001.jpg            108            110       272  

TOTAL READABILITY
autocontrast0: 58.456
autocontrast2: 59.191
autocontrast4: 58.456
autocontrast6: 60.294
autocontrast8: 59.559
blur test



                                blurTrue  blurFalse  totaltok
Démonomanie I.1_image_0001.jpg       114        108       261

TOTAL READABILITY
blurTrue: 56.322
blurFa

In [49]:
# Function to perform OCR on images
def perform_ocr(fulldata_path, output_dir, doc_name):
    fulldata = pd.read_csv(fulldata_path)
    
    # Loop through each row in fulldata
    for row in fulldata.itertuples():
        img = str(Path(root_dir, row.filename))
        print(datetime.now().strftime("%H:%M") + " Processing " + row.filename + "...")

        # Set up margin cutting
        cuts = {"rotate": row.angle,
                "left": row.bbox1,
                "up": row.bbox2,
                "right": row.bbox3,
                "lower": row.bbox4,
                "border": 0,
                "bkgcol": (row.backR, row.backG, row.backB)}

        # Set up image adjustment
        adjustments = {"color": row.color, 
                       "autocontrast": row.autocontrast,
                       "blur": row.blur,
                       "sharpen": row.sharpen,
                       "smooth": row.smooth,
                       "xsmooth": row.xsmooth}

        # OCR the image
        tsvOCR((adjustImg(cutMarg(img, **cuts), **adjustments)), 
               savpath=output_dir / f"{doc_name}.txt",
               tsvfile=f"{doc_name}_data.tsv")
    
        print(f'Done {doc_name} OCR job')

# Set paths for metadata files and output directory
metadata_files = sorted(list(output_dir.glob("*_metadata.csv")))

# Loop through each metadata file
for metadata_file in metadata_files:
    doc_name = metadata_file.stem.replace("_metadata", "")
    
    # Read metadata file into memory
    margcsv = pd.read_csv(metadata_file)
    
    # Find the corresponding adjustment file
    adjdata = output_dir / f"{doc_name}_adjustments.csv"
    
    if adjdata.exists():
        # Read adjustment file into memory
        adjcsv = pd.read_csv(adjdata)
        
        # Prepare for merging csvs
        margcsv['key'] = 0
        adjcsv['key'] = 0
        
        # Perform the Cartesian product
        fulldata = pd.merge(margcsv, adjcsv, on='key')
        
        # Drop the temporary key column
        fulldata.drop('key', axis=1, inplace=True)
        
        # Save the combined dataframe to a new CSV file
        fulldata_path = output_dir / f"{doc_name}_fulldata.csv"
        fulldata.to_csv(fulldata_path, index=False)
        
        # Perform OCR on the combined data
        perform_ocr(fulldata_path, output_dir, doc_name)

19:21 Processing Démonomanie I.1/images/Démonomanie I.1_image_0000.jpg...
19:21 Processing Démonomanie I.1/images/Démonomanie I.1_image_0001.jpg...
19:21 Processing Démonomanie I.1/images/Démonomanie I.1_image_0002.jpg...
19:22 Processing Démonomanie I.1/images/Démonomanie I.1_image_0003.jpg...
19:22 Processing Démonomanie I.1/images/Démonomanie I.1_image_0004.jpg...
19:22 Processing Démonomanie I.1/images/Démonomanie I.1_image_0005.jpg...
19:22 Processing Démonomanie I.1/images/Démonomanie I.1_image_0006.jpg...
19:22 Processing Démonomanie I.1/images/Démonomanie I.1_image_0007.jpg...
19:23 Processing Démonomanie I.1/images/Démonomanie I.1_image_0008.jpg...
19:23 Processing Démonomanie I.1/images/Démonomanie I.1_image_0009.jpg...
19:23 Processing Démonomanie I.1/images/Démonomanie I.1_image_0010.jpg...
19:23 Processing Démonomanie I.1/images/Démonomanie I.1_image_0011.jpg...
Done OCR job
19:23 Processing Démonomanie I.2/images/Démonomanie I.2_image_0000.jpg...
19:24 Processing Démonoma

In [40]:
# Calculate the total limit from subdir_ocr_results
total_limit = sum(value['limit'] for value in subdir_ocr_results.values() if isinstance(value, dict) and 'limit' in value)

# Assuming the rest of the code runs here

end_time = datetime.now()
time_consumed = end_time - start_time
avg_time = time_consumed / total_limit
print(f'Time consumed by {start_string} notebook run: {time_consumed}')
print(f'{total_limit} pages')
print(f'{avg_time} per page')

Time consumed by Harangue - lit de justice notebook run: 0:03:47.597763
16 pages
0:00:14.224860 per page
