## Tesseract Benchmark Code

In [1]:
import os
import pytesseract
from PIL import Image
from pdf2image import convert_from_path
import time

In [2]:
pytesseract.pytesseract.tesseract_cmd = r'/usr/bin/tesseract'

In [3]:
def ocr_image(image_path):
    # Open an image file
    with Image.open(image_path) as img:
        # Use Tesseract to do OCR on the image
        text = pytesseract.image_to_string(img)
    return text

def ocr_pdf(pdf_path, output_dir):
    # Convert PDF to a list of image files
    images = convert_from_path(pdf_path)
    text_output = ""
    
    # OCR each image
    for i, image in enumerate(images):
        image_path = os.path.join(output_dir, f"page_{i+1}.png")
        image.save(image_path, 'PNG')
        
        # Perform OCR on the image
        text = ocr_image(image_path)
        text_output += text + "\n\n"
        
        # Optionally, remove the image file after OCR
        os.remove(image_path)
    
    return text_output

def main(input_path, output_file):
    # Determine the type of the input file
    if input_path.lower().endswith(('.png', '.jpg', '.jpeg', '.tiff')):
        # Single image file
        text = ocr_image(input_path)
    elif input_path.lower().endswith('.pdf'):
        # PDF file
        output_dir = os.path.dirname(output_file)
        text = ocr_pdf(input_path, output_dir)
    else:
        raise ValueError("Unsupported file type. Please provide a PDF or an image file.")
    
    # Save the text output to a file
    with open(output_file, 'w') as f:
        f.write(text)

In [None]:
start_time = time.time()

for f in os.listdir('/home/darshewskijadmin@consilio.com/ExperimentalLLMs/LowResolutionMobyDick'):
    input_path = '/home/darshewskijadmin@consilio.com/ExperimentalLLMs/LowResolutionMobyDick/' + f
    name, ext = os.path.splitext(f)
    output_file = '/home/darshewskijadmin@consilio.com/ExperimentalLLMs/TestTesseractTranscriptions/' + name + '.txt'
    main(input_path, output_file)  

end_time = time.time()
duration = end_time - start_time
print(duration)