In [None]:
import os
from PIL import Image
from surya.ocr import run_ocr
from surya.model.detection.model import load_model as load_det_model, load_processor as load_det_processor
from surya.model.recognition.model import load_model as load_rec_model
from surya.model.recognition.processor import load_processor as load_rec_processor
from pdf2image import convert_from_path

# Helper function to process input file (image or PDF)
def process_input(file_path):
    if file_path.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp', '.tiff')):
        return [Image.open(file_path)]  # Open image file
    elif file_path.lower().endswith('.pdf'):
        pages = convert_from_path(file_path, 300)  # Convert PDF to images
        return pages
    else:
        raise ValueError("Unsupported file type. Please provide a valid image or PDF file.")

# Function to save OCR output to text file
def save_ocr_output(file_path, ocr_result, output_folder):
    base_name = os.path.splitext(os.path.basename(file_path))[0]
    output_text_file = os.path.join(output_folder, f"{base_name}.txt")
    
    # Save OCR result to the text file
    with open(output_text_file, 'w') as file:
        file.write(ocr_result)
    
    print(f"OCR result has been saved to {output_text_file}")

# Function to run OCR on a given file (image or PDF)
def run_ocr_on_file(file_path, output_folder):
    # Process the input (image or PDF)
    images = process_input(file_path)
    langs = ["en"]  # Languages used for OCR

    # Load the detection and recognition models
    det_processor, det_model = load_det_processor(), load_det_model()
    rec_model, rec_processor = load_rec_model(), load_rec_processor()

    # Run OCR on the images
    predictions = run_ocr(images, [langs] * len(images), det_model, det_processor, rec_model, rec_processor)

    # Extract the full text from OCR predictions
    full_text = ""
    for prediction in predictions:
        for text_line in prediction.text_lines:
            full_text += text_line.text + "\n"

    # Save the OCR output to a text file
    save_ocr_output(file_path, full_text, output_folder)

# Function to process all files in a folder
def process_input_folder(input_folder_path):
    parent_dir = os.path.dirname(input_folder_path)
    output_folder = os.path.join(parent_dir, "ocr_output")

    os.makedirs(output_folder, exist_ok=True)

    # Iterate through all files in the folder and process them
    for root, dirs, files in os.walk(input_folder_path):
        for file in files:
            file_path = os.path.join(root, file)
            if file.lower().endswith(('.pdf', '.png', '.jpg', '.jpeg', '.bmp', '.tiff')):
                print(f"Processing file: {file_path}")
                run_ocr_on_file(file_path, output_folder)

# Set the folder path where the files are located
input_folder_path = "path/to/input/folder"  
process_input_folder(input_folder_path)


  from .autonotebook import tqdm as notebook_tqdm


Processing file: /home/azureuser/S_OCR/OCR/Test_data/Forms_for_OCR_transscript-3 1.jpg
Loaded detection model vikp/surya_det3 on device cuda with dtype torch.float16
Loaded recognition model vikp/surya_rec2 on device cuda with dtype torch.float16


Detecting bboxes: 100%|██████████| 1/1 [00:00<00:00,  1.72it/s]
Recognizing Text: 100%|██████████| 1/1 [00:02<00:00,  2.31s/it]


OCR result has been saved to /home/azureuser/S_OCR/OCR/ocr_output/Forms_for_OCR_transscript-3 1.txt
Processing file: /home/azureuser/S_OCR/OCR/Test_data/Aies.pdf
Loaded detection model vikp/surya_det3 on device cuda with dtype torch.float16
Loaded recognition model vikp/surya_rec2 on device cuda with dtype torch.float16


Detecting bboxes: 100%|██████████| 1/1 [00:03<00:00,  3.85s/it]
Recognizing Text: 100%|██████████| 1/1 [00:04<00:00,  4.88s/it]

OCR result has been saved to /home/azureuser/S_OCR/OCR/ocr_output/Aies.txt



