In [5]:
#doing ocr for the royal society data in pdf format so its easire to work with using pytesseract and pdf2image
import os
import pytesseract
from pdf2image import convert_from_path
from concurrent.futures import ThreadPoolExecutor

pdf_dir = 'D:/Fact_fiction_corpus/royal society/pdf'
txt_dir = 'D:/Fact_fiction_corpus/royal society/txt'

os.makedirs(txt_dir, exist_ok=True)

# Configure Tesseract to use Old English language model 'enm'
custom_config = r'--oem 1 --psm 4 -l enm'

def process_pdf(pdf_file):
    if pdf_file.endswith('.pdf'):
        pdf_path = os.path.join(pdf_dir, pdf_file)
        txt_file = os.path.splitext(pdf_file)[0] + '.txt'
        txt_path = os.path.join(txt_dir, txt_file)

        # Check if the text file already exists
        if os.path.exists(txt_path):
            print(f"{txt_file} already exists, skipping OCR...")
            return

        images = convert_from_path(pdf_path)
        
        print(f"Processing {pdf_file}...")

        full_text = []

        for i, image in enumerate(images):
            image = image.convert('L')  # Convert image to grayscale
            text = pytesseract.image_to_string(image, config=custom_config)
            full_text.append(text)

        with open(txt_path, 'w', encoding='utf-8') as f:
            f.write('\n\n'.join(full_text))
        
        print(f"Combined text saved as {txt_file}")

pdf_files = [f for f in os.listdir(pdf_dir) if f.endswith('.pdf')]

#concurrent processing with thread pool
with ThreadPoolExecutor(max_workers=os.cpu_count()) as executor:
    executor.map(process_pdf, pdf_files)

print("Conversion complete.")


rstl_1665_0051.txt already exists, skipping OCR...
rstl_1665_0052.txt already exists, skipping OCR...
rstl_1665_0053.txt already exists, skipping OCR...
rstl_1665_0054.txt already exists, skipping OCR...
rstl_1665_0055.txt already exists, skipping OCR...
rstl_1665_0056.txt already exists, skipping OCR...
rstl_1665_0067.txt already exists, skipping OCR...
rstl_1665_0068.txt already exists, skipping OCR...
rstl_1665_0012.txt already exists, skipping OCR...
rstl_1665_0034.txt already exists, skipping OCR...
rstl_1665_0018.txt already exists, skipping OCR...
rstl_1665_0025.txt already exists, skipping OCR...
rstl_1665_0042.txt already exists, skipping OCR...
rstl_1665_0057.txt already exists, skipping OCR...
rstl_1665_0041.txt already exists, skipping OCR...
rstl_1665_0050.txt already exists, skipping OCR...
rstl_1665_0064.txt already exists, skipping OCR...
rstl_1665_0061.txt already exists, skipping OCR...
rstl_1665_0063.txt already exists, skipping OCR...
rstl_1665_0059.txt already exis

In [None]:
"""
Page segmentation modes: (psm)
    0    Orientation and script detection (OSD) only.
    1    Automatic page segmentation with OSD.
    2    Automatic page segmentation, but no OSD, or OCR.
    3    Fully automatic page segmentation, but no OSD. (Default)
    4    Assume a single column of text of variable sizes.
    5    Assume a single uniform block of vertically aligned text.
    6    Assume a single uniform block of text.
    7    Treat the image as a single text line.
    8    Treat the image as a single word.
    9    Treat the image as a single word in a circle.
 10    Treat the image as a single character.
 11    Sparse text. Find as much text as possible in no particular order.
 12    Sparse text with OSD.
 13    Raw line. Treat the image as a single text line,
                                                bypassing hacks that are Tesseract-specific.


OCR Engine modes: (see https://github.com/tesseract-ocr/tesseract/wiki#linux)
  0    Legacy engine only.
  1    Neural nets LSTM engine only.
  2    Legacy + LSTM engines.
  3    Default, based on what is available.
"""
