# How to Remove OCR from PDF(s)

## Follow the steps:
1. Add your PDF(s) to the input folder
2. Run the below code block to include the packages
3. Run the next code block and scroll to the very bottom of the notebook to view the directions and follow the given directions



# Things to know
This program has different options to remove OCR (removal is done by converting PDF pages to images)
Removing OCR can causing the size of a PDF to double for large files.

In [1]:
!pip install PyPDF2 PyMuPDF Pillow tqdm
!pip install matplotlib
!pip install pdf2image

Defaulting to user installation because normal site-packages is not writeable
Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 KB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting PyMuPDF
  Downloading pymupdf-1.26.3-cp39-abi3-manylinux_2_28_x86_64.whl (24.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.1/24.1 MB[0m [31m50.6 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting Pillow
  Downloading pillow-11.3.0-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (6.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.6/6.6 MB[0m [31m79.8 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hCollecting tqdm
  Downloading tqdm-4.67.1-py3-none-any.whl (78 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.5/78.5 KB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling coll

In [None]:
"""
PDF OCR Removal Script - Batch Processing Version (Size Optimized)
This script removes OCR text layers from PDF files while maintaining reasonable file sizes.
Supports processing single files or entire folders with size optimization options.
"""

import os
import sys
import traceback
from datetime import datetime
from pathlib import Path
from io import BytesIO

try:
    import PyPDF2
    import fitz
    from PIL import Image
    from tqdm import tqdm
except ImportError as import_error:
    print(f"Required library not installed: {import_error}")
    print("Please install required packages:")
    print("pip install PyPDF2 PyMuPDF Pillow tqdm")
    sys.exit(1)

class PDFOCRRemover:
    def __init__(self):
        """
        Initialize the PDFOCRRemover with empty lists for tracking processed, failed, and skipped files.
        Sets default input and output directories.
        """
        self.successfully_processed_files = []  # List to store info about successfully processed files
        self.failed_processing_files = []      # List to store info about files that failed to process
        self.skipped_processing_files = []     # List to store info about files that were skipped
        self.default_input_directory = Path('/workspace/Remove-OCR-from-PDF/input')
        self.default_output_dir = Path('/workspace/Remove-OCR-from-PDF/output')

    def get_input_directory(self):
        """
        Prompt the user for the input directory containing PDF files.
        If the default directory exists and contains PDFs, offer to use it.
        Otherwise, prompt for a custom directory path.
        Returns:
            Path: The selected input directory.
        """
        print(f"Default input directory: {self.default_input_directory}")

        # Check if the default input directory exists and contains PDF files
        if self.default_input_directory.exists() and self.default_input_directory.is_dir():
            pdf_file_count = len(list(self.default_input_directory.glob("*.pdf"))) + len(list(self.default_input_directory.glob("*.PDF")))
            if pdf_file_count > 0:
                print(f"Found {pdf_file_count} PDF files in the default directory.")
                use_default = input("Use the default input directory? (Y/n): ").strip().lower()
                if use_default in ['', 'y', 'yes']:
                    return self.default_input_directory

        # Prompt for a custom directory if default is not used
        while True:
            input_path = input("Enter the path to your input directory containing PDF files: ").strip().strip('"\'')
            if not input_path:
                print("Please enter a valid directory path.")
                continue

            input_path = Path(input_path)
            if not input_path.exists():
                print(f"Directory not found: {input_path}")
                continue
            if not input_path.is_dir():
                print(f"Path is not a directory: {input_path}")
                continue
            return input_path

    #Check for default output folder or a custom path to an output folder
    def get_output_dir(self, input_directory):
        """Prompt the user for the output directory or use the default."""
        print(f"\nDefault output directory: {self.default_output_dir}")
        use_default = input("Use the default output directory? (Y/n): ").strip().lower()

        # defalt option
        if use_default in ['', 'y', 'yes']:
            output_dir = self.default_output_dir
        else:
            custom_output_dir = input("Enter the output directory path: ").strip().strip('"\'')
            output_dir = Path(custom_output_dir) if custom_output_dir else self.default_output_dir

        #custom path
        try:
            output_dir.mkdir(parents=True, exist_ok=True)
            print(f"Output directory: {output_dir}")
            return output_dir
        except Exception as directory_error:
            print(f"Failed to create the output directory: {directory_error}")
            return None

    def get_pdf_file_list(self, directory_path):
        """Retrieve all PDF files from the specified directory."""
        all_files = directory_path.iterdir()
        pdf_files = [file for file in all_files if file.is_file() and file.suffix.lower() == '.pdf']

        if not pdf_files:
            print(f"No PDF files found in {directory_path}")
            return []

        pdf_files.sort()
        return pdf_files

    def select_files_for_processing(self, pdf_files):
        """
        Select which PDF files to process.
        Args:
            pdf_files (list): List of Path objects representing PDF files.
        Returns:
            list: List of Path objects selected for processing.
        """
        # get number of files in input folder
        print(f"\nFound {len(pdf_files)} PDF files:")
        print("-" * 50)

        # Calculate sizes of PDF file(s) then print name and file size
        for index, pdf_file in enumerate(pdf_files, 1):
            file_size_mb = pdf_file.stat().st_size / 1024 / 1024
            print(f"{index:2d}. {pdf_file.name} ({file_size_mb:.2f} MB)")

        print("-" * 50)
        print("Process all files or select specific files?")
        print("1. Process ALL files")
        print("2. Select specific files to process")

        # Choose all files or specific file(s)
        while True:
            user_choice = input("\nEnter your choice (1 or 2): ").strip()
            if user_choice == '1':
                return pdf_files
            elif user_choice == '2':
                return self.select_specific_pdf_files(pdf_files)
            else:
                print("Please enter 1 or 2")

    def select_specific_pdf_files(self, pdf_files):
        """Allow the user to select specific PDF files for processing."""
        print("\nEnter a number (e.g., 1 for the first PDF).")
        print("You can enter multiple numbers like '1,3,5' or ranges like '1-5' or combinations like '1,3-7,9':")

        while True:
            user_selection = input("File numbers: ").strip()
            if not user_selection:
                print("Please enter at least one file number.")
                continue

            try:
                selected_indices = self.parse_user_selection(user_selection, len(pdf_files))
                selected_pdf_files = [pdf_files[index - 1] for index in selected_indices]

                print(f"\nSelected {len(selected_pdf_files)} files:")
                for pdf_file in selected_pdf_files:
                    print(f"   - {pdf_file.name}")

                confirmation = input("\nProceed with these files? (y/N): ").strip().lower()
                if confirmation in ['y', 'yes']:
                    return selected_pdf_files
                else:
                    print("Selection cancelled. Choose again:")
                    continue
            except ValueError as selection_error:
                print(f"Invalid selection: {selection_error}")
                continue

    def parse_user_selection(self, selection_string, max_file_number):
        """Parse the user's selection string into a list of indices."""
        selected_indices = set()

        for selected in selection_string.split(','):
            selected = selected.strip()
            if '-' in selected:
                start_index, end_index = selected.split('-', 1)
                start_index, end_index = int(start_index.strip()), int(end_index.strip())
                if start_index < 1 or end_index > max_file_number or start_index > end_index:
                    raise ValueError(f"Invalid range {start_index}-{end_index}")
                selected_indices.update(range(start_index, end_index + 1))
            else:
                file_number = int(selected)
                if file_number < 1 or file_number > max_file_number:
                    raise ValueError(f"Number {file_number} out of range (1-{max_file_number})")
                selected_indices.add(file_number)

        return sorted(selected_indices)

    def get_quality_settings(self):
        """Prompt the user to choose quality/size trade-off settings."""
        print("\nChoose quality/size settings:")
        print("1. Small file size (lower quality, ~0.5-1x original size)")
        print("2. Balanced (medium quality, ~1-2x original size)")
        print("3. High quality (higher file size, ~2-4x original size)")
        print("4. Custom settings")

        while True:
            user_choice = input("Enter your choice (1-4): ").strip()
            if user_choice == '1':
                return {'dpi': 100, 'jpeg_quality': 70, 'img_format': 'jpeg'}
            elif user_choice == '2':
                return {'dpi': 150, 'jpeg_quality': 85, 'img_format': 'jpeg'}
            elif user_choice == '3':
                return {'dpi': 200, 'jpeg_quality': 95, 'img_format': 'png'}
            elif user_choice == '4':
                return self.get_custom_quality_settings()
            else:
                print("Please enter 1, 2, 3, or 4")

    def get_custom_quality_settings(self):
        """Prompt the user for custom quality settings."""
        print("\nCustom settings:")

        while True:
            try:
                dpi_value = int(input("DPI (72-300, recommended 100-200): "))
                if 72 <= dpi_value <= 300:
                    break
                print("DPI should be between 72 and 300")
            except ValueError:
                print("Please enter a valid number")

        while True:
            img_format = input("Image format (jpeg/png): ").strip().lower()
            if img_format in ['jpeg', 'jpg', 'png']:
                img_format = 'jpeg' if img_format in ['jpeg', 'jpg'] else 'png'
                break
            print("Please enter 'jpeg' or 'png'")

        jpeg_quality_val = 85
        if img_format == 'jpeg':
            while True:
                try:
                    jpeg_quality_val = int(input("JPEG quality (50-100, recommended 70-90): "))
                    if 50 <= jpeg_quality_val <= 100:
                        break
                    print("JPEG quality should be between 50 and 100")
                except ValueError:
                    print("Please enter a valid number")

        return {'dpi': dpi_value, 'jpeg_quality': jpeg_quality_val, 'img_format': img_format}

    def calculate_optimal_dpi(self, pdf_page, target_size_factor=1.5):
        """Calculate the optimal DPI based on page dimensions to control file size."""
        page_width_points = pdf_page.rect.width
        page_height_points = pdf_page.rect.height

        page_width_inches = page_width_points / 72
        page_height_inches = page_height_points / 72

        page_area_square_inches = page_width_inches * page_height_inches

        if page_area_square_inches > 100:
            base_dpi = 100
        elif page_area_square_inches > 60:
            base_dpi = 120
        else:
            base_dpi = 150

        return base_dpi

    def remove_ocr_layer_optimized(self, input_pdf_path, output_pdf_path, quality_settings, progress_bar=None):
        """Remove OCR layers from a PDF while optimizing for file size."""
        try:
            input_pdf_document = fitz.open(str(input_pdf_path))
            output_pdf_document = fitz.open()

            total_pages = len(input_pdf_document)
            dpi = quality_settings['dpi']
            jpeg_quality = quality_settings['jpeg_quality']
            img_format = quality_settings['img_format']

            for current_page_number in range(total_pages):
                current_page = input_pdf_document[current_page_number]

                zoom_factor = dpi / 72.0
                pixel_map = current_page.get_pixmap(matrix=fitz.Matrix(zoom_factor, zoom_factor), alpha=False)

                page_image = Image.frombytes("RGB", [pixel_map.width, pixel_map.height], pixel_map.samples)

                if img_format == 'jpeg':
                    image_bytes = self.compress_pil_image_to_bytes(page_image, 'JPEG', quality=jpeg_quality)
                else:
                    image_bytes = self.compress_pil_image_to_bytes(page_image, 'PNG')

                new_pdf_page = output_pdf_document.new_page(width=current_page.rect.width, height=current_page.rect.height)
                new_pdf_page.insert_image(new_pdf_page.rect, stream=image_bytes)

                if progress_bar:
                    progress_bar.update(1)
                    progress_bar.set_postfix({'Page': f'{current_page_number + 1}/{total_pages}'})

                pixel_map = None
                page_image = None

            output_pdf_document.save(
                str(output_pdf_path),
                garbage=4,
                deflate=True,
                clean=True
            )

            output_pdf_document.close()
            input_pdf_document.close()

            return True, None
        except Exception as processing_error:
            return False, str(processing_error)

    def compress_pil_image_to_bytes(self, pil_image, format_type, quality=85):
        """Compress a PIL Image to bytes with the specified format and quality."""
        image_buffer = BytesIO()

        if format_type == 'JPEG':
            if pil_image.mode in ('RGBA', 'LA', 'P'):
                background_image = Image.new('RGB', pil_image.size, (255, 255, 255))
                if pil_image.mode == 'P':
                    pil_image = pil_image.convert('RGBA')
                background_image.paste(pil_image, mask=pil_image.split()[-1] if pil_image.mode == 'RGBA' else None)
                pil_image = background_image

            pil_image.save(image_buffer, format='JPEG', quality=quality, optimize=True)
        else:
            pil_image.save(image_buffer, format='PNG', optimize=True)

        return image_buffer.getvalue()

    def get_pdf_page_count(self, pdf_file_path):
        """Get the number of pages in a PDF file."""
        try:
            with open(pdf_file_path, 'rb') as pdf_file:
                pdf_reader = PyPDF2.PdfReader(pdf_file)
                return len(pdf_reader.pages)
        except:
            try:
                pdf_document = fitz.open(str(pdf_file_path))
                page_count = len(pdf_document)
                pdf_document.close()
                return page_count
            except:
                return 0

    def process_single_pdf_file(self, input_pdf_file, output_dir, quality_settings):
        """Process a single PDF file with progress tracking and size optimization."""
        try:
            if not input_pdf_file.exists():
                raise FileNotFoundError(f"Input file not found: {input_pdf_file}")
            if not os.access(input_pdf_file, os.R_OK):
                raise PermissionError(f"Cannot read input file: {input_pdf_file}")

            output_pdf_file = output_dir / f"{input_pdf_file.stem}_no_ocr.pdf"

            if output_pdf_file.exists():
                overwrite_confirmation = input(f"Output file exists: {output_pdf_file.name}. Overwrite? (y/N): ").strip().lower()
                if overwrite_confirmation not in ['y', 'yes']:
                    self.skipped_processing_files.append({
                        'file_name': input_pdf_file.name,
                        'reason': 'File already exists, user chose not to overwrite'
                    })
                    return False, "Skipped by user"

            print(f"   Processing: {input_pdf_file.name}")
            print(f"   Settings: {quality_settings['dpi']} DPI, {quality_settings['img_format'].upper()} format")

            total_pages = self.get_pdf_page_count(input_pdf_file)

            with tqdm(
                total=total_pages,
                desc=f"     Pages",
                unit="page",
                bar_format='{desc}: {percentage:3.0f}%|{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}]'
            ) as progress_bar:
                processing_success, processing_error = self.remove_ocr_layer_optimized(
                    input_pdf_file,
                    output_pdf_file,
                    quality_settings,
                    progress_bar
                )

            if processing_success:
                if not output_pdf_file.exists():
                    raise FileNotFoundError("Output file was not created")
                if output_pdf_file.stat().st_size == 0:
                    raise ValueError("Output file is empty")

                original_file_size_mb = input_pdf_file.stat().st_size / 1024 / 1024
                new_file_size_mb = output_pdf_file.stat().st_size / 1024 / 1024
                size_ratio = new_file_size_mb / original_file_size_mb if original_file_size_mb > 0 else 0

                self.successfully_processed_files.append({
                    'input_file_name': input_pdf_file.name,
                    'output_file_name': output_pdf_file.name,
                    'original_file_size_mb': original_file_size_mb,
                    'new_file_size_mb': new_file_size_mb,
                    'size_change_mb': new_file_size_mb - original_file_size_mb,
                    'size_ratio': size_ratio
                })

                print(f"     ✓ Success: {input_pdf_file.name} -> {output_pdf_file.name}")
                print(f"     Size: {original_file_size_mb:.2f} MB -> {new_file_size_mb:.2f} MB ({size_ratio:.1f}x)")
                return True, None
            else:
                raise Exception(processing_error or "Unknown error during processing")
        except Exception as processing_error:
            error_message = str(processing_error)
            self.failed_processing_files.append({
                'file_name': input_pdf_file.name,
                'error': error_message,
                'traceback': traceback.format_exc()
            })
            print(f"     ✗ Failed: {input_pdf_file.name} - {error_message}")
            return False, error_message

    def process_pdf_files(self, pdf_files_to_process, output_dir, quality_settings):
        """Process multiple PDF files with progress tracking."""
        print(f"\nProcessing {len(pdf_files_to_process)} files with size optimization...")
        print("=" * 60)

        with tqdm(
            total=len(pdf_files_to_process),
            desc="Overall Progress",
            unit="file",
            position=0,
            leave=True
        ) as overall_progress_bar:
            for file_index, current_pdf_file in enumerate(pdf_files_to_process, 1):
                overall_progress_bar.set_description(f"File {file_index}/{len(pdf_files_to_process)}")
                print(f"\n[{file_index}/{len(pdf_files_to_process)}] {current_pdf_file.name}")

                self.process_single_pdf_file(current_pdf_file, output_dir, quality_settings)
                overall_progress_bar.update(1)

    def print_processing_summary(self):
        """Print a summary of the processing results."""
        print("\n" + "=" * 60)
        print("PROCESSING SUMMARY")
        print("=" * 60)

        print(f"Successfully processed: {len(self.successfully_processed_files)} files")
        print(f"Failed to process: {len(self.failed_processing_files)} files")
        print(f"Skipped: {len(self.skipped_processing_files)} files")

        if self.successfully_processed_files:
            print("\n✓ SUCCESSFULLY PROCESSED FILES:")
            total_original_size_mb = 0
            total_new_size_mb = 0

            for file_info in self.successfully_processed_files:
                print(f"   {file_info['input_file_name']} -> {file_info['output_file_name']}")
                print(f"     Size: {file_info['original_file_size_mb']:.2f} MB -> {file_info['new_file_size_mb']:.2f} MB ({file_info['size_ratio']:.1f}x)")
                total_original_size_mb += file_info['original_file_size_mb']
                total_new_size_mb += file_info['new_file_size_mb']

            average_size_ratio = total_new_size_mb / total_original_size_mb if total_original_size_mb > 0 else 0
            print(f"\nTotal size change: {total_original_size_mb:.2f} MB -> {total_new_size_mb:.2f} MB ({average_size_ratio:.1f}x average)")

        if self.failed_processing_files:
            print("\n✗ FAILED FILES:")
            for file_info in self.failed_processing_files:
                print(f"   {file_info['file_name']}: {file_info['error']}")

        if self.skipped_processing_files:
            print("\n⊘ SKIPPED FILES:")
            for file_info in self.skipped_processing_files:
                print(f"   {file_info['file_name']}: {file_info['reason']}")

  

def main():
    """
    Main function to run the PDF OCR removal tool.
    Handles user interaction, file selection, and processing.
    """
    print("PDF OCR Removal Tool - Size Optimized Version")
    print("==============================================")
    print("This tool removes OCR text layers while maintaining reasonable file sizes.")
    print("Default paths:")
    print(f"   Input:  /workspace/Remove-OCR-from-PDF/input")
    print(f"   Output: /workspace/Remove-OCR-from-PDF/output")
    print()

    ocr_remover = PDFOCRRemover()

    try:
        ocr_remover.default_input_directory.mkdir(parents=True, exist_ok=True)
        ocr_remover.default_output_dir.mkdir(parents=True, exist_ok=True)

        input_directory = ocr_remover.get_input_directory()
        print(f"Input directory: {input_directory}")

        output_dir = ocr_remover.get_output_dir(input_directory)
        if not output_dir:
            return

        quality_settings = ocr_remover.get_quality_settings()
        print(f"\nUsing settings: {quality_settings['dpi']} DPI, {quality_settings['img_format'].upper()} format")

        pdf_files = ocr_remover.get_pdf_file_list(input_directory)
        if not pdf_files:
            return

        pdf_files_to_process = ocr_remover.select_files_for_processing(pdf_files)
        if not pdf_files_to_process:
            print("No files selected for processing.")
            return

        ocr_remover.process_pdf_files(pdf_files_to_process, output_dir, quality_settings)

    except KeyboardInterrupt:
        print("\n\nProcessing interrupted by user.")
    except Exception as unexpected_error:
        print(f"\nUnexpected error: {unexpected_error}")
        traceback.print_exc()

if __name__ == "__main__":
    main()

PDF OCR Removal Tool - Size Optimized Version
This tool removes OCR text layers while maintaining reasonable file sizes.
Default paths:
   Input:  /workspace/Remove-OCR-from-PDF/input
   Output: /workspace/Remove-OCR-from-PDF/output

Default input directory: /workspace/Remove-OCR-from-PDF/input
Found 1 PDF files in the default directory.


Input directory: /workspace/Remove-OCR-from-PDF/input

Default output directory: /workspace/Remove-OCR-from-PDF/output
Output directory: /workspace/Remove-OCR-from-PDF/output

Choose quality/size settings:
1. Small file size (lower quality, ~0.5-1x original size)
2. Balanced (medium quality, ~1-2x original size)
3. High quality (higher file size, ~2-4x original size)
4. Custom settings

Using settings: 100 DPI, JPEG format

Found 1 PDF files:
--------------------------------------------------
 1. Eloquent_JavaScript.pdf (1.89 MB)
--------------------------------------------------
Process all files or select specific files?
1. Process ALL files
2. Select specific files to process

Processing 1 files with size optimization...


File 1/1:   0%|          | 0/1 [00:00<?, ?file/s]        


[1/1] Eloquent_JavaScript.pdf
   Processing: Eloquent_JavaScript.pdf
   Settings: 100 DPI, JPEG format


     Pages: 100%|██████████| 435/435 [00:07<00:00]
File 1/1: 100%|██████████| 1/1 [00:07<00:00,  7.41s/file]

     ✓ Success: Eloquent_JavaScript.pdf -> Eloquent_JavaScript_no_ocr.pdf
     Size: 1.89 MB -> 44.58 MB (23.6x)

PROCESSING SUMMARY
Successfully processed: 1 files
Failed to process: 0 files
Skipped: 0 files

✓ SUCCESSFULLY PROCESSED FILES:
   Eloquent_JavaScript.pdf -> Eloquent_JavaScript_no_ocr.pdf
     Size: 1.89 MB -> 44.58 MB (23.6x)

Total size change: 1.89 MB -> 44.58 MB (23.6x average)



