# How to Remove OCR from PDF(s)

## Follow the steps:
1. Add your PDF(s) to the input folder
2. Run the below code block to include the packages
3. Run the next code block and scroll to the very bottom of the notebook to view the directions and follow the given directions



# Things to know
This program has different options to remove OCR (removal is done by converting PDF pages to images)
Removing OCR can causing the size of a PDF to double for large files.

In [1]:
!pip install PyPDF2 PyMuPDF Pillow tqdm
!pip install matplotlib
!pip install pdf2image

Defaulting to user installation because normal site-packages is not writeable
Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 KB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting PyMuPDF
  Downloading pymupdf-1.26.3-cp39-abi3-manylinux_2_28_x86_64.whl (24.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.1/24.1 MB[0m [31m74.1 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting Pillow
  Downloading pillow-11.3.0-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (6.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.6/6.6 MB[0m [31m126.5 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hCollecting tqdm
  Downloading tqdm-4.67.1-py3-none-any.whl (78 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.5/78.5 KB[0m [31m29.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tqdm, PyPD

In [None]:
"""
PDF OCR Removal Script - Batch Processing Version (Size Optimized)
This script removes OCR text layers from PDF files while maintaining reasonable file sizes.
Supports processing single files or entire folders with size optimization options.
"""

import os
import sys
from pathlib import Path
import traceback
from datetime import datetime
import time

try:
    import PyPDF2
    import fitz  
    from PIL import Image
    from tqdm import tqdm
except ImportError as e:
    print(f"Required library not installed: {e}")
    print("Please install required packages:")
    print("pip install PyPDF2 PyMuPDF Pillow tqdm")
    sys.exit(1)

class PDFProcessor:
    def __init__(self):
        self.processed_files = []
        self.failed_files = []
        self.skipped_files = []
        self.default_input = Path('/workspace/Remove-OCR-from-PDF/input')
        self.default_output = Path('/workspace/Remove-OCR-from-PDF/output')

    def get_input_folder(self):
        """Get input folder path with default option."""
        print(f"Default input folder: {self.default_input}")

        if self.default_input.exists() and self.default_input.is_dir():
            pdf_count = len(list(self.default_input.glob("*.pdf"))) + len(list(self.default_input.glob("*.PDF")))
            if pdf_count > 0:
                print(f"Found {pdf_count} PDF files in default folder.")
                use_default = input("Use default input folder? (Y/n): ").strip().lower()
                if use_default in ['', 'y', 'yes']:
                    return self.default_input

        while True:
            folder_path = input("Enter the path to your input folder containing PDF files: ").strip()

            folder_path = folder_path.strip('"\'')

            if not folder_path:
                print("Please enter a valid folder path.")
                continue

            folder_path = Path(folder_path)

            if not folder_path.exists():
                print(f"Folder not found: {folder_path}")
                continue

            if not folder_path.is_dir():
                print(f"Path is not a directory: {folder_path}")
                continue

            return folder_path

    def get_output_folder(self, input_folder):
        """Get or create output folder with default option."""
        print(f"\nDefault output folder: {self.default_output}")
        use_default = input("Use default output folder? (Y/n): ").strip().lower()

        if use_default in ['', 'y', 'yes']:
            output_folder = self.default_output
        else:
            custom_output = input("Enter output folder path: ").strip().strip('"\'')
            if custom_output:
                output_folder = Path(custom_output)
            else:
                output_folder = self.default_output

        try:
            output_folder.mkdir(parents=True, exist_ok=True)
            print(f"Output folder: {output_folder}")
            return output_folder
        except Exception as e:
            print(f"Failed to create output folder: {e}")
            return None

    def get_pdf_files(self, folder_path):
        """Get all PDF files from the input folder."""

        all_files = folder_path.iterdir()
        pdf_files = [f for f in all_files if f.is_file() and f.suffix.lower() == '.pdf']

        if not pdf_files:
            print(f"No PDF files found in {folder_path}")
            return []

        pdf_files.sort()
        return pdf_files

    def choose_files_to_process(self, pdf_files):
        """Let user choose which files to process."""
        print(f"\nFound {len(pdf_files)} PDF files:")
        print("-" * 50)

        for i, file in enumerate(pdf_files, 1):
            file_size = file.stat().st_size / 1024 / 1024  
            print(f"{i:2d}. {file.name} ({file_size:.2f} MB)")

        print("-" * 50)
        print("Convert all or choose a specific file?")
        print("1. Process ALL files")
        print("2. Select specific files to process")

        while True:
            choice = input("\nEnter your choice (1 or 2): ").strip()
            if choice == '1':
                return pdf_files
            elif choice == '2':
                return self.select_specific_files(pdf_files)
            else:
                print("Please enter 1 or 2")

    def select_specific_files(self, pdf_files):
        """Allow user to select specific files to process."""
        print("\nEnter a number, ex) 1 represents the first PDF at the top")
        print("You can enter multiple numbers like '1,3,5' or ranges like '1-5' or combinations like '1,3-7,9':")

        while True:
            selection = input("File numbers: ").strip()
            if not selection:
                print("Please enter at least one file number.")
                continue

            try:
                selected_indices = self.parse_selection(selection, len(pdf_files))
                selected_files = [pdf_files[i-1] for i in selected_indices]

                print(f"\nSelected {len(selected_files)} files:")
                for file in selected_files:
                    print(f"   - {file.name}")

                confirm = input("\nProceed with these files? (y/N): ").strip().lower()
                if confirm in ['y', 'yes']:
                    return selected_files
                else:
                    print("Selection cancelled. Choose again:")
                    continue

            except ValueError as e:
                print(f"Invalid selection: {e}")
                continue

    def parse_selection(self, selection, max_num):
        """Parse user selection string into list of indices."""
        indices = set()

        for part in selection.split(','):
            part = part.strip()
            if '-' in part:

                start, end = part.split('-', 1)
                start, end = int(start.strip()), int(end.strip())
                if start < 1 or end > max_num or start > end:
                    raise ValueError(f"Invalid range {start}-{end}")
                indices.update(range(start, end + 1))
            else:

                num = int(part)
                if num < 1 or num > max_num:
                    raise ValueError(f"Number {num} out of range (1-{max_num})")
                indices.add(num)

        return sorted(indices)

    def get_quality_settings(self):
        """Let user choose quality/size trade-off settings."""
        print("\nChoose quality/size settings:")
        print("1. Small file size (lower quality, ~0.5-1x original size)")
        print("2. Balanced (medium quality, ~1-2x original size)")
        print("3. High quality (higher file size, ~2-4x original size)")
        print("4. Custom settings")

        while True:
            choice = input("Enter your choice (1-4): ").strip()
            if choice == '1':
                return {'dpi': 100, 'jpeg_quality': 70, 'format': 'jpeg'}
            elif choice == '2':
                return {'dpi': 150, 'jpeg_quality': 85, 'format': 'jpeg'}
            elif choice == '3':
                return {'dpi': 200, 'jpeg_quality': 95, 'format': 'png'}
            elif choice == '4':
                return self.get_custom_settings()
            else:
                print("Please enter 1, 2, 3, or 4")

    def get_custom_settings(self):
        """Get custom quality settings from user."""
        print("\nCustom settings:")

        while True:
            try:
                dpi = int(input("DPI (72-300, recommended 100-200): "))
                if 72 <= dpi <= 300:
                    break
                print("DPI should be between 72 and 300")
            except ValueError:
                print("Please enter a valid number")

        while True:
            format_choice = input("Image format (jpeg/png): ").strip().lower()
            if format_choice in ['jpeg', 'jpg', 'png']:
                format_choice = 'jpeg' if format_choice in ['jpeg', 'jpg'] else 'png'
                break
            print("Please enter 'jpeg' or 'png'")

        jpeg_quality = 85
        if format_choice == 'jpeg':
            while True:
                try:
                    jpeg_quality = int(input("JPEG quality (50-100, recommended 70-90): "))
                    if 50 <= jpeg_quality <= 100:
                        break
                    print("JPEG quality should be between 50 and 100")
                except ValueError:
                    print("Please enter a valid number")

        return {'dpi': dpi, 'jpeg_quality': jpeg_quality, 'format': format_choice}

    def calculate_optimal_dpi(self, page, target_size_factor=1.5):
        """Calculate optimal DPI based on page dimensions to control file size."""

        width_pts = page.rect.width
        height_pts = page.rect.height

        width_inches = width_pts / 72
        height_inches = height_pts / 72

        area_sq_inches = width_inches * height_inches

        if area_sq_inches > 100:  
            base_dpi = 100
        elif area_sq_inches > 60:  
            base_dpi = 120
        else:  
            base_dpi = 150

        return base_dpi

    def remove_ocr_optimized(self, input_path, output_path, quality_settings, progress_bar=None):
        """
        Optimized OCR removal method that maintains reasonable file sizes.
        """
        try:
            doc = fitz.open(str(input_path))
            new_doc = fitz.open()  

            total_pages = len(doc)
            dpi = quality_settings['dpi']
            jpeg_quality = quality_settings['jpeg_quality']
            img_format = quality_settings['format']

            for page_num in range(total_pages):
                page = doc[page_num]

                zoom_factor = dpi / 72.0
                mat = fitz.Matrix(zoom_factor, zoom_factor)

                pix = page.get_pixmap(matrix=mat, alpha=False)  

                img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)

                if img_format == 'jpeg':

                    img_bytes = self.compress_image_to_bytes(img, 'JPEG', quality=jpeg_quality)
                else:

                    img_bytes = self.compress_image_to_bytes(img, 'PNG')

                new_page = new_doc.new_page(width=page.rect.width, height=page.rect.height)
                new_page.insert_image(new_page.rect, stream=img_bytes)

                if progress_bar:
                    progress_bar.update(1)
                    progress_bar.set_postfix({'Page': f'{page_num + 1}/{total_pages}'})

                pix = None
                img = None

            new_doc.save(str(output_path), 
                        garbage=4,  
                        deflate=True,  
                        clean=True)  

            new_doc.close()
            doc.close()

            return True, None
        except Exception as e:
            return False, str(e)

    def compress_image_to_bytes(self, img, format_type, quality=85):
        """Compress PIL Image to bytes with specified format and quality."""
        from io import BytesIO

        img_buffer = BytesIO()

        if format_type == 'JPEG':

            if img.mode in ('RGBA', 'LA', 'P'):

                background = Image.new('RGB', img.size, (255, 255, 255))
                if img.mode == 'P':
                    img = img.convert('RGBA')
                background.paste(img, mask=img.split()[-1] if img.mode == 'RGBA' else None)
                img = background

            img.save(img_buffer, format='JPEG', quality=quality, optimize=True)
        else:  
            img.save(img_buffer, format='PNG', optimize=True)

        return img_buffer.getvalue()

    def get_page_count(self, pdf_path):
        """Get the number of pages in a PDF file."""
        try:
            with open(pdf_path, 'rb') as file:
                reader = PyPDF2.PdfReader(file)
                return len(reader.pages)
        except:
            try:
                doc = fitz.open(str(pdf_path))
                page_count = len(doc)
                doc.close()
                return page_count
            except:
                return 0

    def process_single_file(self, input_file, output_folder, quality_settings):
        """Process a single PDF file with progress bar and size optimization."""
        try:

            if not input_file.exists():
                raise FileNotFoundError(f"Input file not found: {input_file}")

            if not os.access(input_file, os.R_OK):
                raise PermissionError(f"Cannot read input file: {input_file}")

            output_file = output_folder / f"{input_file.stem}_no_ocr.pdf"

            if output_file.exists():
                overwrite = input(f"Output file exists: {output_file.name}. Overwrite? (y/N): ").strip().lower()
                if overwrite not in ['y', 'yes']:
                    self.skipped_files.append({
                        'file': input_file.name,
                        'reason': 'File already exists, user chose not to overwrite'
                    })
                    return False, "Skipped by user"

            print(f"   Processing: {input_file.name}")
            print(f"   Settings: {quality_settings['dpi']} DPI, {quality_settings['format'].upper()} format")

            page_count = self.get_page_count(input_file)

            with tqdm(total=page_count, desc=f"     Pages", unit="page",
                      bar_format='{desc}: {percentage:3.0f}%|{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}]') as pbar:

                success, error = self.remove_ocr_optimized(input_file, output_file, quality_settings, pbar)

            if success:

                if not output_file.exists():
                    raise FileNotFoundError("Output file was not created")

                if output_file.stat().st_size == 0:
                    raise ValueError("Output file is empty")

                original_size = input_file.stat().st_size / 1024 / 1024
                new_size = output_file.stat().st_size / 1024 / 1024
                size_ratio = new_size / original_size if original_size > 0 else 0

                self.processed_files.append({
                    'input_file': input_file.name,
                    'output_file': output_file.name,
                    'original_size_mb': original_size,
                    'new_size_mb': new_size,
                    'size_change': new_size - original_size,
                    'size_ratio': size_ratio
                })

                print(f"     ✓ Success: {input_file.name} -> {output_file.name}")
                print(f"     Size: {original_size:.2f} MB -> {new_size:.2f} MB ({size_ratio:.1f}x)")
                return True, None
            else:
                raise Exception(error or "Unknown error during processing")

        except Exception as e:
            error_msg = str(e)
            self.failed_files.append({
                'file': input_file.name,
                'error': error_msg,
                'traceback': traceback.format_exc()
            })
            print(f"     ✗ Failed: {input_file.name} - {error_msg}")
            return False, error_msg

    def process_files(self, files_to_process, output_folder, quality_settings):
        """Process multiple PDF files with overall progress tracking."""
        print(f"\nProcessing {len(files_to_process)} files with size optimization...")
        print("=" * 60)

        with tqdm(total=len(files_to_process), desc="Overall Progress", unit="file",
                  position=0, leave=True) as overall_pbar:

            for i, file in enumerate(files_to_process, 1):
                overall_pbar.set_description(f"File {i}/{len(files_to_process)}")
                print(f"\n[{i}/{len(files_to_process)}] {file.name}")

                self.process_single_file(file, output_folder, quality_settings)
                overall_pbar.update(1)

                time.sleep(0.1)

    def print_summary(self):
        """Print processing summary with size analysis."""
        print("\n" + "=" * 60)
        print("PROCESSING SUMMARY")
        print("=" * 60)

        print(f"Successfully processed: {len(self.processed_files)} files")
        print(f"Failed to process: {len(self.failed_files)} files")
        print(f"Skipped: {len(self.skipped_files)} files")

        if self.processed_files:
            print("\n✓ SUCCESSFULLY PROCESSED FILES:")
            total_original = 0
            total_new = 0
            for file_info in self.processed_files:
                print(f"   {file_info['input_file']} -> {file_info['output_file']}")
                print(f"     Size: {file_info['original_size_mb']:.2f} MB -> {file_info['new_size_mb']:.2f} MB ({file_info['size_ratio']:.1f}x)")
                total_original += file_info['original_size_mb']
                total_new += file_info['new_size_mb']

            avg_ratio = total_new / total_original if total_original > 0 else 0
            print(f"\nTotal size change: {total_original:.2f} MB -> {total_new:.2f} MB ({avg_ratio:.1f}x average)")

        if self.failed_files:
            print("\n✗ FAILED FILES:")
            for file_info in self.failed_files:
                print(f"   {file_info['file']}: {file_info['error']}")

        if self.skipped_files:
            print("\n⊘ SKIPPED FILES:")
            for file_info in self.skipped_files:
                print(f"   {file_info['file']}: {file_info['reason']}")

    def save_log(self, output_folder):
        """Save processing log to file."""
        if not (self.processed_files or self.failed_files or self.skipped_files):
            return

        log_file = output_folder / f"processing_log_{datetime.now().strftime('%Y%m%d_%H%M%S')}.txt"

        try:
            with open(log_file, 'w', encoding='utf-8') as f:
                f.write(f"PDF OCR Removal Processing Log\n")
                f.write(f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
                f.write("=" * 60 + "\n\n")

                f.write(f"Summary:\n")
                f.write(f"   Successfully processed: {len(self.processed_files)} files\n")
                f.write(f"   Failed to process: {len(self.failed_files)} files\n")
                f.write(f"   Skipped: {len(self.skipped_files)} files\n\n")

                if self.processed_files:
                    f.write("SUCCESSFULLY PROCESSED FILES:\n")
                    f.write("-" * 40 + "\n")
                    for file_info in self.processed_files:
                        f.write(f"Input: {file_info['input_file']}\n")
                        f.write(f"Output: {file_info['output_file']}\n")
                        f.write(f"Size: {file_info['original_size_mb']:.2f} MB -> {file_info['new_size_mb']:.2f} MB ({file_info['size_ratio']:.1f}x)\n\n")

                if self.failed_files:
                    f.write("FAILED FILES:\n")
                    f.write("-" * 40 + "\n")
                    for file_info in self.failed_files:
                        f.write(f"File: {file_info['file']}\n")
                        f.write(f"Error: {file_info['error']}\n")
                        f.write(f"Traceback:\n{file_info['traceback']}\n\n")

                if self.skipped_files:
                    f.write("SKIPPED FILES:\n")
                    f.write("-" * 40 + "\n")
                    for file_info in self.skipped_files:
                        f.write(f"File: {file_info['file']}\n")
                        f.write(f"Reason: {file_info['reason']}\n\n")

            print(f"\nProcessing log saved: {log_file}")

        except Exception as e:
            print(f"Failed to save log file: {e}")

def main():
    print("PDF OCR Removal Tool - Size Optimized Version")
    print("==============================================")
    print("This tool removes OCR text layers while maintaining reasonable file sizes.")
    print("Default paths:")
    print(f"   Input:  /workspace/Remove-OCR-from-PDF/input")
    print(f"   Output: /workspace/Remove-OCR-from-PDF/output")
    print()

    processor = PDFProcessor()

    try:

        processor.default_input.mkdir(parents=True, exist_ok=True)
        processor.default_output.mkdir(parents=True, exist_ok=True)

        input_folder = processor.get_input_folder()
        print(f"Input folder: {input_folder}")

        output_folder = processor.get_output_folder(input_folder)
        if not output_folder:
            return

        quality_settings = processor.get_quality_settings()
        print(f"\nUsing settings: {quality_settings['dpi']} DPI, {quality_settings['format'].upper()} format")

        pdf_files = processor.get_pdf_files(input_folder)
        if not pdf_files:
            return

        files_to_process = processor.choose_files_to_process(pdf_files)
        if not files_to_process:
            print("No files selected for processing.")
            return

        processor.process_files(files_to_process, output_folder, quality_settings)

        processor.print_summary()

        processor.save_log(output_folder)

    except KeyboardInterrupt:
        print("\n\nProcessing interrupted by user.")
    except Exception as e:
        print(f"\nUnexpected error: {e}")
        traceback.print_exc()

if __name__ == "__main__":
    main()

PDF OCR Removal Tool - Size Optimized Version
This tool removes OCR text layers while maintaining reasonable file sizes.
Default paths:
   Input:  /workspace/Remove-OCR-from-PDF/input
   Output: /workspace/Remove-OCR-from-PDF/output

Default input folder: /workspace/Remove-OCR-from-PDF/input
Found 1 PDF files in default folder.


Input folder: /workspace/Remove-OCR-from-PDF/input

Default output folder: /workspace/Remove-OCR-from-PDF/output
Output folder: /workspace/Remove-OCR-from-PDF/output

Choose quality/size settings:
1. Small file size (lower quality, ~0.5-1x original size)
2. Balanced (medium quality, ~1-2x original size)
3. High quality (higher file size, ~2-4x original size)
4. Custom settings
Please enter 1, 2, 3, or 4
Please enter 1, 2, 3, or 4
Please enter 1, 2, 3, or 4
Please enter 1, 2, 3, or 4
Please enter 1, 2, 3, or 4
Please enter 1, 2, 3, or 4


Processing interrupted by user.
