In [1]:
pip install easyocr pdf2image Pillow docx2txt colorama pdfplumber


Collecting easyocr
  Using cached easyocr-1.7.2-py3-none-any.whl.metadata (10 kB)
Collecting pdf2image
  Using cached pdf2image-1.17.0-py3-none-any.whl.metadata (6.2 kB)
Collecting docx2txt
  Using cached docx2txt-0.8.tar.gz (2.8 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting pdfplumber
  Using cached pdfplumber-0.11.4-py3-none-any.whl.metadata (41 kB)
Collecting torch (from easyocr)
  Using cached torch-2.5.1-cp312-cp312-win_amd64.whl.metadata (28 kB)
Collecting torchvision>=0.5 (from easyocr)
  Using cached torchvision-0.20.1-cp312-cp312-win_amd64.whl.metadata (6.2 kB)
Collecting opencv-python-headless (from easyocr)
  Using cached opencv_python_headless-4.10.0.84-cp37-abi3-win_amd64.whl.metadata (20 kB)
Collecting python-bidi (from easyocr)
  Using cached python_bidi-0.6.3-cp312-none-win_amd64.whl.metadata (5.0 kB)
Collecting Shapely (from easyocr)
  Using cached shapely-2.0.6-cp312-cp312-win_amd64.whl.metadata (7


[notice] A new release of pip is available: 24.2 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [3]:
import easyocr
import pdfplumber
import docx2txt
import os
import json
import numpy as np
from PIL import Image
from colorama import Fore, Style, init


In [4]:
# Initialize colorama for colored output
init(autoreset=True)

# Create an EasyOCR Reader instance
reader = easyocr.Reader(['en'])  # Specify languages (e.g., 'en' for English)


Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.


In [5]:
def is_machine_readable(filepath):
    """ Check if the document is already machine-readable by its extension. """
    _, ext = os.path.splitext(filepath)
    return ext.lower() in ['.txt', '.json']

def extract_text_from_pdf(filepath):
    """ Extract text from a PDF document using pdfplumber. """
    text = ""
    print(f"{Fore.BLUE}Extracting text from PDF: {filepath}")

    with pdfplumber.open(filepath) as pdf:
        for i, page in enumerate(pdf.pages):
            page_text = page.extract_text()
            if page_text:
                print(f"{Fore.GREEN}Extracted text from page {i + 1} using pdfplumber.")
                text += page_text + "\n"
            else:
                print(f"{Fore.YELLOW}No text found on page {i + 1}. Performing OCR...")
                # If no text is found, apply OCR to the page image
                image = page.to_image(resolution=300)
                ocr_text = ocr_image(image.original)
                text += ocr_text + "\n"

    return text

In [6]:
def ocr_image(image):
    """ Perform OCR on a given image using EasyOCR. """
    # Convert the image to a NumPy array for EasyOCR
    image_np = np.array(image)
    result = reader.readtext(image_np)
    return " ".join([res[1] for res in result])

def extract_text_from_docx(filepath):
    """ Extract all text content from a DOCX file. """
    print(f"{Fore.YELLOW}Extracting text from DOCX: {filepath}")
    text = docx2txt.process(filepath)
    print(f"{Fore.GREEN}Text extraction from DOCX complete.")
    return text

def transform_to_machine_readable(filepath):
    """ Transforms non-machine-readable documents into a JSON format. """
    _, ext = os.path.splitext(filepath)
    print(f"{Fore.BLUE}Starting transformation of {filepath} to machine-readable format...")
    
    if ext.lower() == '.pdf':
        text = extract_text_from_pdf(filepath)
    elif ext.lower() == '.docx':
        text = extract_text_from_docx(filepath)
    else:
        raise ValueError(f"{Fore.RED}Unsupported file type for OCR: {ext}")
    
    # Structure extracted text into JSON
    data = {"content": text}
    json_output = filepath.replace(ext, ".json")
    with open(json_output, 'w') as f:
        json.dump(data, f)
    
    print(f"{Fore.CYAN}Transformed and saved as JSON: {json_output}")
    return json_output


In [7]:
def process_document(filepath):
    """ Primary function to process documents based on their readability status. """
    print(f"{Fore.MAGENTA}Processing document: {filepath}")
    
    if is_machine_readable(filepath):
        print(f"{Fore.GREEN}{filepath} is already machine-readable.")
    elif os.path.splitext(filepath)[1].lower() in ['.pdf', '.docx']:
        print(f"{Fore.RED}{filepath} is not machine-readable. Transforming...")
        transform_to_machine_readable(filepath)
    else:
        print(f"{Fore.RED}Unsupported file type: {filepath}")

In [8]:
# Example usage - replace with actual document path
file_path = r"C:\Users\jayan\OneDrive\Desktop\Harshu Eamcet\College LIST.pdf"  # Update with your document path
process_document(file_path)


Processing document: C:\Users\jayan\OneDrive\Desktop\Harshu Eamcet\College LIST.pdf
C:\Users\jayan\OneDrive\Desktop\Harshu Eamcet\College LIST.pdf is not machine-readable. Transforming...
Starting transformation of C:\Users\jayan\OneDrive\Desktop\Harshu Eamcet\College LIST.pdf to machine-readable format...
Extracting text from PDF: C:\Users\jayan\OneDrive\Desktop\Harshu Eamcet\College LIST.pdf
No text found on page 1. Performing OCR...
No text found on page 2. Performing OCR...
No text found on page 3. Performing OCR...
No text found on page 4. Performing OCR...
Transformed and saved as JSON: C:\Users\jayan\OneDrive\Desktop\Harshu Eamcet\College LIST.json
