In [None]:
%reset -f

In [None]:
import re
import cv2
import numpy as np
from pdf2image import convert_from_path
from PyPDF2 import PdfReader
import pytesseract
from pathlib import Path
import time
import nltk
from nltk.corpus import stopwords, words
from nltk.tokenize import word_tokenize
import gc  # Garbage collection

# Download necessary NLTK data
'''
nltk.download('words')
nltk.download('punkt')
nltk.download('stopwords')
'''

In [None]:
'''
I've been working on building out the text cleaning process because better text yields
better results. Tokenizing words is something I'm not terribly familiar with, but
filtering nonsense words are my specialty, especially with three teenagers at home
who speak nothing but nonsense. 
'''

def clean_text(text, return_tokens=True, filter_nonsense=True):
    """
    Perform standard text pre-processing steps and optionally filter out nonsense words.
    Parameters:
    - text: Input text to clean.
    - return_tokens: If True, return a list of tokens instead of a string.
    - filter_nonsense: If True, remove words not found in a predefined valid words list.
    """
    # Convert to lowercase
    text = text.lower()
    # Remove hyperlinks
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    # Remove special characters and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Replace multiple spaces with a single space
    text = re.sub(r'\s+', ' ', text).strip()
    # Tokenize text
    tokens = word_tokenize(text)
    # Optionally filter out nonsense words
    if filter_nonsense:
        tokens = [word for word in tokens if word in valid_words]
    # Remove stop words
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word not in stop_words]
    # Return based on the return_tokens flag
    return filtered_tokens if return_tokens else ' '.join(filtered_tokens)

In [None]:
'''
Prior to running the PDF images through OCR, images are enhanced for improved OCR accuracy.
'''

def enhance_image(image):

    try:
        # Convert to grayscale
        gray = cv2.cvtColor(np.array(image), cv2.COLOR_BGR2GRAY)
        # Apply Gaussian blur to reduce noise
        blurred = cv2.GaussianBlur(gray, (5, 5), 0)
        # Apply thresholding to binarize the image
        _, thresh = cv2.threshold(blurred, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
        # Detect and correct skew
        coords = np.column_stack(np.where(thresh > 0))
        angle = cv2.minAreaRect(coords)[-1]
        if angle < -45:
            angle = -(90 + angle)
        else:
            angle = -angle
        (h, w) = thresh.shape[:2]
        center = (w // 2, h // 2)
        M = cv2.getRotationMatrix2D(center, angle, 1.0)
        rotated = cv2.warpAffine(thresh, M, (w, h), flags=cv2.INTER_CUBIC, borderMode=cv2.BORDER_REPLICATE)
        return rotated
    except Exception as e:
        print(f"Error in image enhancement: {e}")
        return image  # Return original image if enhancement fails

In [None]:
'''
Little function to get the number of pages in the document.
It isn't technically necessary at all, but it's nice to have.
'''

def get_num_pages(pdf_path):
    with open(pdf_path, 'rb') as file:
        pdf_reader = PdfReader(file)
        return len(pdf_reader.pages)

In [None]:
"""
Convert a multi-page PDF to images in chunks, enhance each image, perform OCR,
clean the text, and write results to two text files: one with raw OCR output
and another with cleaned text.
"""

def process_pdf_and_ocr(pdf_path, output_text_file, cleaned_output_file, chunk_size):
    if not Path(pdf_path).is_file():
        print(f"Error: The file {pdf_path} does not exist.")
        return

    # Determine the total number of pages
    total_pages = get_num_pages(pdf_path)
    print(f"Total number of pages: {total_pages}")
    current_page = 1

    while current_page <= total_pages:
        last_page = min(current_page + chunk_size - 1, total_pages)
        try:
            images = convert_from_path(pdf_path, first_page=current_page, last_page=last_page)
            with open(output_text_file, "a") as raw_file, open(cleaned_output_file, "a") as cleaned_file:
                for image in images:
                    print(f"Processing page {current_page}...")
                    enhanced_image = enhance_image(image)
                    text = pytesseract.image_to_string(enhanced_image)
                    raw_file.write(text + "\n")

                    # Call clean_text with return_tokens=True and handle both list and string outputs
                    cleaned_text = clean_text(text, return_tokens=True, filter_nonsense=True)
                    
                    # Check if cleaned_text is a list and join if necessary
                    if isinstance(cleaned_text, list):
                        cleaned_text = ' '.join(cleaned_text)
                    
                    cleaned_file.write(cleaned_text + "\n")
                    current_page += 1
                    # Optional: Run garbage collection after each image is processed
                    del image, enhanced_image, text, cleaned_text
                    gc.collect()

        except Exception as e:
            print(f"Error during OCR processing on pages {current_page}-{last_page}: {e}")
            current_page += chunk_size  # Skip the chunk and try the next one
            continue

In [None]:
'''
Send it.

Make sure to update the pdf_path, output_text_file, and cleaned_output_file variables
'''

start_time = time.time()
print("Reading PDF and performing OCR...")

valid_words = set(words.words())

pdf_path = r""
output_text_file = r""
cleaned_output_file = r""

chunk_size = 50
process_pdf_and_ocr(pdf_path, output_text_file, cleaned_output_file, chunk_size)
print(f"Processing completed. Raw OCR text written to {output_text_file}")
print(f"Cleaned text written to {cleaned_output_file}")
end_time = time.time()

print(f"Processing complete in {end_time - start_time:.2f} seconds.")