In [59]:
import pytesseract
from PIL import Image
import os
import fitz  # PyMuPDF
import cv2
import numpy as np
import pandas as pd
import csv
import re



In [None]:
def pdf_to_images_pymupdf(pdf_path, output_folder, zoom=2.0):
    """
    Converts each page of a PDF into an image and saves them to the specified folder.

    Args:
        pdf_path (str): Path to the input PDF file.
        output_folder (str): Folder where images will be saved.
        zoom (float): Zoom factor for rendering quality (default: 2.0).

    Returns:
        list: List of paths to the saved image files.
    """
    # Ensure the output folder and 'page_img' subfolder exist
    page_img_folder = os.path.join(output_folder, 'page_img')
    if not os.path.exists(page_img_folder):
        os.makedirs(page_img_folder)

    # Open the PDF
    pdf_document = fitz.open(pdf_path)
    image_paths = []

    for page_num in range(len(pdf_document)):
        # Get the page
        page = pdf_document[page_num]

        # Set zoom factor and render page to a pixmap
        matrix = fitz.Matrix(zoom, zoom)
        pixmap = page.get_pixmap(matrix=matrix)

        # Save the pixmap as an image in the 'page_img' subfolder
        image_path = os.path.join(page_img_folder, f"page_{page_num + 1}.png")
        pixmap.save(image_path)
        image_paths.append(image_path)

    pdf_document.close()
    print(f"Converted {len(image_paths)} pages into images.")
    return image_paths

# Example Usage
if __name__ == "__main__":
    pdf_path = r"D:\project\netz_work\OCR\PublicWaterMassMailing.pdf"
    output_folder = r"D:\project\netz_work\OCR\output"  # Main output folder
    image_files = pdf_to_images_pymupdf(pdf_path, output_folder)
    print("Images saved at:", image_files)


Converted 8 pages into images.
Images saved at: ['D:\\project\\netz_work\\OCR\\output\\page_img\\page_1.png', 'D:\\project\\netz_work\\OCR\\output\\page_img\\page_2.png', 'D:\\project\\netz_work\\OCR\\output\\page_img\\page_3.png', 'D:\\project\\netz_work\\OCR\\output\\page_img\\page_4.png', 'D:\\project\\netz_work\\OCR\\output\\page_img\\page_5.png', 'D:\\project\\netz_work\\OCR\\output\\page_img\\page_6.png', 'D:\\project\\netz_work\\OCR\\output\\page_img\\page_7.png', 'D:\\project\\netz_work\\OCR\\output\\page_img\\page_8.png']


In [11]:


def extract_text_from_images(image_folder, output_text_file):
    """
    Extracts text from all PNG images in the specified folder and saves it to a text file.

    Args:
        image_folder (str): Folder containing the PNG images.
        output_text_file (str): Path to the output text file.
    """
    with open(output_text_file, "w", encoding="utf-8") as text_file:
        # Iterate through all PNG files in the folder
        for image_name in os.listdir(image_folder):
            if image_name.endswith(".png"):
                image_path = os.path.join(image_folder, image_name)
                
                # Open the image
                img = Image.open(image_path)
                
                # Extract text using pytesseract
                text = pytesseract.image_to_string(img)
                
                # Write extracted text to the file
                text_file.write(f"--- Text from {image_name} ---\n")
                text_file.write(text)
                text_file.write("\n\n")
    
    print(f"Text extracted and saved to {output_text_file}")

# Example Usage
if __name__ == "__main__":
    image_folder = r"D:\project\netz_work\OCR\output"  # Folder where PNG images are saved
    output_text_file = r"D:\project\netz_work\OCR\output\extracted_text.txt"  # Path to save the text file
    extract_text_from_images(image_folder, output_text_file)


Text extracted and saved to D:\project\netz_work\OCR\output\extracted_text.txt


In [None]:


def extract_text_from_images(image_folder, output_folder):
    """
    Extracts text from all PNG images in the specified folder and saves each page's text
    in a separate text file inside the 'txtoutput' folder.

    Args:
        image_folder (str): Folder containing the PNG images.
        output_folder (str): Folder where the text files will be saved.
    """
    # Ensure the output folder exists
    txt_output_folder = os.path.join(output_folder, 'txtoutput')
    if not os.path.exists(txt_output_folder):
        os.makedirs(txt_output_folder)
    
    # Iterate through all PNG files in the folder
    for page_num, image_name in enumerate(os.listdir(image_folder), start=1):
        if image_name.endswith(".png"):
            image_path = os.path.join(image_folder, image_name)
            
            # Open the image
            img = Image.open(image_path)
            
            # Extract text using pytesseract
            text = pytesseract.image_to_string(img)
            
            # Save the text to a separate file for each page
            text_file_path = os.path.join(txt_output_folder, f"page_{page_num}.txt")
            with open(text_file_path, "w", encoding="utf-8") as text_file:
                text_file.write(text)
    
    print(f"Text extracted and saved in {txt_output_folder}")

# Example Usage
if __name__ == "__main__":
    image_folder = r"D:\project\netz_work\OCR\output\page_img"  # Folder where PNG images are saved
    output_folder = r"D:\project\netz_work\OCR\output"
    extract_text_from_images(image_folder, output_folder)


Text extracted and saved in D:\project\netz_work\OCR\output\txtoutput


In [22]:
def extract_and_save_images_from_png(png_folder, output_folder):
    """
    Detects and extracts individual images (diagrams) from PNG files and saves them
    in the 'images' subfolder inside the output folder.

    Args:
        png_folder (str): Folder containing the PNG images.
        output_folder (str): Folder where the extracted images will be saved.
    """
    # Ensure the 'images' subfolder exists inside output_folder
    images_folder = os.path.join(output_folder, 'images')
    if not os.path.exists(images_folder):
        os.makedirs(images_folder)

    # Iterate through all PNG files in the folder
    for page_num, image_name in enumerate(os.listdir(png_folder), start=1):
        if image_name.endswith(".png"):
            image_path = os.path.join(png_folder, image_name)
            
            # Open the image using OpenCV
            img = cv2.imread(image_path)
            
            # Convert the image to grayscale
            gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
            
            # Apply thresholding or edge detection to detect images/diagrams
            _, thresh = cv2.threshold(gray, 128, 255, cv2.THRESH_BINARY_INV)
            
            # Find contours (potential diagrams)
            contours, _ = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
            
            for i, contour in enumerate(contours):
                # Get the bounding box for each contour
                x, y, w, h = cv2.boundingRect(contour)
                
                # Filter out small contours that might not be actual images (diagrams)
                if w > 50 and h > 50:  # Adjust these values based on your images
                    # Crop the image based on the bounding box
                    cropped_img = img[y:y+h, x:x+w]
                    
                    # Save the cropped image as a separate file
                    diagram_img_path = os.path.join(images_folder, f"{image_name}_diagram_{i+1}.png")
                    cv2.imwrite(diagram_img_path, cropped_img)
                    
                    print(f"Saved diagram from {image_name} as {diagram_img_path}")

# Example Usage
if __name__ == "__main__":
    png_folder = r"D:\project\netz_work\OCR\output\page_img"  # Folder with PNG images
    output_folder = r"D:\project\netz_work\OCR\output"  # Main output folder
    extract_and_save_images_from_png(png_folder, output_folder)


Saved diagram from page_1.png as D:\project\netz_work\OCR\output\images\page_1.png_diagram_5095.png
Saved diagram from page_4.png as D:\project\netz_work\OCR\output\images\page_4.png_diagram_1163.png
Saved diagram from page_4.png as D:\project\netz_work\OCR\output\images\page_4.png_diagram_1534.png
Saved diagram from page_5.png as D:\project\netz_work\OCR\output\images\page_5.png_diagram_949.png
Saved diagram from page_5.png as D:\project\netz_work\OCR\output\images\page_5.png_diagram_1753.png
Saved diagram from page_5.png as D:\project\netz_work\OCR\output\images\page_5.png_diagram_2136.png
Saved diagram from page_5.png as D:\project\netz_work\OCR\output\images\page_5.png_diagram_2268.png
Saved diagram from page_5.png as D:\project\netz_work\OCR\output\images\page_5.png_diagram_3429.png
Saved diagram from page_5.png as D:\project\netz_work\OCR\output\images\page_5.png_diagram_3443.png
Saved diagram from page_5.png as D:\project\netz_work\OCR\output\images\page_5.png_diagram_3448.png
S

In [None]:
def extract_and_save_images_from_png(png_folder, output_folder):
    """
    Extracts and saves diagrams from PNG images in the specified folder.
    """
    images_folder = os.path.join(output_folder, 'images2')
    if not os.path.exists(images_folder):
        os.makedirs(images_folder)

    for page_num, image_name in enumerate(os.listdir(png_folder), start=1):
        if image_name.endswith(".png"):
            image_path = os.path.join(png_folder, image_name)
            img = cv2.imread(image_path)

            # Convert the image to grayscale
            gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
            # Apply adaptive thresholding to handle varying lighting conditions
            thresh = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_MEAN_C, cv2.THRESH_BINARY_INV, 11, 2)

            # Find contours in the thresholded image
            contours, _ = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

            for i, contour in enumerate(contours):
                x, y, w, h = cv2.boundingRect(contour)
                aspect_ratio = w / float(h)

                # Filter out small contours and those with unusual aspect ratios
                if w > 50 and h > 50 and aspect_ratio > 0.5 and aspect_ratio < 3.0:
                    cropped_img = img[y:y+h, x:x+w]
                    diagram_img_path = os.path.join(images_folder, f"{image_name}_diagram_{i+1}.png")
                    cv2.imwrite(diagram_img_path, cropped_img)
                    print(f"Saved diagram from {image_name} as {diagram_img_path}")

# Usage example
png_folder = r"D:\project\netz_work\OCR\output\page_img"
output_folder = r"D:\project\netz_work\OCR\output"
extract_and_save_images_from_png(png_folder, output_folder)


Saved diagram from page_1.png as D:\project\netz_work\OCR\output\images9\page_1.png_diagram_5813.png
Saved diagram from page_1.png as D:\project\netz_work\OCR\output\images9\page_1.png_diagram_5852.png
Saved diagram from page_4.png as D:\project\netz_work\OCR\output\images9\page_4.png_diagram_395.png
Saved diagram from page_4.png as D:\project\netz_work\OCR\output\images9\page_4.png_diagram_3162.png
Saved diagram from page_5.png as D:\project\netz_work\OCR\output\images9\page_5.png_diagram_1202.png
Saved diagram from page_5.png as D:\project\netz_work\OCR\output\images9\page_5.png_diagram_2649.png
Saved diagram from page_5.png as D:\project\netz_work\OCR\output\images9\page_5.png_diagram_3941.png
Saved diagram from page_5.png as D:\project\netz_work\OCR\output\images9\page_5.png_diagram_3948.png
Saved diagram from page_6.png as D:\project\netz_work\OCR\output\images9\page_6.png_diagram_997.png
Saved diagram from page_6.png as D:\project\netz_work\OCR\output\images9\page_6.png_diagram_1

In [65]:
# Set the path to the Tesseract executable (update as needed)
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'  # Update path if required

def extract_table_from_image(image_path):
    """
    Extracts table-like structures from an image using OCR.

    Args:
        image_path (str): Path to the PNG image.

    Returns:
        list: A list of rows, where each row is a list of cell contents.
    """
    # Load the image using OpenCV
    image = cv2.imread(image_path)

    # Perform OCR on the image
    ocr_text = pytesseract.image_to_string(image)

    # Split the OCR result into lines
    lines = ocr_text.split('\n')
    
    table_data = []
    
    # Process each line and detect rows and columns (simple space-based logic for table extraction)
    for line in lines:
        # Use a regular expression to split columns based on spaces
        columns = re.split(r'\s{2,}', line.strip())  # Assumes columns are separated by multiple spaces
        
        # If there are columns, add them as a row
        if len(columns) > 1:
            table_data.append(columns)

    return table_data

def save_table_to_csv(table_data, table_index, output_folder):
    """
    Saves the extracted table data to a CSV file.

    Args:
        table_data (list): The table data (rows and columns).
        table_index (int): The index of the table to differentiate CSV files.
        output_folder (str): The folder where the CSV will be saved.
    """
    table_folder = os.path.join(output_folder, "tables")
    if not os.path.exists(table_folder):
        os.makedirs(table_folder)

    csv_file = os.path.join(table_folder, f"table_{table_index + 1}.csv")
    
    with open(csv_file, mode='w', newline='') as f:
        writer = csv.writer(f)
        writer.writerows(table_data)

def extract_tables_from_images(image_folder, output_folder):
    """
    Extracts all tables from images and saves them as CSV files.

    Args:
        image_folder (str): Folder containing the PNG images.
        output_folder (str): Folder where the CSV files will be saved.
    """
    table_index = 0
    for image_name in os.listdir(image_folder):
        image_path = os.path.join(image_folder, image_name)

        # Ensure the file is a PNG image
        if image_name.lower().endswith('.png'):
            table_data = extract_table_from_image(image_path)

            if table_data:
                save_table_to_csv(table_data, table_index, output_folder)
                table_index += 1

    print(f"Extracted and saved {table_index} tables as CSV files.")

# Example Usage
# Define paths (update these as per your folder structure)
image_folder = r"D:\project\netz_work\OCR\output\page_img"  # Folder with PNG images
output_folder = r"D:\project\netz_work\OCR\output"  # Main output folder

# Run the table extraction
extract_tables_from_images(image_folder, output_folder)


Extracted and saved 0 tables as CSV files.


In [None]:
pip install camelot-py[cv] pdfplumber