In [5]:
from pathlib import Path
import pathlib

import mss
import mss.tools

def get_screenshot(path_input):
        with mss.mss() as sct:
            # The screen part to capture
            monitor = {"top": 50, "left": 50, "width": 500, "height": 800}
    
            sct_img = sct.grab(monitor) # Grab the data
            # Save to the picture file
            mss.tools.to_png(sct_img.rgb, sct_img.size, output=path_input)

path_input = r"C:/Users/rpa.uat/Downloads/pdf_screenshot.JPG"

from PIL import Image
import pytesseract
import re
import time

def extract_text(image_path):
        # Set the path to the Tesseract executable (only required on Windows)
        pytesseract.pytesseract.tesseract_cmd = r'C:/Program Files/Tesseract-OCR/tesseract.exe'
        image = Image.open(image_path) # Open the image using PIL

        # Preprocess the image (convert to grayscale and apply thresholding)
        image = image.convert('L')  # Convert to grayscale
        image = image.point(lambda x: 0 if x < 128 else 255, '1')  # Apply thresholding

        text = pytesseract.image_to_string(image) # Extract text from the image
        # Use regex to remove special characters like '+' and newline characters
        cleaned_text = re.sub(r'[+\n]', '', text)
        cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()
        print(f"text extracted from {image_path}: {cleaned_text}")        
        return cleaned_text


import fitz  # PyMuPDF
import os

def convert_pdf_pages_to_images_pymupdf(pdf_path, output_folder="C:/Users/rpa.uat/Downloads/pdf_output_images", num_pages_to_convert=3, dpi=300):
    """
    Converts the first few pages of a PDF to images using PyMuPDF.

    Args:
        pdf_path (str): The path to the input PDF file.
        output_folder (str): The folder where output images will be saved.
        num_pages_to_convert (int): The number of pages from the beginning of the PDF to convert.
        dpi (int): Dots per inch, affecting the resolution of the output image.
    """
    try:
        # Create the output folder if it doesn't exist
        if not os.path.exists(output_folder):
            os.makedirs(output_folder)
            print(f"Created output folder: {output_folder}")

        doc = fitz.open(pdf_path) # Open the PDF file

        # Determine the number of pages to actually convert
        # It will be the minimum of num_pages_to_convert and the total number of pages in the PDF
        pages_to_process = min(num_pages_to_convert, doc.page_count)

        if pages_to_process == 0:
            print(f"No pages found in the PDF: {pdf_path}")
            doc.close()
            return

        print(f"Converting the first {pages_to_process} page(s) of '{pdf_path}'...")

        # Iterate through the specified number of pages
        for page_num in range(pages_to_process):
            # Load the page
            page = doc.load_page(page_num)  # Page numbers are 0-indexed in PyMuPDF

            # Render page to an image (pixmap)
            # The matrix determines the zoom factor. Higher zoom means higher resolution.
            # Default matrix is fitz.Matrix(1, 1) which corresponds to 72 DPI.
            # To achieve a desired DPI, we scale by (dpi/72).
            zoom = dpi / 72.0
            mat = fitz.Matrix(zoom, zoom)
            pix = page.get_pixmap(matrix=mat)

            # Define the output image path
            image_filename = f"page_{page_num + 1}.png"  # Save as PNG (supports transparency)
            image_path = os.path.join(output_folder, image_filename)

            # Save the pixmap as an image file
            pix.save(image_path)
            print(f"Saved: {image_path} (Resolution: {pix.width}x{pix.height})")
        
        doc.close() # Close the PDF document
        print("Conversion complete.")

    except FileNotFoundError:
        print(f"Error: PDF file not found at '{pdf_path}'")
    except Exception as e:
        print(f"An error occurred: {e}")
        print("Please ensure PyMuPDF is installed correctly ('pip install PyMuPDF')")

import PyPDF2
import os # For path joining

def check_and_rearrange_pdf_pages(input_pdf_file_path, output_pdf_path):
    """
    Checks the first two pages of a PDF for specific keywords or images and moves them to the end.

    Pages to be moved are identified if they are within the first two pages AND
    their text content contains "Field modification", "Table line insertion", or "Table line deletion".
    The function also prints if image resources are detected on the first two pages.

    Args:
        input_pdf_path (str): The file path of the input PDF.
        output_pdf_path (str): The file path where the rearranged PDF will be saved.

    Returns:
        bool: True if the rearrangement was successful, False otherwise.
    """
    keywords_to_check = [
        "susan",  # PyPDF2 text extraction is often lowercase
        "audit trail",
        "modification",
        "field modification",
        "insertion",
        "lineinsertionTable",
        "linedeletionTable"
    ]
    
    def get_list_image_files(folder_dir):
        list_image_files = [file for file in os.listdir(folder_dir) 
                        if file.endswith(('.jpg', '.JPG', '.png', '.PNG'))]
        return list_image_files

    folder_dir_images = r"C:/Users/rpa.uat/Downloads/pdf_output_images/"
    list_image_files = get_list_image_files(folder_dir_images)

    try:
        with open(input_pdf_file_path, 'rb') as pdf_file:
            reader = PyPDF2.PdfReader(pdf_file)
            writer = PyPDF2.PdfWriter()

            num_pages = len(reader.pages)
            print(f"Total pages in PDF: {num_pages}")

            pages_to_move_indices = []
            
            # Examine only the first two pages (or fewer if the PDF is shorter)
            pages_to_examine_count = min(num_pages, 2)

            for i in range(pages_to_examine_count):
                page = reader.pages[i]
                page_text = ""
                contains_keywords = False
                has_images = False

                # Extract text
                try:
                    extracted_text = page.extract_text()
                    if extracted_text:
                        page_text = extracted_text.lower() # Convert to lowercase for case-insensitive search
                        for keyword in keywords_to_check:
                            if keyword in page_text:
                                contains_keywords = True
                                break
                    else:
                        print(f"Page {i+1}: No text could be extracted.")
                except Exception as e:
                    print(f"Page {i+1}: Error extracting text: {e}")

                # Check for text in image files (presence of image XObjects)
                # PyPDF2's page.images attribute is a more direct way if available and populated
                for image_file in list_image_files:
                    print(f"\n--- processing image: {image_file} ---")
                    extracted_text = extract_text("C:/Users/rpa.uat/Downloads/pdf_output_images/"+image_file)
                    
                    if not extracted_text:
                        print("No text extracted or extraction failed, skipping keyword check.")                        
                        continue
                    if any(keyword.lower() in extracted_text.lower() for keyword in keywords_to_check):
                        has_images = True
                        print(f"Found one or more keywords in '{image_file}'.")                        
                    else:
                        print('not text exist')                 
                
                """
                if hasattr(page, 'images') and page.images:
                    has_images = True
                else:
                    # Fallback to checking XObjects for image types
                    if '/XObject' in page['/Resources']:
                        x_objects = page['/Resources']['/XObject']
                        for obj in x_objects:
                            if x_objects[obj]['/Subtype'] == '/Image':
                                has_images = True
                                break
                """                
                print(f"Page {i+1} analysis:")
                print(f"  - Contains target keywords: {contains_keywords}")
                print(f"  - Contains image resources: {has_images}")

                if contains_keywords or has_images:
                    pages_to_move_indices.append(i)
                    print(f"  - Marked to be moved to the end.")
            
            # Collect pages in their new order
            final_page_order_objects = []
            moved_page_objects = []

            # Add pages that are not moved
            for i in range(num_pages):
                if i not in pages_to_move_indices:
                    final_page_order_objects.append(reader.pages[i])
            
            # Collect the pages that need to be moved
            for i in pages_to_move_indices: # Iterate in original order to preserve relative order
                moved_page_objects.append(reader.pages[i])

            # Add all pages to the writer
            for page_obj in final_page_order_objects:
                writer.add_page(page_obj)
            
            for page_obj in moved_page_objects: # Add moved pages at the end
                writer.add_page(page_obj)

            # Write the output PDF file
            with open(output_pdf_path, 'wb') as output_file:
                writer.write(output_file)
            
            print(f"\nSuccessfully rearranged PDF saved to: {output_pdf_path}")
            print(f"Pages moved to the end (original indices): {pages_to_move_indices}")
            return True

    except FileNotFoundError:
        print(f"Error: Input PDF file not found at {input_pdf_file_path}")
        return False
    except Exception as e:
        print(f"An error occurred: {e}")
        return False

import glob, os
import shutil

def remove_image_files(file_path_image):
    # Iterate over all files in the given directory
    for filename in os.listdir(file_path_image):
        # Check if the file is an image
        if filename.lower().endswith(('.jpg', '.png')):
            try:
                os.remove(os.path.join(file_path_image, filename))
            except Exception as e:
                print(e)


if __name__ == '__main__':    
    # If the script is in the same directory as the PDF, the filename alone might be sufficient.
    
    # Assuming the PDF is in the same directory as the script:
    # For this example, ensure 'ISLAND RECOVERY SERVICES PTE. LTD._IRS34547_08052025.pdf'
    # is in the same directory as this script, or provide the full path.
    def get_list_pdf_files(folder_dir):
        list_pdf_files = [file for file in os.listdir(folder_dir) 
                        if file.lower().endswith(('.pdf'))]
        return list_pdf_files
    
    input_files = get_list_pdf_files(r"C:/Users/rpa.uat/Downloads/esker_merged/")
    for input_file in input_files:
        print(f"Processing file: {input_file}")
        input_filename = input_file
        #input_filename = 'ISLAND RECOVERY SERVICES PTE. LTD._IRS34547_08052025.pdf' ##
            
        # Construct full path assuming the PDF is in the same directory as the script
        # This makes the script more robust if run from a different working directory.
        #script_dir = os.path.dirname(os.path.abspath(__file__))
        script_dir = "C:/Users/rpa.uat/Downloads/esker_merged/"
        input_pdf_file_path = os.path.join(script_dir, input_filename)
    
        output_pdf_file_path = os.path.join(script_dir, 'rearranged '+input_filename)
        output_folder=r"C:/Users/rpa.uat/Downloads/pdf_output_images/"

        convert_pdf_pages_to_images_pymupdf(input_pdf_file_path, output_folder=output_folder, num_pages_to_convert=3, dpi=300)
    
        print(f"Attempting to rearrange PDF: {input_pdf_file_path}")
        if not os.path.exists(input_pdf_file_path):
            print(f"hereERROR: Input PDF not found at {input_pdf_file_path}")
            print("Please ensure the PDF file is in the same directory as the script, or update the path.")
        else:
            success = check_and_rearrange_pdf_pages(input_pdf_file_path, output_pdf_file_path)
            if success:
                print("PDF rearrangement complete.")
            else:
                print("PDF rearrangement failed.")
    
        #file_path_image = r"C:/Users/rpa.uat/Downloads/pdf_output_images/"
        remove_image_files(output_folder) #remove all image files


#run

Processing file: ECONO GREEN PTE. LTD._INV14859_14052025.pdf
Converting the first 3 page(s) of 'C:/Users/rpa.uat/Downloads/esker_merged/ECONO GREEN PTE. LTD._INV14859_14052025.pdf'...
Saved: C:/Users/rpa.uat/Downloads/pdf_output_images/page_1.png (Resolution: 2480x3509)
Saved: C:/Users/rpa.uat/Downloads/pdf_output_images/page_2.png (Resolution: 2480x3509)
Saved: C:/Users/rpa.uat/Downloads/pdf_output_images/page_3.png (Resolution: 2480x3509)
Conversion complete.
Attempting to rearrange PDF: C:/Users/rpa.uat/Downloads/esker_merged/ECONO GREEN PTE. LTD._INV14859_14052025.pdf
Total pages in PDF: 12

--- processing image: page_1.png ---
text extracted from C:/Users/rpa.uat/Downloads/pdf_output_images/page_1.png: Audit Trail - Invoice id: 1269716053587459587Vendor Information1000373067 - ECONO GREEN PTE. LTD.3 ANG MO KIO STREET 62, #08-07, LINInvoice ProcessingInvoice source: EmailDigital signature: No signatureFrom: Allison,Ng@sh-cogent.com.sgInvoice DetailsInvoice numbe'r: INV14859Net amou

In [None]:
import fitz  # PyMuPDF
import os

def convert_pdf_pages_to_images_pymupdf(pdf_path, output_folder="pdf_images", num_pages_to_convert=3, dpi=300):
    """
    Converts the first few pages of a PDF to images using PyMuPDF.

    Args:
        pdf_path (str): The path to the input PDF file.
        output_folder (str): The folder where output images will be saved.
        num_pages_to_convert (int): The number of pages from the beginning of the PDF to convert.
        dpi (int): Dots per inch, affecting the resolution of the output image.
    """
    try:
        # Create the output folder if it doesn't exist
        if not os.path.exists(output_folder):
            os.makedirs(output_folder)
            print(f"Created output folder: {output_folder}")

        doc = fitz.open(pdf_path) # Open the PDF file

        # Determine the number of pages to actually convert
        # It will be the minimum of num_pages_to_convert and the total number of pages in the PDF
        pages_to_process = min(num_pages_to_convert, doc.page_count)

        if pages_to_process == 0:
            print(f"No pages found in the PDF: {pdf_path}")
            doc.close()
            return

        print(f"Converting the first {pages_to_process} page(s) of '{pdf_path}'...")

        # Iterate through the specified number of pages
        for page_num in range(pages_to_process):
            # Load the page
            page = doc.load_page(page_num)  # Page numbers are 0-indexed in PyMuPDF

            # Render page to an image (pixmap)
            # The matrix determines the zoom factor. Higher zoom means higher resolution.
            # Default matrix is fitz.Matrix(1, 1) which corresponds to 72 DPI.
            # To achieve a desired DPI, we scale by (dpi/72).
            zoom = dpi / 72.0
            mat = fitz.Matrix(zoom, zoom)
            pix = page.get_pixmap(matrix=mat)

            # Define the output image path
            image_filename = f"page_{page_num + 1}.png"  # Save as PNG (supports transparency)
            image_path = os.path.join(output_folder, image_filename)

            # Save the pixmap as an image file
            pix.save(image_path)
            print(f"Saved: {image_path} (Resolution: {pix.width}x{pix.height})")
        
        doc.close() # Close the PDF document
        print("Conversion complete.")

    except FileNotFoundError:
        print(f"Error: PDF file not found at '{pdf_path}'")
    except Exception as e:
        print(f"An error occurred: {e}")
        print("Please ensure PyMuPDF is installed correctly ('pip install PyMuPDF')")
        

if __name__ == "__main__":
    # IMPORTANT: Replace with the actual path to YOUR PDF file
    pdf_file_path = "C:/Users/john.tan/Downloads/ISLAND RECOVERY SERVICES PTE. LTD._IRS34547_08052025.pdf"
        
    if pdf_file_path == None:
        print("--------------------------------------------------------------------")
        print("PLEASE UPDATE 'pdf_file_path' with the actual path to your PDF file.")
        print("--------------------------------------------------------------------")
    else:
        # Specify the output folder (optional, defaults to "pdf_images")
        custom_output_folder = "C:/Users/john.tan/Downloads/pdf_output_images/"
        
        # Convert the first 3 pages with a DPI of 300
        convert_pdf_pages_to_images_pymupdf(pdf_file_path, 
                                            output_folder=custom_output_folder, 
                                            num_pages_to_convert=3, 
                                            dpi=300)
        
        # Example: Convert first 1 page with default DPI (72) and default output folder
        # convert_pdf_pages_to_images_pymupdf(pdf_file_path, num_pages_to_convert=1)



In [4]:
from PIL import Image
import pytesseract
import re
import time

def extract_text(image_path):
        # Set the path to the Tesseract executable (only required on Windows)
        pytesseract.pytesseract.tesseract_cmd = r'C:/Program Files/Tesseract-OCR/tesseract.exe'
        image = Image.open(image_path) # Open the image using PIL

        # Preprocess the image (convert to grayscale and apply thresholding)
        image = image.convert('L')  # Convert to grayscale
        image = image.point(lambda x: 0 if x < 128 else 255, '1')  # Apply thresholding

        text = pytesseract.image_to_string(image) # Extract text from the image
        # Use regex to remove special characters like '+' and newline characters
        cleaned_text = re.sub(r'[+\n]', '', text)
        cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()
        print(f"text extracted from {image_path}: {cleaned_text}")
        return cleaned_text

keywords_to_check = [
        "susan",  # PyPDF2 text extraction is often lowercase
        "audit trail",
        "lineinsertionTable",
        "linedeletionTable"
    ]

list_image_files = [
        "C:/Users/rpa.uat/Downloads/pdf_output_images/page_1.png",
        "C:/Users/rpa.uat/Downloads/pdf_output_images/page_2.png",
        "C:/Users/rpa.uat/Downloads/pdf_output_images/page_3.png"
    ]
for image_file in list_image_files:
    print(f"\n--- processing image: {image_file} ---")
    extracted_text = extract_text(image_file)
    has_images=False
    if not extracted_text:
        print("No text extracted or extraction failed, skipping keyword check.")
        print("not text exist") # Matching your original output for this case
        continue
    if any(keyword.lower() in extracted_text.lower() for keyword in keywords_to_check):
        has_images = True
        print(f"Found one or more keywords in '{image_file}'.")
        print('text in image')
    else:
        print('not text exist')





--- processing image: C:/Users/rpa.uat/Downloads/pdf_output_images/page_1.png ---
text extracted from C:/Users/rpa.uat/Downloads/pdf_output_images/page_1.png: Audit Trail - Invoice id: 1269188288005791365Vendor Information3000701608 - ISLAND RECOVERY SERVICESPTE. LTD.26 CHIA PING ROAD619977 SGInvoice ProcessingInvoice source: Email Digital signature: No signatureFrom: susan.chua@sh-cogent.com.sgInvoice DetailsInvoice number: IRS34547 Invoice date: 19/03/2025Net amount: 6,100.00 Tax amount: 549.00Invoice currency: SGDArchiving DetailsArchive duration: 7 yearsDigital fingerprint:725185 1bdf0965c33cb527b0815721beRelated Documents1269188288005791365.pdfWorkflowInvoice status: Pending paymentArchive date: 10/05/2025, 09:13ERP invoice number: 5105625716Invoice amount: 6,649.00Digital fingerprint: [5b33f8fa1002a09b6fc5bf285d3ab28CCAP Specialists AP Specialist 08/05/2025, 09:55$G79 CAL Verifier Team Reviewer 08/05/2025, 10:30SG79 CAL HOB Group Reviewer 08/05/2025, 12:37AP Specialists AP Speci