In [4]:
import os
import datetime

def get_filtered_files(directory, cutoff_date, excluded_extensions):
    """
    Get a list of files in the specified directory that are newer than the cutoff_date
    and do not have an extension in excluded_extensions.

    Args:
    directory (str): The path to the directory to scan for files.
    cutoff_date (datetime.datetime): The date to compare file modification times against.
    excluded_extensions (list of str): The list of file extensions to exclude.

    Returns:
    list of str: The list of filtered file paths.
    """
    filtered_files = []

    for root, _, files in os.walk(directory):
        for file in files:
            file_path = os.path.join(root, file)
            file_extension = os.path.splitext(file)[1].lower()
            file_mod_time = datetime.datetime.fromtimestamp(os.path.getmtime(file_path))

            if file_mod_time > cutoff_date and file_extension not in excluded_extensions:
                filtered_files.append(file_path)

    return filtered_files

# Example usage
directory_path = "C:\\Users\\keith\\Dropbox\\Embrace"
cutoff_date = datetime.datetime(2024, 7, 11)  # Specify your cutoff date
excluded_filetypes = ['.txt', '.log', '.xlsx']  # Specify file types to exclude

filtered_files = get_filtered_files(directory_path, cutoff_date, excluded_filetypes)

for file in filtered_files:
    print(file)


C:\Users\keith\Dropbox\Embrace\ccc_429078.pdf
C:\Users\keith\Dropbox\Embrace\ccc_429080.pdf
C:\Users\keith\Dropbox\Embrace\chewy_1524360381.pdf
C:\Users\keith\Dropbox\Embrace\chewy_1529967397.pdf
C:\Users\keith\Dropbox\Embrace\chewy_1530771607.pdf
C:\Users\keith\Dropbox\Embrace\chewy_1531541127.pdf
C:\Users\keith\Dropbox\Embrace\chewy_1536612634.pdf
C:\Users\keith\Dropbox\Embrace\chewy_1542282860.pdf
C:\Users\keith\Dropbox\Embrace\chewy_1548973566.pdf
C:\Users\keith\Dropbox\Embrace\chewy_1553301591.pdf
C:\Users\keith\Dropbox\Embrace\chewy_1555118251.pdf
C:\Users\keith\Dropbox\Embrace\jasper-poe-insulin-2024-07-10.png
C:\Users\keith\Dropbox\Embrace\ww_0016642904.pdf
C:\Users\keith\Dropbox\Embrace\ww_0016916045.pdf
C:\Users\keith\Dropbox\Embrace\ww_0017143357.pdf


In [5]:
filtered_files.remove("C:\\Users\\keith\\Dropbox\\Embrace\\jasper-poe-insulin-2024-07-10.png")
for file in filtered_files:
    print(file)

C:\Users\keith\Dropbox\Embrace\ccc_429078.pdf
C:\Users\keith\Dropbox\Embrace\ccc_429080.pdf
C:\Users\keith\Dropbox\Embrace\chewy_1524360381.pdf
C:\Users\keith\Dropbox\Embrace\chewy_1529967397.pdf
C:\Users\keith\Dropbox\Embrace\chewy_1530771607.pdf
C:\Users\keith\Dropbox\Embrace\chewy_1531541127.pdf
C:\Users\keith\Dropbox\Embrace\chewy_1536612634.pdf
C:\Users\keith\Dropbox\Embrace\chewy_1542282860.pdf
C:\Users\keith\Dropbox\Embrace\chewy_1548973566.pdf
C:\Users\keith\Dropbox\Embrace\chewy_1553301591.pdf
C:\Users\keith\Dropbox\Embrace\chewy_1555118251.pdf
C:\Users\keith\Dropbox\Embrace\ww_0016642904.pdf
C:\Users\keith\Dropbox\Embrace\ww_0016916045.pdf
C:\Users\keith\Dropbox\Embrace\ww_0017143357.pdf


In [3]:
%pip install PyMuPDF pytesseract


Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.0 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [4]:
print(filtered_files[0])

C:\Users\keith\Dropbox\Embrace\ccc_429078.pdf


In [5]:
%pip install pdf2image

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.0 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [6]:
%pip install opencv-python

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.0 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [6]:
TESSDATA_PREFIX='C:/Program Files/Tesseract-OCR/tessdata'

In [11]:
# import the required library
import cv2
from PIL import Image
import fitz 
import numpy as np

# define a function to display the coordinates of

# of the points clicked on the image
def click_event(event, x, y, flags, params):
    if event == cv2.EVENT_LBUTTONDOWN:
        print(f'({x},{y})')
      
    # put coordinates as text on the image
#     cv2.putText(img, f'({x},{y})',(x,y),
#     cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2)
      
#     # draw point on the image
#     cv2.circle(img, (x,y), 3, (0,255,255), -1)

# read the input image
# img = cv2.imread('back2school.jpg')
page_num = 0
pdf_path = 'C:/Users/keith/Dropbox/Embrace/chewy_1524360381.pdf'
# pdf_path = 'C:/Users/keith/Dropbox/Embrace/ccc_429078.pdf'
# pdf_path = 'C:/Users/keith/Dropbox/Embrace/ww_0016642904.pdf'
document = fitz.open(pdf_path)
page = document[page_num]
pix = page.get_pixmap()
image = Image.frombuffer("RGB", [pix.width, pix.height], pix.samples, "raw", "RGB", 0, 1)
img = image

# create a window
cv2.namedWindow('Point Coordinates')

# bind the callback function to window
cv2.setMouseCallback('Point Coordinates', click_event)

# display the image
while True:
    cv2.imshow('Point Coordinates',np.asarray(img))
    k = cv2.waitKey(1) & 0xFF
    if k == 27:
        break
        
cv2.destroyAllWindows()

(11,112)
(11,112)
(131,139)
(131,139)
(10,109)
(237,272)
(415,407)
(507,487)
(530,513)
(482,564)
(239,249)
(411,359)
(388,372)


KeyboardInterrupt: 

: 

In [2]:
import cv2
from PIL import Image
import fitz 
page_num = 0
pdf_path = 'C:/Users/keith/Dropbox/Embrace/chewy_1411848544.pdf'
document = fitz.open(pdf_path)
page = document[page_num]
pix = page.get_pixmap()
image = Image.frombuffer("RGB", [pix.width, pix.height], pix.samples, "raw", "RGB", 0, 1)
img = image
print(type(img))
print(img.size)
print(type(img.size))

<class 'PIL.Image.Image'>
(612, 792)
<class 'tuple'>


In [9]:
from PIL import Image
from PIL import ImageFilter
from PIL import ImageEnhance
import fitz 
import pytesseract
import cv2
import numpy as np                                    
import pandas as pd

pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'  # Update the path as needed

chy = [[97, 133, 97+55, 133+10], 
       [217, 324, 217+100, 324+20], 
       [507, 487, 507+60, 487+15]]

ccc = [[472, 151, 472+55, 151+15], 
       [471, 184, 471+75, 184+15], 
       [519, 626, 519+60, 626+15]]

wwp = [[459, 158, 459+65, 158+20], 
       [32, 347, 32+350, 347+20], 
       [494, 416, 494+100, 416+20]]
    
page_num = 0

col_names = ["File", "Date", "Pet", "Total"]

temp = {col_names[0]:["dummy","dummy"],
       col_names[1]:["dummy","dummy"],
       col_names[2]:["dummy","dummy"],
       col_names[3]:["dummy","dummy"]}
ocr_txt = temp

df = pd.DataFrame(temp)

for file in filtered_files:
    # Simple image to string
    # print(pytesseract.image_to_string(Image.open('C:/Users/keith/Dropbox/Insurance/KKrenek_BCBSofMA_Front.png')))
    
    pdf_path = file
    ocr_txt['File'] = pdf_path

    document = fitz.open(pdf_path)
    page = document[page_num]
    pix = page.get_pixmap()
    image = Image.frombuffer("RGB", [pix.width, pix.height], pix.samples, "raw", "RGB", 0, 1)

    if "chewy" in pdf_path:
        crop_rect = chy
    elif "cc" in pdf_path:
        crop_rect = ccc
    elif "ww" in pdf_path:
        crop_rect = wwp
    else:
        crop_rect = []

    upsize = 5
    threshold = 200


    for idk in range(0,3):

        img_crop = image.crop(crop_rect[idk][:])

        img = img_crop
        img = img.resize((upsize*img.size[0], upsize*img.size[1]))
    #     display(img)
        img = img.filter(ImageFilter.EDGE_ENHANCE_MORE)
    #     img_rsz_filt = ImageEnhance.Sharpness(img_rsz)
    #     img_rsz_filt.enhance(50).show()
    #     display(img)
        img = img.point( lambda p: 255 if p > threshold else 0 )
    #     display(img)
    #     print(img_crop.size)
    #     norm_img = np.zeros((img_crop.size[0], img_crop.size[1]))
    #     img = cv2.normalize(np.asarray(img_crop), norm_img, 0, 255, cv2.NORM_MINMAX)
    #     ret, otsu = cv2.threshold(np.asarray(img_crop),0,255,cv2.THRESH_BINARY+cv2.THRESH_OTSU)
    #     display(ret)
    #     img = cv2.threshold(np.asarray(img), 127, 255, cv2.THRESH_BINARY)


    #     display(img)

        txt = pytesseract.image_to_string(img)
        txt = txt.rstrip("\n")
#         print((txt))
        ocr_txt[col_names[idk+1]] = txt

    # print(type(image))
    # display(image)
    # image.show()

    # print(pytesseract.image_to_string(image))

    # print(len(crop_rect))

    df = df._append(ocr_txt, ignore_index = True)

display(df)

Unnamed: 0,File,Date,Pet,Total
0,dummy,dummy,dummy,dummy
1,dummy,dummy,dummy,dummy
2,C:\Users\keith\Dropbox\Embrace\ccc_429078.pdf,09-10-2024\n\nmae tra,Handsome,220.4\n\noS a Se
3,C:\Users\keith\Dropbox\Embrace\ccc_429080.pdf,09-10-2024\n\nmae tra,Handsome,.\n\noS a Se
4,C:\Users\keith\Dropbox\Embrace\chewy_152436038...,,"Perervoe, Vat",$S28E
5,C:\Users\keith\Dropbox\Embrace\chewy_152996739...,,"SOME, Call",$35.
6,C:\Users\keith\Dropbox\Embrace\chewy_153077160...,,"Perervoe, Vat",$S28E
7,C:\Users\keith\Dropbox\Embrace\chewy_153154112...,,"Perervoe, Vat",$65.€
8,C:\Users\keith\Dropbox\Embrace\chewy_153661263...,,"Perervoe, Vat",$S28E
9,C:\Users\keith\Dropbox\Embrace\chewy_154228286...,,"SOME, Call",$63.


In [9]:
print(crop_rect[0][:])

[472, 151, 527, 166]


In [10]:
import fitz  # PyMuPDF
import pytesseract
from PIL import Image
import io
from pdf2image import convert_from_path 

def extract_text_from_image(image):
    """Extract text from a PIL image using pytesseract."""
    text = pytesseract.image_to_string(image)
    return text

def extract_information(text):
    """Extract specific information from the text. Modify this function based on your needs."""
    # Example: Extract email addresses and phone numbers
    import re
    emails = re.findall(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', text)
    phone_numbers = re.findall(r'\b\d{3}[-.\s]??\d{3}[-.\s]??\d{4}\b', text)
    
    return {'emails': emails, 'phone_numbers': phone_numbers}

def process_pdf(pdf_path):
    """Process a single PDF to extract text from images and then extract specific information."""
    document = fitz.open(pdf_path)
    all_info = []

    for page_num in range(len(document)):
        page = document[page_num]
#         page = document.load_page(page_num)
        # make the TextPage object. It does all the OCR.
        full_tp = page.get_textpage_ocr(flags=0, full=True)
        print(page.get_text(textpage=full_tp))
#         image_list = page.get_images(full=True)
#         image_list = convert_from_path(pdf_path) 
#         image = page.get_pixmap()
#         image = image.getImageData("png")
#         image = image.tobytes(output='png', jpg_quality=95)
#         print(type(image))
        
#         for img_index, img in enumerate(image_list):
#             xref = img[0]
#             base_image = document.extract_image(xref)
#             image_bytes = base_image["image"]
#             image = Image.open(io.BytesIO(image_bytes))
            
        text = extract_text_from_image(image)
        info = extract_information(text)
        all_info.append(info)
    
    return all_info

def main(pdf_list):
    for pdf in pdf_list:
        info = process_pdf(pdf)
        print(f"Information extracted from {pdf}: {info}")

if __name__ == "__main__":
    # List of PDF files to process
    pdf_list = filtered_files[:1]
#     pdf_list = [
#         'path/to/your/pdf1.pdf',
#         'path/to/your/pdf2.pdf',
#         # Add more PDFs as needed
#     ]
    main(pdf_list)


RuntimeError: No OCR support: TESSDATA_PREFIX not set