In [242]:
from pathlib import Path
import os
import cv2
import fitz
from PIL import Image, ImageDraw, ImageFont
from PyPDF2 import PdfMerger
import pytesseract
pytesseract.pytesseract.tesseract_cmd = (
        r"C:\Users\jdr\AppData\Local\Programs\Tesseract-OCR\tesseract.exe"
    )

In [243]:
def draw_white_rect(img_path: str):
    img = cv2.imread(img_path)
 
    # Specify the coordinates for the redaction
    top_left_x = 1337 #4444
    top_left_y = 2424 #8050
    bottom_right_x = 1637 #5450
    bottom_right_y = 2453 #8170
    x, y, width, height = top_left_x, top_left_y, (bottom_right_x - top_left_x), (bottom_right_y - top_left_y)

    # Create a white rectangle to cover the desired portion of the image
    white = (255, 255, 255)
    img[y:y + height, x:x + width] = white
    cv2.imwrite(img_path, img)

In [244]:
def draw_drawing_num(pngs_path: str):
    '''Takes the name of the images,
    sorts them and writes it in a formatted way in the tegningnummer section of the image.

    Example:
    First file in the dir named SB-1001.png will be named RIG-TEG-SB-1001-010
    second file named SB-1002 will be named RIG-TEG-SB-1002-011 and so on.
    '''
    path = Path(pngs_path)
    list_pngs = [f for f in path.glob('*.png')]
    list_pngs.sort()
    print(type(list_pngs))

    for index, picture in enumerate(list_pngs):
        img_path = os.path.join(pngs_path, picture.stem + '.png')
        draw_white_rect(img_path)
        image_name = img_path.split("\\")[-1]
        image = Image.open(img_path)
        draw = ImageDraw.Draw(image)
        font_dir = r'C:\Users\jdr\OneDrive - Multiconsult\Dokumenter\Fonts\isocpeur.ttf'
        colorblack = 'black'
        font = ImageFont.truetype(font_dir, size=35)
        drawing_number = '010' #10 + index
        text = "RIG-TEG-"+ image_name.replace('.png', '') + '-' + drawing_number #str(drawing_number).zfill(3)
        #draw.text((4450, 8080), text, font=font, fill=colorblack)
        draw.text((1337, 2424), text, font=font, fill=colorblack)
        image.save(img_path)

In [245]:
def find_bp_names_and_rename_imgs(img_folder: str):
    '''Uses OCR to find the name of the boring to rename the image with that name.
    With the font used in AutoCAD 7 is interpreted as / so logic is made to correct the name.
    Made for borings named like: SB-1000.
    If a boring is split in two pages, the second page will be named "SB-X" plus the index of the file. 
    '''
    path = Path(img_folder)
    list_pngs = [f for f in path.glob('*.png')]
    bp_names = []

    for i, picture in enumerate(list_pngs):
        img_path = os.path.join(img_folder, picture.stem + '.png')
        text = str(((pytesseract.image_to_string(Image.open(img_path)))))
        text.strip()
        bp_name = text.partition('\n')[0].replace(' ', '')
        fix_bp_name = bp_name.replace('/', '7')
        print(fix_bp_name)
        if 'SB' in fix_bp_name:
            file_name = fix_bp_name[:-1] if len(fix_bp_name) > 7 else fix_bp_name
            print('Justert til: ', file_name)
        else:
            file_name = 'SB-X' + str(i)
        bp_pic_path = img_folder + '\\' + file_name + '.png'
        os.rename(img_path, bp_pic_path)

In [246]:
def pdf2img(pdf_path: str):
    """
    pdf_path: path of a pdf file.
    
    Creates images from pages of a pdf file.
    """
    #img_path = pdf_path + r'\images'
    #Path(img_path).mkdir(parents=True, exist_ok=True)  # create the 'images' folder if it doesn't exist
    img_path = os.path.dirname(pdf_path) + r'\images\\'
    Path(img_path).mkdir(parents=True, exist_ok=True) 
    doc = fitz.open(pdf_path)
    for index, page in enumerate(doc.pages()):
        zoom = 3
        zz = fitz.Matrix(zoom, zoom)
        pix = page.get_pixmap(matrix=zz)
        pix.save(img_path + str(index) + r'.png')

In [247]:
# Open the image file
def convert_images_to_pdf(img_folder: str, pdf_filename: str):
    """Converts pngs to pdf files.
    Input: image folder, and name of the resulting pdf.¨
    PDF will be named SB-XXXX_pdf_filname.pdf
    """
    path = Path(img_folder)
    list_pngs = [f for f in path.glob('*.png')]

    for picture in list_pngs:
        img_path = os.path.join(img_folder, picture.stem + '.png')
        image = Image.open(img_path)
        # Save the image as a PDF file
        image.save(img_folder + '\\' + picture.stem + '_' + pdf_filename, "PDF")


In [248]:
def merge_pdfs(pdf_folder: str):
    """Merges all pdfs in a folder.
    Input: directory of pdf files to merge.
    Resulting pdf is named Samlet.pdf
    """ 
    path = Path(pdf_folder)
    list_pdfs = [f for f in path.glob('*.pdf')]
    
    merger = PdfMerger()
    
    for pdf in list_pdfs:
        merger.append(open(pdf, 'rb'))
    
    with open(pdf_folder + "\\Samlet.pdf", "wb") as fout:
        merger.write(fout)

In [253]:
img_folder = r'C:\Users\jdr\OneDrive - Multiconsult\Skrivebord\Totsonderinger\CPT\images'
pdf_file = r'C:\Users\jdr\OneDrive - Multiconsult\Skrivebord\Totsonderinger\CPT\Lay_totalsonderinger A4_cpt_new1.pdf'

#pdf2img(pdf_file)
#find_bp_names_and_rename_imgs(img_folder=img_folder)

# NB!!: Run the following functions AFTER controlling file names = boring name!
#draw_drawing_num(img_folder)
#convert_images_to_pdf(img_folder, 'RIG-TEG-TOT.pdf')
merge_pdfs(img_folder)



In [250]:
img_path = r'C:\Users\jdr\OneDrive - Multiconsult\Skrivebord\Totsonderinger\images\SB-X19.png'
text = str(((pytesseract.image_to_string(Image.open(img_path)))))
print(text)

FileNotFoundError: [Errno 2] No such file or directory: 'C:\\Users\\jdr\\OneDrive - Multiconsult\\Skrivebord\\Totsonderinger\\images\\SB-X19.png'