In [2]:
from pathlib import Path
import os
import io
import cv2
import fitz
from PIL import Image, ImageDraw, ImageFont
from PyPDF2 import PdfMerger, PdfWriter, PdfReader
from reportlab.pdfgen import canvas
from reportlab.lib.pagesizes import letter, A4
from reportlab.pdfbase import pdfmetrics
from reportlab.pdfbase.ttfonts import TTFont
import pytesseract
pytesseract.pytesseract.tesseract_cmd = (
        r"C:\Users\jdr\AppData\Local\Programs\Tesseract-OCR\tesseract.exe"
    )

In [3]:
def write_drawing_num(pdfs_path):

    path = Path(pdfs_path)
    list_pdfs = [f for f in path.glob('*.pdf')]
    list_pdfs.sort()

    for index, pdf in enumerate(list_pdfs):
        pdf_path = os.path.join(pdfs_path, pdf.stem + '.pdf')
        pdfmetrics.registerFont(TTFont('isocpeur', 'isocpeur.ttf'))
        packet = io.BytesIO()
        can = canvas.Canvas(packet, pagesize=A4)
        can.setFont('isocpeur', 11)
        can.rotate(270)
        pdf_path = os.path.join(pdfs_path, pdf.stem + '.pdf')
        pdf_name = pdf_path.split("\\")[-1]
        drawing_number = '010'
        text = "RIG-TEG-"+ pdf_name.replace('.pdf', '') + '-' + drawing_number
        can.drawString(-150, 25, text, direction='ltr')
        can.save()

        #move to the beginning of the StringIO buffer
        packet.seek(0)

        # create a new PDF with Reportlab
        new_pdf = PdfReader(packet)
        # read your existing PDF
        existing_pdf = PdfReader(open(pdf, "rb"))
        output = PdfWriter()
        # add the "watermark" (which is the new pdf) on the existing page
        page = existing_pdf.pages[0]
        page.merge_page(new_pdf.pages[0])
        output.add_page(page)
        # finally, write "output" to a real file
        output_stream = open(pdf_path, "wb")
        output.write(output_stream)
        output_stream.close()

In [4]:
def find_bp_names_and_rename_imgs_and_pdfs(img_folder: str):
    '''Uses OCR to find the name of the boring to rename the image with that name.
    With the font used in AutoCAD 7 is interpreted as / so logic is made to correct the name.
    Made for borings named like: SB-1000.
    If a boring is split in two pages, the second page will be named "SB-X" plus the index of the file. 
    '''
    path = Path(img_folder)
    list_pngs = [f for f in path.glob('*.png')]
    bp_names = []

    for i, picture in enumerate(list_pngs):
        img_path = os.path.join(img_folder, picture.stem + '.png')
        pdf_path = os.path.join(img_folder, picture.stem + '.pdf')
        text = str(((pytesseract.image_to_string(Image.open(img_path)))))
        text.strip()
        bp_name = text.partition('\n')[0].replace(' ', '')
        fix_bp_name = bp_name.replace('/', '7')
        print(fix_bp_name)
        if 'SB' in fix_bp_name:
            file_name = fix_bp_name[:-1] if len(fix_bp_name) > 7 else fix_bp_name
            print('Justert til: ', file_name)
        else:
            file_name = 'SB-X' + str(i)
        bp_pic_path = img_folder + '\\' + file_name + '.png'
        bp_pdf_path = img_folder + '\\' + file_name + '.pdf'
        os.rename(img_path, bp_pic_path)
        os.rename(pdf_path, bp_pdf_path)

In [5]:
def pdf2img(pdf_path: str):
    """
    pdf_path: path of a pdf file.
    
    Creates images from pages of a pdf file.
    """
    #img_path = pdf_path + r'\images'
    #Path(img_path).mkdir(parents=True, exist_ok=True)  # create the 'images' folder if it doesn't exist
    img_path = os.path.dirname(pdf_path) + r'\images\\'
    Path(img_path).mkdir(parents=True, exist_ok=True) 
    doc = fitz.open(pdf_path)
    for index, page in enumerate(doc.pages()):
        zoom = 3
        zz = fitz.Matrix(zoom, zoom)
        pix = page.get_pixmap(matrix=zz)
        pix.save(img_path + str(index) + r'.png')

In [6]:
def pdf_splitter(path):
    fname = os.path.splitext(os.path.basename(path))[0]
    new_path = '\\'.join(path.split('\\')[:-1]) + '\\images'
    pdf = PdfReader(path)
    for page in range(len(pdf.pages)):
        pdf_writer = PdfWriter()
        pdf_writer.add_page(pdf.pages[page])
        output_filename = '{}\\{}.pdf'.format(new_path, page)
        with open(output_filename, 'wb') as out:
            pdf_writer.write(out)

In [7]:
# Open the image file
def convert_images_to_pdf(img_folder: str, pdf_filename: str):
    """Converts pngs to pdf files.
    Input: image folder, and name of the resulting pdf.¨
    PDF will be named SB-XXXX_pdf_filname.pdf
    """
    path = Path(img_folder)
    list_pngs = [f for f in path.glob('*.png')]

    for picture in list_pngs:
        img_path = os.path.join(img_folder, picture.stem + '.png')
        image = Image.open(img_path)
        # Save the image as a PDF file
        image.save(img_folder + '\\' + picture.stem + '_' + pdf_filename, "PDF")


In [8]:
def merge_pdfs(pdf_folder: str):
    """Merges all pdfs in a folder.
    Input: directory of pdf files to merge.
    Resulting pdf is named Samlet.pdf
    """ 
    path = Path(pdf_folder)
    list_pdfs = [f for f in path.glob('*.pdf')]
    
    merger = PdfMerger()
    
    for pdf in list_pdfs:
        merger.append(open(pdf, 'rb'))
    
    with open(pdf_folder + "\\Samlet.pdf", "wb") as fout:
        merger.write(fout)

In [24]:
img_folder = r'C:\Users\jdr\OneDrive - Multiconsult\Skrivebord\Totsonderinger\CPT\images'
pdf_file = r'C:\Users\jdr\OneDrive - Multiconsult\Skrivebord\Totsonderinger\CPT\Lay_totalsonderinger A4_medcpt_etter_KS.pdf'
samlet = r'C:\Users\jdr\OneDrive - Multiconsult\Skrivebord\Totsonderinger\Samlet'

#pdf2img(pdf_file)
#pdf_splitter(pdf_file)
#find_bp_names_and_rename_imgs_and_pdfs(img_folder=img_folder)

# NB!!: Run the following functions AFTER controlling the PDF file names = boring name!
#write_drawing_num(pdfs_path=img_folder)
merge_pdfs(samlet)



In [10]:
def rename_pdf(img_folder: str):
    """Only for long borings"""
    path = Path(img_folder)
    list_pdfs = [f for f in path.glob('*.pdf')]

    for i, pdf in enumerate(list_pdfs):
        pdf_path = os.path.join(img_folder, pdf.stem + '.pdf')
        bp_pdf_path = img_folder + '\\' + pdf.stem + 'I.pdf'
        os.rename(pdf_path, bp_pdf_path)

In [11]:
img_folder = r'C:\Users\jdr\OneDrive - Multiconsult\Skrivebord\Totsonderinger\Lange\images'
#rename_pdf(img_folder)