# Search-Convert-Select (SCS): A pipeline to locate all the pdf document containing desired information  

In [1]:
# # Repos an commands to install required libraries:
# Tesseract: https://github.com/madmaze/pytesseract
#         https://github.com/tesseract-ocr/tesseract
            
# pip install PyPDF2
# pip install pdfminer

import PyPDF2
from PyPDF2 import PdfFileMerger
import fitz
import pdfminer
import shutil
from pdfminer.layout import LAParams
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfpage import PDFPage
from io import StringIO
import os
import pandas as pd
from PIL import Image
import PIL
import pytesseract
import pathlib

# User inputs:

In [2]:
# The root directory where all the folders will be created
root_Dir = r"C:\Users\aamini\Work\NRCan\Automatic data capture\Analysis\Phase_1_pdf_search_and_select" 
# The location of raw input data inside the root directory
raw_data_Dir = r"C:\Users\aamini\Work\NRCan\Automatic data capture\Analysis\Phase_1_pdf_search_and_select\data" 
# The path to tessdata directory
tessdata_dir_config = r'--tessdata-dir "C:/Program Files/Tesseract-OCR/tessdata"'

# keywords and criteria parameter for the search function
keyword = ['Final Environmental Impact Statement']#,'azimuth', 'inclination','drillhole','depth', 'borehole','overburden', 'ovb']
c = 1

### Let's create the necessary folders and setup their paths

In [5]:
def directories (root_Dir):
    pathlib.Path(root_Dir+ '/' + 'Image_pdf_Dir').mkdir(exist_ok=True) 
    pathlib.Path(root_Dir+ '/' + 'Text_pdf_Dir').mkdir(exist_ok=True)
    pathlib.Path(root_Dir+ '/' + 'Temp_image_Dir').mkdir(exist_ok=True)
    pathlib.Path(root_Dir+ '/' + 'Temp_pdf_Dir').mkdir(exist_ok=True)
    pathlib.Path(root_Dir+ '/' + 'Converted_Dir').mkdir(exist_ok=True)
    pathlib.Path(root_Dir+ '/' + 'Output_Dir').mkdir(exist_ok=True)

    
directories(root_Dir)

Output_Dir = root_Dir+ '/' + 'Output_Dir'
Temp_image_Dir = root_Dir+ '/' + 'Temp_image_Dir'
Temp_pdf_Dir = root_Dir+ '/' + 'Temp_pdf_Dir'
Converted_Dir = root_Dir+ '/' + 'Converted_Dir'
Image_pdf_Dir = root_Dir+ '/' + 'Image_pdf_Dir'
Text_pdf_Dir = root_Dir+ '/' + 'Text_pdf_Dir'

# 1- Search:

## This function gives a summary of pdf file types and separates them into two groups of:
####    - pdfs with searchable text (Text_pdf_Dir)
####    - pdfs without searchable text (Image_pdf_Dir)

### Input: Directory of raw data

In [56]:
# This function gives a summary of pdf file types (selecteable text or not) for input directory and separates them into two folders
def Separate_pdf_type(raw_data_Dir):
    Total_pdf = 0
    count_mix = 0
    count_text = 0
    count_image = 0
    count_unvalid = 0
    for root, dirs, files in os.walk(raw_data_Dir, topdown=False):
        for name in dirs:
            directory = (root+ '/' +name+ '/')
            for filename in os.listdir(directory):
                #print (filename)
                if filename.endswith(".pdf"):
                    
                    from pdfminer.pdfpage import PDFPage
                    #searchable_pages = []
                    #non_searchable_pages = []
                    #page_num = 0

                    with open(directory+filename, 'rb') as infile:
                        Total_pdf += 1
                        searchable_pages = []
                        non_searchable_pages = []
                        page_num = 0
                        for page in PDFPage.get_pages(infile):
                            page_num += 1
                            if 'Font' in page.resources.keys():
                                searchable_pages.append(page_num)
                            else:
                                non_searchable_pages.append(page_num)
                    if page_num > 0:
                        if len(searchable_pages) == 0:
                            count_image += 1
                            shutil.copy(os.path.join(directory, filename), Image_pdf_Dir)
                            #print(f"Document '{fname}' has {page_num} page(s). "
                            #      f"Complete document is non-searchable")
                        elif len(non_searchable_pages) == 0:
                            count_text += 1
                            shutil.copy(os.path.join(directory, filename), Text_pdf_Dir)
                            #print(f"Document '{fname}' has {page_num} page(s). "
                            #      f"Complete document is searchable")
                        else:
                            count_mix += 1
                            shutil.copy(os.path.join(directory, filename), Image_pdf_Dir)
                            #print (f"Mixed Document")
                            #print(f"searchable_pages : {searchable_pages}")
                            #print(f"non_searchable_pages : {non_searchable_pages}")
                    else:
                        count_unvalid += 1
                        #print(f"Not a valid document")
    print (f"Total Number of pdfs:", Total_pdf)
    print (f"Number of pdfs containing only Text:", count_text)
    print (f"Number of pdfs containing only Image:", count_image)
    print (f"Number of Mixed Text/Image pdfs:", count_mix)
    print (f"Number of Unvalid pdfs:", count_unvalid)

In [57]:
Separate_pdf_type(raw_data_Dir)

The PDF <_io.BufferedReader name='C:\\Users\\aamini\\Work\\NRCan\\Automatic data capture\\Analysis\\Phase_1_pdf_search_and_select\\data\\Data\\NTGSref_2023February10 (5)\\034725_034725\\034725_Report\\034725_Appendices/034725_Appendix_G_Dril_Core_Assay_Certificates/COA_YW17034549_122387-38289151.pdf'> contains a metadata field indicating that it should not allow text extraction. Ignoring this field and proceeding. Use the check_extractable if you want to raise an error in this case
The PDF <_io.BufferedReader name='C:\\Users\\aamini\\Work\\NRCan\\Automatic data capture\\Analysis\\Phase_1_pdf_search_and_select\\data\\Data\\NTGSref_2023February10 (5)\\034725_034725\\034725_Report\\034725_Appendices/034725_Appendix_G_Dril_Core_Assay_Certificates/COA_YW17039051_122387-38357039.pdf'> contains a metadata field indicating that it should not allow text extraction. Ignoring this field and proceeding. Use the check_extractable if you want to raise an error in this case
The PDF <_io.BufferedReade

Total Number of pdfs: 703
Number of pdfs containing only Text: 385
Number of pdfs containing only Image: 275
Number of Mixed Text/Image pdfs: 43
Number of Unvalid pdfs: 0


# 2- Convert:

### This function converts pdfs without sellectable text into pdfs with sellectable text using OCR.

### Input: Directory of image pdfs created by previous function (Image_pdf_Dir).

In [25]:
# This function converts image pdfs to image files and then uses OCR to convert them back to pdfs with selectable text.
def To_selectable_text_convertor(Image_pdf_Dir):
    
    mat = fitz.Matrix(200 / 72, 200 / 72)  # sets zoom factor for 200 dpi
    
    for root, dirs, files in os.walk(Image_pdf_Dir, topdown=False):
        for filename in files:
#            print(filename)
            doc = fitz.open(Image_pdf_Dir +'/' +filename)
            for page in doc:  # convert and save all images into temp_image_dir
                pix = page.get_pixmap(matrix=mat)
                img_filename = "page-%04i.png" % page.number
                pix.pil_save(Temp_image_Dir +'/' + img_filename, format="PNG", dpi=(200,200))
              
                
            #now let's convert images in temp_image_dir folder to a single pdf file and copy the file into the converted_dir:
            # first: converts each image into a separate pdf file
            # uses pdffilemerger to merge all pages into a single pdf
            # copies the merged pdf into converted folder and deletes all images and pdf in temp folders
            # moves all merged pdfs from converted folder into text_pdf_dir
                        
            for root, dirs, files in os.walk(Temp_image_Dir, topdown=False):
                for imagename in files:
#                    print(filename)
                    pdf = pytesseract.image_to_pdf_or_hocr(Temp_image_Dir + '/' + imagename, extension='pdf', config=tessdata_dir_config)
                    with open(Temp_pdf_Dir + '/' +imagename[0:9]+ '.pdf', 'w+b') as f:
                        f.write(pdf) # pdf type is bytes by default
#                f.close()
            
            #Create an instance of PdfFileMerger() class
            merger = PdfFileMerger()
            for root, dirs, files in os.walk(Temp_pdf_Dir, topdown=False):
                for pdf_file in files:
                    merger.append(Temp_pdf_Dir + '/' + pdf_file)  #Append PDF files
                merger.write(Converted_Dir + '/' + filename)   #Write out the merged PDF file
                merger.close()

            #remove all temp files    
            for root, dirs, files in os.walk(Temp_image_Dir):
                for file in files:
                    os.remove(os.path.join(root, file))
            for root, dirs, files in os.walk(Temp_pdf_Dir):
                for file in files:
                    os.remove(os.path.join(root, file))
    # move all merged pdfs to text_pdf_dir    
    for root, dirs, files in os.walk(Converted_Dir):
        for file in files:
            shutil.move(Converted_Dir+'/'+file, Text_pdf_Dir)

In [27]:
To_selectable_text_convertor(Image_pdf_Dir)

# 3- Select

### Finds pfds containing defined keywords and copies them into output folder

In [3]:
def select_pdf_by_keywords(Text_pdf_Dir, keyword, c, Output_Dir):
    # Text_pdf_Dir: direcory of all pdf files with selectable text
    # Keyword: list of keywords the you want to look for in lowercase
    # c (number): criteria on number of keywords for selecting pdf. 
        # If all keywords are required: c= len(keyword)
        # If only one keyword is enough: c=1
    # Dest_Dir = output flder for pdf files containing keywords
    Total_checked_pdfs = 0
    Total_copied_pdfs = 0
    
    for root, dirs, files in os.walk(Text_pdf_Dir, topdown=False):
        for file in files:
            doc = fitz.open(Text_pdf_Dir+'/'+file)  # open document
            text = ""
            #print(doc)
            Total_checked_pdfs += 1
            for page in doc:  # iterate the document pages
                counter = 0
                text += page.get_text() # get all text in pdf document
            for k in keyword:  # search for keywords
                if k in text:
                    counter += 1
                    if counter == c:
                        Total_copied_pdfs += 1
                        shutil.copy(Text_pdf_Dir +'/'+ file, Output_Dir)
                        print(counter)
                        print(file)
                            
## To skip functions 1&2 and search directly in raw data folder comment above lines and uncomment below:
# def select_pdf_by_keywords(raw_data_Dir, keyword, c, Output_Dir):
#     # Text_pdf_Dir: direcory of all pdf files with selectable text
#     # Keyword: list of keywords the you want to look for in lowercase
#     # c (number): criteria on number of keywords for selecting pdf. 
#         # If all keywords are required: c= len(keyword)
#         # If only one keyword is enough: c=1
#     # Dest_Dir = output flder for pdf files containing keywords
#    Total_checked_pdfs = 0
#    Total_copied_pdfs = 0
    
#    for root, dirs, files in os.walk(raw_data_Dir, topdown=False):
#         for name in dirs:
#             directory = (root+ '/' +name+ '/')
#             for filename in os.listdir(directory):
#                 print (filename)
#                 if filename.endswith(".pdf"):
#                     doc = fitz.open(directory+filename)  # open document
#                     #print(doc)
#                     Total_checked_pdfs += 1
#                     for page in doc:  # iterate the document pages
#                         counter = 0
#                         text = ""
#                         text += page.get_text() # get all text in pdf document
#                         for k in keyword:
#                             if k in text:
#                                 counter += 1
#                                 if counter >= c:
#                                     Total_copied_pdfs += 1
#                                     shutil.copy(os.path.join(directory, filename), Output_Dir)
#                                     print(counter)
#                                     print(filename)
    print('Checked pdf files:', Total_checked_pdfs)
    print('copied pdf files:', Total_copied_pdfs)

In [6]:
select_pdf_by_keywords(Text_pdf_Dir, keyword, c, Output_Dir)

1
140506-11MN034-Golder Rpt Vol 7-Freshwater Envir-Pt 13-IA2E.pdf
Checked pdf files: 1
copied pdf files: 1
