In [92]:
# Load packages and define functions

from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfpage import PDFTextExtractionNotAllowed
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.pdfdevice import PDFDevice
from pdfminer.layout import LAParams
from pdfminer.converter import PDFPageAggregator
import pdfminer
import re
import pandas as pd
pd.set_option('display.max_colwidth', -1)
import os
import numpy
import glob

# Function for removing unwanted clutter from the pdf outputs. 
def clean(x):
    # Set up regex for removing non-alphabet characters
    regex = re.compile('[+=,\.!?0-9]')
    xn = []
    for y in x:
        # Shorten any stings of the same letter to two letter e.g. Whooooosh to Whoosh
        y = re.sub(r'(.)\1{2,}', r'', y, flags=re.DOTALL)
        try:
            # This checks that the longest word in the string is longer than 2 characters and that there is more than one word. 
            if len(max(regex.sub('',y).split(), key=len)) > 2 and len(y.split()) > 2:
                # This allows you to replace the degree and diameter symbols with text
                y = y.encode('utf-8')
                y = y.replace("-"," ").replace("\xc2\xb0"," DEGREE").replace("\xc3\x98"," DIAMETER")
                xn.append(y)
        except:
            pass
    return xn

# Function for extracting the pdf textboxes as list of strings
def parse_obj(lt_objs):
    x = []
    # loop over the object list
    for obj in lt_objs:

        # if it's a textbox, print text and location
        if isinstance(obj, pdfminer.layout.LTTextBoxHorizontal):
             x.append(obj.get_text().replace('\n', ' '))

        # if it's a container, recurse
        elif isinstance(obj, pdfminer.layout.LTFigure):
            parse_obj(obj._objs)
    return x

# Function for extracting a list of strings from a pdf
def convert(fname):

    # Open a PDF file.
    fp = open(fname, 'rb')

    # Create a PDF parser object associated with the file object.
    parser = PDFParser(fp)

    # Create a PDF document object that stores the document structure.
    # Password for initialization as 2nd parameter
    document = PDFDocument(parser)

    # Check if the document allows text extraction. If not, abort.
    if not document.is_extractable:
        raise PDFTextExtractionNotAllowed

    # Create a PDF resource manager object that stores shared resources.
    rsrcmgr = PDFResourceManager()

    # Create a PDF device object.
    device = PDFDevice(rsrcmgr)

    # BEGIN LAYOUT ANALYSIS
    # Set parameters for analysis.
    # Other parameters you can use: char_margin=0.01, word_margin=0.2, line_margin=0.3
    laparams = LAParams(all_texts=True, detect_vertical=False)

    # Create a PDF page aggregator object.
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)

    # Create a PDF interpreter object.
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    
    # loop over all pages in the document
    for page in PDFPage.create_pages(document):

        # read the page into a layout object
        interpreter.process_page(page)
        layout = device.get_result()

        # extract text from this object
        x = parse_obj(layout._objs)
    
    # Clean output data
    xn = clean(x)
    
    return xn

# Fucntion for extracting all the drawings and date produced from a given job area
def drawlist(Area):
    DL = []
    DT = []
    Jobs = "\\\\global\\europe\\Cardiff\\Jobs\\"
    Strut = "\\4 Internal Project Data\\4-30 Drawings\\4-31 Issue Drawings\\Plots"
    
    # Find all the job folders in job area
    root, dirs, files = os.walk(Jobs+Area).next()
    
    # Find all the pdfs in the issue drawings plot area for each job number
    for x in dirs:
        try:
            os.chdir(Jobs+Area+"\\"+x+Strut)
            for file in glob.glob("*.pdf"):
                DL.append(Jobs+Area+"\\"+x+Strut+"\\"+file)
        except:
            pass
    
    #Find the date each pdf was produced
    for c in DL:
        DT.append(os.path.getctime(c))
    
    # Transform lists into a datframe
    di = dict(zip(DL, DT))
    df = pd.DataFrame(di.items(), columns=["Filename","Date Created"])
    df['Date Created'] = pd.to_datetime(df['Date Created'],unit='s')
    df['Date Created'] = df['Date Created'].dt.date
    
    return df

In [None]:
# Create a dataframe of textbox values and write to csv
df = drawlist('241000')
dk = pd.read_csv('C:\\Users\\james.runnalls\\Documents\\Jupyter\\Output.csv', names=['FID', 'Text', 'Filename', 'Date Created'])
dk = dk.drop('FID', axis=1).drop('Text', axis=1).drop_duplicates()
dk['Date Created'] = pd.to_datetime(dk['Date Created'])
dk['Date Created'] = dk['Date Created'].dt.date
merge = df.merge(dk, how='left', indicator=True)
df = merge[merge['_merge']=='left_only']
#df = df.head(50)

for index, row in df.iterrows():
    try:
        xn = convert(row['Filename'])
        xd = [row['Filename']]*len(xn)
        xx = dict(zip(xn,xd))
        dn = pd.DataFrame(xx.items(), columns=["Text","Filename"])
        dn['Date Created'] = row['Date Created']
        with open('C:\\Users\\james.runnalls\\Documents\\Jupyter\\Output.csv', 'ab') as f:
            dn.to_csv(f, header=False)
    except:
        pass

In [None]:
# Clear the csv for starting again
#f = open('C:\\Users\\james.runnalls\\Documents\\Jupyter\\Output.csv', "w+")
#f.close()