In [12]:
from importlib import reload
import pdfquery

import incident
reload(incident)
from incident import Incident
import timing
reload(timing)
import utils
reload(utils)

<module 'utils' from '/Users/neel/Git/harvardcrime/utils.py'>

In [13]:
def get_text_from_curve(ltcurve):
    # text can be within LTTextBoxHorizontal or LTTextLineHorizontal
    # the problem is that these are interleaved
    # so we select EITHER as they come
    # if we chose the Boxes first and the Lines second then merged those lists,
    # the resulting list would be out of order!
    # e.g. if the true order is B1 L1 B2 B3 L2, the approach we are using
    # gives you the right order... but choosing Boxes and Lines separately
    # gives you B1 B2 B3 L1 L2!!!!
    textual_elements = ltcurve.cssselect("LTTextBoxHorizontal, LTTextLineHorizontal")
    texts = [t.text.strip() for t in textual_elements]
    
    # remove empty lines
    cleaned_texts = [t for t in texts if t != '']
    
    # PROBLEM with this approach: in rare cases some text from this falls way
    # outside the ltcurve. Might it still be within the bounding box though?
    
    # UPDATE: try gathering all data
    # for 11/28 consider these bboxes
    #
    # 10:08am [247.08, 80.197, 275.587, 90.18]
    # hp laptop [6.0, 65.784, 737.868, 82.318]
    # bounding box [3.36, 77.7, 754.86, 98.64]
    
    return cleaned_texts

In [14]:
def get_comments(pdf):
    # filtering function
    def comment_filter():
        return this.get('word_margin',0) == "0.1" and this.get('x0', 0) == "6.0" and this.get('x1', 0) == "737.868"

    comment_lines = pdf.pq('LTTextLineHorizontal').filter(comment_filter)

    def extract_text(comment_line):
        # sometimes text lives directly in the LTTextLine (which is passed),
        # sometimes it lives within a LTTextBox within the LTTextLine.
        # This standardizes that.
        if comment_line.text != '':
            # Contains text directly
            return comment_line.text
        else:
            text_box = comment_line.find("LTTextBoxHorizontal")

            if text_box is not None:
                # This is the Box inside the Line
                return text_box.text

            else:
                return None

    comments = [extract_text(line) for line in comment_lines]

    # Now smoosh the lines together. Lines beginning with
    # "Officer" are probably a new entry. Everything else
    # should be lumped with the previous line.

    cleaned_comments = []
    for comment in comments:
        if comment.startswith("Officer"):
            # proper new comment. add to the list!
            cleaned_comments.append(comment)
        else:
            # it's the next line of the previous comment.
            # add it to the previous comment.
            # these lines usually have spaces between them already
            # so no need to add a new one
            cleaned_comments[-1] += comment

    return cleaned_comments

In [18]:
def incidents_of_pdf(pdf):
    """
    Pass this function the result of a pdfquery.PDFQuery() function.
    This will read through the pdf file and return a list of 
    Incident objects contained in there!
    
    Make sure the PDF is load()'ed before you pass it.
    """
    
    # TODO watch out for things like 11/24/17 where there were no incidents. there's a specific tag for those.
    
    # so each individual report, as well as headers, is filed inside 
    # its own <LTCurve>. The text fields are inside <LTTextLineHorizontal>s and <LTTextBoxHorizontal>s
    # inside the <LTCurve>.
    reports_plus_heads = pdf.tree.findall(".//LTCurve")
    
    # extract raw incidents
    raw_incidents = [get_text_from_curve(lt) for lt in reports_plus_heads]

    # remove headers of tables
    HEADER_ROW_TEXT = ['Reported', 'Incident Type', 'Occurred', 'Location', 'Disposition']
    incidents_without_headers = [i for i in raw_incidents if i != HEADER_ROW_TEXT]
    
    # convert incidents to proper objects
    # 9 = proper length of report; anything less is malformed
    # TODO clean up — extract error checking into its own make_incident_objects() function
    incident_objects = [incident.Incident(i) for i in incidents_without_headers if len(i) == 9]
    
    # get the comments from this pdf
    comments = get_comments(pdf)
    # there should be as many comments as incidents, so attach one to each
    # (in order)
    if len(comments) == len(incident_objects):
        for i in range(0, len(comments)):
            incident_objects[i].comments = comments[i]
    else:
        # PROBLEM: if the number of comments != the number of incidents,
        # we currently don't attach any comments to anyone.
        # is there a way to match comments individually to incidents
        # (like the LTCurves)? perhaps by x/y coords? that might help
        print("Wrong length comments!")
        print(comments)
        print(len(comments))
        print(incident_objects)
        print(len(incident_objects))
    
    return incident_objects

In [19]:
pdf = pdfquery.PDFQuery("data/112817.pdf")
pdf.load()
all_incidents = incidents_of_pdf(pdf)


Wrong length comments!
['Officer dispatched to take a report of a stolen U.S. currency valued at $620.00. ', 'Officer dispatched to take a report of a stolen blue Canada Goose jacket valued at $1,000.00. ', 'Officer dispatched to take a report of a stolen green Subaru Forrester valued at $12,000.00. ', 'Officer dispatched to take a report of threats. ', 'Officer dispatched to a report of an unwanted guest sitting in on a class. Officer arrived, located individual and conducted a field interview. The individual was run for wants/warrants with negative results. The individual was then advised that they need to get permission to sit in on future classes. The individual was then sent on their way. ', 'Officer dispatched to take a report of a stolen package containing an HP laptop valued at $821.00. ', 'Officers dispatched to a report of an individual wandering in and out of the library for the last 20 minutes. Officers arrived and report individual gone on arrival. ', 'Officer dispatched t

In [20]:
# try another date
pdf = pdfquery.PDFQuery("data/113017.pdf")
pdf.load()
new_incidents = incidents_of_pdf(pdf)
all_incidents += new_incidents
utils.dump_csv(all_incidents)

In [None]:
# REMAINING TODOS:
# - have a function to programmatically download HUPD crime logs
# - have another function to run through all downloaded crime logs in the `data` folder
#   (requires us to read the file system?)
# - Extract the descriptive test along with the metadata. This is somewhat harder but still
#   very important!

In [40]:
# Go through EVERY pdf in our data folder!
import glob

all_incidents = []

for filename in glob.iglob('data/*.pdf'):
    # filename will be like `data/xxxxxx.pdf`
    # extract incidents from this file
    pdf = pdfquery.PDFQuery(filename)
    pdf.load()
    new_incidents = incidents_of_pdf(pdf)
    all_incidents += new_incidents
    
    print("Done {}".format(filename))
    
# dump to csv
utils.dump_csv(all_incidents)
print("Dumped!")

Done data/012018.pdf
Done data/010418.pdf
Done data/020518.pdf
Done data/110817.pdf
Done data/020718.pdf
Done data/010618.pdf
Done data/012218.pdf
Done data/112817.pdf
Done data/122917.pdf
Done data/012618.pdf
Done data/123017.pdf
Done data/010218.pdf
Done data/020318.pdf
Done data/020118.pdf
Done data/011918.pdf
Done data/012418.pdf
Done data/020618.pdf
Done data/010718.pdf
Done data/012118.pdf
Done data/010518.pdf
Done data/020418.pdf
Done data/010118.pdf
Done data/012518.pdf
Done data/011818.pdf
Done data/012718.pdf
Done data/122817.pdf
Done data/010318.pdf
Done data/020218.pdf
Done data/113017.pdf
Done data/123117.pdf
Done data/122417.pdf
Done data/121917.pdf
Done data/011618.pdf
Done data/011418.pdf
Done data/012918.pdf
Done data/122617.pdf
Done data/013018.pdf
Done data/010918.pdf
Done data/020818.pdf
Done data/011018.pdf
Done data/021118.pdf
Done data/122217.pdf
Done data/122017.pdf
Done data/021318.pdf
Done data/011218.pdf
Done data/122717.pdf
Done data/012818.pdf
Done data/021