In [1]:
import pdfquery


In [7]:
pdf = pdfquery.PDFQuery("data/112817.pdf")
pdf.load()

In [8]:
# see what happens when we dump it
filename = "tmp.xml"
pdf.tree.write(filename, pretty_print=True)

In [13]:
[t.text for t in pdf.tree.findall(".//LTTextLineHorizontal")]

['Francis D. Riley ',
 'Chief Of Police ',
 '',
 '1033 Massachusetts Avenue ',
 'Sixth Floor ',
 'Cambridge, Massachusetts 02138 ',
 'Phone: 617-495-1215 ',
 'Fax: 617-495-7782 ',
 'President and Fellows of Harvard College ',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 'Officer dispatched to a report of an unwanted guest sitting in on a class. Officer arrived, located individual and ',
 'conducted a field interview. The individual was run for wants/warrants with negative results. The individual was then ',
 'advised that they need to get permission to sit in on future classes. The individual was then sent on their way. ',
 '11/28/17 ',
 '3:14 PM ',
 '',
 '11/16/17 ',
 '10:08 AM ',
 '',
 '',
 '',
 '',
 '',
 '',
 '11/28/17 ',
 '8:26 AM ',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '11/28/17 ',
 '9:08 AM ',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '11/28/17 ',
 '10:17 AM ',
 '',
 '11/27/17 - 5:45 PM ',
 '11/28/17 - 9:00 AM ',
 '',
 '',
 '',
 '',
 '',
 '11/28/17 ',
 '12:07 PM ',
 '',


In [14]:
[t.text for t in pdf.tree.findall(".//LTTextBoxHorizontal")]

['',
 'Harvard University Police Department ',
 '',
 u'Copyright \xa9 ',
 '2017 ',
 'PUBLIC POLICE LOG ',
 '11/28/2017 ',
 'Date & Time ',
 'Date & Time ',
 'Officer dispatched to take a report of a stolen U.S. currency valued at $620.00. ',
 'Officer dispatched to take a report of a stolen blue Canada Goose jacket valued at $1,000.00. ',
 'Officer dispatched to take a report of a stolen green Subaru Forrester valued at $12,000.00. ',
 'Officer dispatched to take a report of threats. ',
 '',
 '',
 'Officer dispatched to take a report of a stolen package containing an HP laptop valued at $821.00. ',
 '',
 '1 WESTERN AVE ',
 'Reported ',
 'Incident Type ',
 'Occurred ',
 'Location ',
 'Disposition ',
 '',
 'THEFT REPORT ',
 '11/28/17 ',
 '7:30 AM - 8:00 AM ',
 'MALKIN ATHLETIC CENTER ',
 '39 HOLYOKE ST ',
 'CAMBRIDGE ',
 'CLOSED ',
 ' ',
 '',
 'THEFT REPORT ',
 '11/21/17 ',
 '2:00 PM - 6:00 PM ',
 'NEW DENTAL RESEARCH BUILDING ',
 '190 LONGWOOD AVE ',
 'BOSTON ',
 'CLOSED ',
 ' ',
 '',
 

In [15]:
# so each individual report, as well as headers, is filed inside 
# its own <LTCurve>. The text fields are inside <LTTextLineHorizontal>s and <LTTextBoxHorizontal>s.

In [17]:
reports_plus_heads = pdf.tree.findall(".//LTCurve")

In [18]:
report2 = reports_plus_heads[1]

In [19]:
report2

<Element LTCurve at 0x1083e5788>

In [28]:
cleaned_texts

['11/28/17',
 '8:26 AM',
 'THEFT REPORT',
 '11/28/17',
 '7:30 AM - 8:00 AM',
 'MALKIN ATHLETIC CENTER',
 '39 HOLYOKE ST',
 'CAMBRIDGE',
 'CLOSED']

In [32]:
def get_text_from_curve(ltcurve):
    textual_elements = (ltcurve.findall(".//LTTextLineHorizontal") + 
                        ltcurve.findall(".//LTTextBoxHorizontal"))
    texts = [t.text.strip() for t in textual_elements]
    
    # remove empty lines
    cleaned_texts = [t for t in texts if t != '']
    
    # PROBLEM with this approach: in rare cases some text from this falls way
    # outside the ltcurve. Might it still be within the bounding box though?
    
    # UPDATE: try gathering all data
    # for 11/28 consider these bboxes
    #
    # 10:08am [247.08, 80.197, 275.587, 90.18]
    # hp laptop [6.0, 65.784, 737.868, 82.318]
    # bounding box [3.36, 77.7, 754.86, 98.64]
    
    return cleaned_texts

In [35]:
HEADER_ROW_TEXT = ['Reported', 'Incident Type', 'Occurred', 'Location', 'Disposition']

In [36]:
incidents = [get_text_from_curve(lt) for lt in reports_plus_heads]

# remove headers
incidents_without_headers = [i for i in incidents if i != HEADER_ROW_TEXT]

In [37]:
incidents_without_headers

[['11/28/17',
  '8:26 AM',
  'THEFT REPORT',
  '11/28/17',
  '7:30 AM - 8:00 AM',
  'MALKIN ATHLETIC CENTER',
  '39 HOLYOKE ST',
  'CAMBRIDGE',
  'CLOSED'],
 ['11/28/17',
  '9:08 AM',
  'THEFT REPORT',
  '11/21/17',
  '2:00 PM - 6:00 PM',
  'NEW DENTAL RESEARCH BUILDING',
  '190 LONGWOOD AVE',
  'BOSTON',
  'CLOSED'],
 ['11/28/17',
  '10:17 AM',
  '11/27/17 - 5:45 PM',
  '11/28/17 - 9:00 AM',
  'MOTOR VEHICLE THEFT',
  'FACULTY ROW CAR PORT',
  '46 LINNAEAN ST',
  'CAMBRIDGE',
  'CLOSED'],
 ['11/28/17',
  '12:07 PM',
  '11/27/17 - 4:53 PM',
  '11/28/17 - 11:02 AM',
  'THREAT(S)',
  'DUDLEY HOUSE - LEHMAN HALL',
  '8 HARVARD YARD',
  'CAMBRIDGE',
  'OPEN'],
 ['11/28/17',
  '1:48 PM',
  'FIELD INTERVIEW',
  '11/28/17',
  '1:48 PM - 2:31 PM',
  'ALDRICH HALL',
  '35 HARVARD WAY',
  'ALLSTON',
  'CLOSED'],
 ['THEFT REPORT', 'CLOSED', 'ALLSTON'],
 ['11/28/17',
  '4:20 PM',
  '11/28/17',
  '4:20 PM',
  'EMERSON HALL',
  '26 HARVARD YARD',
  'SUSPICIOUS ACTIVITY',
  'CAMBRIDGE',
  'CLOSED'],


In [39]:
class Incident(object):
    
    def __init__(self, data_row):
        # `data_row` is something like:
        #
        #  ['11/28/17',
        #   '1:48 PM',
        #   'FIELD INTERVIEW',
        #   '11/28/17',
        #   '1:48 PM - 2:31 PM',
        #   'ALDRICH HALL',
        #   '35 HARVARD WAY',
        #   'ALLSTON',
        #   'CLOSED'],
        #
        # The rows correspond to:
        # 1. Date Reported
        # 2. Time Reported
        # 3. Incident Type
        # 4. Date Occurred
        # 5. Time Occurred
        # 6. Location
        # 7. Street Address
        # 8. City (in Massachusetts)
        # 9. "Disposition Type", per HUPD (whether the case is open or closed)
        
        self.date_reported = data_row[0]
        self.time_reported = data_row[1]
        self.incident_type = data_row[2]
        self.date_occurred = data_row[3]
        self.time_occurred = data_row[4]
        self.location = data_row[5]
        self.street_address = data_row[6]
        self.city = data_row[7]
        self.disposition = data_row[8]
        
        # TODO: convert date/time reported into datetime objects

In [40]:
# TODO watch out for things like 11/24/17 where there were no incidents. there's a specific tag for those.

In [47]:
def element_within_box(element, box_coords):
    # box_coords must be [x0, y0, x1, y1]
    
    # get element coords

    print element_coords
    print box_coords

In [48]:
element_within_box(textual_elements[0], box)

[4.5, 545.197, 51.496, 555.18]
[3.36, 77.7, 754.86, 98.64]


In [85]:
class BoundingBox(object):
    
    def __init__(self, x0, y0, x1, y1):
        self.x0 = x0
        self.y0 = y0
        self.x1 = x1
        self.y1 = y1
        
    def is_inside_box(self, other_box):
        """
        Returns true if this box is entirely contained within the other box.
        """
        FUZZ = 5
        return (self.x0 >= other_box.x0 - FUZZ and
                self.x1 <= other_box.x1 + FUZZ and
                self.y0 >= other_box.y0 - FUZZ and
                self.y1 <= other_box.y1 + FUZZ)
    
    def __str__(self):
        return "({},{}) to ({},{})".format(self.x0, self.y0, self.x1, self.y1)
    
        

In [101]:
# find everything within this box


# get all textual elements
textual_elements = (pdf.tree.findall("//LTTextLineHorizontal") + 
                    pdf.tree.findall(".//LTTextBoxHorizontal"))

In [87]:
test_elt = textual_elements[0]

In [88]:
def box_of_element(element):
    coordinate_names = ["x0","y0","x1","y1"]
    coords = [float(element.get(z)) for z in coordinate_names]
    
    return BoundingBox(coords[0], coords[1], coords[2], coords[3])

In [89]:
str(box_of_element(test_elt))

'(4.5,545.197) to (51.496,555.18)'

In [90]:
# whats in here
test_box = BoundingBox(3.36, 77.7, 754.86, 98.64)

In [91]:
# get boxes for all elts

def is_in_box(element):
    box = box_of_element(element)
    return box.is_inside_box(test_box)

In [92]:
valid_elts = [e for e in textual_elements if is_in_box(e)]

In [93]:
[v.text for v in valid_elts]

['11/28/17 ',
 '3:14 PM ',
 '11/16/17 ',
 '10:08 AM ',
 '',
 '',
 '',
 '',
 '',
 '',
 '1 WESTERN AVE ',
 'THEFT REPORT ',
 'CLOSED ',
 'ALLSTON ']

In [None]:
"""

Summary of where we are as of 11/30/17

Through the "text-extraction" approach, we get 95% of the data. 
Sometimes, however, some text is put in the wrong order in the
document so we miss it. See the 10:08 AM report on 11/28/17.

The approach of "finding everything within the box" is quite
messy and difficult. But it might be able to get all the info.
The main problem is that I can't figure out how to properly
interleave the TextLines and TextBoxes. It's close — might need
to extract the TextLines within the TextBoxes. But it's such a messy operation
that I'd rather go with the text-extraction approach for the MVP.

"""