In [1]:
import pdfquery


In [2]:
pdf = pdfquery.PDFQuery("data/112817.pdf")
pdf.load()

In [3]:
# see what happens when we dump it
filename = "tmp.xml"
pdf.tree.write(filename, pretty_print=True)

In [4]:
[t.text for t in pdf.tree.findall(".//LTTextLineHorizontal")]




['Francis D. Riley ',
 'Chief Of Police ',
 '',
 '1033 Massachusetts Avenue ',
 'Sixth Floor ',
 'Cambridge, Massachusetts 02138 ',
 'Phone: 617-495-1215 ',
 'Fax: 617-495-7782 ',
 'President and Fellows of Harvard College ',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 'Officer dispatched to a report of an unwanted guest sitting in on a class. Officer arrived, located individual and ',
 'conducted a field interview. The individual was run for wants/warrants with negative results. The individual was then ',
 'advised that they need to get permission to sit in on future classes. The individual was then sent on their way. ',
 '11/28/17 ',
 '3:14 PM ',
 '',
 '11/16/17 ',
 '10:08 AM ',
 '',
 '',
 '',
 '',
 '',
 '',
 '11/28/17 ',
 '8:26 AM ',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '11/28/17 ',
 '9:08 AM ',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '11/28/17 ',
 '10:17 AM ',
 '',
 '11/27/17 - 5:45 PM ',
 '11/28/17 - 9:00 AM ',
 '',
 '',
 '',
 '',
 '',
 '11/28/17 ',
 '12:07 PM ',
 '',


In [5]:
[t.text for t in pdf.tree.findall(".//LTTextBoxHorizontal")]

['',
 'Harvard University Police Department ',
 '',
 u'Copyright \xa9 ',
 '2017 ',
 'PUBLIC POLICE LOG ',
 '11/28/2017 ',
 'Date & Time ',
 'Date & Time ',
 'Officer dispatched to take a report of a stolen U.S. currency valued at $620.00. ',
 'Officer dispatched to take a report of a stolen blue Canada Goose jacket valued at $1,000.00. ',
 'Officer dispatched to take a report of a stolen green Subaru Forrester valued at $12,000.00. ',
 'Officer dispatched to take a report of threats. ',
 '',
 '',
 'Officer dispatched to take a report of a stolen package containing an HP laptop valued at $821.00. ',
 '',
 '1 WESTERN AVE ',
 'Reported ',
 'Incident Type ',
 'Occurred ',
 'Location ',
 'Disposition ',
 '',
 'THEFT REPORT ',
 '11/28/17 ',
 '7:30 AM - 8:00 AM ',
 'MALKIN ATHLETIC CENTER ',
 '39 HOLYOKE ST ',
 'CAMBRIDGE ',
 'CLOSED ',
 ' ',
 '',
 'THEFT REPORT ',
 '11/21/17 ',
 '2:00 PM - 6:00 PM ',
 'NEW DENTAL RESEARCH BUILDING ',
 '190 LONGWOOD AVE ',
 'BOSTON ',
 'CLOSED ',
 ' ',
 '',
 

In [6]:
# so each individual report, as well as headers, is filed inside 
# its own <LTCurve>. The text fields are inside <LTTextLineHorizontal>s and <LTTextBoxHorizontal>s.

reports_plus_heads = pdf.tree.findall(".//LTCurve")

In [7]:
def get_text_from_curve(ltcurve):
    # text can be within LTTextBoxHorizontal or LTTextLineHorizontal
    # the problem is that these are interleaved
    # so we select EITHER as they come
    # if we chose the Boxes first and the Lines second then merged those lists,
    # the resulting list would be out of order!
    # e.g. if the true order is B1 L1 B2 B3 L2, the approach we are using
    # gives you the right order... but choosing Boxes and Lines separately
    # gives you B1 B2 B3 L1 L2!!!!
    textual_elements = ltcurve.cssselect("LTTextBoxHorizontal, LTTextLineHorizontal")
    texts = [t.text.strip() for t in textual_elements]
    
    # remove empty lines
    cleaned_texts = [t for t in texts if t != '']
    
    # PROBLEM with this approach: in rare cases some text from this falls way
    # outside the ltcurve. Might it still be within the bounding box though?
    
    # UPDATE: try gathering all data
    # for 11/28 consider these bboxes
    #
    # 10:08am [247.08, 80.197, 275.587, 90.18]
    # hp laptop [6.0, 65.784, 737.868, 82.318]
    # bounding box [3.36, 77.7, 754.86, 98.64]
    
    return cleaned_texts

In [8]:
HEADER_ROW_TEXT = ['Reported', 'Incident Type', 'Occurred', 'Location', 'Disposition']

In [9]:
incidents = [get_text_from_curve(lt) for lt in reports_plus_heads]

# remove headers
incidents_without_headers = [i for i in incidents if i != HEADER_ROW_TEXT]

In [10]:
incidents_without_headers

[['11/28/17',
  '8:26 AM',
  'THEFT REPORT',
  '11/28/17',
  '7:30 AM - 8:00 AM',
  'MALKIN ATHLETIC CENTER',
  '39 HOLYOKE ST',
  'CAMBRIDGE',
  'CLOSED'],
 ['11/28/17',
  '9:08 AM',
  'THEFT REPORT',
  '11/21/17',
  '2:00 PM - 6:00 PM',
  'NEW DENTAL RESEARCH BUILDING',
  '190 LONGWOOD AVE',
  'BOSTON',
  'CLOSED'],
 ['11/28/17',
  '10:17 AM',
  'MOTOR VEHICLE THEFT',
  '11/27/17 - 5:45 PM',
  '11/28/17 - 9:00 AM',
  'FACULTY ROW CAR PORT',
  '46 LINNAEAN ST',
  'CAMBRIDGE',
  'CLOSED'],
 ['11/28/17',
  '12:07 PM',
  'THREAT(S)',
  '11/27/17 - 4:53 PM',
  '11/28/17 - 11:02 AM',
  'DUDLEY HOUSE - LEHMAN HALL',
  '8 HARVARD YARD',
  'CAMBRIDGE',
  'OPEN'],
 ['11/28/17',
  '1:48 PM',
  'FIELD INTERVIEW',
  '11/28/17',
  '1:48 PM - 2:31 PM',
  'ALDRICH HALL',
  '35 HARVARD WAY',
  'ALLSTON',
  'CLOSED'],
 ['THEFT REPORT', 'CLOSED', 'ALLSTON'],
 ['11/28/17',
  '4:20 PM',
  'SUSPICIOUS ACTIVITY',
  '11/28/17',
  '4:20 PM',
  'EMERSON HALL',
  '26 HARVARD YARD',
  'CAMBRIDGE',
  'CLOSED'],


In [38]:
import incident
reload(incident)
from incident import Incident
import timing
reload(timing)

<module 'timing' from 'timing.pyc'>

In [39]:
# TODO watch out for things like 11/24/17 where there were no incidents. there's a specific tag for those.

In [40]:
# convert incidents to proper objects
# 9 = proper length of report; anything less is malformed
# TODO clean up — extract error checking into its own make_incident_objects() function
incident_objects = [incident.Incident(i) for i in incidents_without_headers if len(i) == 9]

In [41]:
incident_objects[0].to_dict_for_csv()

{'city': 'CAMBRIDGE',
 'disposition': 'CLOSED',
 'incident_type': 'THEFT REPORT',
 'location': 'MALKIN ATHLETIC CENTER',
 'occurred_end': '2017-11-28 08:00:00',
 'occurred_start': '2017-11-28 07:30:00',
 'reported': '(datetime.datetime(2017, 11, 28, 8, 26), datetime.datetime(2017, 11, 28, 8, 26))',
 'street_address': '39 HOLYOKE ST'}

In [31]:
import csv
import json

def dump_csv(incidents):
    """
    Dumps a list of Incident objects to CSV.
    """
    with open('harvard_crime_incidents.csv', 'w') as csvfile:
        fieldnames = Incident.CSV_FIELDS
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

        writer.writeheader()

        for incident in incidents:
            writer.writerow(incident.to_dict_for_csv())

In [32]:
dump_csv(incident_objects)

In [33]:
curve = reports_plus_heads[2]