In [1]:
import pdfquery


In [2]:
pdf = pdfquery.PDFQuery("data/112817.pdf")
pdf.load()

In [3]:
# see what happens when we dump it
filename = "tmp.xml"
pdf.tree.write(filename, pretty_print=True)

In [18]:
[t.text for t in pdf.tree.findall(".//LTTextLineHorizontal")]




['Francis D. Riley ',
 'Chief Of Police ',
 '',
 '1033 Massachusetts Avenue ',
 'Sixth Floor ',
 'Cambridge, Massachusetts 02138 ',
 'Phone: 617-495-1215 ',
 'Fax: 617-495-7782 ',
 'President and Fellows of Harvard College ',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 'Officer dispatched to a report of an unwanted guest sitting in on a class. Officer arrived, located individual and ',
 'conducted a field interview. The individual was run for wants/warrants with negative results. The individual was then ',
 'advised that they need to get permission to sit in on future classes. The individual was then sent on their way. ',
 '11/28/17 ',
 '3:14 PM ',
 '',
 '11/16/17 ',
 '10:08 AM ',
 '',
 '',
 '',
 '',
 '',
 '',
 '11/28/17 ',
 '8:26 AM ',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '11/28/17 ',
 '9:08 AM ',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '11/28/17 ',
 '10:17 AM ',
 '',
 '11/27/17 - 5:45 PM ',
 '11/28/17 - 9:00 AM ',
 '',
 '',
 '',
 '',
 '',
 '11/28/17 ',
 '12:07 PM ',
 '',


In [19]:
[t.text for t in pdf.tree.findall(".//LTTextBoxHorizontal")]

['',
 'Harvard University Police Department ',
 '',
 u'Copyright \xa9 ',
 '2017 ',
 'PUBLIC POLICE LOG ',
 '11/28/2017 ',
 'Date & Time ',
 'Date & Time ',
 'Officer dispatched to take a report of a stolen U.S. currency valued at $620.00. ',
 'Officer dispatched to take a report of a stolen blue Canada Goose jacket valued at $1,000.00. ',
 'Officer dispatched to take a report of a stolen green Subaru Forrester valued at $12,000.00. ',
 'Officer dispatched to take a report of threats. ',
 '',
 '',
 'Officer dispatched to take a report of a stolen package containing an HP laptop valued at $821.00. ',
 '',
 '1 WESTERN AVE ',
 'Reported ',
 'Incident Type ',
 'Occurred ',
 'Location ',
 'Disposition ',
 '',
 'THEFT REPORT ',
 '11/28/17 ',
 '7:30 AM - 8:00 AM ',
 'MALKIN ATHLETIC CENTER ',
 '39 HOLYOKE ST ',
 'CAMBRIDGE ',
 'CLOSED ',
 ' ',
 '',
 'THEFT REPORT ',
 '11/21/17 ',
 '2:00 PM - 6:00 PM ',
 'NEW DENTAL RESEARCH BUILDING ',
 '190 LONGWOOD AVE ',
 'BOSTON ',
 'CLOSED ',
 ' ',
 '',
 

In [36]:
# so each individual report, as well as headers, is filed inside 
# its own <LTCurve>. The text fields are inside <LTTextLineHorizontal>s and <LTTextBoxHorizontal>s.

reports_plus_heads = pdf.tree.findall(".//LTCurve")

In [37]:
def get_text_from_curve(ltcurve):
    # text can be within LTTextBoxHorizontal or LTTextLineHorizontal
    # the problem is that these are interleaved
    # so we select EITHER as they come
    # if we chose the Boxes first and the Lines second then merged those lists,
    # the resulting list would be out of order!
    # e.g. if the true order is B1 L1 B2 B3 L2, the approach we are using
    # gives you the right order... but choosing Boxes and Lines separately
    # gives you B1 B2 B3 L1 L2!!!!
    textual_elements = ltcurve.cssselect("LTTextBoxHorizontal, LTTextLineHorizontal")
    texts = [t.text.strip() for t in textual_elements]
    
    # remove empty lines
    cleaned_texts = [t for t in texts if t != '']
    
    # PROBLEM with this approach: in rare cases some text from this falls way
    # outside the ltcurve. Might it still be within the bounding box though?
    
    # UPDATE: try gathering all data
    # for 11/28 consider these bboxes
    #
    # 10:08am [247.08, 80.197, 275.587, 90.18]
    # hp laptop [6.0, 65.784, 737.868, 82.318]
    # bounding box [3.36, 77.7, 754.86, 98.64]
    
    return cleaned_texts

In [38]:
HEADER_ROW_TEXT = ['Reported', 'Incident Type', 'Occurred', 'Location', 'Disposition']

In [39]:
incidents = [get_text_from_curve(lt) for lt in reports_plus_heads]

# remove headers
incidents_without_headers = [i for i in incidents if i != HEADER_ROW_TEXT]

In [40]:
incidents_without_headers

[['11/28/17',
  '8:26 AM',
  'THEFT REPORT',
  '11/28/17',
  '7:30 AM - 8:00 AM',
  'MALKIN ATHLETIC CENTER',
  '39 HOLYOKE ST',
  'CAMBRIDGE',
  'CLOSED'],
 ['11/28/17',
  '9:08 AM',
  'THEFT REPORT',
  '11/21/17',
  '2:00 PM - 6:00 PM',
  'NEW DENTAL RESEARCH BUILDING',
  '190 LONGWOOD AVE',
  'BOSTON',
  'CLOSED'],
 ['11/28/17',
  '10:17 AM',
  'MOTOR VEHICLE THEFT',
  '11/27/17 - 5:45 PM',
  '11/28/17 - 9:00 AM',
  'FACULTY ROW CAR PORT',
  '46 LINNAEAN ST',
  'CAMBRIDGE',
  'CLOSED'],
 ['11/28/17',
  '12:07 PM',
  'THREAT(S)',
  '11/27/17 - 4:53 PM',
  '11/28/17 - 11:02 AM',
  'DUDLEY HOUSE - LEHMAN HALL',
  '8 HARVARD YARD',
  'CAMBRIDGE',
  'OPEN'],
 ['11/28/17',
  '1:48 PM',
  'FIELD INTERVIEW',
  '11/28/17',
  '1:48 PM - 2:31 PM',
  'ALDRICH HALL',
  '35 HARVARD WAY',
  'ALLSTON',
  'CLOSED'],
 ['THEFT REPORT', 'CLOSED', 'ALLSTON'],
 ['11/28/17',
  '4:20 PM',
  'SUSPICIOUS ACTIVITY',
  '11/28/17',
  '4:20 PM',
  'EMERSON HALL',
  '26 HARVARD YARD',
  'CAMBRIDGE',
  'CLOSED'],


In [41]:
class Incident(object):
    
    def __init__(self, data_row):
        # `data_row` is something like:
        #
        #  ['11/28/17',
        #   '1:48 PM',
        #   'FIELD INTERVIEW',
        #   '11/28/17',
        #   '1:48 PM - 2:31 PM',
        #   'ALDRICH HALL',
        #   '35 HARVARD WAY',
        #   'ALLSTON',
        #   'CLOSED'],
        #
        # The rows correspond to:
        # 1. Date Reported
        # 2. Time Reported
        # 3. Incident Type
        # 4. Date Occurred
        # 5. Time Occurred
        # 6. Location
        # 7. Street Address
        # 8. City (in Massachusetts)
        # 9. "Disposition Type", per HUPD (whether the case is open or closed)
        
        self.date_reported = data_row[0]
        self.time_reported = data_row[1]
        self.incident_type = data_row[2]
        self.location = data_row[5]
        self.street_address = data_row[6]
        self.city = data_row[7]
        self.disposition = data_row[8]
        
        # TODO: there are 3 ways for date and time occurred to be 
        # represented! (left of /// is data_row[3], right of /// is data_row[4])
        """
        11/28/17 /// 4:20 PM
        11/21/17 /// 2:00 PM - 6:00 PM
        11/27/17 - 5:45 PM /// 11/28/17 - 9:00 AM
        """
        # The first & second ways are used if the start and end of the incident
        # are on the same date. The third way is used if it spans multiple
        # dates.
        # TODO PARSE
#         self.date_occurred = data_row[3]
#         self.time_occurred = data_row[4]
        
        # TODO: convert date/time reported into datetime objects
        
        

    # static
    # this is the list of fields that are exported to CSV
    # see to_dict_for_csv()
    CSV_FIELDS = [
        'date_reported',
        'time_reported',
        'incident_type',
        'date_occurred',
        'time_occurred',
        'location',
        'street_address',
        'city',
        'disposition'
    ]
    
    def to_dict_for_csv(self):
        # returns a nicer-formatted dict ready for insertion into a csv
        # so that means any arrays need to be flattened to scalars
        # also everything needs to be converted to ascii

        # TODO: extract to utils module
        def to_ascii(unicode_str):
            if unicode_str is None:
                return None
            return unicode_str.encode("ascii","replace")

        return dict(
            date_reported=to_ascii(self.date_reported),
            time_reported=to_ascii(self.time_reported),
            incident_type=to_ascii(self.incident_type),
            date_occurred=to_ascii(self.date_occurred),
            time_occurred=to_ascii(self.time_occurred),
            location=to_ascii(self.location),
            street_address=to_ascii(self.street_address),
            city=to_ascii(self.city),
            disposition=to_ascii(self.disposition)
        )

In [42]:
# TODO watch out for things like 11/24/17 where there were no incidents. there's a specific tag for those.

In [43]:
# convert incidents to proper objects
# 9 = proper length of report; anything less is malformed
# TODO clean up — extract error checking into its own make_incident_objects() function
incident_objects = [Incident(i) for i in incidents_without_headers if len(i) == 9]

In [44]:
incident_objects[0].to_dict_for_csv()

{'city': 'CAMBRIDGE',
 'date_occurred': '11/28/17',
 'date_reported': '11/28/17',
 'disposition': 'CLOSED',
 'incident_type': 'THEFT REPORT',
 'location': 'MALKIN ATHLETIC CENTER',
 'street_address': '39 HOLYOKE ST',
 'time_occurred': '7:30 AM - 8:00 AM',
 'time_reported': '8:26 AM'}

In [45]:
import csv
import json

def dump_csv(incidents):
    """
    Dumps a list of Incident objects to CSV.
    """
    with open('harvard_crime_incidents.csv', 'w') as csvfile:
        fieldnames = Incident.CSV_FIELDS
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

        writer.writeheader()

        for incident in incidents:
            writer.writerow(incident.to_dict_for_csv())

In [46]:
dump_csv(incident_objects)

In [49]:
curve = reports_plus_heads[2]

In [54]:
import re

In [78]:
data = [
    ["11/28/17","4:20 PM"],
    ["11/21/17","2:00 PM - 6:00 PM"],
    ["11/27/17 - 5:45 PM","11/28/17 - 9:00 AM"]
]

In [89]:
def hour_to_24(hour, meridian):
    """
    Converts a 12-hour representation to a 24-hour representation.
    e.g.
        12, "AM" => 0
         6, "AM" => 6
        12, "PM" => 12
         3, "PM" => 15
        11, "PM" => 23
    """
    
    # first, if it's 12 whatever, subtract the 12 because it's actually 0
    if hour == 12:
        hour -= 12
        
    # if it's PM, add 12
    if meridian == "PM":
        hour += 12
        
    return hour

In [108]:
import datetime

    
def parse_datetime_tuple(dt_tuple):
    """
    Converts a tuple like ('11', '27', '17', '5', '45', 'PM')
    into a proper Python datetime object.

    It MUST be 6 elements long and have exactly the right
    order of elements.
    """

    if len(dt_tuple) != 6:
        raise ValueError

    # abbreviate for simplicity
    R = dt_tuple

    # convert this to a datetime
    # year is encoded as 2-digit
    year = 2000 + int(R[2])
    month = int(R[0])
    day = int(R[1])
    # need to convert this 12-hour representation to 24-hours
    hour = hour_to_24(int(R[3]), R[5])
    minute = int(R[4])
    # no seconds

    # TODO: set timezone as EST; see
    # https://docs.python.org/2/library/datetime.html#datetime-objects
    dt = datetime.datetime(year, month, day, hour, minute)
    return dt

def parse_occurrence_string(occurrence_array):
    """
    Parses a list of 2 elements that represents 
    when an incident occurred. The following formats
    are all supported:
    
    ["11/28/17","4:20 PM"],
    ["11/21/17","2:00 PM - 6:00 PM"],
    ["11/27/17 - 5:45 PM","11/28/17 - 9:00 AM"]
     
    Transforms this into a timestamp of when it started.
    """
    # concatenate and trim whitespace
    occurrence = (occurrence_array[0] + " | " + occurrence_array[1]).strip()
    
    print occurrence
    
    start = None
    end = None

    
    # CASE I: simplest, occurred at a constant time and not a range
    simple_matcher = re.compile(
        "^(\d{1,2})\/(\d{1,2})\/(\d{2}) \| (\d{1,2}):(\d{2}) ([A|P]M)$")
    simple_result = simple_matcher.findall(occurrence)
    if len(simple_result) > 0:
        # this matches Case I
        # return group length = 6
        # abbreviate this for simplicity
        R = simple_result[0]
        
        start = parse_datetime_tuple(R)
        # end time is same as start time
        end = start
    
    # CASE II: range of times on the SAME day
    range_matcher = re.compile(
        "^(\d{1,2})\/(\d{1,2})\/(\d{2}) \| (\d{1,2}):(\d{2}) ([A|P]M) - (\d{1,2}):(\d{2}) ([A|P]M)$")
    range_result = range_matcher.findall(occurrence)
    if len(range_result) > 0:
        # this matches Case II
        # return group length = 9
        # abbreviate this for simplicity
        R = range_result[0]
        
        # convert to datetime
        # start has R[0, 1, 2, 3, 4, 5]
        # end has R[0, 1, 2, 6, 7, 8]
        
        start_tuple = R[0:6]
        start = parse_datetime_tuple(start_tuple)
        end_tuple = (R[0], R[1], R[2], R[6], R[7], R[8])
        end = parse_datetime_tuple(end_tuple)
    
    # CASE III: range of times ACROSS days
    # TODO: break this down across lines
    across_day_matcher = re.compile(
        "^(\d{1,2})\/(\d{1,2})\/(\d{2}) - (\d{1,2}):(\d{2}) ([A|P]M) \| (\d{1,2})\/(\d{1,2})\/(\d{2}) - (\d{1,2}):(\d{2}) ([A|P]M)$")
    across_day_result = across_day_matcher.findall(occurrence)
    if len(across_day_result) > 0:
        # this matches Case III
        # return group length = 12

        # abbreviate this for simplicity
        R = across_day_result[0]
        
        # convert start and end for datetime
        # start is encoded as elements [0,6); end is [6, 12)
        start = parse_datetime_tuple(R[0:6])
        end = parse_datetime_tuple(R[6:12])
        
    print start
    print end
    return (start, end)

        
print parse_occurrence_string(data[0])
print ""
print parse_occurrence_string(data[1])
print ""
print parse_occurrence_string(data[2])

11/28/17 | 4:20 PM
2017-11-28 16:20:00
2017-11-28 16:20:00
(datetime.datetime(2017, 11, 28, 16, 20), datetime.datetime(2017, 11, 28, 16, 20))

11/21/17 | 2:00 PM - 6:00 PM
2017-11-21 14:00:00
2017-11-21 18:00:00
(datetime.datetime(2017, 11, 21, 14, 0), datetime.datetime(2017, 11, 21, 18, 0))

11/27/17 - 5:45 PM | 11/28/17 - 9:00 AM
2017-11-27 17:45:00
2017-11-28 09:00:00
(datetime.datetime(2017, 11, 27, 17, 45), datetime.datetime(2017, 11, 28, 9, 0))
