In [2]:
import os, sys
sys.path.insert(0, os.path.abspath(".."))

# Chicago data

We have already done this somewhat in `open_cp`, but let's do it again for completeness.

In [3]:
import os, lzma, csv, collections, datetime
import pyproj
import numpy as np

In [19]:
datadir = os.path.join("/media", "disk", "Data")
filename = os.path.join(datadir, "chicago_all.csv.xz")

def gen():
    with lzma.open(filename, "rt", encoding="UTF8") as f:
        yield from csv.reader(f)
        
def gen_data():
    reader = gen()
    header = next(reader)
    yield from reader

rows = gen()
header = next(rows)
print(header)
print(next(rows))

['ID', 'Case Number', 'Date', 'Block', 'IUCR', 'Primary Type', 'Description', 'Location Description', 'Arrest', 'Domestic', 'Beat', 'District', 'Ward', 'Community Area', 'FBI Code', 'X Coordinate', 'Y Coordinate', 'Year', 'Updated On', 'Latitude', 'Longitude', 'Location']
['5189091', 'HM779248', '10/12/2006 10:58:00 AM', '108XX S AVENUE G', '1750', 'OFFENSE INVOLVING CHILDREN', 'CHILD ABUSE', 'RESIDENCE', 'false', 'false', '0432', '004', '10', '52', '20', '1203183', '1833703', '2006', '04/15/2016 08:55:02 AM', '41.698387427', '-87.531655723', '(41.698387427, -87.531655723)']


Same again, but for the "old" file

In [15]:
datadir = os.path.join("/media", "disk", "Data")
filename = os.path.join(datadir, "chicago_all_old.csv")

def gen():
    with open(filename, "rt", encoding="UTF8") as f:
        yield from csv.reader(f)
        
def gen_data():
    reader = gen()
    header = next(reader)
    yield from reader

rows = gen()
header = next(rows)
print(header)
print(next(rows))

['ID', 'Case Number', 'Date', 'Block', 'IUCR', 'Primary Type', 'Description', 'Location Description', 'Arrest', 'Domestic', 'Beat', 'District', 'Ward', 'Community Area', 'FBI Code', 'X Coordinate', 'Y Coordinate', 'Year', 'Updated On', 'Latitude', 'Longitude', 'Location']
['9625135', 'HX275356', '05/24/2014 11:53:00 PM', '087XX S KINGSTON AVE', '0486', 'BATTERY', 'DOMESTIC BATTERY SIMPLE', 'APARTMENT', 'true', 'true', '0423', '004', '7', '46', '08B', '1194619', '1847489', '2014', '05/31/2014 12:40:12 PM', '41.736431989689265', '-87.56256025376493', '(41.736431989689265, -87.56256025376493)']


# ID Numbers

- `ID` is unique
- `Case Number` is nearly unique.  No obvious pattern in the repeats.  Quite a few are "HOMICIDE".

In [61]:
ids, cases = [], []
for row in gen_data():
    ids.append(row[0])
    cases.append(row[1])

In [62]:
len(ids), len(set(ids))

(5536001, 5536001)

In [63]:
len(cases), len(set(cases))

(5536001, 5535722)

In [64]:
counter = collections.defaultdict(int)
for case in cases:
    counter[case] += 1
counter = {case:count for case,count in counter.items() if count>1}

In [65]:
case_repeats = collections.defaultdict(list)
for row in gen_data():
    if row[1] in counter:
        case_repeats[row[1]].append(row)

In [66]:
for key in case_repeats:
    uniques = set(tuple(row)[1:] for row in case_repeats[key])
    if len(uniques) > 1:
        titles = []
        for i, title in enumerate(header[1:]):
            if len(set(row[i] for row in uniques)) > 1:
                titles.append(title)
        print(key, titles)

HX258236 ['Date', 'Block', 'IUCR', 'Primary Type', 'Description', 'Location Description', 'Arrest', 'District', 'Ward', 'FBI Code', 'X Coordinate', 'Y Coordinate', 'Updated On', 'Latitude', 'Longitude', 'Location']
HX238885 ['Date', 'Block', 'IUCR', 'Primary Type', 'Description', 'Location Description', 'Beat', 'District', 'Ward', 'Community Area', 'FBI Code', 'X Coordinate', 'Y Coordinate', 'Year', 'Updated On', 'Latitude', 'Longitude', 'Location']
HX230593 ['Updated On']
HX229324 ['Updated On']
HV394360 ['Date', 'IUCR', 'Primary Type', 'Description', 'Location Description', 'FBI Code', 'Y Coordinate', 'Year', 'Updated On', 'Latitude', 'Longitude', 'Location']
HX219632 ['Date', 'Block', 'IUCR', 'Primary Type', 'Description', 'Location Description', 'Arrest', 'Beat', 'District', 'Ward', 'Community Area', 'FBI Code', 'X Coordinate', 'Y Coordinate', 'Year', 'Updated On', 'Latitude', 'Longitude', 'Location']
HX191030 ['Date', 'Block', 'IUCR', 'Primary Type', 'Description', 'Location Descr

# Types and Descriptions

In [67]:
types = collections.defaultdict(set)
type_index = header.index('Primary Type')
desc_index = header.index('Description')
for row in gen_data():
    types[row[type_index]].add(row[desc_index])
types

defaultdict(set,
            {'ARSON': {'AGGRAVATED',
              'ATTEMPT ARSON',
              'BY EXPLOSIVE',
              'BY FIRE',
              'POS: CHEMICAL/DRY-ICE DEVICE',
              'POS: EXPLOSIVE/INCENDIARY DEV'},
             'ASSAULT': {'AGG PO HANDS NO/MIN INJURY',
              'AGG PRO.EMP: HANDGUN',
              'AGG PRO.EMP: OTHER DANG WEAPON',
              'AGG PRO.EMP: OTHER FIREARM',
              'AGG PRO.EMP:KNIFE/CUTTING INST',
              'AGGRAVATED PO: HANDGUN',
              'AGGRAVATED PO: OTHER DANG WEAP',
              'AGGRAVATED PO: OTHER FIREARM',
              'AGGRAVATED PO:KNIFE/CUT INSTR',
              'AGGRAVATED: HANDGUN',
              'AGGRAVATED: OTHER DANG WEAPON',
              'AGGRAVATED: OTHER FIREARM',
              'AGGRAVATED:KNIFE/CUTTING INSTR',
              'PRO EMP HANDS NO/MIN INJURY',
              'SIMPLE'},
             'BATTERY': {'AGG PO HANDS ETC SERIOUS INJ',
              'AGG PO HANDS NO/MIN INJURY',
      

# Locations

In [6]:
index = header.index('Location Description')
locations = set(row[index] for row in gen_data())
print(len(locations))
locations

174


{'',
 'ABANDONED BUILDING',
 'AIRCRAFT',
 'AIRPORT BUILDING NON-TERMINAL - NON-SECURE AREA',
 'AIRPORT BUILDING NON-TERMINAL - SECURE AREA',
 'AIRPORT EXTERIOR - NON-SECURE AREA',
 'AIRPORT EXTERIOR - SECURE AREA',
 'AIRPORT PARKING LOT',
 'AIRPORT TERMINAL LOWER LEVEL - NON-SECURE AREA',
 'AIRPORT TERMINAL LOWER LEVEL - SECURE AREA',
 'AIRPORT TERMINAL MEZZANINE - NON-SECURE AREA',
 'AIRPORT TERMINAL UPPER LEVEL - NON-SECURE AREA',
 'AIRPORT TERMINAL UPPER LEVEL - SECURE AREA',
 'AIRPORT TRANSPORTATION SYSTEM (ATS)',
 'AIRPORT VENDING ESTABLISHMENT',
 'AIRPORT/AIRCRAFT',
 'ALLEY',
 'ANIMAL HOSPITAL',
 'APARTMENT',
 'APPLIANCE STORE',
 'ATHLETIC CLUB',
 'ATM (AUTOMATIC TELLER MACHINE)',
 'AUTO',
 'AUTO / BOAT / RV DEALERSHIP',
 'BANK',
 'BANQUET HALL',
 'BAR OR TAVERN',
 'BARBER SHOP/BEAUTY SALON',
 'BARBERSHOP',
 'BASEMENT',
 'BOAT/WATERCRAFT',
 'BOWLING ALLEY',
 'BRIDGE',
 'CAR WASH',
 'CEMETARY',
 'CHA APARTMENT',
 'CHA BREEZEWAY',
 'CHA ELEVATOR',
 'CHA GROUNDS',
 'CHA HALLWAY',
 '

# Geocoding

- `Location` is just `Latitude` and `Longitude` combined
- http://spatialreference.org/ref/epsg/2790/
- Agrees exactly with X/Y coords if we use the US Survey foot.

In [69]:
lat_index = header.index('Latitude')
lon_index = header.index('Longitude')
location_index = header.index('Location')
no_location_count = 0
for row in gen_data():
    lat, lon = row[lat_index], row[lon_index]
    location = row[location_index]
    empty_count = sum(x == "" for x in [lat, lon, location])
    if empty_count == 3:
        no_location_count += 1
        continue
    assert empty_count == 0
    assert location[0] == "(" and location[-1] == ")"
    location = [x.strip() for x in location[1:-1].split(",")]
    assert len(location) == 2
    assert location[0] == lat
    assert location[1] == lon
no_location_count

43252

In [70]:
x_index = header.index('X Coordinate')
y_index = header.index('Y Coordinate')
longs, lats = [], []
xcoords, ycoords = [], []
for row in gen_data():
    lat, lon = row[lat_index], row[lon_index]
    x, y = row[x_index], row[y_index]
    empty_count = sum(x == "" for x in [lat, lon, x, y])
    if empty_count == 4:
        continue
    assert empty_count == 0
    lat, lon = float(lat), float(lon)
    assert x == str(int(x))
    assert y == str(int(y))
    longs.append(lon)
    lats.append(lat)
    xcoords.append(int(x))
    ycoords.append(int(y))
longs = np.asarray(longs)
lats = np.asarray(lats)
xcoords = np.asarray(xcoords)
ycoords = np.asarray(ycoords)
longs.shape, longs.dtype, xcoords.shape, xcoords.dtype

((5492749,), dtype('float64'), (5492749,), dtype('int64'))

In [71]:
proj = pyproj.Proj({"init":"epsg:2790"})
xcs, ycs = proj(longs, lats)

In [72]:
conversion = 0.3048
xd = np.floor(xcs / conversion + 0.5) - xcoords
yd = np.floor(ycs / conversion + 0.5) - ycoords
np.max(np.abs(xd)), np.max(np.abs(yd))

(2.0, 4.0)

In [73]:
conversion = 1200 / 3937
xd = np.floor(xcs / conversion + 0.5) - xcoords
yd = np.floor(ycs / conversion + 0.5) - ycoords
np.max(np.abs(xd)), np.max(np.abs(yd))

(0.0, 0.0)

In [74]:
x, y = proj(xcoords * conversion, ycoords * conversion, inverse=True)
diff = (x - longs)**2 + (y - lats)**2
np.max(diff)

4.8213663914170217e-24

# Timestamps

In [20]:
fmt = "%m/%d/%Y %I:%M:%S %p"

missing_date_count = 0
date_index = header.index("Date")
dates = []
for row in gen_data():
    if row[date_index] == "":
        missing_date_count += 1
        continue
    dates.append(datetime.datetime.strptime(row[date_index], fmt))
missing_date_count

0

In [21]:
min(dates), max(dates)

(datetime.datetime(2001, 1, 1, 0, 0), datetime.datetime(2017, 8, 9, 23, 59))

In [22]:
assert not all(dt.second==0 for dt in dates)

# Use the library

In [4]:
import impute.chicago

In [5]:
with lzma.open(filename, "rt", encoding="utf8") as file:
    row = next(impute.chicago.load(file))
row

Row(id='5189091', crime_type='OFFENSE INVOLVING CHILDREN', crime_subtype='CHILD ABUSE', location='RESIDENCE', address='108XX S AVENUE G', datetime=datetime.datetime(2006, 10, 12, 10, 58), point=(-87.531655723, 41.698387427))

In [6]:
count = 0
with lzma.open(filename, "rt", encoding="utf8") as file:
    for row in impute.chicago.load(file):
        count += 1
count

6405860

In [7]:
def filter(row):
    return row.crime_type.startswith("BURG") and row.datetime >= datetime.datetime(2016,1,1)

with lzma.open(filename, "rt", encoding="utf8") as file:
    frame = impute.chicago.to_geoframe(file, filter)

In [8]:
frame.head()

Unnamed: 0,id,crime_type,crime_subtype,location,address,datetime,geometry
0,10511547,BURGLARY,FORCIBLE ENTRY,RESTAURANT,068XX N SHERIDAN RD,2016-05-05 00:50:00,POINT (-87.660994398 42.005849323)
1,10511583,BURGLARY,UNLAWFUL ENTRY,APARTMENT,014XX W LELAND AVE,2016-05-05 08:30:00,POINT (-87.665255481 41.967109706)
2,10511668,BURGLARY,UNLAWFUL ENTRY,APARTMENT,075XX S MAY ST,2016-05-05 07:45:00,POINT (-87.652689305 41.756801414)
3,10511676,BURGLARY,ATTEMPT FORCIBLE ENTRY,APARTMENT,052XX S JUSTINE ST,2016-05-05 08:20:00,POINT (-87.663496806 41.798797735)
4,10511681,BURGLARY,FORCIBLE ENTRY,APARTMENT,029XX N RACINE AVE,2016-05-05 19:15:00,POINT (-87.65881512199999 41.935549294)


In [9]:
old_filename = os.path.join(datadir, "chicago_all_old.csv")
count = 0
for row in impute.chicago.load(old_filename):
    count += 1
count

5536001