In [1]:
from invisibleroads_macros.disk import make_folder
from os.path import expanduser
target_folder = make_folder(expanduser('~/Experiments/spatiotemporal'))

In [2]:
import requests
from os.path import getsize, join

def download(target_path, source_url):
    response = requests.get(source_url)
    open(target_path, 'w').write(response.content)
    print('{} {:,}'.format(source_url, getsize(target_path)))
    return target_path

geojson_path = download(
    join(target_folder, 'nyc-traffic-injuries.json'), 
    'http://www.nyc.gov/html/dot/downloads/misc/injury_all_monthly.json')
shapefile_path = download(
    join(target_folder, 'nyc-traffic-injuries.shp.zip'),
    'http://www.nyc.gov/html/dot/downloads/misc/injury_all_monthly_shapefile.zip')

http://www.nyc.gov/html/dot/downloads/misc/injury_all_monthly.json 46,306,835
http://www.nyc.gov/html/dot/downloads/misc/injury_all_monthly_shapefile.zip 5,053,838


In [4]:
import fiona
from os.path import join
geojson_path = join(target_folder, 'nyc-traffic-injuries.json')
collection = geojson_collection = fiona.open(geojson_path)
print 'geojson_collection.bounds = %s' % repr(geojson_collection.bounds)
print 'geojson_collection.crs_wkt = %s' % geojson_collection.crs_wkt
print 'geojson_collection.crs = %s' % geojson_collection.crs
geojson_collection[0]

geojson_collection.bounds = (-74.2539230306024, 40.49947769792743, -73.70059800086655, 40.91246913562538)
geojson_collection.crs_wkt = GEOGCS["WGS 84",DATUM["WGS_1984",SPHEROID["WGS 84",6378137,298.257223563,AUTHORITY["EPSG","7030"]],AUTHORITY["EPSG","6326"]],PRIMEM["Greenwich",0,AUTHORITY["EPSG","8901"]],UNIT["degree",0.0174532925199433,AUTHORITY["EPSG","9122"]],AUTHORITY["EPSG","4326"]]
geojson_collection.crs = {'init': u'epsg:4326'}


{'geometry': {'coordinates': (-73.7917447266822, 40.72578884918672),
  'type': 'Point'},
 'id': '0',
 'properties': OrderedDict([(u'MVOInjurie', 1),
              (u'MN', u'1'),
              (u'Injuries', 2),
              (u'BikeInjuri', 0),
              (u'YR', u'2009'),
              (u'PedInjurie', 1)]),
 'type': 'Feature'}

In [3]:
import fiona
from os.path import join
shapefile_path = join(target_folder, 'nyc-traffic-injuries.shp.zip')
collection = shapefile_collection = fiona.open('/', vfs='zip://' + shapefile_path)
print 'shapefile_collection.bounds = %s' % repr(shapefile_collection.bounds)
print 'shapefile_collection.crs_wkt = %s' % shapefile_collection.crs_wkt
print 'shapefile_collection.crs = %s' % shapefile_collection.crs
shapefile_collection[0]

shapefile_collection.bounds = (-74.2539230306024, 40.49947769792743, -73.70059800086655, 40.91246913562538)
shapefile_collection.crs_wkt = GEOGCS["GCS_WGS_1984",DATUM["WGS_1984",SPHEROID["WGS_84",6378137.0,298.257223563]],PRIMEM["Greenwich",0.0],UNIT["Degree",0.0174532925199433]]
shapefile_collection.crs = {'init': u'epsg:4326'}


{'geometry': {'coordinates': (-73.7917447266822, 40.72578884918672),
  'type': 'Point'},
 'id': '0',
 'properties': OrderedDict([(u'Injuries', 2),
              (u'PedInjurie', 1),
              (u'BikeInjuri', 0),
              (u'MVOInjurie', 1),
              (u'MN', u'1'),
              (u'YR', u'2009')]),
 'type': 'Feature'}

In [5]:
from pandas import DataFrame
rows, indices = [], []
for d in collection:
    indices.append(int(d['id']))
    longitude, latitude = map(float, d['geometry']['coordinates'])
    properties = d['properties']
    year, month = int(properties['YR']), int(properties['MN'])
    total_count = int(properties['Injuries'])
    pedestrian_count = int(properties['PedInjurie'])
    bike_count = int(properties['BikeInjuri'])
    vehicle_count = int(properties['MVOInjurie'])
    rows.append([
        longitude, latitude, year, month,
        total_count, pedestrian_count, bike_count, vehicle_count,
    ])
nyc_traffic_injury_table = DataFrame(rows, index=indices, columns=[
    'Longitude', 'Latitude', 'Year', 'Month',
    'Total', 'Pedestrian', 'Bike', 'Vehicle',
])
nyc_traffic_injury_table[:3]

Unnamed: 0,Longitude,Latitude,Year,Month,Total,Pedestrian,Bike,Vehicle
0,-73.791745,40.725789,2009,1,2,1,0,1
1,-73.882429,40.844981,2009,1,1,1,0,0
2,-73.979058,40.744444,2009,1,2,2,0,0


In [7]:
from os.path import join
nyc_traffic_injury_table.to_csv(join(target_folder, 'nyc-traffic-injuries.csv'), index=False)
nyc_traffic_injury_table.to_msgpack(join(target_folder, 'nyc-traffic-injuries.msg-zlib'), compress='zlib')
nyc_traffic_injury_table.to_msgpack(join(target_folder, 'nyc-traffic-injuries.msg-blosc'), compress='blosc')

In [8]:
ls -lh -S $target_folder

total 103M
-rw-rw-r--. 1 rhh rhh  45M Dec 17 19:22 nyc-traffic-injuries.json
-rw-rw-r--. 1 rhh rhh  26M Dec 10 20:02 nyc-traffic-injuries-by-month.pkl
-rw-rw-r--. 1 rhh rhh  15M Dec 10 20:02 nyc-traffic-injuries.pkl
-rw-rw-r--. 1 rhh rhh 8.5M Dec 17 19:23 nyc-traffic-injuries.csv
-rw-rw-r--. 1 rhh rhh 4.9M Dec 17 19:22 [0m[01;31mnyc-traffic-injuries.shp.zip[0m
-rw-rw-r--. 1 rhh rhh 2.9M Dec 17 19:23 nyc-traffic-injuries.msg-zlib
-rw-rw-r--. 1 rhh rhh 2.7M Dec 17 19:23 nyc-traffic-injuries.msg-blosc
drwxr-xr-x. 2 rhh rhh 4.0K Nov 27 16:51 [01;34mnyc-traffic-injuries.shp[0m/


In [9]:
from pandas import read_csv, read_msgpack
%timeit nyc_traffic_injury_table = read_csv(join(target_folder, 'nyc-traffic-injuries.csv'))
%timeit nyc_traffic_injury_table = read_msgpack(join(target_folder, 'nyc-traffic-injuries.msg-zlib'))
%timeit nyc_traffic_injury_table = read_msgpack(join(target_folder, 'nyc-traffic-injuries.msg-blosc'))

10 loops, best of 3: 104 ms per loop
10 loops, best of 3: 36.6 ms per loop
100 loops, best of 3: 9.52 ms per loop
