In [1]:
import sys, os
sys.path.insert(0, os.path.abspath(os.path.join("..")))

# Library code to load data

Here we demonstrate how to use the Python package to load data

In [2]:
import opencrimedata.san_francisco as sf
import opencrimedata.chicago as chicago
import opencrimedata.dallas as dallas
import bz2, collections, lzma

#datadir = os.path.join("/media", "disk", "Data")
datadir = os.path.join("..", "..", "..", "..", "Data")

## Chicago

https://data.cityofchicago.org/Public-Safety/Crimes-2001-to-present/ijzp-q8t2

We have similar code in `open_cp`, but this library is more flexible.

In [27]:
filename = os.path.join(datadir, "chicago_all.csv.xz")
filename_old = os.path.join(datadir, "chicago_all_old.csv")

In [28]:
with lzma.open(filename, "rt", encoding="utf8") as f:
    gen = chicago.load(f)
    row = next(gen)
row

Row(id='5189091', crime_type='OFFENSE INVOLVING CHILDREN', crime_subtype='CHILD ABUSE', location='RESIDENCE', address='108XX S AVENUE G', datetime=datetime.datetime(2006, 10, 12, 10, 58), point=(-87.531655723, 41.698387427))

In [29]:
with lzma.open(filename, "rt", encoding="utf8") as f:
    print(len(list(chicago.load(f))))

6405860


Or we can load skipping data which isn't geocoded correctly.

In [30]:
with lzma.open(filename, "rt", encoding="utf8") as f:
    print(len(list(chicago.load_only_with_point(f))))

6323451


In [35]:
def fil(row):
    return row.crime_type == "BURGLARY"

with lzma.open(filename, "rt", encoding="utf8") as f:
    frame = chicago.to_geoframe(f, fil)
frame.head()

Unnamed: 0,id,crime_type,crime_subtype,location,address,datetime,geometry
0,5189119,BURGLARY,FORCIBLE ENTRY,RESIDENCE,110XX S ASHLAND AVE,2006-12-15 03:40:00,POINT (-87.661870593 41.693088458)
1,5189120,BURGLARY,HOME INVASION,RESIDENCE,043XX W CERMAK RD,2006-12-16 13:00:00,POINT (-87.733721827 41.851409307)
2,5189166,BURGLARY,FORCIBLE ENTRY,RESIDENCE,055XX S THROOP ST,2006-12-17 06:18:00,POINT (-87.657274157 41.793119033)
3,5189200,BURGLARY,FORCIBLE ENTRY,APARTMENT,033XX N HAMLIN AVE,2006-12-17 16:50:00,POINT (-87.722373658 41.941879259)
4,5189201,BURGLARY,FORCIBLE ENTRY,APARTMENT,008XX E 62ND ST,2006-12-11 01:00:00,POINT (-87.60552953200001 41.782237362)


### Street network

In [33]:
gen = chicago.load_street_centre_lines(os.path.join(datadir, "Chicago_Street Center Lines"))
next(gen)

Street(street_id=1782, street_name='S YALE AVE', length=67.22865483352807, source=StreetNode(street_id=2208, node_id=10809, street_address='245|W|ENGLEWOOD|AVE|'), destination=StreetNode(street_id=0, node_id=16581, street_address='6250|S|||'), left=StreetAddress(start=0, end=0, parity='O'), right=StreetAddress(start=6228, end=6248, parity='E'), line=array([[-87.63187018,  41.78080975],
       [-87.63184648,  41.78038243],
       [-87.63184505,  41.78035651],
       [-87.63184151,  41.78020488]]))

## San Francisco

https://data.sfgov.org/Public-Safety/Police-Department-Incidents/tmnf-yvry

In [5]:
filename = os.path.join(datadir, "SF_Police_Department_Incidents.csv.bz2")
def gen():
    with bz2.open(filename, "rt", encoding="UTF8") as f:
        yield from sf.load(f)

next(gen())

Row(category='NON-CRIMINAL', description='LOST PROPERTY', datetime=datetime.datetime(2015, 1, 19, 14, 0), block='18TH ST / VALENCIA ST', point=(-122.42158168137, 37.7617007179518), idd='15006027571000', incident='150060275')

In [6]:
len(list(gen()))

2119901

In [7]:
burg = [r for r in gen() if r.category=="BURGLARY"]
len(burg)

87536

In [8]:
min((x.datetime for x in burg)), max((x.datetime for x in burg))

(datetime.datetime(2003, 1, 1, 0, 1), datetime.datetime(2017, 9, 13, 23, 0))

In [9]:
len([x for x in burg if x.datetime.year == 2016])

5806

You need to be careful, as there are repeated events.

In [14]:
burg_by_incident = collections.defaultdict(list)
for row in burg:
    burg_by_incident[row.incident].append(row)

In [17]:
[col for col in burg_by_incident.values() if len(col)>1][0]

[Row(category='BURGLARY', description='BURGLARY OF RESIDENCE, UNLAWFUL ENTRY', datetime=datetime.datetime(2015, 2, 6, 1, 41), block='0 Block of BEAUMONT AV', point=(-122.455197430222, 37.7807110127869), idd='15011317505043', incident='150113175'),
 Row(category='BURGLARY', description='BURGLARY, HOT PROWL, UNLAWFUL ENTRY', datetime=datetime.datetime(2015, 2, 6, 1, 41), block='0 Block of BEAUMONT AV', point=(-122.455197430222, 37.7807110127869), idd='15011317505083', incident='150113175')]

In [18]:
with bz2.open(filename, "rt", encoding="UTF8") as f:
    frame = sf.to_geoframe(f, lambda row : row.category == 'BURGLARY')

In [19]:
frame.head()

Unnamed: 0,category,description,datetime,block,idd,incident,geometry
0,BURGLARY,"BURGLARY OF STORE, UNLAWFUL ENTRY",2015-01-31 16:09:00,200 Block of STOCKTON ST,15009825405053,150098254,POINT (-122.40656817787 37.7878092959561)
1,BURGLARY,"BURGLARY, UNLAWFUL ENTRY",2015-01-30 12:30:00,2600 Block of 18TH ST,15009883405073,150098834,POINT (-122.40867573409 37.761791195706)
2,BURGLARY,"BURGLARY OF APARTMENT HOUSE, FORCIBLE ENTRY",2015-02-01 13:15:00,1800 Block of WAWONA ST,15009930905011,150099309,POINT (-122.486177330467 37.7368129188976)
3,BURGLARY,"BURGLARY OF STORE, FORCIBLE ENTRY",2015-02-02 00:50:00,400 Block of VALENCIA ST,15009954705051,150099547,POINT (-122.42207304894 37.7661259454801)
4,BURGLARY,"BURGLARY OF RESIDENCE, UNLAWFUL ENTRY",2015-02-01 21:00:00,300 Block of LEXINGTON ST,15009980405043,150099804,POINT (-122.420489911989 37.7577269207659)


### Load street data

https://data.sfgov.org/Geographic-Locations-and-Boundaries/San-Francisco-Basemap-Street-Centerlines/7hfy-8sz8

In [4]:
gen = sf.load_street_centre_lines(os.path.join(datadir, "San Francisco Basemap Street Centerlines"))
next(gen)

Street(street_id=15145000, layer='PRIVATE', nhood='Twin Peaks', oneway='B', name='CROWN CT', left=None, right=None, line=array([[-122.44694245,   37.75722892],
       [-122.44664129,   37.7572054 ],
       [-122.44586702,   37.75693649]]))

# Dallas data

https://www.dallasopendata.com/Public-Safety/Police-Incidents/tbnj-w5hb

There are multiple rows per crime event (coding for witnesses etc. etc.)  Both `load` methods return just one `Row` object per crime.  `load` takes a guess as to the entry with the most useful information; `load_full` loads the entire file and selects the best entry (so is slower, but better).

In [3]:
def gen():
    with lzma.open(os.path.join(datadir, "Dallas_11_Sept_2017.csv.xz"), "rt") as file:
        yield from dallas.load(file)
        
def gen_full():
    with lzma.open(os.path.join(datadir, "Dallas_11_Sept_2017.csv.xz"), "rt") as file:
        yield from dallas.load_full(file)

In [4]:
next(gen())

Row(code='276285-2016', crime_type='BURGLARY', crime_subtype='BURGLARY-RESIDENCE', start_time=datetime.datetime(2016, 11, 16, 11, 0), end_time=datetime.datetime(2016, 11, 18, 11, 0), call_time=datetime.datetime(2016, 11, 18, 11, 42, 26), address='5850 BELT LINE RD', city='DALLAS 75254', lonlat=(-96.807131, 32.953948), xy=(758206.7262484236, 2144003.9338816064))

In [5]:
next(gen_full())

Row(code='276285-2016', crime_type='BURGLARY', crime_subtype='BURGLARY-RESIDENCE', start_time=datetime.datetime(2016, 11, 16, 11, 0), end_time=datetime.datetime(2016, 11, 18, 11, 0), call_time=datetime.datetime(2016, 11, 18, 11, 42, 26), address='5850 BELT LINE RD', city='DALLAS 75254', lonlat=(-96.807131, 32.953948), xy=(758206.7262484236, 2144003.9338816064))

In [6]:
with lzma.open(os.path.join(datadir, "Dallas_11_Sept_2017.csv.xz"), "rt") as file:
    frame = dallas.to_geoframe(file)
frame.head()

Unnamed: 0,code,crime_type,crime_subtype,start_time,end_time,call_time,address,city,x,y,lon,lat,geometry
0,276285-2016,BURGLARY,BURGLARY-RESIDENCE,2016-11-16 11:00:00,2016-11-18 11:00:00,2016-11-18 11:42:26,5850 BELT LINE RD,DALLAS 75254,231101.872364,653493.706035,-96.807131,32.953948,POINT (231101.8723642642 653493.7060345256)
1,277474-2016,MOTOR VEHICLE ACCIDENT,ACCIDENT MV,2016-11-20 00:40:00,2016-11-20 00:45:00,2016-11-20 00:50:30,4400 W DAVIS ST,DALLAS 75211,228822.734527,646550.220798,-96.892956,32.749608,POINT (228822.7345272984 646550.2207975964)
2,276593-2016,FOUND PROPERTY,FOUND,2016-11-18 23:28:00,2016-11-18 23:28:00,2016-11-18 23:28:52,600 E COLORADO BLVD,DALLAS 75203,230981.469742,646893.413264,-96.815154,32.758751,POINT (230981.4697415552 646893.4132635808)
3,276430-2016,ROBBERY,ROBBERY-BUSINESS,2016-11-18 18:32:00,2016-11-18 18:32:00,2016-11-18 18:44:57,2231 SINGLETON BLVD,DALLAS 75212,229763.705344,647562.311509,-96.858248,32.778774,POINT (229763.7053437693 647562.3115085037)
4,277049-2016,THEFT,OTHER THEFTS,2016-11-18 01:30:00,2016-11-18 07:30:00,2016-11-19 14:08:16,400 CRESCENT CT,DALLAS 75201,231301.600446,648089.852187,-96.80388,32.794004,POINT (231301.600446244 648089.8521874177)


### Load street data

https://www.dallasopendata.com/Geography-Boundaries/Streets-Shapefile-Polyline/cvgm-fp24

Export as shapefile

In [8]:
streets = list(dallas.load_street_lines(os.path.join(datadir, "Dallas Streets Shapefile - Polyline")))

In [13]:
streets[543]

Street(street_id=17331, clazz='MINOR ARTERIAL', name='RIDGEVIEW CIR', oneway=0, left=StreetAddress(start=6539, end=6555), right=StreetAddress(start=6538, end=6554), line=array([[-96.79318822,  32.92671109],
       [-96.79306999,  32.927066  ],
       [-96.79276672,  32.92781496]]))