In [1]:
import numpy as np
import pandas as pd
import geopandas as gp
from shapely.geometry import Point
from src.geom_helper import pts2segs_by_chunk
from src.constants import (index_pt, index_seg, epsg_dc, fn_segments_dc, dir_data, 
                           fn_311_dc, fn_crash_dc, fns_crime_dc, fn_vision0_dc,
                          fn_feature_crash_dc, fn_feature_311_dc, fn_feature_vision0_dc, fn_feature_crime_dc,
                          fn_oepndc_bk_dc, fn_feature_seg_attr_dc, fn_feature_oepndc_bk_dc)


In [2]:
seg_dc = gp.read_file(dir_data + fn_segments_dc)

# segment attributes features

In [None]:
seg_dc.DIRECTIONALITY = seg_dc.DIRECTIONALITY.apply(lambda x: 'Bi-direction' if x==2 else 'one-way')
ftr_segs_col = ['DIRECTIONALITY', 'STREETTYPE', 'SHAPE_Length', 'SEGMENTTYPE']
ftr_segs = seg_dc[ftr_segs_col]
ftr_segs.index.name = index_seg

In [None]:
ftr_segs.to_csv(dir_data+fn_feature_seg_attr_dc)

# bake lanes features in opendc

In [None]:
bk = pd.read_csv(dir_data + fn_oepndc_bk_dc)

dum = pd.get_dummies(bk.FACILITY)
bk = bk[['STREETSEGID']]
bk['bkdc_total'] = 1
bk = bk.merge(dum, left_index=True, right_index=True)

ftr_bk = bk.groupby('STREETSEGID').sum().reset_index()

slice_str = seg_dc[['STREETSEGID']].reset_index()
slice_str.columns=['index_seg', 'STREETSEGID']
ftr_bk = ftr_bk.merge(slice_str)
ftr_bk.set_index('index_seg', inplace=True)
ftr_bk.drop('STREETSEGID', axis=1).to_csv(dir_data + fn_feature_oepndc_bk_dc)

# 311 features

In [3]:
csr311 = pd.read_csv(dir_data + fn_311_dc)

  interactivity=interactivity, compiler=compiler, result=result)


### match 311 to segments

In [4]:
pts = csr311.apply(lambda x: Point(x.LONGITUDE, x.LATITUDE), axis=1)
gpdf = gp.GeoDataFrame(pts, columns=['geometry'])
pts_has_ln, pts_no_ln = pts2segs_by_chunk(gpdf, seg_dc, epsg_dc)
pts_has_ln.columns = [index_pt, index_seg]

size of data: 738322 # chunks: 7.38322
matching chunk: 0 100000


  outputs = ufunc(*inputs)


matching chunk: 100000 200000
matching chunk: 200000 300000
matching chunk: 300000 400000
matching chunk: 400000 500000
matching chunk: 500000 600000
matching chunk: 600000 700000
matching chunk: 700000 738322


### clean 311 data: get YEAR, MONTH and dummy types

In [8]:
csr311.shape, pts_has_ln.index_pt.nunique()

((738322, 52), 733930)

In [9]:
# clean csr311
date = pd.to_datetime(csr311.INITIATEDDATE)
csr311['MONTH'] = date.dt.month
csr311['YEAR'] = date.dt.year

# get dummies for types
dummies = pd.get_dummies(csr311.DESCRIPTION, prefix='311')

csr311['311_total'] = 1
csr311['311_total_not_parking_meter'] = csr311.DESCRIPTION=='PARKING METER REQUES'
csr311 = csr311[['MONTH','YEAR', '311_total', '311_total_not_parking_meter']]

csr311 = csr311.merge(dummies, left_index=True, right_index=True)


### get features for each seg per month

In [16]:
segs_with_vision0 = pts_has_ln.merge(csr311, left_on=index_pt, right_index=True, how='left')
segs_with_vision0.drop('index_pt', inplace=True, axis=1)
print 'to group'
ftr_311 = segs_with_vision0.groupby(['index_seg', 'YEAR', 'MONTH']).agg('sum')
ftr_311.to_csv(dir_data + fn_feature_311_dc)

---

# Features for crashes

In [7]:
crashes = pd.read_csv(dir_data+ fn_crash_dc)
# clean crashes
date = pd.to_datetime(crashes.SOURCEADDTIME)
crashes['MONTH'] = date.dt.month
crashes['YEAR'] = date.dt.year


  interactivity=interactivity, compiler=compiler, result=result)


### match crashes to segments:
1. apply pts2seg
2. for those(~70K) without coordinates or no matches, use STREETSEGID column in crashes

In [8]:
# match with segs
no_xy = (crashes.X.isnull())|(crashes.Y.isnull())
crashes_with_xy = crashes[~no_xy]
pts = crashes_with_xy.apply(lambda x: Point(x.X, x.Y), axis=1)
gpdf = gp.GeoDataFrame(pts, columns=['geometry'], index=crashes_with_xy.index)
pts_has_ln, pts_no_ln = pts2segs_by_chunk(gpdf, seg_dc, epsg_dc)
# raname columns
pts_has_ln.columns = [index_pt, index_seg]

# get STREETSEGID FOR crashes with matches
crash_no_ln = crashes[~crashes.index.isin(set(pts_has_ln.index_pt))]
crash_using_segid = crash_no_ln.loc[~crash_no_ln.STREETSEGID.isnull(), 'STREETSEGID'].reset_index()
# get index_seg
crash_using_segid = crash_using_segid.merge(seg_dc[['STREETSEGID']].reset_index(), left_on="STREETSEGID", right_on="STREETSEGID").drop('STREETSEGID', axis=1)
# rename columns
crash_using_segid.columns=[index_pt, index_seg]

# append them
pts_segs_index = pts_has_ln.append(crash_using_segid,ignore_index=True)

size of data: 124278 # chunks: 1.24278
matching chunk: 0 100000
matching chunk: 100000 124278


### clean crashes data: get YEAR, MONTH and dummy types

In [9]:
# get dummies for types
CRASHEVENTTYPES_dummies = crashes.CRASHEVENTTYPES.apply(lambda x: {i.strip():1 for i in x.split(',')}).apply(pd.Series)
CRASHEVENTTYPES_dummies.columns = 'crash_evt_' + CRASHEVENTTYPES_dummies.columns

FIRSTHARMFULEVENTSPECIFICS_dummies = pd.get_dummies(crashes.FIRSTHARMFULEVENTSPECIFICS)
FIRSTHARMFULEVENTSPECIFICS_dummies.columns = 'crash_1stharm_' + FIRSTHARMFULEVENTSPECIFICS_dummies.columns
FIRSTHARMFULEVENTSPECIFICS_dummies.replace(0, np.nan, inplace=True)

crashes = crashes[['MONTH','YEAR']]
crashes['crash_total'] = 1
crashes = crashes.merge(CRASHEVENTTYPES_dummies, left_index=True, right_index=True)
crashes = crashes.merge(FIRSTHARMFULEVENTSPECIFICS_dummies, left_index=True, right_index=True)

### get features for each seg per month

In [1]:
segs_with_crashes = pts_segs_index.merge(crashes, left_on=index_pt, right_index=True, how='left')
segs_with_crashes.drop('index_pt', inplace=True, axis=1)
ftr_crashes = segs_with_crashes.groupby(['index_seg', 'YEAR', 'MONTH']).agg('sum')
ftr_crashes.to_csv(dir_data + fn_feature_crash_dc)

NameError: name 'pts_segs_index' is not defined

---

# Vision zero features

In [12]:
vision0 = pd.read_csv(dir_data+ fn_vision0_dc)

### match v0 to segments

In [13]:
pts = vision0.apply(lambda x: Point(x.X, x.Y), axis=1)
gpdf = gp.GeoDataFrame(pts, columns=['geometry'])
pts_has_ln, pts_no_ln = pts2segs_by_chunk(gpdf, seg_dc, epsg_dc)
pts_has_ln.columns = [index_pt, index_seg]

pts2segs ing...


### clean vision zero data: get YEAR, MONTH and dummy types

In [14]:
# clean crashes
date = pd.to_datetime(vision0.REQUESTDATE)
vision0['MONTH'] = date.dt.month
vision0['YEAR'] = date.dt.year

# get dummies for types
USERTYPE_dummies = pd.get_dummies(vision0.USERTYPE)
USERTYPE_dummies.columns = 'v0ur_'+USERTYPE_dummies.columns

REQUESTTYPE_dummies = pd.get_dummies(vision0.REQUESTTYPE)
REQUESTTYPE_dummies.columns = 'v0rq_' + REQUESTTYPE_dummies.columns

vision0 = vision0[['MONTH','YEAR']]
vision0['v0_total'] = 1
vision0 = vision0.merge(USERTYPE_dummies, left_index=True, right_index=True)
vision0 = vision0.merge(REQUESTTYPE_dummies, left_index=True, right_index=True)


### get features for each seg per month

In [15]:
segs_with_vision0 = pts_has_ln.merge(vision0, left_on=index_pt, right_index=True, how='left')
segs_with_vision0.drop('index_pt', inplace=True, axis=1)
ftr_v0 = segs_with_vision0.groupby(['index_seg', 'YEAR', 'MONTH']).agg('sum')
ftr_v0.to_csv(dir_data + fn_feature_vision0_dc)

---

# crimes

In [32]:
ys = [2014, 2015, 2016, 2017]
crimes = [gp.read_file(dir_data + fns_crime_dc[y]) for y in ys]
crime = pd.concat(crimes, ignore_index=True)

### match v0 to segments

In [33]:
pts_has_ln, pts_no_ln = pts2segs_by_chunk(crime, seg_dc, epsg_dc, close_jn_dist=150, far_jn_dist=150)
pts_has_ln.columns = [index_pt, index_seg]
pts_has_ln.shape

size of data: 112912 # chunks: 1.12912
matching chunk: 0 100000
matching chunk: 100000 112912


(1465899, 2)

pts_has_ln.shape

close_jn_dist=5, far_jn_dist=20: (126773, 2)

close_jn_dist=50, far_jn_dist=20: (250848, 2)

close_jn_dist=50, far_jn_dist=50: (250848, 2)

close_jn_dist=100, far_jn_dist=100 :(758803, 2)

close_jn_dist=150, far_jn_dist=150 :(1465899, 2)


### clean crime data: get YEAR, MONTH and dummy types

In [34]:
# clean crashes
date = pd.to_datetime(crime.START_DATE)
crime['MONTH'] = date.dt.month
crime['YEAR'] = date.dt.year

# get dummies for types
METHOD_dummies = pd.get_dummies(crime.METHOD, prefix='crime_mtd')
METHOD_dummies.replace(0, np.nan, inplace=True)

OFFENSE_dummies = pd.get_dummies(crime.OFFENSE, prefix='crime_ofn')
OFFENSE_dummies.replace(0, np.nan, inplace=True)

crime = crime[['MONTH','YEAR']]
crime['crime_total'] = 1
crime = crime.merge(METHOD_dummies, left_index=True, right_index=True)
crime = crime.merge(OFFENSE_dummies, left_index=True, right_index=True)

### get features for each seg per month

In [35]:
segs_with_crime = pts_has_ln.merge(crime, left_on=index_pt, right_index=True, how='left')
segs_with_crime.drop('index_pt', inplace=True, axis=1)

ftr_crime = segs_with_crime.groupby(['index_seg', 'YEAR', 'MONTH']).agg('sum')
ftr_crime.to_csv(dir_data + fn_feature_crime_dc)