# Data Generator

Takes a standard format .CSV and does the one-time date parsing so it loads really fast. Allows for you (the researcher) to modify data and then hand it off to me (the program.)

- Make sure your data is at `crime.csv` relative to this file
- Put `updatedcrimedata` in the `data/` directory to use it in TrustTeaming

Thanks!

In [39]:
import pandas as pd

test = pd.read_csv("crime.csv", infer_datetime_format=True, parse_dates=['FIRST_OCCURRENCE_DATE', 'LAST_OCCURRENCE_DATE', 'REPORTED_DATE'])

In [47]:
test2 = test.drop(['PRECINCT_ID', 'NEIGHBORHOOD_ID', 'GEO_X', 'GEO_Y'], axis=1)
test2 = test2.sort_values(by=["FIRST_OCCURRENCE_DATE"])
test2 = test2.dropna(subset=["GEO_LON", "GEO_LAT"])
test2['LAST_OCCURRENCE_DATE'] = test2['LAST_OCCURRENCE_DATE'].fillna(test2['FIRST_OCCURRENCE_DATE'])
test2 = test2.reset_index()
test2.to_feather("updatedcrimedata")
test2

Unnamed: 0,index,INCIDENT_ID,OFFENSE_ID,OFFENSE_CODE,OFFENSE_CODE_EXTENSION,OFFENSE_TYPE_ID,OFFENSE_CATEGORY_ID,FIRST_OCCURRENCE_DATE,LAST_OCCURRENCE_DATE,REPORTED_DATE,INCIDENT_ADDRESS,GEO_LON,GEO_LAT,DISTRICT_ID,IS_CRIME,IS_TRAFFIC
0,351573,20156000026,20156000026230500,2305,0,theft-items-from-vehicle,theft-from-motor-vehicle,2015-01-02 00:00:00,2015-01-02 12:00:00,2015-01-02 13:47:00,4904 N JULIAN ST,-105.030728,39.785783,1,1,0
1,238911,201513644,201513644230800,2308,0,theft-from-bldg,larceny,2015-01-02 00:00:00,2015-01-08 09:20:00,2015-01-08 10:53:00,6750 E STAPLETON S DR,-104.910120,39.777321,2,1,0
2,87955,20156000023,20156000023230500,2305,0,theft-items-from-vehicle,theft-from-motor-vehicle,2015-01-02 00:00:00,2015-01-02 07:00:00,2015-01-02 13:21:00,2121 DELGANY ST,-104.997312,39.757979,6,1,0
3,351572,20156000022,20156000022230500,2305,0,theft-items-from-vehicle,theft-from-motor-vehicle,2015-01-02 00:00:00,2015-01-02 07:00:00,2015-01-02 12:00:00,4455 N WOLFF ST,-105.048907,39.777578,1,1,0
4,65429,20151811,20151811549900,5499,0,traf-other,all-other-crimes,2015-01-02 01:03:00,2015-01-02 01:03:00,2015-01-02 03:27:00,N CENTRAL PARK BLVD / E NORTHFIELD BLVD,-104.883070,39.785728,5,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
456952,60089,202045112,202045112549900,5499,0,traf-other,all-other-crimes,2020-01-21 03:33:00,2020-01-21 03:33:00,2020-01-21 04:39:00,E 4TH AVE / N GRANT ST,-104.983804,39.722545,3,1,0
456953,60078,202045132,202045132739902,7399,2,public-order-crimes-other,all-other-crimes,2020-01-21 03:56:00,2020-01-21 03:56:00,2020-01-21 03:56:00,100 W COLFAX AVE,-104.988833,39.740354,6,1,0
456954,60082,202045137,202045137549900,5499,0,traf-other,all-other-crimes,2020-01-21 03:59:00,2020-01-21 03:59:00,2020-01-21 04:34:00,4400 BLOCK MORRISON RD,-105.043983,39.703058,4,1,0
456955,60062,202045149,202045149739902,7399,2,public-order-crimes-other,all-other-crimes,2020-01-21 04:17:00,2020-01-21 04:17:00,2020-01-21 04:17:00,1499 N BROADWAY ST,-104.987485,39.739897,6,1,0


## Developer Tests

Trying out things with pd here. No need to touch it! Unless you want to.

In [50]:

import pandas as pd
import datetime as datetime

crimeData = pd.read_feather("updatedcrimedata")

start = pd.Timestamp(2017,10,1)
end = pd.Timestamp(2017, 10, 2)

# crimeData.loc[(crimeData['FIRST_OCCURRENCE_DATE'] > start) & (crimeData['FIRST_OCCURRENCE_DATE'] < end)].loc[crimeData['OFFENSE_CATEGORY_ID'] == 'larceny']

# crimeData[crimeData['INCIDENT_ADDRESS'].str.contains("COLFAX", na=False)]
categories = ['theft-from-motor-vehicle', 'larceny', 'all-other-crimes']

sortedCD = crimeData.loc[(crimeData['FIRST_OCCURRENCE_DATE'] > start) & (crimeData['FIRST_OCCURRENCE_DATE'] < end)]

sortedCD = sortedCD[sortedCD['OFFENSE_CATEGORY_ID'].isin(categories)]

sortedCD

Unnamed: 0,index,INCIDENT_ID,OFFENSE_ID,OFFENSE_CODE,OFFENSE_CODE_EXTENSION,OFFENSE_TYPE_ID,OFFENSE_CATEGORY_ID,FIRST_OCCURRENCE_DATE,LAST_OCCURRENCE_DATE,REPORTED_DATE,INCIDENT_ADDRESS,GEO_LON,GEO_LAT,DISTRICT_ID,IS_CRIME,IS_TRAFFIC
249026,173937,2017658158,2017658158230400,2304,0,theft-parts-from-vehicle,theft-from-motor-vehicle,2017-10-01 00:01:00,2017-10-01 09:45:00,2017-10-01 10:26:00,4310 N AIRPORT WAY,-104.795056,39.774673,5,1,0
249028,321441,2017657605,2017657605239900,2399,0,theft-other,larceny,2017-10-01 00:04:00,2017-10-01 00:04:00,2017-10-01 01:05:00,2945 N IVY ST,-104.920256,39.759181,2,1,0
249039,297494,20176009262,20176009262239901,2399,1,theft-bicycle,larceny,2017-10-01 01:10:00,2017-10-01 06:00:00,2017-10-01 15:22:00,1620 N FILLMORE ST,-104.953431,39.742429,2,1,0
249065,173958,2017658578,2017658578230500,2305,0,theft-items-from-vehicle,theft-from-motor-vehicle,2017-10-01 03:14:00,2017-10-01 03:14:00,2017-10-01 16:15:00,3000 BLK S WILLOW ST,-104.891277,39.660948,3,1,0
249075,177010,20178045111,20178045111239902,2399,2,theft-gas-drive-off,larceny,2017-10-01 04:19:00,2017-10-01 04:19:00,2017-10-01 04:19:00,23970 E 78TH AVE,-104.706429,39.837989,7,1,0
249094,367951,2017735508,2017735508239900,2399,0,theft-other,larceny,2017-10-01 08:00:00,2017-11-02 08:00:00,2017-11-02 10:05:00,4720 N URAVAN ST,-104.779951,39.782151,5,1,0
249095,373828,2017764687,2017764687239900,2399,0,theft-other,larceny,2017-10-01 08:00:00,2017-11-14 08:00:00,2017-11-14 13:05:00,390 N VALLEJO ST,-105.012259,39.721541,4,1,0
249097,2008,2018110497,2018110497239900,2399,0,theft-other,larceny,2017-10-01 08:00:00,2018-02-15 16:44:00,2018-02-15 16:44:00,4300 E 9TH AVE,-104.936661,39.730872,2,1,0
249099,388584,20175004542,20175004542230400,2304,0,theft-parts-from-vehicle,theft-from-motor-vehicle,2017-10-01 09:00:00,2017-10-01 11:00:00,2017-10-02 13:30:00,3100 BLK W LOUISIANA AVE,-105.026475,39.692639,4,1,0
249102,177630,20176009335,20176009335230500,2305,0,theft-items-from-vehicle,theft-from-motor-vehicle,2017-10-01 09:30:00,2017-10-01 18:00:00,2017-10-03 08:12:00,1701 N BRYANT ST,-105.020108,39.743922,1,1,0


In [37]:
import datetime as datetime

crimeData = pd.read_feather("updatedcrimedata")



Timestamp('2015-01-02 00:00:00')