In [1]:
import pandas as pd
import holidays
import json
us_holidays = holidays.US()

In [2]:
dv = '../data/denver_crime.csv'
weather = '../data/selected_weather_data.csv'
event = '../data/event_data.csv'
dv_json = '../data/denver_related.json'

In [3]:
parse_dates = ['FIRST_OCCURRENCE_DATE', 'LAST_OCCURRENCE_DATE', 'REPORTED_DATE']
useful_cols =['OFFENSE_TYPE_ID','OFFENSE_CATEGORY_ID', 'FIRST_OCCURRENCE_DATE', 'LAST_OCCURRENCE_DATE', 'REPORTED_DATE', 
            'INCIDENT_ADDRESS','GEO_LON','GEO_LAT', 'NEIGHBORHOOD_ID', 'IS_TRAFFIC']
dvdf = pd.read_csv(dv, parse_dates=parse_dates, usecols=useful_cols,nrows=100)

dvdf.columns = map(str.lower, dvdf.columns)
dvdf.rename(columns={"offense_type_id": "crime_type", "offense_category_id": "crime_category"}, inplace=True)

In [4]:
weather_df = pd.read_csv(weather, parse_dates = ['datetime'])

In [5]:
dvdf['tmp'] = pd.DatetimeIndex(dvdf['first_occurrence_date']).round('H')
dvdf['city']= 'Denver'
dvdf = dvdf.merge(weather_df, left_on=['city', 'tmp'], right_on =['city_name', 'datetime'],how='left')
dvdf.drop(columns=['tmp', 'datetime', 'city_name'], inplace=True)

In [6]:
# Crime rate
dvdf['crime_rate'] = (len(dvdf.index) / 6) * (10000 / 2723000)

In [7]:
dvdf.head()

Unnamed: 0,crime_type,crime_category,first_occurrence_date,last_occurrence_date,reported_date,incident_address,geo_lon,geo_lat,neighborhood_id,is_traffic,city,temperature,humidity,weather_main,weather_description,crime_rate
0,weapon-unlawful-discharge-of,all-other-crimes,2016-06-15 23:31:00,NaT,2016-06-15 23:31:00,,-104.809881,39.773188,montbello,0,Denver,32.05,18,Clouds,few clouds,0.062431
1,theft-other,larceny,2017-10-11 12:30:00,2017-10-11 16:55:00,2018-01-29 17:53:00,,-104.781434,39.785649,gateway-green-valley-ranch,0,Denver,4.87,55,Clear,sky is clear,0.062431
2,theft-items-from-vehicle,theft-from-motor-vehicle,2016-03-04 20:00:00,2016-04-25 08:00:00,2016-04-26 21:02:00,2932 S JOSEPHINE ST,-104.957381,39.66349,wellshire,0,Denver,13.28,28,Clouds,broken clouds,0.062431
3,theft-other,larceny,2018-01-30 19:20:00,NaT,2018-01-30 22:29:00,705 S COLORADO BLVD,-104.94144,39.702698,belcaro,0,Denver,14.38,14,Clouds,broken clouds,0.062431
4,theft-shoplift,larceny,2017-06-22 20:53:00,NaT,2017-06-23 16:09:00,2810 E 1ST AVE,-104.95537,39.717107,cherry-creek,0,Denver,26.59,30,Clouds,broken clouds,0.062431


In [8]:
dvdf.columns

Index(['crime_type', 'crime_category', 'first_occurrence_date',
       'last_occurrence_date', 'reported_date', 'incident_address', 'geo_lon',
       'geo_lat', 'neighborhood_id', 'is_traffic', 'city', 'temperature',
       'humidity', 'weather_main', 'weather_description', 'crime_rate'],
      dtype='object')

In [9]:
dvdf['crime_date'] = pd.DatetimeIndex(dvdf['first_occurrence_date']).date
dvdf['day_of_week'] = pd.DatetimeIndex(dvdf['first_occurrence_date']).dayofweek
dvdf['week_of_year'] = pd.DatetimeIndex(dvdf['first_occurrence_date']).week
dvdf['quarter'] = pd.DatetimeIndex(dvdf['first_occurrence_date']).quarter

# holiday
weekend = pd.DataFrame({'day_of_week':[0,1,2,3,4,5,6],'weekend':[0,0,0,0,0,1,1]})
dvdf = dvdf.merge(weekend, how='left', left_on=['day_of_week'], right_on = ['day_of_week'])
dvdf['holiday'] = dvdf['crime_date'].apply(lambda x: us_holidays.get(x) is not None)
dvdf['holiday_name'] = dvdf['crime_date'].apply(lambda x: us_holidays.get(x))

In [10]:
# crime severity index
with open(dv_json) as f:
    dv_dict = json.load(f)

dvdf['crime_severity_c'] = dvdf['crime_category'].apply(lambda x: True if x in dv_dict['violent-crime'] else False)
dvdf['crime_severity_t'] = dvdf['crime_type'].apply(lambda x: True if x in dv_dict['violent-crime'] else False)
dvdf['crime_severity'] = dvdf.crime_severity_c | dvdf.crime_severity_t
dvdf.drop(columns=['crime_severity_t', 'crime_severity_c'], inplace=True)
dvdf['crime_severity'].replace([True, False], ['violent', 'non-violent'], inplace = True)

In [11]:
# is nighttime
dvdf['is_nighttime'] = pd.DatetimeIndex(dvdf['first_occurrence_date']).hour
dvdf['is_nighttime'] = dvdf['is_nighttime'].apply(lambda x: True if x >= 21 or x<=5 else False)

In [12]:
# is fatal
dvdf['is_fatal_c'] = dvdf['crime_category'].apply(lambda x: True if x in dv_dict['fatal-crime'] else False)
dvdf['is_fatal_t'] = dvdf['crime_type'].apply(lambda x: True if x in dv_dict['fatal-crime'] else False)
dvdf['is_fatal'] = dvdf.is_fatal_c | dvdf.is_fatal_t
dvdf.drop(columns=['is_fatal_c', 'is_fatal_t'], inplace=True)

In [None]:
def out(path:str, df:pd.DataFrame):
    df.to_csv(path)