In [None]:
import pandas as pd
import numpy as np
import holidays
import json
import utm
from time import time

In [None]:
ca_holidays = holidays.CA()
def xy_2_lonlat(x, y):
    return utm.to_latlon(x, y, 10, 'U')

In [None]:
vc = '../data/van_crime.csv'
weather = '../data/selected_weather_data.csv'
vc_json = '../data/van_related.json'

In [None]:
dvvc = pd.read_csv(vc)
dvvc.columns = map(str.lower, dvvc.columns)
dvvc.rename(columns={'type': 'original_crime_type'}, inplace=True)

In [None]:
# Take subset between 2015 and 2020
idx = dvvc.index[(dvvc['year'] >=2015) & (dvvc['year'] <= 2020)]
dvvc = dvvc.iloc[idx]

In [None]:
# Mapping crime category/type
with open(vc_json) as f:
    vc_dict = json.load(f)
_type_mapping_dict = vc_dict['type-mapping']

dvvc['crime_category'] = dvvc['original_crime_type'].apply(lambda x: _type_mapping_dict[x][0])
dvvc['crime_type'] = dvvc['original_crime_type'].apply(lambda x: _type_mapping_dict[x][1])
dvvc.drop(columns=['original_crime_type'], inplace=True)

In [None]:
# City
dvvc['city'] = 'Vancouver'

In [None]:
# Hundred block -> address
dvvc.rename({'hundred_block': 'address'}, axis=1, inplace=True)

In [None]:
# X, Y -> longitude, latitude
start = time()
dvvc['tmp'] = dvvc[['x','y']].apply(lambda e: xy_2_lonlat(e.x, e.y) if (100000<=e.x<=999999 and 0<=e.y<=10000000) else (0, 0), axis=1)
dvvc[['longitude', 'latitude']] = pd.DataFrame(dvvc['tmp'].tolist())
dvvc.drop(columns=['tmp', 'x', 'y'], inplace=True)

dvvc['longitude'] = dvvc['longitude'].apply(lambda x: x if not np.isnan(x) else 0)
dvvc['latitude'] = dvvc['latitude'].apply(lambda x: x if not np.isnan(x) else 0)
print('Time elapsed: %s' % (time()-start))

In [None]:
dvvc['reported_datetime'] = pd.to_datetime(dvvc[['year', 'month', 'day', 'hour', 'minute']])

In [None]:
# Weather
weather_df = pd.read_csv(weather, parse_dates = ['datetime'])
dvvc['tmp'] = pd.DatetimeIndex(dvvc['reported_datetime']).round('H')
dvvc['city']= 'Vancouver'
dvvc = dvvc.merge(weather_df, left_on=['city', 'tmp'], right_on =['city_name', 'datetime'],how='left')
dvvc.drop(columns=['tmp', 'datetime', 'city_name'], inplace=True)

In [None]:
# Crime rate
# 651416 is the average population from 2014 to 2017
dvvc['crime_rate'] = (len(dvvc) / len(dvvc['year'].unique())) * (10000 / 651416)

In [None]:
dvvc['crime_date'] = pd.DatetimeIndex(dvvc['reported_datetime']).date
dvvc['day_of_week'] = pd.DatetimeIndex(dvvc['reported_datetime']).dayofweek
dvvc['week_of_year'] = pd.DatetimeIndex(dvvc['reported_datetime']).week
dvvc['quarter'] = pd.DatetimeIndex(dvvc['reported_datetime']).quarter

# Weekend, holiday
weekend = pd.DataFrame({'day_of_week':[0,1,2,3,4,5,6],'weekend':[False,False,False,False,False,True,True]})
dvvc = dvvc.merge(weekend, how='left', left_on=['day_of_week'], right_on = ['day_of_week'])
dvvc['holiday'] = dvvc['crime_date'].apply(lambda x: ca_holidays.get(x) is not None)
dvvc['holiday_name'] = dvvc['crime_date'].apply(lambda x: 'NOT APPLICABLE' if ca_holidays.get(x) is None else ca_holidays.get(x))

In [None]:
# Crime severity index
_violent_crime_lst = vc_dict['violent-crime']

dvvc['crime_severity_c'] = dvvc['crime_category'].apply(lambda x: True if x in _violent_crime_lst else False)
dvvc['crime_severity_t'] = dvvc['crime_type'].apply(lambda x: True if x in _violent_crime_lst else False)
dvvc['crime_severity'] = dvvc.crime_severity_c | dvvc.crime_severity_t
dvvc.drop(columns=['crime_severity_t', 'crime_severity_c'], inplace=True)
dvvc['crime_severity'].replace([True, False], ['violent', 'non-violent'], inplace = True)

In [None]:
# is nighttime
dvvc['is_nighttime'] = dvvc['hour']
dvvc['is_nighttime'] = dvvc['is_nighttime'].apply(lambda x: True if x >= 21 or x<=5 else False)

In [None]:
# is fatal
_fatal_crime_lst = vc_dict['fatal-crime']

dvvc['is_fatal_c'] = dvvc['crime_category'].apply(lambda x: True if x in _fatal_crime_lst else False)
dvvc['is_fatal_t'] = dvvc['crime_type'].apply(lambda x: True if x in _fatal_crime_lst else False)
dvvc['is_fatal'] = dvvc.is_fatal_c | dvvc.is_fatal_t
dvvc.drop(columns=['is_fatal_c', 'is_fatal_t'], inplace=True)

In [None]:
# is traffic
dvvc['is_traffic'] = dvvc['crime_category'].apply(lambda x: True if x == 'traffic-accident' else False)

In [None]:
# Drop useless cols
dvvc.drop(columns=['year', 'month', 'day', 'hour', 'minute'], inplace=True)

In [None]:
# reported_datetime -> report_date, report_time
dvvc['reported_date'] = pd.DatetimeIndex(dvvc['reported_datetime']).round('D')
dvvc['reported_time'] = pd.DatetimeIndex(dvvc['reported_datetime']).time
dvvc.drop(columns=['reported_datetime'], inplace=True)

In [None]:
# Empty first_occurrence_date/time, last_occurrence_date/time
empty_date = pd.to_datetime(['1970-01-01'] * len(dvvc.index))
empty_time = pd.DatetimeIndex(pd.to_datetime(['1970-01-01 00:00:00'] * len(dvvc.index))).time
dvvc['first_occurrence_date'] = empty_date
dvvc['first_occurrence_time'] = empty_time

dvvc['last_occurrence_date'] = empty_date
dvvc['last_occurrence_time'] = empty_time

In [None]:
dvvc.head()

In [None]:
dvvc.columns

In [None]:
dvvc.to_csv('../out/'+'vc.csv', index=False)