# Crime data pre-proccessing 

## 



In [156]:
# import libraries

import pandas as pd
import numpy as np
import utm
import re

In [157]:
# read csv

df = pd.read_csv('downloaded_data/new_crimedata_csv_AllNeighbourhoods_2022.csv')


In [158]:
# find number of unique values

df.nunique()

TYPE                11
YEAR                 1
MONTH               12
DAY                 31
HOUR                24
MINUTE              60
HUNDRED_BLOCK     7290
NEIGHBOURHOOD       24
X                14652
Y                14659
dtype: int64

## Cleaning data

In [159]:
#  for certain crimes like homicides, some of the data is unlabelled in order to respect the privacy of victims. 

df.isnull().sum()

TYPE             0
YEAR             0
MONTH            0
DAY              0
HOUR             0
MINUTE           0
HUNDRED_BLOCK    0
NEIGHBOURHOOD    5
X                1
Y                1
dtype: int64

In [160]:
# creating column with datetime object

dt = df.columns[1:5]
df['datetime'] = pd.to_datetime(df[dt])


In [161]:
# This is not important, recorded time the crime occcured does not accurately reflect the actual time the crime occurred
# There is always a slight delay in reporting of the crime

df = df.drop(columns=['MINUTE'])


In [164]:
# count of types of crime in 2021 and 2022

df[['YEAR', 'TYPE']].groupby(['TYPE']).count()


Unnamed: 0_level_0,YEAR
TYPE,Unnamed: 1_level_1
Break and Enter Commercial,1983
Break and Enter Residential/Other,1265
Homicide,11
Mischief,5606
Offence Against a Person,3872
Other Theft,10726
Theft from Vehicle,7260
Theft of Bicycle,1516
Theft of Vehicle,911
Vehicle Collision or Pedestrian Struck (with Fatality),18


In [163]:
# set only to relevant years

df=df[df['YEAR']==2022]

In [165]:
# dropping null 

df=df.dropna()


In [166]:
# The lattitude and longitude in the database are in UTM coordinate system
# In order to create a tableau dashboard, the UTM coordiantes need to be converted to lat/long WGS84 coordinate

# this function takes the UTM coordinates in that region and turns them into WGS84 coordinates
def rule(row):
    try:
        lat, long = utm.to_latlon(easting=row["X"], northing=row["Y"], zone_number=10, zone_letter='N')
    except:
        lat, long = None, None
        pd.Series({"lat": lat, "long": long})
    return pd.Series({"lat": lat, "long": long})


In [167]:
# run function

df=df.merge(df.apply(rule, axis=1), left_index=True, right_index=True)


In [168]:
# extract weekday from datetime object

# this could be useful to know as there could be more thefts during certain days of the week (like the weekend)
# thefts could also be less likely to occur during specific days when at-risk populations recieve income and disability assistance
# This is only speculation and would require a proper study to make any explicit claims. What I am doing here can be seen as pre-study research

# day name
df['weekday']=df['datetime'].dt.day_name()

# day number
df['weekday_date']=df['datetime'].dt.weekday

# month
df['month']=df['datetime'].dt.month_name()


Upon further expirmentation in tableau I found the best way to feature-engineer the data by weekdays, while still having enough data points is to split the days by weekend and weekday. If the user decides they want all the data points they can still choose the "All" button

In [169]:
# function that converts the value from 'weekday_date' column and turns it 'weekend' or 'weekday'

def time_of_week_func(x):
    if x>4 or x==0:
        w='Weekend'
    else:
        w='Weekday'
    return w

In [170]:
# run time_of_week_func function

df['time_of_week']=df['weekday_date'].apply(time_of_week_func)

In [171]:
# turning months into seasons

# I found the best way to have enough data points while still having relevent information is by splitting the months into warmer and colder months

# label numbered month into warmer/colder season
def season_func(row):
    if row['MONTH'] >=5 and row['MONTH'] <=10:
        season='May - October (Warmer Months)'
    else:
        season='November - April (Colder Months)'
    return season


In [172]:
# run function

df['season']=df.apply(season_func, axis=1, raw=False)

In [173]:
#this is our new dataset

df.head(5)

Unnamed: 0,TYPE,YEAR,MONTH,DAY,HOUR,HUNDRED_BLOCK,NEIGHBOURHOOD,X,Y,datetime,lat,long,weekday,weekday_date,month,time_of_week,season
0,Other Theft,2022,5,10,12,48XX CAMBIE ST,Riley Park,491398.1033,5454383.0,2022-05-10 12:00:00,49.242165,-123.118179,Tuesday,1,May,Weekday,May - October (Warmer Months)
1,Other Theft,2022,7,7,22,48XX CAMBIE ST,Riley Park,491398.1033,5454383.0,2022-07-07 22:00:00,49.242165,-123.118179,Thursday,3,July,Weekday,May - October (Warmer Months)
2,Other Theft,2022,9,18,9,48XX CAMBIE ST,Riley Park,491398.1033,5454383.0,2022-09-18 09:00:00,49.242165,-123.118179,Sunday,6,September,Weekend,May - October (Warmer Months)
3,Other Theft,2022,3,31,15,48XX CAMBIE ST,Riley Park,491398.1095,5454383.0,2022-03-31 15:00:00,49.242165,-123.118179,Thursday,3,March,Weekday,November - April (Colder Months)
4,Other Theft,2022,11,25,12,48XX CLARENDON ST,Renfrew-Collingwood,496025.8227,5454175.0,2022-11-25 12:00:00,49.240337,-123.054598,Friday,4,November,Weekday,November - April (Colder Months)


In [174]:
# drop the columns which are not needed

df=df.drop(columns=['X','Y', 'datetime','weekday_date'])

In [175]:
df

Unnamed: 0,TYPE,YEAR,MONTH,DAY,HOUR,HUNDRED_BLOCK,NEIGHBOURHOOD,lat,long,weekday,month,time_of_week,season
0,Other Theft,2022,5,10,12,48XX CAMBIE ST,Riley Park,49.242165,-123.118179,Tuesday,May,Weekday,May - October (Warmer Months)
1,Other Theft,2022,7,7,22,48XX CAMBIE ST,Riley Park,49.242165,-123.118179,Thursday,July,Weekday,May - October (Warmer Months)
2,Other Theft,2022,9,18,9,48XX CAMBIE ST,Riley Park,49.242165,-123.118179,Sunday,September,Weekend,May - October (Warmer Months)
3,Other Theft,2022,3,31,15,48XX CAMBIE ST,Riley Park,49.242165,-123.118179,Thursday,March,Weekday,November - April (Colder Months)
4,Other Theft,2022,11,25,12,48XX CLARENDON ST,Renfrew-Collingwood,49.240337,-123.054598,Friday,November,Weekday,November - April (Colder Months)
...,...,...,...,...,...,...,...,...,...,...,...,...,...
34176,Other Theft,2022,2,8,0,47XX INVERNESS ST,Kensington-Cedar Cottage,49.241522,-123.080249,Tuesday,February,Weekday,November - April (Colder Months)
34177,Other Theft,2022,2,23,20,47XX KNIGHT ST,Kensington-Cedar Cottage,49.242847,-123.076304,Wednesday,February,Weekday,November - April (Colder Months)
34178,Other Theft,2022,4,20,19,47XX MANOR ST,Renfrew-Collingwood,49.242581,-123.038761,Wednesday,April,Weekday,November - April (Colder Months)
34179,Other Theft,2022,8,11,19,47XX NW MARINE DR,West Point Grey,49.276167,-123.215744,Thursday,August,Weekday,May - October (Warmer Months)


# Thefts from Vehicles Project

For the next part of the project I created a map which includes the thefts from vehicles.

Value statement: As someone who frequently parks their car in Vancouver, it always is a bit nerve-racking leaving my car parked on the street/in a parking lot, as I may come back to find a window smashed and the contents in my car stolen. I wanted to create a dashboard which could visualize where and when these thefts occur, so that the user could chooose safer spots to park their vehicle.

In [176]:
# first I will drop all the other types of crimes

df_car_thefts=df[df['TYPE']=='Theft from Vehicle']

In [177]:
# occurences of breakins in 2021 and 2022 by neighbourhood

# creat filter for previous year (its the only full year you can get real data from)
filt=df_car_thefts['YEAR']>=2021

# groupby+filt
df_car_thefts[filt][['NEIGHBOURHOOD', 'TYPE']].groupby(['NEIGHBOURHOOD']).count()

Unnamed: 0_level_0,TYPE
NEIGHBOURHOOD,Unnamed: 1_level_1
Arbutus Ridge,71
Central Business District,2313
Dunbar-Southlands,93
Fairview,300
Grandview-Woodland,302
Hastings-Sunrise,260
Kensington-Cedar Cottage,332
Kerrisdale,56
Killarney,165
Kitsilano,256


In [178]:
# count of breakings during each day
# we can see that break-ins happen more often during the weekend and monday and decreases during the week

# groupby+filt
df_car_thefts[filt][['weekday', 'TYPE']].groupby(['weekday']).count()

Unnamed: 0_level_0,TYPE
weekday,Unnamed: 1_level_1
Friday,1099
Monday,1061
Saturday,1157
Sunday,1054
Thursday,1014
Tuesday,985
Wednesday,890


In [179]:
# how many breakings happen every month

# groupby+filt
df_car_thefts[filt][['MONTH', 'TYPE']].groupby(['MONTH']).count()

Unnamed: 0_level_0,TYPE
MONTH,Unnamed: 1_level_1
1,593
2,541
3,697
4,667
5,601
6,640
7,574
8,621
9,676
10,654


In [180]:
# how many car thefts happen during the colder/warmer months

# groupby+filt
df_car_thefts[filt][['season', 'TYPE']].groupby(['season']).count()

Unnamed: 0_level_0,TYPE
season,Unnamed: 1_level_1
May - October (Warmer Months),3766
November - April (Colder Months),3494


In [181]:
df_car_thefts=df_car_thefts.drop(columns=['TYPE'])

In [182]:
# dataset

df_car_thefts.head()

Unnamed: 0,YEAR,MONTH,DAY,HOUR,HUNDRED_BLOCK,NEIGHBOURHOOD,lat,long,weekday,month,time_of_week,season
4126,2022,10,4,20,0_NK LOC ST,Strathcona,49.281851,-123.099466,Tuesday,October,Weekday,May - October (Warmer Months)
4127,2022,3,21,15,10XX ALBERNI ST,West End,49.285153,-123.123494,Monday,March,Weekend,November - April (Colder Months)
4128,2022,5,13,9,10XX ALBERNI ST,West End,49.285,-123.123259,Friday,May,Weekday,May - October (Warmer Months)
4129,2022,1,17,19,10XX ALBERNI ST,West End,49.284955,-123.123013,Monday,January,Weekend,November - April (Colder Months)
4130,2022,4,10,12,10XX ALBERNI ST,West End,49.284955,-123.123013,Sunday,April,Weekend,November - April (Colder Months)


### Car-thefts pre-processsing: keeping only the previous 12-months

## UPDATE: The database changed and VPD stopped including the most recent year. So I didn't have a need for the following code.

Including the same months for both 2021 and 2022 will skew the data and make it appear that more car breakins are occurring that month.

Although more data points are generally a good thing, it will be bad data if they are included in the data-set.
Any sort of exploratory analysis, or statistical analysis will get spoiled if we use the bad data.

As the old adage goes "garbage in, garbage out"

Therefore the overlapping months need to get deleted removed from the previous year

In [154]:
'''# dropping the months of of 2021 which overlap with 2022


# first we need to find what the most recent month is in 2022

# create filter for 2022
year_filt=df_car_thefts['YEAR']==2022


# create filter to find the most recent month in 2022
month_max=df_car_thefts[year_filt]['MONTH'].max()
month_filt=df_car_thefts['MONTH']==month_max


# It would be bad practice to include a month that has only partially passed, however I still wanted to include it if the month had mostly passed. 
# I compromised by choosing to include the month  only if at least 24 days (around 80%) of that month had passed

#find the greatest day for the most recent month
day_max=df_car_thefts[month_filt][year_filt]['DAY'].max()

# function which finds out whether approx. 80% of the month has passed
def month_func(x):
    if x>=24:
        ret=month_max
    else:
        ret=month_max-1
    return ret

# run function
actual_month_max=month_func(day_max)
actual_month_max #this is the month which will be included'''

  day_max=df_car_thefts[month_filt][year_filt]['DAY'].max()


12

In [155]:
'''# a filter which gets rid of the month(s) in the previous year

sel_rows=df_car_thefts[(df_car_thefts['MONTH']<=actual_month_max) & (df_car_thefts['YEAR']==2021)].index
df_car_thefts=df_car_thefts.drop(sel_rows, axis=0)

# dataframe
df_car_thefts.head()'''

Unnamed: 0,YEAR,MONTH,DAY,HOUR,HUNDRED_BLOCK,NEIGHBOURHOOD,lat,long,weekday,month,time_of_week,season
4126,2022,10,4,20,0_NK LOC ST,Strathcona,49.281851,-123.099466,Tuesday,October,Weekday,May - October (Warmer Months)
4127,2022,3,21,15,10XX ALBERNI ST,West End,49.285153,-123.123494,Monday,March,Weekend,November - April (Colder Months)
4128,2022,5,13,9,10XX ALBERNI ST,West End,49.285,-123.123259,Friday,May,Weekday,May - October (Warmer Months)
4129,2022,1,17,19,10XX ALBERNI ST,West End,49.284955,-123.123013,Monday,January,Weekend,November - April (Colder Months)
4130,2022,4,10,12,10XX ALBERNI ST,West End,49.284955,-123.123013,Sunday,April,Weekend,November - April (Colder Months)


# Parking meter data set

In [183]:
# load parking meter data set

'''this is a dataset I found which includes the coordinates of all city pay parkins'''

park_df=pd.read_excel('downloaded_data/parking-meters.xls')
park_df.head()

Unnamed: 0,METERHEAD,R_MF_9A_6P,R_MF_6P_10,R_SA_9A_6P,R_SA_6P_10,R_SU_9A_6P,R_SU_6P_10,RATE_MISC,TIMEINEFFE,T_MF_9A_6P,...,T_SA_9A_6P,T_SA_6P_10,T_SU_9A_6P,T_SU_6P_10,TIME_MISC,CREDITCARD,PAY_PHONE,Geom,Geo Local Area,METERID
0,Twin,$1.00,$1.00,$1.00,$1.00,$1.00,$1.00,,METER IN EFFECT: 9:00 AM TO 10:00 PM,3 Hr,...,3 Hr,4 Hr,3 Hr,4 Hr,,No,56533,"{""coordinates"": [-123.03362137823906, 49.23285...",Killarney,993358
1,Twin,$1.00,$1.00,$1.00,$1.00,$1.00,$1.00,,METER IN EFFECT: 9:00 AM TO 10:00 PM,3 Hr,...,3 Hr,4 Hr,3 Hr,4 Hr,,No,56469,"{""coordinates"": [-123.03313623855607, 49.23286...",Renfrew-Collingwood,993371
2,Twin,$1.00,$1.00,$1.00,$1.00,$1.00,$1.00,,METER IN EFFECT: 9:00 AM TO 10:00 PM,2 Hr,...,2 Hr,4 Hr,2 Hr,4 Hr,,No,57554,"{""coordinates"": [-123.10096093278823, 49.25907...",Mount Pleasant,512904
3,Twin,$1.00,$4.00,$1.00,$4.00,$1.00,$4.00,,METER IN EFFECT: 9:00 AM TO 10:00 PM,2 Hr,...,2 Hr,4 Hr,2 Hr,4 Hr,,No,56215,"{""coordinates"": [-123.10131362071866, 49.25781...",Mount Pleasant,513019
4,Twin,$1.00,$4.00,$1.00,$4.00,$1.00,$4.00,,METER IN EFFECT: 9:00 AM TO 10:00 PM,2 Hr,...,2 Hr,4 Hr,2 Hr,4 Hr,,No,57150,"{""coordinates"": [-123.10103668736426, 49.25781...",Mount Pleasant,513018


In [184]:
# only need the longitude/lattitude column

park_df=park_df[['Geom']]

In [185]:
# number of long/lat points on the graph

park_df.nunique()

Geom    4715
dtype: int64

In [186]:
# dropping duplicates

park_df=park_df.drop_duplicates(subset=['Geom'], keep=False)

In [187]:
# null count

park_df.isnull().sum()

Geom    1
dtype: int64

In [188]:
# drop the nulls

park_df=park_df.dropna()

In [189]:
# pre-processing longitude/lattitude vales

# the data value describing the data for the long and lat is messy and requires some data cleaning
# the data is stored as a set of dictionaries. Pandas cannot interpret dictionaries, so they must be turned into a string and pre-processed further

# turn data into string to prepare it for pre-processing
park_df=park_df[['Geom']].astype(str)

# function to extract longitude from data
def long_func(x):
    a=x.split()
    if len(a)>1:
        ret=a[1]
        ret = re.sub(r'[^-\d*\.\d*]', "", ret)
    else:
        ret=None
    return ret

# function to extract lattitude from data
def lat_func(x):
    a=x.split()
    if len(a)>1:
        ret=a[2]
        ret = re.sub(r'[^-\d*\.\d*]', "", ret)
    else:
        ret=None
    return ret

In [190]:
# run functions

park_df['park_lat']=[lat_func(x) for x in park_df['Geom']]
park_df['park_long']=[long_func(x) for x in park_df['Geom']]

# dataset
park_df.head()

Unnamed: 0,Geom,park_lat,park_long
5,"{""coordinates"": [-123.10249423430041, 49.27706...",49.27706523628811,-123.1024942343004
6,"{""coordinates"": [-123.1029445090042, 49.277403...",49.27740316288875,-123.1029445090042
7,"{""coordinates"": [-123.0898869035528, 49.270338...",49.27033821755311,-123.0898869035528
8,"{""coordinates"": [-123.10556622620037, 49.27691...",49.27691853858081,-123.10556622620037
9,"{""coordinates"": [-123.10376713040408, 49.27681...",49.27681160535016,-123.10376713040408


In [191]:
# drop the columns that are not needed

park_df=park_df.drop(columns=['Geom'])

In [192]:
# turn dtypes from bojects into floats

park_df=park_df[['park_lat','park_long']].astype(float)

park_df.dtypes


park_lat     float64
park_long    float64
dtype: object

In [193]:
# dataset

park_df.head()

Unnamed: 0,park_lat,park_long
5,49.277065,-123.102494
6,49.277403,-123.102945
7,49.270338,-123.089887
8,49.276919,-123.105566
9,49.276812,-123.103767


# Export

In [194]:
# export the data sets

park_df.to_excel('cleaned_data/city_parking_dataset.xlsx')
df_car_thefts.to_excel('cleaned_data/car_thefts_dataset.xlsx')