# Crime data pre-proccessing 

## 



In [138]:
# import libraries

import pandas as pd
import numpy as np
import utm
import re

In [139]:
# read csv

data = pd.read_csv('crimedata_csv_AllNeighbourhoods_2021+2022.csv')


In [140]:
# create copy for safety

df = data.copy()

In [141]:
# dataframe

df

Unnamed: 0,TYPE,YEAR,MONTH,DAY,HOUR,MINUTE,HUNDRED_BLOCK,NEIGHBOURHOOD,X,Y
0,Break and Enter Commercial,2021,11,21,6,33,10XX ALBERNI ST,West End,491015.9434,5459166.140
1,Break and Enter Commercial,2021,11,26,13,58,10XX BARCLAY ST,West End,490833.8455,5458886.535
2,Break and Enter Commercial,2021,12,14,2,2,10XX BEACH AVE,Central Business District,490252.3815,5458162.723
3,Break and Enter Commercial,2021,7,17,5,0,10XX BEACH AVE,Central Business District,490255.3982,5458158.788
4,Break and Enter Commercial,2021,6,15,4,0,10XX BEACH AVE,Central Business District,490258.4148,5458154.853
...,...,...,...,...,...,...,...,...,...,...
35454,Vehicle Collision or Pedestrian Struck (with I...,2022,1,7,14,1,W 7TH AVE / ALDER ST,Fairview,490448.0000,5456949.000
35455,Vehicle Collision or Pedestrian Struck (with I...,2022,1,18,20,4,WILLOW ST / W BROADWAY AVE,Fairview,491136.0000,5456735.000
35456,Vehicle Collision or Pedestrian Struck (with I...,2022,2,8,19,52,WINDSOR ST / E 49TH AVE,Sunset,493792.0000,5452516.000
35457,Vehicle Collision or Pedestrian Struck (with I...,2022,1,4,14,56,X BLOCK E 33RD AVE,Riley Park,492354.0000,5454205.000


In [142]:
# find number of unique values

df.nunique()

TYPE                11
YEAR                 2
MONTH               12
DAY                 31
HOUR                24
MINUTE              60
HUNDRED_BLOCK     7901
NEIGHBOURHOOD       24
X                15691
Y                15587
dtype: int64

## Cleaning data

In [143]:
#  for certain crimes like homicides, some of the data is unlabelled in order to respect the privacy of victims. 

df.isnull().sum()

TYPE             0
YEAR             0
MONTH            0
DAY              0
HOUR             0
MINUTE           0
HUNDRED_BLOCK    0
NEIGHBOURHOOD    9
X                2
Y                2
dtype: int64

In [144]:
# creating column with datetime object

dt = df.columns[1:5]
df['datetime'] = pd.to_datetime(df[dt])


In [145]:
# This is not important, recorded time the crime occcured does not accurately reflect the actual time the crime occurred
# There is always a slight delay in reporting of the crime

df = df.drop(columns=['MINUTE'])


In [146]:
# count of types of crime in 2021 and 2022

df[['YEAR', 'TYPE']].groupby(['TYPE']).count()


Unnamed: 0_level_0,YEAR
TYPE,Unnamed: 1_level_1
Break and Enter Commercial,2322
Break and Enter Residential/Other,1649
Homicide,18
Mischief,5996
Offence Against a Person,4117
Other Theft,9595
Theft from Vehicle,8212
Theft of Bicycle,1508
Theft of Vehicle,927
Vehicle Collision or Pedestrian Struck (with Fatality),21


In [147]:
# there is a poor correlation between any of these variables

df.corr()

Unnamed: 0,YEAR,MONTH,DAY,HOUR,X,Y
YEAR,1.0,-0.434988,-0.065721,0.012028,0.000793,0.000612
MONTH,-0.434988,1.0,0.023954,-0.002784,-0.001674,-0.001526
DAY,-0.065721,0.023954,1.0,0.002544,-0.011597,-0.011679
HOUR,0.012028,-0.002784,0.002544,1.0,0.537629,0.537564
X,0.000793,-0.001674,-0.011597,0.537629,1.0,0.999877
Y,0.000612,-0.001526,-0.011679,0.537564,0.999877,1.0


In [148]:
# dropping null UTM-coordinates in the X and Y columns
# I decided to drop this they wouldn't work with my tableau dashboard

df=df.dropna(subset=['X', 'Y'])


In [149]:
# The lattitude and longitude in the database are in UTM coordinate system
# In order to create a tableau dashboard, the UTM coordiantes need to be converted to lat/long WGS84 coordinate

# this function takes the UTM coordinates in that region and turns them into WGS84 coordinates
def rule(row):
    try:
        lat, long = utm.to_latlon(easting=row["X"], northing=row["Y"], zone_number=10, zone_letter='N')
    except:
        lat = None
        long = None
        pd.Series({"lat": lat, "long": long})
    return pd.Series({"lat": lat, "long": long})


In [150]:
# run function

df=df.merge(df.apply(rule, axis=1), left_index=True, right_index=True)


In [151]:
# extract weekday from datetime object

# this could be useful to know as there could be more thefts during certain days of the week (like the weekend)
# thefts could also be less likely to occur during specific days when at-risk populations recieve income and disability assistance
# This is only speculation and would require a proper study to make any explicit claims. What I am doing here can be seen as pre-study research

# day name
df['weekday']=df['datetime'].dt.day_name()

# day number
df['weekday_date']=df['datetime'].dt.weekday

In [470]:
'''Upon further expirmentation in tableau I found the best way to feature-engineer the data by weekdays, while still having enough data points is to split the days by weekend and weekday. If the user decides they want all the data points they can still choose the "All" button'''

'Upon further expirmentation in tableau I found the best way to feature-engineer the data by weekdays, while still having enough data points to draw a conclusion is to split the days by weekend and weekday. If the user decides to want all the data points they can still choose the "All" button'

In [495]:
# function that converts the value from 'weekday_date' column and turns it 'weekend' or 'weekday'

def time_of_week_func(x):
    if x>4 or x==0:
        w='Weekend'
    else:
        w='Weekday'
    return w

In [496]:
# run time_of_week_func function

df['time_of_week']=df['weekday_date'].apply(time_of_week_func)

In [152]:
# turning months into seasons

# I found the best way to have enough data points while still having relevent information is by splitting the months into warmer and colder months
# Months correspond with daylight savings times

# label numbered month into warmer/colder season
def season_func(row):
    if row['MONTH'] >=4 and row['MONTH'] <=9:
        season='May - October (Warmer Months)'
    else:
        season='November - April (Colder Months)'
    return season


In [153]:
# run function
df['season']=df.apply(season_func, axis=1, raw=False)

In [494]:
#this is our new dataset

df.head(5)

Unnamed: 0,TYPE,YEAR,MONTH,DAY,HOUR,HUNDRED_BLOCK,NEIGHBOURHOOD,X,Y,datetime,lat,long,weekday,Season,weekday_date,time_of_week
0,Break and Enter Commercial,2021,11,21,6,10XX ALBERNI ST,West End,491015.9434,5459166.14,2021-11-21 06:00:00,49.285181,-123.123536,Sunday,November - April (Colder Months),6,Weekend
1,Break and Enter Commercial,2021,11,26,13,10XX BARCLAY ST,West End,490833.8455,5458886.535,2021-11-26 13:00:00,49.282663,-123.126034,Friday,November - April (Colder Months),4,Weekday
2,Break and Enter Commercial,2021,12,14,2,10XX BEACH AVE,Central Business District,490252.3815,5458162.723,2021-12-14 02:00:00,49.276144,-123.134011,Tuesday,November - April (Colder Months),1,Weekday
3,Break and Enter Commercial,2021,7,17,5,10XX BEACH AVE,Central Business District,490255.3982,5458158.788,2021-07-17 05:00:00,49.276108,-123.13397,Saturday,May - October (Warmer Months),5,Weekend
4,Break and Enter Commercial,2021,6,15,4,10XX BEACH AVE,Central Business District,490258.4148,5458154.853,2021-06-15 04:00:00,49.276073,-123.133928,Tuesday,May - October (Warmer Months),1,Weekday


# Thefts from Vehicles Project

For the next part of the project I created a map which includes the thefts from vehicles.

Value statement: As someone who frequently parks their car in Vancouver, it always is a bit nerve-racking leaving my car parked on the street/in a parking lot, as I may come back to find a window smashed and the contents in my car stolen. I wanted to create a dashboard which could visualize where and when these thefts occur, so that the user could chooose safer spots to park their vehicle.

In [506]:
# first I will drop all the other types of crimes

df_car_thefts=df[df['TYPE']=='Theft from Vehicle']

In [507]:
# occurences of breakins in 2021 and 2022 by neighbourhood

df_car_thefts[['NEIGHBOURHOOD', 'TYPE']].groupby(['NEIGHBOURHOOD']).count()

Unnamed: 0_level_0,TYPE
NEIGHBOURHOOD,Unnamed: 1_level_1
Arbutus Ridge,86
Central Business District,2192
Dunbar-Southlands,100
Fairview,400
Grandview-Woodland,392
Hastings-Sunrise,404
Kensington-Cedar Cottage,439
Kerrisdale,140
Killarney,150
Kitsilano,363


In [508]:
# count of breakings during each day
# we can see that breakins happen more often during the weekend and monday and decreases during the week

df_car_thefts[['weekday', 'TYPE']].groupby(['weekday']).count()

Unnamed: 0_level_0,TYPE
weekday,Unnamed: 1_level_1
Friday,1295
Monday,1217
Saturday,1248
Sunday,1218
Thursday,1067
Tuesday,1099
Wednesday,1068


In [509]:
df_car_thefts[['Season', 'TYPE']].groupby(['Season']).count()

Unnamed: 0_level_0,TYPE
Season,Unnamed: 1_level_1
May - October (Warmer Months),3819
November - April (Colder Months),4393


In [502]:
df_car_thefts[['MONTH', 'TYPE']].groupby(['MONTH']).count()

Unnamed: 0_level_0,TYPE
MONTH,Unnamed: 1_level_1
1,1291
2,722
3,546
4,525
5,600
6,559
7,689
8,732
9,714
10,626


In [510]:
# drop the columns we don't need

df_car_thefts=df_car_thefts.drop(columns=['TYPE','X','Y', 'datetime','weekday_date'])

In [511]:
# this is how our dataset looks like

df_car_thefts

Unnamed: 0,YEAR,MONTH,DAY,HOUR,HUNDRED_BLOCK,NEIGHBOURHOOD,lat,long,weekday,Season,time_of_week
21218,2021,10,28,17,0X KEEFER ST,Central Business District,49.279185,-123.104351,Thursday,November - April (Colder Months),Weekday
21219,2021,12,13,17,10XX ALBERNI ST,West End,49.284871,-123.123063,Monday,November - April (Colder Months),Weekend
21220,2021,2,4,17,10XX ALBERNI ST,West End,49.284981,-123.123053,Thursday,November - April (Colder Months),Weekday
21221,2021,2,15,14,10XX ALBERNI ST,West End,49.284794,-123.122946,Monday,November - April (Colder Months),Weekend
21222,2021,4,11,13,10XX ALBERNI ST,West End,49.284794,-123.122946,Sunday,May - October (Warmer Months),Weekend
...,...,...,...,...,...,...,...,...,...,...,...
35223,2022,1,20,16,X E CORDOVA ST,Central Business District,49.282458,-123.104155,Thursday,November - April (Colder Months),Weekday
35224,2022,1,3,21,X NK_LOC ST,Strathcona,49.281851,-123.099466,Monday,November - April (Colder Months),Weekend
35225,2022,1,25,21,X NK_LOC ST,Strathcona,49.281851,-123.099466,Tuesday,November - April (Colder Months),Weekday
35226,2022,2,2,14,X NK_LOC ST,Strathcona,49.281851,-123.099466,Wednesday,November - April (Colder Months),Weekday


In [436]:
# load parking meter data set

park_df=pd.read_csv('parking-meters.csv', ';')

  exec(code_obj, self.user_global_ns, self.user_ns)


In [437]:
park_df=park_df[['Geom']]

In [438]:
# number of points on the graph

park_df.nunique()

Geom    4715
dtype: int64

In [439]:
# turn data into string

park_df=park_df[['Geom']].astype(str)

In [440]:
# CREATE FUNCTION FOR LONGITUDE 
# the data value describing the data for the long and lat is messy and requires some data cleaning

# create function to extract longitude from data
def long_func(x):
    a=x.split()
    if len(a)>1:
        ret=a[1]
        ret = re.sub(r'[^-\d]', "", ret)
    else:
        ret=None
    return ret

# create function to extract lattitude from data
def lat_func(x):
    a=x.split()
    if len(a)>1:
        ret=a[2]
        ret = re.sub(r'[^-\d]', "", ret)
    else:
        ret=None
    return ret

In [441]:
# run functions

park_df['park_long']=[long_func(x) for x in park_df['Geom']]
park_df['park_lat']=[lat_func(x) for x in park_df['Geom']]

In [449]:
# null count

park_df.isnull().sum()

Geom         0
park_long    0
park_lat     0
dtype: int64

In [447]:
# drop the nulls

park_df=park_df.dropna()

In [457]:
# drop the columns that we don't need

park_df=park_df.drop(columns=['Geom'])

In [461]:
# join both data frames
# i am doing this as it might work in tableau not sure

test_df=df.join(park_df)

In [None]:
# export the data frames

df.to_excel('dataframe_version2.xlsx')
park_df.to_excel('parking_df_version2.xlsx')
test_df.to_excel('test_df_version2.xlsx')

In [463]:
test_df

Unnamed: 0,TYPE,YEAR,MONTH,DAY,HOUR,HUNDRED_BLOCK,NEIGHBOURHOOD,X,Y,datetime,lat,long,weekday,Season,weekday_date,park_long,park_lat
0,Break and Enter Commercial,2021,11,21,6,10XX ALBERNI ST,West End,491015.9434,5459166.140,2021-11-21 06:00:00,49.285181,-123.123536,Sunday,November - April (Colder Months),6,-12303362137823906,4923285999158738
1,Break and Enter Commercial,2021,11,26,13,10XX BARCLAY ST,West End,490833.8455,5458886.535,2021-11-26 13:00:00,49.282663,-123.126034,Friday,November - April (Colder Months),4,-12303313623855607,4923286768777498
2,Break and Enter Commercial,2021,12,14,2,10XX BEACH AVE,Central Business District,490252.3815,5458162.723,2021-12-14 02:00:00,49.276144,-123.134011,Tuesday,November - April (Colder Months),1,-12310096093278823,4925907758288844
3,Break and Enter Commercial,2021,7,17,5,10XX BEACH AVE,Central Business District,490255.3982,5458158.788,2021-07-17 05:00:00,49.276108,-123.133970,Saturday,May - October (Warmer Months),5,-12310131362071866,4925781812412518
4,Break and Enter Commercial,2021,6,15,4,10XX BEACH AVE,Central Business District,490258.4148,5458154.853,2021-06-15 04:00:00,49.276073,-123.133928,Tuesday,May - October (Warmer Months),1,-12310103668736426,49257812429792175
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35454,Vehicle Collision or Pedestrian Struck (with I...,2022,1,7,14,W 7TH AVE / ALDER ST,Fairview,490448.0000,5456949.000,2022-01-07 14:00:00,49.265229,-123.131293,Friday,November - April (Colder Months),4,,
35455,Vehicle Collision or Pedestrian Struck (with I...,2022,1,18,20,WILLOW ST / W BROADWAY AVE,Fairview,491136.0000,5456735.000,2022-01-18 20:00:00,49.263314,-123.121832,Tuesday,November - April (Colder Months),1,,
35456,Vehicle Collision or Pedestrian Struck (with I...,2022,2,8,19,WINDSOR ST / E 49TH AVE,Sunset,493792.0000,5452516.000,2022-02-08 19:00:00,49.225396,-123.085261,Tuesday,November - April (Colder Months),1,,
35457,Vehicle Collision or Pedestrian Struck (with I...,2022,1,4,14,X BLOCK E 33RD AVE,Riley Park,492354.0000,5454205.000,2022-01-04 14:00:00,49.240573,-123.105042,Tuesday,November - April (Colder Months),1,,
