# Crime data pre-proccessing 

## 



In [52]:
# import libraries

import pandas as pd
import numpy as np
import utm


In [53]:
# read csv

data = pd.read_csv('crimedata_csv_AllNeighbourhoods_2021+2022.csv')


In [54]:
# create copy for safety

df = data.copy()

In [55]:
# dataframe

df

Unnamed: 0,TYPE,YEAR,MONTH,DAY,HOUR,MINUTE,HUNDRED_BLOCK,NEIGHBOURHOOD,X,Y
0,Break and Enter Commercial,2021,11,21,6,33,10XX ALBERNI ST,West End,491015.9434,5459166.140
1,Break and Enter Commercial,2021,11,26,13,58,10XX BARCLAY ST,West End,490833.8455,5458886.535
2,Break and Enter Commercial,2021,12,14,2,2,10XX BEACH AVE,Central Business District,490252.3815,5458162.723
3,Break and Enter Commercial,2021,7,17,5,0,10XX BEACH AVE,Central Business District,490255.3982,5458158.788
4,Break and Enter Commercial,2021,6,15,4,0,10XX BEACH AVE,Central Business District,490258.4148,5458154.853
...,...,...,...,...,...,...,...,...,...,...
35454,Vehicle Collision or Pedestrian Struck (with I...,2022,1,7,14,1,W 7TH AVE / ALDER ST,Fairview,490448.0000,5456949.000
35455,Vehicle Collision or Pedestrian Struck (with I...,2022,1,18,20,4,WILLOW ST / W BROADWAY AVE,Fairview,491136.0000,5456735.000
35456,Vehicle Collision or Pedestrian Struck (with I...,2022,2,8,19,52,WINDSOR ST / E 49TH AVE,Sunset,493792.0000,5452516.000
35457,Vehicle Collision or Pedestrian Struck (with I...,2022,1,4,14,56,X BLOCK E 33RD AVE,Riley Park,492354.0000,5454205.000


In [56]:
# find number of unique values

df.nunique()

TYPE                11
YEAR                 2
MONTH               12
DAY                 31
HOUR                24
MINUTE              60
HUNDRED_BLOCK     7901
NEIGHBOURHOOD       24
X                15691
Y                15587
dtype: int64

## Cleaning data

In [57]:
#  for certain crimes like homicides, some of the data is unlabelled in order to respect the privacy of victims. 

df.isnull().sum()

TYPE             0
YEAR             0
MONTH            0
DAY              0
HOUR             0
MINUTE           0
HUNDRED_BLOCK    0
NEIGHBOURHOOD    9
X                2
Y                2
dtype: int64

In [58]:
# creating column with datetime object

dt = df.columns[1:5]
df['datetime'] = pd.to_datetime(df[dt])


In [59]:
# This is not important, recorded time the crime occcured does not accurately reflect the actual time the crime occurred
# There is always a slight delay in reporting of the crime

df = df.drop(columns=['MINUTE'])


In [60]:
# count of types of crime in 2021 and 2022

df[['YEAR', 'TYPE']].groupby(['TYPE']).count()


Unnamed: 0_level_0,YEAR
TYPE,Unnamed: 1_level_1
Break and Enter Commercial,2322
Break and Enter Residential/Other,1649
Homicide,18
Mischief,5996
Offence Against a Person,4117
Other Theft,9595
Theft from Vehicle,8212
Theft of Bicycle,1508
Theft of Vehicle,927
Vehicle Collision or Pedestrian Struck (with Fatality),21


In [61]:
# there is a poor correlation between any of these variables

df.corr()

Unnamed: 0,YEAR,MONTH,DAY,HOUR,X,Y
YEAR,1.0,-0.434988,-0.065721,0.012028,0.000793,0.000612
MONTH,-0.434988,1.0,0.023954,-0.002784,-0.001674,-0.001526
DAY,-0.065721,0.023954,1.0,0.002544,-0.011597,-0.011679
HOUR,0.012028,-0.002784,0.002544,1.0,0.537629,0.537564
X,0.000793,-0.001674,-0.011597,0.537629,1.0,0.999877
Y,0.000612,-0.001526,-0.011679,0.537564,0.999877,1.0


In [62]:
# dropping null UTM-coordinates in the X and Y columns
# I decided to drop this they wouldn't work with my tableau dashboard

df=df.dropna(subset=['X', 'Y'])


In [63]:
# The lattitude and longitude in the database are in UTM coordinate system
# In order to create a tableau dashboard, the UTM coordiantes need to be converted to lat/long WGS84 coordinate

# this function takes the UTM coordinates in that region and turns them into WGS84 coordinates
def rule(row):
    try:
        lat, long = utm.to_latlon(easting=row["X"], northing=row["Y"], zone_number=10, zone_letter='N')
    except:
        lat = None
        long = None
        pd.Series({"lat": lat, "long": long})
    return pd.Series({"lat": lat, "long": long})


In [64]:
# run function

df=df.merge(df.apply(rule, axis=1), left_index=True, right_index=True)


In [66]:
# extract weekday from datetime object
# this could be useful to know as there could be more thefts during certain days of the week (like the weekend)
# thefts could also be less likely to occur during specific days when at-risk populations recieve income and disability assistance
# This is only speculation and would require a proper study to make any explicit claims. What I am doing here can be seen as pre-study research

df['weekday']=df['datetime'].dt.day_name()

In [103]:
# TURN MONTH INTO SEASON
# I found the best way to have enough data points while still having relevent information is by splitting the months into warmer and colder months

# label numbered month into warmer/colder season
def season_func(row):
    if row['MONTH'] >=5 and row['MONTH'] <=10:
        season='May - October (Warmer Months)'
    else:
        season='November - April (Colder Months)'
    return season


In [104]:
# run function
df['Season']=df.apply(season_func, axis=1, raw=False)

In [105]:
#this is our new dataset

df

Unnamed: 0,TYPE,YEAR,MONTH,DAY,HOUR,HUNDRED_BLOCK,NEIGHBOURHOOD,X,Y,datetime,lat,long,weekday,Season
0,Break and Enter Commercial,2021,11,21,6,10XX ALBERNI ST,West End,491015.9434,5459166.140,2021-11-21 06:00:00,49.285181,-123.123536,Sunday,November - April (Colder Months)
1,Break and Enter Commercial,2021,11,26,13,10XX BARCLAY ST,West End,490833.8455,5458886.535,2021-11-26 13:00:00,49.282663,-123.126034,Friday,November - April (Colder Months)
2,Break and Enter Commercial,2021,12,14,2,10XX BEACH AVE,Central Business District,490252.3815,5458162.723,2021-12-14 02:00:00,49.276144,-123.134011,Tuesday,November - April (Colder Months)
3,Break and Enter Commercial,2021,7,17,5,10XX BEACH AVE,Central Business District,490255.3982,5458158.788,2021-07-17 05:00:00,49.276108,-123.133970,Saturday,May - October (Warmer Months)
4,Break and Enter Commercial,2021,6,15,4,10XX BEACH AVE,Central Business District,490258.4148,5458154.853,2021-06-15 04:00:00,49.276073,-123.133928,Tuesday,May - October (Warmer Months)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35454,Vehicle Collision or Pedestrian Struck (with I...,2022,1,7,14,W 7TH AVE / ALDER ST,Fairview,490448.0000,5456949.000,2022-01-07 14:00:00,49.265229,-123.131293,Friday,November - April (Colder Months)
35455,Vehicle Collision or Pedestrian Struck (with I...,2022,1,18,20,WILLOW ST / W BROADWAY AVE,Fairview,491136.0000,5456735.000,2022-01-18 20:00:00,49.263314,-123.121832,Tuesday,November - April (Colder Months)
35456,Vehicle Collision or Pedestrian Struck (with I...,2022,2,8,19,WINDSOR ST / E 49TH AVE,Sunset,493792.0000,5452516.000,2022-02-08 19:00:00,49.225396,-123.085261,Tuesday,November - April (Colder Months)
35457,Vehicle Collision or Pedestrian Struck (with I...,2022,1,4,14,X BLOCK E 33RD AVE,Riley Park,492354.0000,5454205.000,2022-01-04 14:00:00,49.240573,-123.105042,Tuesday,November - April (Colder Months)


# Thefts from Vehicles Project

For the next part of the project I created a map which includes the thefts from vehicles.

Value statement: As someone who frequently parks their car in Vancouver, it always is a bit nerve-racking leaving my car parked on the street/in a parking lot, as I may come back to find a window smashed and the contents in my car stolen. I wanted to create a dashboard which could visualize where and when these thefts occur, so that the user could chooose safer spots to park their vehicle.

In [106]:
# first I will drop all the other types of crimes

df_car_thefts=df[df['TYPE']=='Theft from Vehicle']

In [107]:
# occurences of breakins in 2021 and 2022 by neighbourhood

df_car_thefts[['NEIGHBOURHOOD', 'TYPE']].groupby(['NEIGHBOURHOOD']).count()

Unnamed: 0_level_0,TYPE
NEIGHBOURHOOD,Unnamed: 1_level_1
Arbutus Ridge,86
Central Business District,2192
Dunbar-Southlands,100
Fairview,400
Grandview-Woodland,392
Hastings-Sunrise,404
Kensington-Cedar Cottage,439
Kerrisdale,140
Killarney,150
Kitsilano,363


In [108]:
# count of breakings during each day
# we can see that breakins happen more often during the weekend and monday and decreases during the week

df_car_thefts[['weekday', 'TYPE']].groupby(['weekday']).count()

Unnamed: 0_level_0,TYPE
weekday,Unnamed: 1_level_1
Friday,1295
Monday,1217
Saturday,1248
Sunday,1218
Thursday,1067
Tuesday,1099
Wednesday,1068


In [109]:
# drop the columns we don't need

df_car_thefts=df_car_thefts.drop(columns=['TYPE', 'HUNDRED_BLOCK','X','Y', 'datetime'])

In [110]:
# this is how our dataset looks like

df_car_thefts

Unnamed: 0,YEAR,MONTH,DAY,HOUR,NEIGHBOURHOOD,lat,long,weekday,Season
21218,2021,10,28,17,Central Business District,49.279185,-123.104351,Thursday,May - October (Warmer Months)
21219,2021,12,13,17,West End,49.284871,-123.123063,Monday,November - April (Colder Months)
21220,2021,2,4,17,West End,49.284981,-123.123053,Thursday,November - April (Colder Months)
21221,2021,2,15,14,West End,49.284794,-123.122946,Monday,November - April (Colder Months)
21222,2021,4,11,13,West End,49.284794,-123.122946,Sunday,November - April (Colder Months)
...,...,...,...,...,...,...,...,...,...
35223,2022,1,20,16,Central Business District,49.282458,-123.104155,Thursday,November - April (Colder Months)
35224,2022,1,3,21,Strathcona,49.281851,-123.099466,Monday,November - April (Colder Months)
35225,2022,1,25,21,Strathcona,49.281851,-123.099466,Tuesday,November - April (Colder Months)
35226,2022,2,2,14,Strathcona,49.281851,-123.099466,Wednesday,November - April (Colder Months)


In [111]:
# export excel file for dashboard

df_car_thefts.to_excel('car_thefts.xlsx')