# Crime data pre-proccessing 

## 



In [87]:
# import libraries

import pandas as pd
import numpy as np
import utm
import re

In [88]:
# read csv

df = pd.read_csv('downloaded_data/crimedata_csv_AllNeighbourhoods_AllYears.csv')


In [89]:
# find number of unique values

df.nunique()

TYPE                 11
YEAR                 20
MONTH                12
DAY                  31
HOUR                 24
MINUTE               60
HUNDRED_BLOCK     23196
NEIGHBOURHOOD        24
X                144190
Y                144022
dtype: int64

## Cleaning data

In [90]:
#  for certain crimes like homicides, some of the data is unlabelled in order to respect the privacy of victims. 

df.isnull().sum()

TYPE               0
YEAR               0
MONTH              0
DAY                0
HOUR               0
MINUTE             0
HUNDRED_BLOCK     12
NEIGHBOURHOOD    138
X                 72
Y                 72
dtype: int64

In [91]:
# creating column with datetime object

dt = df.columns[1:5]
df['datetime'] = pd.to_datetime(df[dt])


In [92]:
# This is not important, recorded time the crime occcured does not accurately reflect the actual time the crime occurred
# There is always a slight delay in reporting of the crime

df = df.drop(columns=['MINUTE'])


In [93]:
# count of types of crime in 2021 and 2022

df[['YEAR', 'TYPE']].groupby(['TYPE']).count()


Unnamed: 0_level_0,YEAR
TYPE,Unnamed: 1_level_1
Break and Enter Commercial,45364
Break and Enter Residential/Other,70939
Homicide,298
Mischief,99177
Offence Against a Person,71746
Other Theft,203678
Theft from Vehicle,233428
Theft of Bicycle,35450
Theft of Vehicle,43825
Vehicle Collision or Pedestrian Struck (with Fatality),332


In [94]:
# set only to relevant years

df=df[df['YEAR']>=2021]

In [95]:
# dropping null UTM-coordinates in the X and Y columns
# I decided to drop this they wouldn't work with my tableau dashboard

df=df.dropna(subset=['X', 'Y'])


In [96]:
# The lattitude and longitude in the database are in UTM coordinate system
# In order to create a tableau dashboard, the UTM coordiantes need to be converted to lat/long WGS84 coordinate

# this function takes the UTM coordinates in that region and turns them into WGS84 coordinates
def rule(row):
    try:
        lat, long = utm.to_latlon(easting=row["X"], northing=row["Y"], zone_number=10, zone_letter='N')
    except:
        lat, long = None, None
        pd.Series({"lat": lat, "long": long})
    return pd.Series({"lat": lat, "long": long})


In [97]:
# run function

df=df.merge(df.apply(rule, axis=1), left_index=True, right_index=True)


In [98]:
# extract weekday from datetime object

# this could be useful to know as there could be more thefts during certain days of the week (like the weekend)
# thefts could also be less likely to occur during specific days when at-risk populations recieve income and disability assistance
# This is only speculation and would require a proper study to make any explicit claims. What I am doing here can be seen as pre-study research

# day name
df['weekday']=df['datetime'].dt.day_name()

# day number
df['weekday_date']=df['datetime'].dt.weekday

# month
df['month']=df['datetime'].dt.month_name()


Upon further expirmentation in tableau I found the best way to feature-engineer the data by weekdays, while still having enough data points is to split the days by weekend and weekday. If the user decides they want all the data points they can still choose the "All" button

In [99]:
# function that converts the value from 'weekday_date' column and turns it 'weekend' or 'weekday'

def time_of_week_func(x):
    if x>4 or x==0:
        w='Weekend'
    else:
        w='Weekday'
    return w

In [100]:
# run time_of_week_func function

df['time_of_week']=df['weekday_date'].apply(time_of_week_func)

In [101]:
# turning months into seasons

# I found the best way to have enough data points while still having relevent information is by splitting the months into warmer and colder months

# label numbered month into warmer/colder season
def season_func(row):
    if row['MONTH'] >=5 and row['MONTH'] <=10:
        season='May - October (Warmer Months)'
    else:
        season='November - April (Colder Months)'
    return season


In [102]:
# run function

df['season']=df.apply(season_func, axis=1, raw=False)

In [103]:
#this is our new dataset

df.head(5)

Unnamed: 0,TYPE,YEAR,MONTH,DAY,HOUR,HUNDRED_BLOCK,NEIGHBOURHOOD,X,Y,datetime,lat,long,weekday,weekday_date,month,time_of_week,season
0,Theft from Vehicle,2022,3,17,21,39XX W 17TH AVE,Dunbar-Southlands,486111.0653,5456088.0,2022-03-17 21:00:00,49.257399,-123.190874,Thursday,3,March,Weekday,November - April (Colder Months)
12,Theft from Vehicle,2021,8,17,2,39XX W 18TH AVE,Dunbar-Southlands,486021.35,5455993.0,2021-08-17 02:00:00,49.256546,-123.192104,Tuesday,1,August,Weekday,May - October (Warmer Months)
37,Theft from Vehicle,2021,3,28,22,39XX W 20TH AVE,Dunbar-Southlands,485958.3983,5455794.0,2021-03-28 22:00:00,49.254753,-123.192962,Sunday,6,March,Weekend,November - April (Colder Months)
103,Theft from Vehicle,2021,8,20,0,39XX W 24TH AVE,Dunbar-Southlands,485973.4555,5455381.0,2021-08-20 00:00:00,49.251042,-123.192741,Friday,4,August,Weekday,May - October (Warmer Months)
128,Theft from Vehicle,2022,3,23,23,39XX W 30TH AVE,Dunbar-Southlands,486045.9833,5454742.0,2022-03-23 23:00:00,49.245288,-123.191722,Wednesday,2,March,Weekday,November - April (Colder Months)


In [104]:
# drop the columns which are not needed

df=df.drop(columns=['TYPE','X','Y', 'datetime','weekday_date'])

# Thefts from Vehicles Project

For the next part of the project I created a map which includes the thefts from vehicles.

Value statement: As someone who frequently parks their car in Vancouver, it always is a bit nerve-racking leaving my car parked on the street/in a parking lot, as I may come back to find a window smashed and the contents in my car stolen. I wanted to create a dashboard which could visualize where and when these thefts occur, so that the user could chooose safer spots to park their vehicle.

In [105]:
# first I will drop all the other types of crimes

df_car_thefts=df[df['TYPE']=='Theft from Vehicle']

KeyError: 'TYPE'

In [None]:
# occurences of breakins in 2021 and 2022 by neighbourhood

# creat filter for previous year (its the only full year you can get real data from)
filt=df_car_thefts['YEAR']>=2021

# groupby+filt
df_car_thefts[filt][['NEIGHBOURHOOD', 'TYPE']].groupby(['NEIGHBOURHOOD']).count()

KeyError: "['TYPE'] not in index"

In [None]:
# count of breakings during each day
# we can see that break-ins happen more often during the weekend and monday and decreases during the week

# groupby+filt
df_car_thefts[filt][['weekday', 'TYPE']].groupby(['weekday']).count()

Unnamed: 0_level_0,TYPE
weekday,Unnamed: 1_level_1
Friday,1746
Monday,1689
Saturday,1714
Sunday,1693
Thursday,1467
Tuesday,1522
Wednesday,1464


In [None]:
# how many breakings happen every month

# groupby+filt
df_car_thefts[filt][['MONTH', 'TYPE']].groupby(['MONTH']).count()

KeyError: "['TYPE'] not in index"

In [None]:
# how many car thefts happen during the colder/warmer months

# groupby+filt
df_car_thefts[filt][['season', 'TYPE']].groupby(['season']).count()

Unnamed: 0_level_0,TYPE
season,Unnamed: 1_level_1
May - October (Warmer Months),5238
November - April (Colder Months),6057


In [None]:
df_car_thefts=df_car_thefts.drop(columns=['TYPE'])

In [27]:
# dataset

df_car_thefts.head()

Unnamed: 0,YEAR,MONTH,DAY,HOUR,HUNDRED_BLOCK,NEIGHBOURHOOD,lat,long,weekday,month,time_of_week,season
21218,2021,10,28,17,0X KEEFER ST,Central Business District,49.279185,-123.104351,Thursday,October,Weekday,May - October (Warmer Months)
21219,2021,12,13,17,10XX ALBERNI ST,West End,49.284871,-123.123063,Monday,December,Weekend,November - April (Colder Months)
21220,2021,2,4,17,10XX ALBERNI ST,West End,49.284981,-123.123053,Thursday,February,Weekday,November - April (Colder Months)
21221,2021,2,15,14,10XX ALBERNI ST,West End,49.284794,-123.122946,Monday,February,Weekend,November - April (Colder Months)
21222,2021,4,11,13,10XX ALBERNI ST,West End,49.284794,-123.122946,Sunday,April,Weekend,November - April (Colder Months)


### Car-thefts pre-processsing: keeping only the previous 12-months

Including the same months for both 2021 and 2022 will skew the data and make it appear that more car breakins are occurring that month.

Although more data points are generally a good thing, it will be bad data if they are included in the data-set.
Any sort of exploratory analysis, or statistical analysis will get spoiled if we use the bad data.

As the old adage goes "garbage in, garbage out"

Therefore the overlapping months need to get deleted removed from the previous year

In [106]:
# dropping the months of of 2021 which overlap with 2022


# first we need to find what the most recent month is in 2022

# create filter for 2022
year_filt=df_car_thefts['YEAR']==2022


# create filter to find the most recent month in 2022
month_max=df_car_thefts[year_filt]['MONTH'].max()
month_filt=df_car_thefts['MONTH']==month_max


# It would be bad practice to include a month that has only partially passed, however I still wanted to include it if the month had mostly passed. 
# I compromised by choosing to include the month  only if at least 24 days (around 80%) of that month had passed

#find the greatest day for the most recent month
day_max=df_car_thefts[month_filt][year_filt]['DAY'].max()

# function which finds out whether approx. 80% of the month has passed
def month_func(x):
    if x>=24:
        ret=month_max
    else:
        ret=month_max-1
    return ret

# run function
actual_month_max=month_func(day_max)
actual_month_max #this is the month which will be included

  day_max=df_car_thefts[month_filt][year_filt]['DAY'].max()


6

In [107]:
# a filter which gets rid of the month(s) in the previous year

sel_rows=df_car_thefts[(df_car_thefts['MONTH']<=actual_month_max) & (df_car_thefts['YEAR']==2021)].index
df_car_thefts=df_car_thefts.drop(sel_rows, axis=0)

# dataframe
df_car_thefts.head()

Unnamed: 0,YEAR,MONTH,DAY,HOUR,HUNDRED_BLOCK,NEIGHBOURHOOD,lat,long,weekday,month,time_of_week,season
0,2022,3,17,21,39XX W 17TH AVE,Dunbar-Southlands,49.257399,-123.190874,Thursday,March,Weekday,November - April (Colder Months)
12,2021,8,17,2,39XX W 18TH AVE,Dunbar-Southlands,49.256546,-123.192104,Tuesday,August,Weekday,May - October (Warmer Months)
103,2021,8,20,0,39XX W 24TH AVE,Dunbar-Southlands,49.251042,-123.192741,Friday,August,Weekday,May - October (Warmer Months)
128,2022,3,23,23,39XX W 30TH AVE,Dunbar-Southlands,49.245288,-123.191722,Wednesday,March,Weekday,November - April (Colder Months)
175,2021,8,21,2,39XX W 33RD AVE,Dunbar-Southlands,49.242469,-123.191058,Saturday,August,Weekend,May - October (Warmer Months)


# Parking meter data set

In [108]:
# load parking meter data set

'''this is a dataset I found which includes the coordinates of all city pay parkins'''

park_df=pd.read_csv('parking-meters.csv', ';')
park_df.head()

  exec(code_obj, self.user_global_ns, self.user_ns)


FileNotFoundError: [Errno 2] No such file or directory: 'parking-meters.csv'

In [None]:
# only need the longitude/lattitude column

park_df=park_df[['Geom']]

In [None]:
# number of long/lat points on the graph

park_df.nunique()

Geom    4715
dtype: int64

In [None]:
# dropping duplicates

park_df=park_df.drop_duplicates(subset=['Geom'], keep=False)

In [None]:
# null count

park_df.isnull().sum()

Geom    1
dtype: int64

In [None]:
# drop the nulls

park_df=park_df.dropna()

In [None]:
# pre-processing longitude/lattitude vales

# the data value describing the data for the long and lat is messy and requires some data cleaning
# the data is stored as a set of dictionaries. Pandas cannot interpret dictionaries, so they must be turned into a string and pre-processed further

# turn data into string to prepare it for pre-processing
park_df=park_df[['Geom']].astype(str)

# function to extract longitude from data
def long_func(x):
    a=x.split()
    if len(a)>1:
        ret=a[1]
        ret = re.sub(r'[^-\d*\.\d*]', "", ret)
    else:
        ret=None
    return ret

# function to extract lattitude from data
def lat_func(x):
    a=x.split()
    if len(a)>1:
        ret=a[2]
        ret = re.sub(r'[^-\d*\.\d*]', "", ret)
    else:
        ret=None
    return ret

In [None]:
# run functions

park_df['park_lat']=[lat_func(x) for x in park_df['Geom']]
park_df['park_long']=[long_func(x) for x in park_df['Geom']]

# dataset
park_df.head()

Unnamed: 0,Geom,park_lat,park_long
5,"{""coordinates"": [-123.10249423430041, 49.27706...",49.27706523628811,-123.1024942343004
6,"{""coordinates"": [-123.1029445090042, 49.277403...",49.27740316288875,-123.1029445090042
7,"{""coordinates"": [-123.0898869035528, 49.270338...",49.27033821755311,-123.0898869035528
8,"{""coordinates"": [-123.10556622620037, 49.27691...",49.27691853858081,-123.10556622620037
9,"{""coordinates"": [-123.10376713040408, 49.27681...",49.27681160535016,-123.10376713040408


In [None]:
# drop the columns that are not needed

park_df=park_df.drop(columns=['Geom'])

In [None]:
# change types from int to float
park_df=park_df.applymap(lambda x: float(x))

#check that it's correct
park_df.dtypes

park_lat     float64
park_long    float64
dtype: object

In [None]:
# dataset

park_df.head()

Unnamed: 0,park_lat,park_long
5,49.277065,-123.102494
6,49.277403,-123.102945
7,49.270338,-123.089887
8,49.276919,-123.105566
9,49.276812,-123.103767


# Export

In [None]:
# export the data sets

park_df.to_excel('cleaned_data/city_parking_dataset.xlsx')
df_car_thefts.to_excel('cleaned_data/car_thefts_dataset.xlsx')