# Crime data pre-proccessing 

## 



In [1]:
import pandas as pd
import numpy as np


In [2]:
# read csv

data = pd.read_csv('crimedata_csv_AllNeighbourhoods_2021+2022.csv')


In [3]:
df = data.copy()


In [4]:
df


Unnamed: 0,TYPE,YEAR,MONTH,DAY,HOUR,MINUTE,HUNDRED_BLOCK,NEIGHBOURHOOD,X,Y
0,Break and Enter Commercial,2021,11,21,6,33,10XX ALBERNI ST,West End,491015.9434,5459166.140
1,Break and Enter Commercial,2021,11,26,13,58,10XX BARCLAY ST,West End,490833.8455,5458886.535
2,Break and Enter Commercial,2021,12,14,2,2,10XX BEACH AVE,Central Business District,490252.3815,5458162.723
3,Break and Enter Commercial,2021,7,17,5,0,10XX BEACH AVE,Central Business District,490255.3982,5458158.788
4,Break and Enter Commercial,2021,6,15,4,0,10XX BEACH AVE,Central Business District,490258.4148,5458154.853
...,...,...,...,...,...,...,...,...,...,...
35454,Vehicle Collision or Pedestrian Struck (with I...,2022,1,7,14,1,W 7TH AVE / ALDER ST,Fairview,490448.0000,5456949.000
35455,Vehicle Collision or Pedestrian Struck (with I...,2022,1,18,20,4,WILLOW ST / W BROADWAY AVE,Fairview,491136.0000,5456735.000
35456,Vehicle Collision or Pedestrian Struck (with I...,2022,2,8,19,52,WINDSOR ST / E 49TH AVE,Sunset,493792.0000,5452516.000
35457,Vehicle Collision or Pedestrian Struck (with I...,2022,1,4,14,56,X BLOCK E 33RD AVE,Riley Park,492354.0000,5454205.000


In [5]:
# find number of unique values

df.nunique()


TYPE                11
YEAR                 2
MONTH               12
DAY                 31
HOUR                24
MINUTE              60
HUNDRED_BLOCK     7901
NEIGHBOURHOOD       24
X                15691
Y                15587
dtype: int64

## Cleaning data

In [6]:
#  certain crimes like homicides, some of the data is unlabelled in order to respect the privacy of victims. I decided to drop it as

df.isnull().sum()


TYPE             0
YEAR             0
MONTH            0
DAY              0
HOUR             0
MINUTE           0
HUNDRED_BLOCK    0
NEIGHBOURHOOD    9
X                2
Y                2
dtype: int64

In [10]:
# creating column with datetime object

dt = df.columns[1:5]
df['datetime'] = pd.to_datetime(df[dt])


In [9]:
# This is not important, as the recorded time it occurred does not accurately reflect the actual time the crime occurred.
# There is sometimes a slight delay in reporting of the crime

df = df.drop(columns=['MINUTE'])


In [11]:
# count of types of crime in 2021 and 2022

df[['YEAR', 'TYPE']].groupby(['TYPE']).count()


Unnamed: 0_level_0,YEAR
TYPE,Unnamed: 1_level_1
Break and Enter Commercial,2322
Break and Enter Residential/Other,1649
Homicide,18
Mischief,5996
Offence Against a Person,4117
Other Theft,9595
Theft from Vehicle,8212
Theft of Bicycle,1508
Theft of Vehicle,927
Vehicle Collision or Pedestrian Struck (with Fatality),21


In [12]:
df.corr()


Unnamed: 0,YEAR,MONTH,DAY,HOUR,X,Y
YEAR,1.0,-0.434993,-0.065726,0.012041,0.000793,0.000612
MONTH,-0.434993,1.0,0.024007,-0.002807,-0.001674,-0.001526
DAY,-0.065726,0.024007,1.0,0.00258,-0.011597,-0.011679
HOUR,0.012041,-0.002807,0.00258,1.0,0.537629,0.537564
X,0.000793,-0.001674,-0.011597,0.537629,1.0,0.999877
Y,0.000612,-0.001526,-0.011679,0.537564,0.999877,1.0


In [8]:
# dropping null UTM-coordinates in the X and Y columns

df=df.dropna(subset=['X', 'Y'])


In [13]:
# the lattitude and longitude in the database are in UTM coordinate system
# In order to create a tableau dashboard, the UTM coordiantes need to be converted to lat/long WGS84 coordinate

# import library
import utm

# this function takes the UTM coordinates in that region and turns them into WGS84 coordinates
def rule(row):
    try:
        lat, long = utm.to_latlon(row["X"], row["Y"], 10, 'N')
    except:
        lat = None
        long = None
        pd.Series({"lat": lat, "long": long})
    return pd.Series({"lat": lat, "long": long})


In [14]:
# run function

df=df.merge(df.apply(rule, axis=1), left_index=True, right_index=True)


In [None]:
# export to excel file

df.to_excel('crime_data_preprocessed.xlsx')

# Thefts from Vehicles Project

For the next part of the project I want to create a map which includes the thefts from vehicles. 

In [20]:
# first I will drop all the other types of crime

df_car_thefts=df[df['TYPE']=='Theft from Vehicle']

In [21]:
df_car_thefts

Unnamed: 0,TYPE,YEAR,MONTH,DAY,HOUR,HUNDRED_BLOCK,NEIGHBOURHOOD,X,Y,datetime,lat,long
21218,Theft from Vehicle,2021,10,28,17,0X KEEFER ST,Central Business District,492410.2511,5458497.396,2021-10-28 17:00:00,49.279185,-123.104351
21219,Theft from Vehicle,2021,12,13,17,10XX ALBERNI ST,West End,491050.2906,5459131.648,2021-12-13 17:00:00,49.284871,-123.123063
21220,Theft from Vehicle,2021,2,4,17,10XX ALBERNI ST,West End,491051.0856,5459143.808,2021-02-04 17:00:00,49.284981,-123.123053
21221,Theft from Vehicle,2021,2,15,14,10XX ALBERNI ST,West End,491058.8169,5459123.086,2021-02-15 14:00:00,49.284794,-123.122946
21222,Theft from Vehicle,2021,4,11,13,10XX ALBERNI ST,West End,491058.8169,5459123.086,2021-04-11 13:00:00,49.284794,-123.122946
...,...,...,...,...,...,...,...,...,...,...,...,...
35223,Theft from Vehicle,2022,1,20,16,X E CORDOVA ST,Central Business District,492424.9863,5458861.332,2022-01-20 16:00:00,49.282458,-123.104155
35224,Theft from Vehicle,2022,1,3,21,X NK_LOC ST,Strathcona,492765.9374,5458793.303,2022-01-03 21:00:00,49.281851,-123.099466
35225,Theft from Vehicle,2022,1,25,21,X NK_LOC ST,Strathcona,492765.9374,5458793.303,2022-01-25 21:00:00,49.281851,-123.099466
35226,Theft from Vehicle,2022,2,2,14,X NK_LOC ST,Strathcona,492765.9374,5458793.303,2022-02-02 14:00:00,49.281851,-123.099466


In [22]:
# occurences of breakins in 2021 and 2022

df_car_thefts[['NEIGHBOURHOOD', 'TYPE']].groupby(['NEIGHBOURHOOD']).count()

Unnamed: 0_level_0,TYPE
NEIGHBOURHOOD,Unnamed: 1_level_1
Arbutus Ridge,86
Central Business District,2192
Dunbar-Southlands,100
Fairview,400
Grandview-Woodland,392
Hastings-Sunrise,404
Kensington-Cedar Cottage,439
Kerrisdale,140
Killarney,150
Kitsilano,363


In [31]:
# extract weekday from datetime object
# this could be useful as there could be more thefts during certain days

df_car_thefts['weekday']=df_car_thefts['datetime'].dt.day_name()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['weekday']=df['datetime'].dt.day_name()


In [33]:
# count of breakings during each day
# we can see that breakins happen more often during the weekend and monday and decreases during the week

df_car_thefts[['weekday', 'TYPE']].groupby(['weekday']).count()

Unnamed: 0_level_0,TYPE
weekday,Unnamed: 1_level_1
Friday,1295
Monday,1217
Saturday,1248
Sunday,1218
Thursday,1067
Tuesday,1099
Wednesday,1068


In [41]:
# drop columns we don't need

df_car_thefts=df_car_thefts.drop(columns=['TYPE', 'HUNDRED_BLOCK','X','Y', 'datetime'])

In [44]:
df_car_thefts.to_excel('car_thefts.xlsx')