# Crime data pre-proccessing 

## 



In [2]:
import pandas as pd
import numpy as np


In [31]:
# read csv

data = pd.read_csv('crimedata_csv_AllNeighbourhoods_2021+2022.csv')


In [32]:
df = data.copy()


In [33]:
df


Unnamed: 0,TYPE,YEAR,MONTH,DAY,HOUR,MINUTE,HUNDRED_BLOCK,NEIGHBOURHOOD,X,Y
0,Break and Enter Commercial,2021,11,21,6,33,10XX ALBERNI ST,West End,491015.9434,5459166.140
1,Break and Enter Commercial,2021,11,26,13,58,10XX BARCLAY ST,West End,490833.8455,5458886.535
2,Break and Enter Commercial,2021,12,14,2,2,10XX BEACH AVE,Central Business District,490252.3815,5458162.723
3,Break and Enter Commercial,2021,7,17,5,0,10XX BEACH AVE,Central Business District,490255.3982,5458158.788
4,Break and Enter Commercial,2021,6,15,4,0,10XX BEACH AVE,Central Business District,490258.4148,5458154.853
...,...,...,...,...,...,...,...,...,...,...
35454,Vehicle Collision or Pedestrian Struck (with I...,2022,1,7,14,1,W 7TH AVE / ALDER ST,Fairview,490448.0000,5456949.000
35455,Vehicle Collision or Pedestrian Struck (with I...,2022,1,18,20,4,WILLOW ST / W BROADWAY AVE,Fairview,491136.0000,5456735.000
35456,Vehicle Collision or Pedestrian Struck (with I...,2022,2,8,19,52,WINDSOR ST / E 49TH AVE,Sunset,493792.0000,5452516.000
35457,Vehicle Collision or Pedestrian Struck (with I...,2022,1,4,14,56,X BLOCK E 33RD AVE,Riley Park,492354.0000,5454205.000


In [34]:
# find number of unique values

df.nunique()


TYPE                11
YEAR                 2
MONTH               12
DAY                 31
HOUR                24
MINUTE              60
HUNDRED_BLOCK     7901
NEIGHBOURHOOD       24
X                15691
Y                15587
dtype: int64

## Cleaning data

In [35]:
#  certain crimes like homicides, some of the data is unlabelled in order to respect the privacy of victims. I decided to drop it as

df.isnull().sum()


TYPE             0
YEAR             0
MONTH            0
DAY              0
HOUR             0
MINUTE           0
HUNDRED_BLOCK    0
NEIGHBOURHOOD    9
X                2
Y                2
dtype: int64

In [45]:
df[['X', 'Y']]


Unnamed: 0,X,Y
0,491015.9434,5459166.140
1,490833.8455,5458886.535
2,490252.3815,5458162.723
3,490255.3982,5458158.788
4,490258.4148,5458154.853
...,...,...
35454,490448.0000,5456949.000
35455,491136.0000,5456735.000
35456,493792.0000,5452516.000
35457,492354.0000,5454205.000


In [61]:
# dropping null coordinates in the X and Y columns

df=df.dropna(subset=['X', 'Y'])


In [53]:
# This is not important, as the recorded time it occurred does not accurately reflect the actual time the crime occurred.
# There is sometimes a slight delay in reporting of the crime

df = df.drop(columns=['MINUTE'])


In [54]:
# creating column with datetime object

dt = df.columns[1:5]
df['datetime'] = pd.to_datetime(df[dt])


In [55]:
# count of types of crime in 2021 and 2022

df[['YEAR', 'TYPE']].groupby(['TYPE']).count()


Unnamed: 0_level_0,YEAR
TYPE,Unnamed: 1_level_1
Break and Enter Commercial,2322
Break and Enter Residential/Other,1649
Homicide,18
Mischief,5996
Offence Against a Person,4117
Other Theft,9595
Theft from Vehicle,8212
Theft of Bicycle,1508
Theft of Vehicle,927
Vehicle Collision or Pedestrian Struck (with Fatality),21


In [56]:
df.corr()


Unnamed: 0,YEAR,MONTH,DAY,HOUR,X,Y
YEAR,1.0,-0.434988,-0.065721,0.012028,0.000793,0.000612
MONTH,-0.434988,1.0,0.023954,-0.002784,-0.001674,-0.001526
DAY,-0.065721,0.023954,1.0,0.002544,-0.011597,-0.011679
HOUR,0.012028,-0.002784,0.002544,1.0,0.537629,0.537564
X,0.000793,-0.001674,-0.011597,0.537629,1.0,0.999877
Y,0.000612,-0.001526,-0.011679,0.537564,0.999877,1.0


In [58]:
# the lattitude and longitude in the database is in UTM coordinate system.
# In order to create a tableau dashboard, the UTM coordiantes need to be converted to lat/long WGS84 coordinate

import utm


def rule(row):
    try:
        lat, long = utm.to_latlon(row["X"], row["Y"], 10, 'N')
    except:
        lat = None
        long = None
        pd.Series({"lat": lat, "long": long})
    return pd.Series({"lat": lat, "long": long})


In [67]:
# run function

df=df.merge(df.apply(rule, axis=1), left_index=True, right_index=True)


In [70]:
df.to_excel('crime_data_preprocessed.xlsx')

In [71]:
df

Unnamed: 0,TYPE,YEAR,MONTH,DAY,HOUR,HUNDRED_BLOCK,NEIGHBOURHOOD,X,Y,datetime,lat,long
0,Break and Enter Commercial,2021,11,21,6,10XX ALBERNI ST,West End,491015.9434,5459166.140,2021-11-21 06:00:00,49.285181,-123.123536
1,Break and Enter Commercial,2021,11,26,13,10XX BARCLAY ST,West End,490833.8455,5458886.535,2021-11-26 13:00:00,49.282663,-123.126034
2,Break and Enter Commercial,2021,12,14,2,10XX BEACH AVE,Central Business District,490252.3815,5458162.723,2021-12-14 02:00:00,49.276144,-123.134011
3,Break and Enter Commercial,2021,7,17,5,10XX BEACH AVE,Central Business District,490255.3982,5458158.788,2021-07-17 05:00:00,49.276108,-123.133970
4,Break and Enter Commercial,2021,6,15,4,10XX BEACH AVE,Central Business District,490258.4148,5458154.853,2021-06-15 04:00:00,49.276073,-123.133928
...,...,...,...,...,...,...,...,...,...,...,...,...
35454,Vehicle Collision or Pedestrian Struck (with I...,2022,1,7,14,W 7TH AVE / ALDER ST,Fairview,490448.0000,5456949.000,2022-01-07 14:00:00,49.265229,-123.131293
35455,Vehicle Collision or Pedestrian Struck (with I...,2022,1,18,20,WILLOW ST / W BROADWAY AVE,Fairview,491136.0000,5456735.000,2022-01-18 20:00:00,49.263314,-123.121832
35456,Vehicle Collision or Pedestrian Struck (with I...,2022,2,8,19,WINDSOR ST / E 49TH AVE,Sunset,493792.0000,5452516.000,2022-02-08 19:00:00,49.225396,-123.085261
35457,Vehicle Collision or Pedestrian Struck (with I...,2022,1,4,14,X BLOCK E 33RD AVE,Riley Park,492354.0000,5454205.000,2022-01-04 14:00:00,49.240573,-123.105042
