In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go

In [2]:
# Data snapshot from 3/11/2022
# https://donnees.montreal.ca/ville-de-montreal/actes-criminels
data = pd.read_csv("data/montreal_crime_data.csv", encoding='latin1')

In [3]:
data.head()

Unnamed: 0,CATEGORIE,DATE,QUART,PDQ,X,Y,LONGITUDE,LATITUDE
0,Vol de véhicule à moteur,2018-09-13,jour,30.0,294904.159001,5047549.0,-73.626778,45.56778
1,Vol de véhicule à moteur,2018-04-30,jour,30.0,294904.159001,5047549.0,-73.626778,45.56778
2,Vol de véhicule à moteur,2018-09-01,nuit,7.0,290274.565,5042150.0,-73.685928,45.519122
3,Méfait,2017-07-21,jour,21.0,0.0,0.0,-76.23729,0.0
4,Méfait,2017-07-29,jour,12.0,0.0,0.0,-76.23729,0.0


## Preprocessing & exploring

In [4]:
data.columns

Index(['CATEGORIE', 'DATE', 'QUART', 'PDQ', 'X', 'Y', 'LONGITUDE', 'LATITUDE'], dtype='object')

#### Meta information of data

In [5]:
print(f"Shape of data is: {data.shape}")

Shape of data is: (214322, 8)


In [6]:
pd.DataFrame(data.dtypes, columns=['Datatype']).rename_axis('Columns')

Unnamed: 0_level_0,Datatype
Columns,Unnamed: 1_level_1
CATEGORIE,object
DATE,object
QUART,object
PDQ,float64
X,float64
Y,float64
LONGITUDE,float64
LATITUDE,float64


## Data cleaning

#### Converting date feature to datatime format

In [7]:
data['DATE'] = pd.to_datetime(data['DATE'])

In [8]:
data['DATE'].head()

0   2018-09-13
1   2018-04-30
2   2018-09-01
3   2017-07-21
4   2017-07-29
Name: DATE, dtype: datetime64[ns]

In [9]:
datatypes = pd.DataFrame(data.dtypes, columns=['Datatype']).rename_axis('Columns')
datatypes

Unnamed: 0_level_0,Datatype
Columns,Unnamed: 1_level_1
CATEGORIE,object
DATE,datetime64[ns]
QUART,object
PDQ,float64
X,float64
Y,float64
LONGITUDE,float64
LATITUDE,float64


#### Checking for NaN values

In [10]:
pd.DataFrame(data.isnull().sum(), columns=['Nb of missing values']).rename_axis('Feature')

Unnamed: 0_level_0,Nb of missing values
Feature,Unnamed: 1_level_1
CATEGORIE,0
DATE,0
QUART,0
PDQ,5
X,0
Y,0
LONGITUDE,0
LATITUDE,0


## Visualization