In [None]:
!pip install pandas numpy lets-plot numerize tqdm sqlalchemy jupysql

In [None]:
import kaggle
import csv
import os
import json
import numpy as np
import pandas as pd
import sqlite3

from tqdm.notebook import tqdm, trange
tqdm.pandas()
from numerize import numerize as nz
from lets_plot import *
from lets_plot.mapping import *
LetsPlot.setup_html()
from dotenv import dotenv_values

# Part III: EDA


Before doing EDA, let's first read data from our SQLite database.

In [None]:
DATA_FOLDER = os.path.join('../data/clean')

In [None]:
# Pandas require an active connection
conn = sqlite3.connect(os.path.join(DATA_FOLDER, 'crime_db.db'))

# Load the data from the product_listing table into a pandas DataFrame
df = pd.read_sql_query('''
SELECT * FROM crime_listing;
''', conn)

In [None]:
df

Now we can do some data visualization and comparison. We will explore three main themes:
- Crime Count by Hour
- Different Types of Crimes
- Geographic Distribution of Crime Incidents

In [None]:
df = df.dropna(subset=['CrimeTime'])

In [None]:
df_Baltimore = df[df['City']=='Baltimore']
df_Vancouver = df[df['City']=='Vancouver']

In [None]:
start_date = '2013-01-01'
end_date = '2016-12-31'

df_modify = df[
    (df['CrimeDate'] >= start_date) &
    (df['CrimeDate'] <= end_date)
] 

df_Baltimore = df_Baltimore[
    (df_Baltimore['City'] == 'Baltimore') &
    (df_Baltimore['CrimeDate'] >= start_date) &
    (df_Baltimore['CrimeDate'] <= end_date)
]

df_Vancouver = df_Vancouver[
    (df_Vancouver['City'] == 'Vancouver') &
    (df_Vancouver['CrimeDate'] >= start_date) &
    (df_Vancouver['CrimeDate'] <= end_date)
]


In [None]:
# Baltimore
df_Baltimore['CrimeTime'] = pd.to_datetime(df_Baltimore['CrimeTime'], errors='coerce')
df_Baltimore['hour'] = df_Baltimore['CrimeTime'].dt.hour

hourly_crime_counts = df_Baltimore.groupby('hour').size()
hourly_crime_counts_ba = hourly_crime_counts.reset_index(name='crimecount')
hourly_crime_counts_ba['city'] = 'Baltimore'
hourly_crime_counts_ba

In [None]:
# save_path = '../docs/figures/' 
# os.makedirs(save_path, exist_ok=True)


# plot = (ggplot(hourly_crime_counts_ba, aes(x='hour', y='Baltimore', fill='Baltimore')) +
#         geom_bar(stat='identity') +
#         ggtitle('Amount of Crime in Baltimore by Hour') +
#         labs(x='Hour', y='Count') +
#         theme(axis_text_x=element_text(angle=45, hjust=1)) +
#         scale_fill_gradient(low='#ADD8E6', high='#00008B'))
# plot.show()

# ggsave(filename=os.path.join('Amount of Crime in Baltimore by Hour(2013-2016).html'), plot=plot, dpi=300)

In [None]:
# Vancouver
df_Vancouver['CrimeTime'] = pd.to_datetime(df_Vancouver['CrimeTime'], errors='coerce', format='%H:%M:%S')
df_Vancouver['hour'] = df_Vancouver['CrimeTime'].dt.hour

hourly_crime_counts_va = df_Vancouver.groupby('hour').size()
hourly_crime_counts_va = hourly_crime_counts_va.reset_index(name='crimecount')
hourly_crime_counts_va['city'] = 'Vancouver'
hourly_crime_counts_va

In [None]:
df_hour = pd.concat([hourly_crime_counts_ba, hourly_crime_counts_va])

In [None]:
save_path = '../../docs/figures'

plot = (ggplot(df_hour, aes(x='hour', y='crimecount', fill='city')) +
        geom_bar(stat='identity') +
        ggtitle('Total Amount of Crime by Hour(2013-2016): Baltimore VS Vancouver') +
        labs(x='Hour', y='Crime Count') +
        theme(axis_text_x=element_text(angle=45, hjust=1)) +
        facet_wrap('city', ncol = 2) 
        )
plot.show()

ggsave(filename=os.path.join(save_path, 'Total_Amount_of_Crime_by_Hour(2013-2016).html'), plot=plot, dpi=300)

Then came to different types of crime:

- Overall

In [None]:
crime_type_counts = df_modify['Description'].value_counts().reset_index()
crime_type_counts.columns = ['Description', 'Count']

plot = (ggplot(crime_type_counts, aes(x='Description', y='Count', fill='Description')) +
        geom_bar(stat='identity') +
        ggtitle('Repartition of Crime by Type from 2013 to 2016') +
        labs(x='Crime Type', y='Count') 
        )

plot.show()

ggsave(filename=os.path.join(save_path, 'Overall_Crime_Type_Count.html'), plot=plot, dpi=300)

In [None]:
plot = (ggplot(crime_type_counts, aes(x='Description', y='Count', fill='Description')) +
        geom_bar(stat='identity') +
        ggtitle('Repartition of Crime by Type from 2013 to 2016(axis scale modified)') +
        labs(x='Crime Type', y='Count') +
        scale_y_log10())

plot.show()

ggsave(filename=os.path.join(save_path, 'Overall_Crime_Type_Count_log.html'), plot=plot, dpi=300)

- Baltimore

In [None]:
df['CrimeDate'] = pd.to_datetime(df['CrimeDate'], errors='coerce')

df = df.dropna(subset=['CrimeDate'])

filtered_df = df[(df['City'] == 'Baltimore') & (df['CrimeDate'].dt.year.isin([2013, 2014, 2015, 2016]))]

crime_type_counts = filtered_df['Description'].value_counts().reset_index()
crime_type_counts.columns = ['Description', 'Count']

plot = (ggplot(crime_type_counts, aes(x='Description', y='Count', fill='Description')) +
        geom_bar(stat='identity') +
        ggtitle('Repartition of Crime by Type in Baltimore from 2013 to 2016(axis scale modified)') +
        labs(x='Crime Type', y='Count') +
        scale_y_log10())

plot.show()
        

ggsave(filename=os.path.join(save_path, 'Crime_Type_Count_in_Baltimore.html'), plot=plot, dpi=300)

- Vancouver

In [None]:
df['CrimeDate'] = pd.to_datetime(df['CrimeDate'], errors='coerce')

df = df.dropna(subset=['CrimeDate'])

filtered_df = df[(df['City'] == 'Vancouver') & (df['CrimeDate'].dt.year.isin([2013, 2014, 2015, 2016]))]

crime_type_counts = filtered_df['Description'].value_counts().reset_index()
crime_type_counts.columns = ['Description', 'Count']

plot = (ggplot(crime_type_counts, aes(x='Description', y='Count', fill='Description')) +
        geom_bar(stat='identity') +
        ggtitle('Repartition of Crime by Type in Vancouver from 2013 to 2016(axis scale modified)') +
        labs(x='Crime Type', y='Count') +
        scale_y_log10())
        

plot.show()

ggsave(filename=os.path.join(save_path, 'Crime_Type_Count_in_Vancouver.html'), plot=plot, dpi=300)

Finally, the Geographic Distribution from 2013 to 2016:

- Baltimore

In [None]:
# generate a map based on the data (latitude and longitude) and the type of crime (description)
df = df.dropna(subset=['Latitude', 'Longitude'])
df = df[df['Longitude'] != 0]
df = df[df['Latitude'] != 0]


df['Description'] = as_discrete(df['Description'])


In [None]:
# plot on the map the different types of crimes using lets-plot
# use `geom_livemap()` to plot on the map and `geom_point()` to plot the points
# use `aes()` to specify the aesthetics of the plot
# only with `city`=='Baltimore'
df['CrimeDate'] = pd.to_datetime(df['CrimeDate'], errors='coerce')
filter_df = df[(df['City'] == 'Baltimore') & (df['CrimeDate'].dt.year.isin([2013, 2014, 2015, 2016]))]

# plot and add a title
plot = (ggplot(data=filter_df) +
        geom_livemap() +
        geom_point(aes(x='Longitude', y='Latitude', color='Description'), size=2, alpha=0.3) +
        scale_color_manual(values={
            'Theft': '#FF0000',        # Red  
            'Burglary': '#00FF00',       # Green  
            'Other': '#A52A2A',  # Brown
            'Assault': '#0000FF',          # Blue 
            'Homicide': '#FFA500',       # Orange
            'Robbery': '#800080',        # Purple  
            'Sexual offense': '#FFFF00',          # Yellow'
        }) +
        ggtitle('Geographical Distribution of Crime in Baltimore from 2013 to 2016'))

plot.show()

ggsave(filename=os.path.join(save_path, 'Crime_Map_in_Baltimore_(2013-2016).html'), plot=plot, dpi=300)


In [None]:
df['CrimeDate'] = pd.to_datetime(df['CrimeDate'], errors='coerce')

# delete rows with missing values in the CrimeDate column
df = df.dropna(subset=['CrimeDate'])

# filter the data to only include crimes in Baltimore in 2013
filtered_df = df[(df['City'] == 'Baltimore') & (df['CrimeDate'].dt.year == 2013)]

# plot and add a title
plot = (ggplot(data=filtered_df) +
        geom_livemap() +
        geom_point(aes(x='Longitude', y='Latitude', color='Description'), size=2, alpha=0.3) +
        scale_color_manual(values={
            'Theft': '#FF0000',        # Red  
            'Burglary': '#00FF00',       # Green  
            'Other': '#A52A2A',  # Brown
            'Assault': '#0000FF',          # Blue 
            'Homicide': '#FFA500',       # Orange
            'Robbery': '#800080',        # Purple  
            'Sexual offense': '#FFFF00',          # Yellow'
        }) +
        ggtitle('Geographical Distribution of Crime in Baltimore in 2013'))


plot.show()

ggsave(filename=os.path.join(save_path, 'Crime_Map_in_Baltimore_(2013).html'), plot=plot, dpi=300)

In [None]:
df['CrimeDate'] = pd.to_datetime(df['CrimeDate'], errors='coerce')

# delete rows with missing values in the CrimeDate column
df = df.dropna(subset=['CrimeDate'])

# filter the data to only include crimes in Baltimore in 2014
filtered_df = df[(df['City'] == 'Baltimore') & (df['CrimeDate'].dt.year == 2014)]

# plot and add a title
plot = (ggplot(data=filtered_df) +
        geom_livemap() +
        geom_point(aes(x='Longitude', y='Latitude', color='Description'), size=2, alpha=0.3) +
        scale_color_manual(values={
            'Theft': '#FF0000',        # Red  
            'Burglary': '#00FF00',       # Green  
            'Other': '#A52A2A',  # Brown
            'Assault': '#0000FF',          # Blue 
            'Homicide': '#FFA500',       # Orange
            'Robbery': '#800080',        # Purple  
            'Sexual offense': '#FFFF00',          # Yellow'
        }) +
        ggtitle('Geographical Distribution of Crime in Baltimore in 2014'))

plot.show()

ggsave(filename=os.path.join(save_path, 'Crime_Map_in_Baltimore_(2014).html'), plot=plot, dpi=300)

In [None]:
df['CrimeDate'] = pd.to_datetime(df['CrimeDate'], errors='coerce')

# delete rows with missing values in the CrimeDate column
df = df.dropna(subset=['CrimeDate'])

# filter the data to only include crimes in Baltimore in 2015
filtered_df = df[(df['City'] == 'Baltimore') & (df['CrimeDate'].dt.year == 2015)]

# plot and add a title
plot = (ggplot(data=filtered_df) +
        geom_livemap() +
        geom_point(aes(x='Longitude', y='Latitude', color='Description'), size=2, alpha=0.3) +
        scale_color_manual(values={
            'Theft': '#FF0000',        # Red  
            'Burglary': '#00FF00',       # Green  
            'Other': '#A52A2A',  # Brown
            'Assault': '#0000FF',          # Blue 
            'Homicide': '#FFA500',       # Orange
            'Robbery': '#800080',        # Purple  
            'Sexual offense': '#FFFF00',          # Yellow'
        }) +
        ggtitle('Geographical Distribution of Crime in Baltimore in 2015'))

plot.show()

ggsave(filename=os.path.join(save_path, 'Crime_Map_in_Baltimore_(2015).html'), plot=plot, dpi=300)

In [None]:
df['CrimeDate'] = pd.to_datetime(df['CrimeDate'], errors='coerce')

# delete rows with missing values in the CrimeDate column
df = df.dropna(subset=['CrimeDate'])

# filter the data to only include crimes in Baltimore in 2016
filtered_df = df[(df['City'] == 'Baltimore') & (df['CrimeDate'].dt.year == 2016)]

# plot and add a title
plot = (ggplot(data=filtered_df) +
        geom_livemap() +
        geom_point(aes(x='Longitude', y='Latitude', color='Description'), size=2, alpha=0.3) +
        scale_color_manual(values={
            'Theft': '#FF0000',        # Red  
            'Burglary': '#00FF00',       # Green  
            'Other': '#A52A2A',  # Brown
            'Assault': '#0000FF',          # Blue 
            'Homicide': '#FFA500',       # Orange
            'Robbery': '#800080',        # Purple  
            'Sexual offense': '#FFFF00',          # Yellow'
        }) +
        ggtitle('Geographical Distribution of Crime in Baltimore in 2016'))

plot.show()

ggsave(filename=os.path.join(save_path, 'Crime_Map_in_Baltimore_(2015).html'), plot=plot, dpi=300)

- Vancouver

In [None]:
# df['CrimeDate'] = pd.to_datetime(df['CrimeDate'], errors='coerce')

# df = df.dropna(subset=['CrimeDate'])

# filtered_df = df[(df['City'] == 'Vancouver') & (df['CrimeDate'].dt.year.isin([2013, 2014, 2015, 2016]))]

# crime_type_counts = filtered_df['Description'].value_counts().reset_index()
# crime_type_counts.columns = ['Description', 'Count']

# plot = (ggplot(crime_type_counts, aes(x='Description', y='Count', fill='Description')) +
#         geom_bar(stat='identity') +
#         ggtitle('Repartition of Crime by Type in Vancouver from 2013 to 2016') +
#         labs(x='Crime Type', y='Count') +
#         theme(axis_text_x=element_text(angle=45, hjust=1)))
        

# plot.show()

# ggsave(filename=os.path.join(save_path, 'Crime Type Count in Vancouver.html'), plot=plot, dpi=300)

In [None]:
# plot on the map the different types of crimes using lets-plot
# use `geom_livemap()` to plot on the map and `geom_point()` to plot the points
# use `aes()` to specify the aesthetics of the plot
# only with `city`=='Baltimore'

filter_df = df[(df['City'] == 'Vancouver') & (df['CrimeDate'].dt.year.isin([2013, 2014, 2015, 2016]))]

plot = (ggplot(data=filter_df) +
        geom_livemap() +
        geom_point(aes(x='Longitude', y='Latitude', color='Description'), size=2, alpha=0.3) +
        scale_color_manual(values={
            'Theft': '#FF0000',        # Red  
            'Burglary': '#800080',        # Purple  
            'Other': '#00FF00',       # Green 
            'Accident': '#0000FF',          # Blue
        }) +
        ggtitle('Geographical Distribution of Crime in Vancouver from 2013 to 2016')
        )

plot.show()

ggsave(filename=os.path.join(save_path, 'Crime_Map_in_Vancouver_(2013-2016).html'), plot=plot, dpi=300)


In [None]:
df['CrimeDate'] = pd.to_datetime(df['CrimeDate'], errors='coerce')

# delete rows with missing values in the CrimeDate column
df = df.dropna(subset=['CrimeDate'])

# filter the data to only include crimes in Vancouver in 2013
filtered_df = df[(df['City'] == 'Vancouver') & (df['CrimeDate'].dt.year == 2013)]

# plot and add a title
plot = (ggplot(data=filtered_df) +
        geom_livemap() +
        geom_point(aes(x='Longitude', y='Latitude', color='Description'), size=2, alpha=0.3) +
        scale_color_manual(values={
            'Theft': '#FF0000',        # Red  
            'Burglary': '#800080',        # Purple  
            'Other': '#00FF00',       # Green 
            'Accident': '#0000FF',          # Blue
        }) +
        ggtitle('Geographical Distribution of Crime in Vancouver in 2013')
        )

plot.show()

ggsave(filename=os.path.join(save_path, 'Crime_Map_in_Vancouver_(2013).html'), plot=plot, dpi=300)

In [None]:
df['CrimeDate'] = pd.to_datetime(df['CrimeDate'], errors='coerce')

# delete rows with missing values in the CrimeDate column
df = df.dropna(subset=['CrimeDate'])

# filter the data to only include crimes in Vancouver in 2014
filtered_df = df[(df['City'] == 'Vancouver') & (df['CrimeDate'].dt.year == 2014)]

# plot and add a title
plot = (ggplot(data=filtered_df) +
        geom_livemap() +
        geom_point(aes(x='Longitude', y='Latitude', color='Description'), size=3, alpha=0.3) +
        scale_color_manual(values={
            'Theft': '#FF0000',        # Red  
            'Burglary': '#0000FF',          # Blue  
            'Other': '#00FF00',       # Green 
            'Accident': '#800080',        # Purple 
        }) +
        ggtitle('Geographical Distribution of Crime in Vancouver in 2014'))

plot.show()

ggsave(filename=os.path.join(save_path, 'Crime_Map_in_Vancouver_(2014).html'), plot=plot, dpi=300)

In [None]:
df['CrimeDate'] = pd.to_datetime(df['CrimeDate'], errors='coerce')

# delete rows with missing values in the CrimeDate column
df = df.dropna(subset=['CrimeDate'])

# filter the data to only include crimes in Vancouver in 2015
filtered_df = df[(df['City'] == 'Vancouver') & (df['CrimeDate'].dt.year == 2015)]

# plot and add a title
plot = (ggplot(data=filtered_df) +
        geom_livemap() +
        geom_point(aes(x='Longitude', y='Latitude', color='Description'), size=3, alpha=0.3) +
        scale_color_manual(values={
            'Theft': '#FF0000',        # Red  
            'Burglary': '#0000FF',          # Blue  
            'Other': '#00FF00',       # Green 
            'Accident': '#800080',        # Purple 
        }) +
        ggtitle('Geographical Distribution of Crime in Vancouver in 2015'))

plot.show()

ggsave(filename=os.path.join(save_path, 'Crime_Map_in_Vancouver_(2015).html'), plot=plot, dpi=300)

In [None]:
df['CrimeDate'] = pd.to_datetime(df['CrimeDate'], errors='coerce')

# delete rows with missing values in the CrimeDate column
df = df.dropna(subset=['CrimeDate'])

# filter the data to only include crimes in Vancouver in 2016
filtered_df = df[(df['City'] == 'Vancouver') & (df['CrimeDate'].dt.year == 2016)]

# plot and add a title
plot = (ggplot(data=filtered_df) +
        geom_livemap() +
        geom_point(aes(x='Longitude', y='Latitude', color='Description'), size=3, alpha=0.3) +
        scale_color_manual(values={
            'Theft': '#FF0000',        # Red  
            'Burglary': '#0000FF',          # Blue  
            'Other': '#00FF00',       # Green 
            'Accident': '#800080',        # Purple 
        }) +
        ggtitle('Geographical Distribution of Crime in Vancouver in 2016'))

plot.show()

ggsave(filename=os.path.join(save_path, 'Crime_Map_in_Vancouver_(2016).html'), plot=plot, dpi=300)

In [None]:
!pip freeze > requirement.txt