In [2]:
import kaggle
import csv
import os
import json
import numpy as np
import pandas as pd

from numerize import numerize as nz

from lets_plot import *
from lets_plot.mapping import *
LetsPlot.setup_html()
from dotenv import dotenv_values

# Part II: Build Database

We've already store the crime data in two dataframes, and the next step is to perform some data cleaning to make the two datasets comparable including:

- Delete unnecessary columns in dataframe
- Add column 'City' to distinguish two cities
- Adjust the 'CrimeTime' column to make data store in the date format
- Reclassify the crime types
- Drop invalid and NA data 

In [6]:
df_VANCOUVER['CrimeDate'] = pd.to_datetime(df_VANCOUVER['DAY'].astype(str) + '/' + df_VANCOUVER['MONTH'].astype(str) + '/' + df_VANCOUVER['YEAR'].astype(str), format='%d/%m/%Y')
df_VANCOUVER = df_VANCOUVER.dropna(subset=['HOUR', 'MINUTE'])
df_VANCOUVER['HOUR'] = df_VANCOUVER['HOUR'].astype(int).astype(str).str.zfill(2)
df_VANCOUVER['MINUTE'] = df_VANCOUVER['MINUTE'].astype(int).astype(str).str.zfill(2)
df_VANCOUVER['CrimeTime'] = pd.to_datetime(df_VANCOUVER['HOUR'] + ':' + df_VANCOUVER['MINUTE'], format='%H:%M').dt.time
df_VANCOUVER=df_VANCOUVER.drop(columns=['YEAR','MONTH','DAY','HOUR','MINUTE','X','Y'])
df_VANCOUVER['Total Incidents']=1
df_VANCOUVER['City']='Vancouver'
df_VANCOUVER
#lower snake case

Unnamed: 0,TYPE,HUNDRED_BLOCK,NEIGHBOURHOOD,Latitude,Longitude,CrimeDate,CrimeTime,Total Incidents,City
0,Other Theft,9XX TERMINAL AVE,Strathcona,49.269802,-123.083763,2003-05-12,16:15:00,1,Vancouver
1,Other Theft,9XX TERMINAL AVE,Strathcona,49.269802,-123.083763,2003-05-07,15:20:00,1,Vancouver
2,Other Theft,9XX TERMINAL AVE,Strathcona,49.269802,-123.083763,2003-04-23,16:40:00,1,Vancouver
3,Other Theft,9XX TERMINAL AVE,Strathcona,49.269802,-123.083763,2003-04-20,11:15:00,1,Vancouver
4,Other Theft,9XX TERMINAL AVE,Strathcona,49.269802,-123.083763,2003-04-12,17:45:00,1,Vancouver
...,...,...,...,...,...,...,...,...,...
530646,Mischief,14XX E HASTINGS ST,Grandview-Woodland,49.281276,-123.074746,2017-01-18,14:44:00,1,Vancouver
530647,Break and Enter Residential/Other,31XX ADANAC ST,Hastings-Sunrise,49.277420,-123.037595,2017-03-03,09:16:00,1,Vancouver
530648,Mischief,14XX E 7TH AVE,Grandview-Woodland,49.264163,-123.075129,2017-05-29,22:30:00,1,Vancouver
530650,Theft from Vehicle,8XX HAMILTON ST,Central Business District,49.278168,-123.117031,2017-06-05,17:00:00,1,Vancouver


In [7]:
print(df_BALTIMORE.columns.tolist()) 
print(df_VANCOUVER.columns.tolist())

['CrimeDate', 'CrimeTime', 'Location', 'Description', 'Neighborhood', 'Longitude', 'Latitude', 'Total Incidents', 'City']
['TYPE', 'HUNDRED_BLOCK', 'NEIGHBOURHOOD', 'Latitude', 'Longitude', 'CrimeDate', 'CrimeTime', 'Total Incidents', 'City']


In [8]:
df_VANCOUVER.rename(columns={'TYPE': 'Description', 
                    'HUNDRED_BLOCK': 'Location', 
                    'NEIGHBOURHOOD': 'Neighborhood'}, inplace=True)


In [9]:
value_counts = df_BALTIMORE['Description'].value_counts()

print(value_counts)

Description
LARCENY                 60528
COMMON ASSAULT          45518
BURGLARY                42538
LARCENY FROM AUTO       36295
AGG. ASSAULT            27513
AUTO THEFT              26838
ROBBERY - STREET        17691
ROBBERY - COMMERCIAL     4141
ASSAULT BY THREAT        3503
SHOOTING                 2910
ROBBERY - RESIDENCE      2866
RAPE                     1637
HOMICIDE                 1559
ROBBERY - CARJACKING     1528
ARSON                    1464
Name: count, dtype: int64


In [10]:
value_counts = df_VANCOUVER['Description'].value_counts()
print(value_counts)

Description
Theft from Vehicle                                        172700
Mischief                                                   70413
Break and Enter Residential/Other                          60862
Other Theft                                                52167
Theft of Vehicle                                           38418
Break and Enter Commercial                                 33845
Theft of Bicycle                                           25730
Vehicle Collision or Pedestrian Struck (with Injury)       21901
Vehicle Collision or Pedestrian Struck (with Fatality)       254
Name: count, dtype: int64


In [11]:
df_merged = pd.concat([df_BALTIMORE, df_VANCOUVER], ignore_index=True)

In [12]:
value_counts = df_merged['Description'].value_counts()

print(value_counts)

Description
Theft from Vehicle                                        172700
Mischief                                                   70413
Break and Enter Residential/Other                          60862
LARCENY                                                    60528
Other Theft                                                52167
COMMON ASSAULT                                             45518
BURGLARY                                                   42538
Theft of Vehicle                                           38418
LARCENY FROM AUTO                                          36295
Break and Enter Commercial                                 33845
AGG. ASSAULT                                               27513
AUTO THEFT                                                 26838
Theft of Bicycle                                           25730
Vehicle Collision or Pedestrian Struck (with Injury)       21901
ROBBERY - STREET                                           17691
ROBBERY - COM

In [13]:
df_merged['Description'] = df_merged['Description'].replace({'Theft from Vehicle': 'Theft', 
                                                             'Mischief': 'Other',
                                                             'Break and Enter Residential/Other': 'Burglary',
                                                             'LARCENY': 'Theft',
                                                             'Other Theft':'Theft',
                                                             'COMMON ASSAULT': 'Assault',
                                                             'BURGLARY': 'Burglary',
                                                             'Theft of Vehicle': 'Theft',
                                                             'LARCENY FROM AUTO': 'Theft',
                                                             'Break and Enter Commercial': 'Burglary',
                                                             'AGG. ASSAULT':'Assault',
                                                             'AUTO THEFT': 'Theft',
                                                             'Theft of Bicycle': 'Theft',
                                                             'Vehicle Collision or Pedestrian Struck (with Injury)': 'Accident',
                                                             'ROBBERY - STREET':'Robbery',
                                                             'ROBBERY - COMMERCIAL':'Robbery',
                                                             'ASSAULT BY THREAT':'Assault',
                                                             'SHOOTING':'Homicide',
                                                             'ROBBERY - RESIDENCE':'Burglary',
                                                             'RAPE':'Sexual offense',
                                                             'HOMICIDE':'Homicide',
                                                             'ROBBERY - CARJACKING':'Robbery',
                                                             'ARSON':'Other',
                                                             'Vehicle Collision or Pedestrian Struck (with Fatality)': 'Accident'
                                                             })

In [14]:
invalid_rows = df_merged['Longitude'].apply(lambda x: -123.9 < x < -123.8 or -122.9 < x < -122.8 or x < -124.1)
df = df_merged[~invalid_rows]

- Here is our final dataframe:

In [15]:
df

Unnamed: 0,CrimeDate,CrimeTime,Location,Description,Neighborhood,Longitude,Latitude,Total Incidents,City
0,2017-09-02,23:30:00,4200 AUDREY AVE,Burglary,Brooklyn,-76.605410,39.229510,1,Baltimore
1,2017-09-02,23:00:00,800 NEWINGTON AVE,Theft,Reservoir Hill,-76.632170,39.313600,1,Baltimore
2,2017-09-02,22:53:00,600 RADNOR AV,Homicide,Winston-Govans,-76.606970,39.347680,1,Baltimore
3,2017-09-02,22:50:00,1800 RAMSAY ST,Assault,Carrollton Ridge,-76.645260,39.283150,1,Baltimore
4,2017-09-02,22:31:00,100 LIGHT ST,Assault,Downtown West,-76.613650,39.287560,1,Baltimore
...,...,...,...,...,...,...,...,...,...
752814,2017-01-18,14:44:00,14XX E HASTINGS ST,Other,Grandview-Woodland,-123.074746,49.281276,1,Vancouver
752815,2017-03-03,09:16:00,31XX ADANAC ST,Burglary,Hastings-Sunrise,-123.037595,49.277420,1,Vancouver
752816,2017-05-29,22:30:00,14XX E 7TH AVE,Other,Grandview-Woodland,-123.075129,49.264163,1,Vancouver
752817,2017-06-05,17:00:00,8XX HAMILTON ST,Theft,Central Business District,-123.117031,49.278168,1,Vancouver


In [40]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 750610 entries, 0 to 752818
Data columns (total 9 columns):
 #   Column           Non-Null Count   Dtype         
---  ------           --------------   -----         
 0   CrimeDate        750610 non-null  datetime64[ns]
 1   CrimeTime        750610 non-null  object        
 2   Location         750594 non-null  object        
 3   Description      750610 non-null  object        
 4   Neighborhood     747817 non-null  object        
 5   Longitude        750610 non-null  float64       
 6   Latitude         750610 non-null  float64       
 7   Total Incidents  750610 non-null  int64         
 8   City             750610 non-null  object        
dtypes: datetime64[ns](1), float64(2), int64(1), object(5)
memory usage: 57.3+ MB


TODO:

- Remember to save the database (`data/clean/crime.db`) (look back to this week's notebooks on databases)

In [38]:
!pip freeze > requirement.txt