In [1]:
# load libraries
import geopandas as gpd
import pandas as pd
import numpy as np
import os
import requests
from urllib.request import urlopen
import json
from shapely import wkt
import matplotlib.pyplot as plt
%matplotlib inline

# Crime

### Load Crime Data

In [None]:
# load data into geopandas dataframe
crime = gpd.read_file("https://opendata.arcgis.com/datasets/716338a41410457bb415a4bae2b2ad3e_0.geojson", low_memory=False)

# confirm load
crime.crs

In [None]:
# check column names - makes copying text for later steps easier
print(list(crime.columns.values))

In [None]:
# look at dataframe
shape = crime.shape
info = crime.info()
head = crime.head()

print(shape)
print(info)
print(head)

### Reduce Crime Data

In [None]:
# drop columns we don't need
cols2drop = ['Index_', 'Division', 'reporteddate', 'location_type', 'ucr_code', 'ucr_ext', 'offence', 'reportedyear', 'reportedmonth', 
             'reportedday', 'reporteddayofyear', 'reporteddayofweek', 'reportedhour', 'occurrencedayofyear', 'occurrencedayofweek', 'occurrencehour']
for col in crime.columns:
    if col in cols2drop:
        crime.drop(col, inplace=True, axis=1)
        
# confirm reduced dataframe
crime.info()

In [None]:
# check column names again - makes copying text for later steps easier
print(list(crime.columns.values))

In [None]:
# let's take a quick peek at crime by year
crime['occurrenceyear'].value_counts()

In [None]:
# there appears to be a big change in crime data between 2013 and 2014 - a jump by 30 thousand a year!
# data from 2014 onwards appears to be similar and represents over 200,000 of the 281,692 entries in this dataset
# it is not clear why this is the case - perhaps due to a change in tracking or recording practices
# in order to ensure our analysis is not skewed, we will delete all occurences from 2013 or earlier
crime = crime[crime['occurrenceyear'] > 2013]

# confirm reduced dataframe
crime.shape

In [None]:
# let's take another look at crime by year
crime['occurrenceyear'].value_counts()

In [None]:
# check reduced dataframe
crime.info()

In [None]:
# look at crimes by dates
dates = crime[['occurrencedate', 'occurrenceyear', 'occurrencemonth', 'occurrenceday']]
dates.describe()

In [None]:
# save reduced data locally
path = os.path.join('data','rough')
fn = 'crimeData.csv.gz'
print(f"Writing to: {fn}")

if not os.path.exists(path):
    print(f"Creating {path} under {os.getcwd()}")
    os.makedirs(path)
    
crime.to_csv(os.path.join(path,fn), index=False, compression="gzip")
print("Done.")

### Clean Crime Data

In [None]:
# reload data from local
path = os.path.join('data','rough')
fn = 'crimeData.csv.gz'
crime = gpd.read_csv(os.path.join(path,fn), compression='gzip', low_memory=False)

# have a look at data
crime.head(10)

In [None]:
# count nulls by columns to identify if there are any problems
crime.isnull().sum(axis=0).sort_values(ascending=False)

In [None]:
# count nulls by rows
crime.isnull().sum(axis=1).sort_values(ascending=False)

In [None]:
# no null values - great!
# now count nans by columns
crime.isna().sum(axis=0).sort_values(ascending=False)

In [None]:
# count nans by rows
crime.isna().sum(axis=1).sort_values(ascending=False)

In [None]:
# no nans - great!
# documentation indicates that for locations identified outside the city of Toronto limits or as invalidated locations
# the division/neighbourhood designation will be ‘NSA’ to indicate ‘Not Specified Area.’
# we will identify and delete these as we are only concerned with occurences with identifiable locations
crime.loc[crime['Hood_ID'] == 'NSA']

In [None]:
# it seems that when 'Hood_ID' == NSA, 'Neighbourhood' is also NSA, let's check to see if we get the same results
crime.loc[crime['Neighbourhood'] == 'NSA']

In [None]:
# they match! we will delete these
crime = crime[crime.Hood_ID != 'NSA']
crime = crime[crime.Neighbourhood != 'NSA']

# confirm reduced dataframe
crime.info()

In [None]:
# Confirm reduction meets amount in NSA query
x = 281692-277071
print(x)

In [None]:
# Data review

# 1. crime types - 2 fields
    # offence and MCI fields - from documentation, we know that offence is a non-standardized open field, whereas MCI is categorized
    # we will drop offence and keep MCI
    
# 2. neighbourhood - 2 fields
    # Hood_ID and Neighbourhood fields are duplicates, but should bothe be left in for fleixbility in matching with neighoburhood polygons later
    # we will check unique values to ensure both have the same amount and keep both of these 
    
# 3. cateogrical fields - 4 fields
    # location_type, premises_type, MCI, and Neighbourhood fields are all categorical data as indicated in the documentation
    # location_type is too detailed for our purposes
    # we will delete location_type and convert the rest to categories - NOTE GPKG WOULD NOT ALLOW CATEGORIES
    
# 4. ids - 2 fields
    # event_unique_id and ObjectId fields seem to both be unique values, but they are different data types
    # documentation suggests that occurences with multiple types of crimes will show up as multiple entries
    # therefore, one of these fields will have multiple entries and the other will have unique values - we will need to confirm differences, address them, and possibly convert to int

# 5. dates - 7 fields
    # occurrencedate, occurrenceyear, occurrencemonth, occurrenceday, occurrencedayofyear, occurrencedayofweek, and occurrencehour fields
    # occurencedate is generalized and is sufficient for our purposes, we will delete the rest
    
# 6. location - 3 fields
    # we will keep geometry
    # we will check lat and long to see if there are outliers to remove, then we will delete them

In [None]:
# 1 - drop offence field
cols2drop2 = ['location_type','offence']

for col in crime.columns:
    if col in cols2drop2:
        crime.drop(col, inplace=True, axis=1)
        
# confirm removal - makes copying text for later steps easier
print(list(crime.columns.values))    

In [None]:
# 2 - count unique values of neighbourhood fields
count1 = crime.Hood_ID.nunique()
count2 = crime.Neighbourhood.nunique()
print(count1)
print(count2)

In [None]:
# they match and we will keep both

# NOTE gpkg will not accept categories so the rest of this cell has been hashed out

# 3 - convert location_type, premises_type, MCI, and neighbourhood fields to cateogrical data
#for c in ['premises_type', 'MCI', 'Neighbourhood']:
#    crime[c] = crime[c].astype('category')
    
# confirm conversion to categories
#crime.info()

In [None]:
# we will also convert Hood_ID to int
for c in ['Hood_ID']:
    crime[c] = crime[c].astype('int')

# confirm conversion to int
crime.info()

In [None]:
# 4 - we have 277071 rows, let's check count of unique values of id fields compared to rows
count3 = crime.event_unique_id.nunique()
count4 = crime.ObjectId.nunique()
print(count3)
print(count4)

In [None]:
# the event_unique_id field has duplicate values, whereas the ObjectID field has a unique value for every row
# as per documentation one event can have mutliple crime types - for example, a single event could have both an assault and a theft
# for our purposes, we want to ensure we count each crime type seperately
# we will delete the event_unique_id field and keep the ObjectId field
cols2drop3 = ['event_unique_id']

for col in crime.columns:
    if col in cols2drop3:
        crime.drop(col, inplace=True, axis=1)
        
# confirm removal - makes copying text for later steps easier
print(list(crime.columns.values))

In [None]:
# 5 - check dates fields
dates = crime[['occurrencedate', 'occurrenceyear', 'occurrencemonth', 'occurrenceday', 'occurrencedayofyear', 'occurrencedayofweek', 'occurrencehour']]
dates.head(10)

In [None]:
# for our purposes occurrencedate is all that is required and the rest can be dropped
cols2drop4 = ['occurrenceyear', 'occurrencemonth', 'occurrenceday', 'occurrencedayofyear', 'occurrencedayofweek', 'occurrencehour']

for col in crime.columns:
    if col in cols2drop4:
        crime.drop(col, inplace=True, axis=1)
        
# confirm removal - makes copying text for later steps easier
print(list(crime.columns.values))

In [None]:
dates['occurrencedate'].sort_values()

In [None]:
# 6 - we will check lat and long fields for outliers
location = crime[['Long', 'Lat']]
location.describe()

In [None]:
# min in the lat field is showing as 0 which is definitely not in Toronto

# let's check how many values are equal to 0
print((crime['Lat'] == 0).sum())

# vs. all values
print((crime['Lat']).count())

In [None]:
# delete these values
crime = crime[crime.Lat != 0]

# check values again
location2 = crime[['Long', 'Lat']]
location2.describe()

In [None]:
# that fixed the problem
# we can now delete Long and Lat columns
cols2drop5 = ['Long', 'Lat']

for col in crime.columns:
    if col in cols2drop5:
        crime.drop(col, inplace=True, axis=1)
        
# confirm removal - makes copying text for later steps easier
print(list(crime.columns.values))

### Enrich Crime Data

In [None]:
# let's have another look at the data
crime.info()

In [None]:
# 278 rows have been removed
# another look
crime.head(10)

In [None]:
# as we are completing an analysis of the impact of COVID-19 lockdowns on crime, we will add an additonal column for lockdown information
# from research we know that Toronto had three lockdowns as follows:

lockdown = {'ONE':['2020-03-23','2020-07-31'],
            'TWO':['2020-11-23','2021-03-08'],
            'THREE':['2021-04-08','2021-06-02']}

crime['occurrencedate'] = pd.to_datetime(crime.occurrencedate.values, infer_datetime_format=True)

for k, (s,e) in lockdown.items():
    crime.loc[crime['occurrencedate'].between(s,e), 'lockdownNum'] = k

crime.head()

In [None]:
# replace NaN values in lockdownNum field with NONE (no lockdown)
crime['lockdownNum'] = crime['lockdownNum'].fillna('NONE')

# check revised number of occurrences by lockdownNum
crime.lockdownNum.value_counts()

In [None]:
# NOTE gpkg will not accept categories so this cell has been hashed out

# convert lockdownNum field to categorical data
#for c in ['lockdownNum']:
#    crime[c] = crime[c].astype('category')

# confirm changes
#crime.info()

In [None]:
# create a binary lockdown column based on lockdownNum column where 1 = in lockdown, and 0 = not in lockdown
crime['lockdownBinary'] = ['1' if x == 'ONE' else '1' if x == 'TWO' else '1' if x == 'THREE' else '0' for x in crime['lockdownNum']]

#confirm new column
crime.head()

In [None]:
# check datatypes
crime.info()

In [None]:
# convert lockdownBinary to int
for c in ['lockdownBinary']:
    crime[c] = crime[c].astype('int')
    
# confirm conversion to int
crime.info()

In [None]:
###





### crime.sort_values(by=['occurrenceyear', 'occurrencemonth'])

In [None]:
# random check of data frame
crime.sample(frac=0.5)

### Save Cleaned Crime Data

In [None]:
# have a look at data
crime.head(10)

In [None]:
# save crime as clean data
path = os.path.join('data','clean')
fn = 'crimeData.csv.gz'
print(f"Writing to: {fn}")

if not os.path.exists(path):
    print(f"Creating {path} under {os.getcwd()}")
    os.makedirs(path)
    
crime.to_csv(os.path.join(path,fn), index=False, compression="gzip")
print("Done.")

In [None]:
# save crime as geodataframe

# Set save location
path = os.path.join('data','geo')
fn = 'crimeData.gpkg'
print(f"Writing to: {fn}")

if not os.path.exists(path):
    print(f"Creating {path} under {os.getcwd()}")
    os.makedirs(path)

crime.to_file(os.path.join(path,fn), index=False, driver='GPKG')
print("Done.")

# COVID Cases

### Load COVID Data

In [None]:
# now let's look at the Covid case data
# open covid-19 case data into a pandas dataframe

url = 'https://ckan0.cf.opendata.inter.prod-toronto.ca/dataset/64b54586-6180-4485-83eb-81e8fae3b8fe/resource/fff4ee65-3527-43be-9a8a-cb9401377dbc/download/COVID19%20cases.csv'
covidcases = pd.read_csv(url, low_memory=False)

# confirm load
print(f"Data frame is {covidcases.shape[0]:,} rows x {covidcases.shape[1]} columns")

In [None]:
# check dataframe
covidcases.info()

In [None]:
# check column names - makes copying text for later steps easier
print(list(covidcases.columns.values))

In [None]:
# look at data
covidcases.head()

### Reduce COVID Data

In [None]:
# drop columns we don't need
cols2drop6 = ['Outbreak Associated', 'Age Group', 'Source of Infection', 'Client Gender', 'Outcome', 'Currently Hospitalized', 
              'Currently in ICU', 'Currently Intubated', 'Ever Hospitalized', 'Ever in ICU', 'Ever Intubated']
for col in df.columns:
    if col in cols2drop6:
        df.drop(col, inplace=True, axis=1)
        
# confirm reduced dataframe
df.info()

In [None]:
# save rough covid case data
path = os.path.join('data','rough')
fn = 'covidData.csv.gz'
print(f"Writing to: {fn}")

if not os.path.exists(path):
    print(f"Creating {path} under {os.getcwd()}")
    os.makedirs(path)
    
covidcases.to_csv(os.path.join(path,fn), index=False, compression="gzip")
print("Done.")

In [None]:
# reload rough covid case data
path = os.path.join('data','rough')
fn = 'covidData.csv.gz'
df = pd.read_csv(os.path.join(path,fn), compression='gzip', low_memory=False)

# have a look at data
df.head(10)

In [None]:
# check column names again - makes copying text for later steps easier
print(list(df.columns.values))

### Clean COVID Data

In [None]:
# count nulls by columns to identify if there are any problems
df.isnull().sum(axis=0).sort_values(ascending=False)

In [None]:
# count nulls by rows
df.isnull().sum(axis=1).sort_values(ascending=False)

In [None]:
# many null values
# let's look at remaining columns in more detail to see if we can drop more
df.head()[['_id', 'Assigned_ID', 'Neighbourhood Name', 'FSA', 'Classification', 'Episode Date', 'Reported Date']]

In [None]:
# Data review
# 1. ids - 2 fields
    # the _id and Assigned_ID fields seem to be identical and have the same amount
    # however, documentation suggests Assigned_ID is from Toronto Public Health, not the database, and cases can disappear
    # based on this, we will delete Assigned_ID

# 2. Locations - 2 fields
    # the Neighbourhood Name and Forward Sortation Area (FSA) fields both can be used to geolocate the cases
    # documentation suggests that the FSA field and Census Tracts (CTs) were used to determine the Neighbourhood Name
    # documentation also mentions that neighbourhood information is missing for cases with missing postalcodes
    # for our purposes aggregate numbers are okay and we don't need to geolocate so we will drop both of these fields 

# 3. Status - 1 field
    # According to the documentation, the Classification field classifies cases as either confirmed or probable
    # as with above, for our purposes, aggregate numbers are okay, we will delete probable cases and keep confirmed
    
# 4. dates - 2 fields
    # we do not need both of these dates, so we will keep the reported date as an official record and drop the episode date
    # change reported date to reported_date for ease
    
# drop columns we don't need
cols2drop7 = ['Assigned_ID', 'Neighbourhood Name', 'FSA', 'Episode Date']
for col in df.columns:
    if col in cols2drop7:
        df.drop(col, inplace=True, axis=1)
        
# confirm reduced dataframe
df.info()

In [None]:
# check column names again - makes copying text for later steps easier
print(list(df.columns.values))

In [None]:
# count nulls by columns to identify if there are any problems
df.isnull().sum(axis=0).sort_values(ascending=False)

In [None]:
# count nulls by rows
df.isnull().sum(axis=1).sort_values(ascending=False)

In [None]:
# no more nulls - perfect!
# check classification counts 
df.Classification.value_counts()

In [None]:
# remove PROBABLE cases, keep CONFIRMED cases
df = df[df.Classification != 'PROBABLE']

# confirm reduced dataframe
df.info()

In [None]:
# check counts
df.Classification.value_counts()

In [None]:
# rename reported date field
df.rename(columns = {'Reported Date':'ReportedDate'}, inplace = True)

# confirm renamed column
df.head()

In [None]:
# the last thing we want to do is to get a summary count of all confirmed cases per day
casesperday = df.groupby(['ReportedDate']).size().reset_index(name='CasesPerDay')
casesperday = casesperday.sort_values(by=['CasesPerDay'], ascending=False)
casesperday

### Save Cleaned COVID Data

In [None]:
# save cleaned covid case data
path = os.path.join('data','clean')
fn = 'covidData.csv.gz'
print(f"Writing to: {fn}")

if not os.path.exists(path):
    print(f"Creating {path} under {os.getcwd()}")
    os.makedirs(path)
    
casesperday.to_csv(os.path.join(path,fn), index=False, compression="gzip")
print("Done.")

# Key Dates

### Load Key Dates Data

In [None]:
# now let's look at key dates
# load key dates data into a pandas dataframe

path = os.path.join('data','rough')
fn = 'keyDates.csv'
kd = pd.read_csv(os.path.join(path,fn), low_memory=False)

# have a look at data
kd.head(10)

In [None]:
# check dataframe
kd.info()

In [None]:
# Data review

# 1. dates - 3 fields
    # Date field - refers to date of event (see keydates) - will leave as object
    # LockdownDates - is binary - refers to the dates the lockdowns began and ended - will be used for visualization - will leave as int
    # KeyDates - is binary - refers to key dates for headlines in the visualization - will leave as int
    
# 2. Event - 1 field
    # can be used as headlines or scrolling tickers for the visualization
    
# 3. cateogrical fields - 1 fields
    # Lockdown field - represents which lockdown
    # we will convert these to categories

### Clean Key Dates

In [None]:
# 3 - convert Lockdown field to cateogrical data
for c in ['Lockdown']:
    kd[c] = kd[c].astype('category')
    
# confirm conversion to categories
kd.info()

### Save Cleaned Key Dates

In [None]:
# save cleaned key dates data
path = os.path.join('data','clean')
fn = 'keyDates.csv'
print(f"Writing to: {fn}")

if not os.path.exists(path):
    print(f"Creating {path} under {os.getcwd()}")
    os.makedirs(path)
    
kd.to_csv(os.path.join(path,fn), index=False)
print("Done.")

# Neighbourhood Boundaries

### Load Neighbourhood data

In [None]:
# now let's look at the neighbourhood boundaries
# read in json file
tor_nbs = gpd.read_file('https://ckan0.cf.opendata.inter.prod-toronto.ca/dataset/4def3f65-2a65-4a4f-83c4-b2a4aed72d46/resource/9ce32bd1-91ac-4422-925a-bdc256702756/download/Neighbourhoods%20-%20historical%20140.geojson')

# confirm load
tor_nbs.head()

In [None]:
# check data
tor_nbs.info()

In [None]:
# check column names - makes copying text for later steps easier
print(list(tor_nbs.columns.values))

### Reduce Neighbourhood data

In [None]:
# drop columns we don't need - we only need to be able to match neighbourhood geography to Hood_ID and Neighbourhood columns within the crimeData table
cols2drop8 = ['AREA_ID', 'AREA_ATTR_ID', 'PARENT_AREA_ID', 'AREA_DESC', 'CLASSIFICATION', 'CLASSIFICATION_CODE', 'OBJECTID']
for col in tor_nbs.columns:
    if col in cols2drop8:
        tor_nbs.drop(col, inplace=True, axis=1)
        
# confirm reduced dataframe
tor_nbs.info()

### Clean Neighbourhood data

In [None]:
# data review

# 1 - area name fields
# we will rename this column to Neighbourhood to match our data

# 2 - area code fields
# we need to confirm which of short_code or long_code is more accurate, then delete the other
# we will then rename the column to Hood_ID to match our data

In [None]:
# 1 - rename area name column to Neighbourhood to match our data
tor_nbs.rename(columns={'AREA_NAME': 'Neighbourhood'}, inplace=True)

# check to confirm
tor_nbs.head()

In [None]:
# 2 - area code fields
# we need to confirm which of short_code or long_code is more accurate, then delete the other
# we will then rename the column to Hood_ID to match our data
count5 = tor_nbs.AREA_SHORT_CODE.nunique()
count6 = tor_nbs.AREA_LONG_CODE.nunique()
print(count5)
print(count6)

In [None]:
# they seem to be identical, let's do one more check
tor_nbs.AREA_SHORT_CODE.describe()

In [None]:
# and...
tor_nbs.AREA_LONG_CODE.describe()

In [None]:
# confirmed, they are identifcal, we will delete AREA_LONG_CODE...
cols2drop9 = ['AREA_LONG_CODE']
for col in tor_nbs.columns:
    if col in cols2drop9:
        tor_nbs.drop(col, inplace=True, axis=1)
        
# .. and change the name of AREA_SHORT_CODE to Hood_ID
tor_nbs.rename(columns={'AREA_SHORT_CODE': 'Hood_ID'}, inplace=True)
        
# confirm changes
tor_nbs.head()

In [None]:
# check plot 
print(tor_nbs.geometry.crs)
print(tor_nbs.total_bounds)
ax = tor_nbs.plot(figsize=(18,14), 
                  edgecolor='red', 
                  facecolor='none', 
                  linewidth=1, 
                  alpha=0.75)

In [None]:
# now let's see how things look

# plot crime
print(crime.geometry.crs)
print(crime.total_bounds)
crime.plot(figsize=(18,14), marker='*', color='green', markersize=3)

In [None]:
# add toronto neighbourhoods
base = tor_nbs.plot(figsize=(18,14), color='white', edgecolor='black')

# add crime locations
crime.plot(ax=base, figsize=(18,14), marker='o', color='red', markersize=3);

In [None]:
# this looks good!
# NOTE - it appears that some dots along the edges are outside toronto
    # HOWEVER - documentation notes crime locations moved to closest intersection to maintain annonimity
    # AS A RESULT - All included points have been verified as within an identified Toronto Neighbourhood, but may not appear so visually

### Save Neighbourhood data

In [None]:
# save tor_nbs as a geodataframe

# Set save location
path = os.path.join('data','geo')
fn = 'tor_nbs.gpkg'

if not os.path.exists(path):
    print(f"Creating {path} under {os.getcwd()}")
    os.makedirs(path)
    
print(f"Using '{fn}' as basis for saving data...")

tor_nbs.to_file(os.path.join(path,fn), driver='GPKG')
print("Done.")