In [1]:
import pandas as pd
import geopandas as gpd
import json

import bokeh
from bokeh import plotting
from bokeh.models import FactorRange
from bokeh.palettes import Category20c
from bokeh.layouts import column, row, widgetbox
from bokeh.io import save, show, output_file, output_notebook, reset_output, export_png
from bokeh.plotting import figure
from bokeh.io.doc import curdoc
from bokeh.models import (
    GeoJSONDataSource, ColumnDataSource, ColorBar, Slider, Spacer,
    HoverTool, TapTool, Panel, Tabs, Legend, Toggle, LegendItem, Button, Select, CategoricalColorMapper
)
from bokeh.palettes import brewer
from bokeh.models.widgets import Div
from matplotlib import pyplot as plt
from matplotlib.colors import rgb2hex

import warnings
from pandas.core.common import SettingWithCopyWarning

# Data preparation

### Filter data set
We start out by doing an overall "cleanup" of the data set. First we add a datetime column with a proper format, and then we remove all entries that are not part of the focuscalls defined below. Finally, we remove all unnecessary columns. Please note that the data set must be downloaded in order for the following code to run. It can be found here: https://data.sfgov.org/Public-Safety/Fire-Department-Calls-for-Service/nuek-vuh3

In [2]:
# Load the data set
df = pd.read_csv('Fire.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
# Set up date format
format = '%m/%d/%Y %I:%M:%S %p'
df['Datetime'] = pd.to_datetime(df['Received DtTm'], format=format)
df = df.set_index(pd.DatetimeIndex(df['Datetime']))

In [4]:
# Define focus calls
focuscalls = ['Medical Incident', 'Structure Fire', 'Alarms', 'Traffic Collision', 
              'Citizen Assist / Service Call', 'Outside Fire', 'Water Rescue', 'Vehicle Fire', 
              'Gas Leak (Natural and LP Gases)', 'Electrical Hazard', 'Elevator / Escalator Rescue', 
              'Odor (Strange / Unknown)', 'Smoke Investigation (Outside)', 'Other']

In [5]:
# Define neighborhoods
neighborhoods = df['Neighborhooods - Analysis Boundaries'].unique()
neighborhoods = neighborhoods[0:-1]

In [6]:
# Define districts
districts = ['Central', 'Southern', 'Bayview', 'Mission', 'Park', 'Richmond', 'Ingleside', 'Taraval',
            'Northern', 'Tenderloin']

In [7]:
# Filter the data set and drop nan's and unnecessary columns 
df = df[df['Call Type'].isin(focuscalls)]
df.dropna(how='any')
df = df[df['Neighborhooods - Analysis Boundaries'].isin(neighborhoods)]
df.dropna(how='any')

cols = ['Call Number', 'Call Type', 'Final Priority', 'Call Type Group',
        'Neighborhooods - Analysis Boundaries',  'Location', 'Datetime', 'Supervisor Districts' ]
df_fil = df[cols]
df_fil['year'] = df_fil.index.year

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.


### Dataframes for choropleth map
In this section we make 5 different dataframes for plotting on the choropleth map. Having individual dataframes is very helpful in this case, as we will be assigning e.g. bins and colors based on the values in the dataframes. 

The seven dataframes contain the following:
1. Number of calls in each neighborhood
2. Number of high and low priority calls in each neighborhood
3. Number of calls in each district
4. Number of high and low priority calls in each district
5. Number of calls and number of calls per 100 residents in each neighborhood

Furthermore, all the dataframes except number 5 have a "year" column such that we can make a slider that slides through the years.

In [8]:
# Create dataframe containing all calls grouped by neighborhood
nhood_all = df_fil.groupby(
    ['year', 'Neighborhooods - Analysis Boundaries']
).agg(
    {
    'Call Number': ['count']
    }
)

nhood_all = nhood_all.set_axis(nhood_all.columns.map('_'.join), axis=1, inplace=False)
nhood_all = nhood_all.reset_index(level=[0,1])

nhood_all.rename(columns={
    'Neighborhooods - Analysis Boundaries': 'Neighborhood',
    'Call Number_count': 'Calls',
}, inplace=True)

# save and show dataframe
nhood_all.to_csv('nhood_all.csv', index=False)
nhood_all.head()

Unnamed: 0,year,Neighborhood,Calls
0,2000,Bayview Hunters Point,10102
1,2000,Bernal Heights,3119
2,2000,Castro/Upper Market,3860
3,2000,Chinatown,3243
4,2000,Excelsior,3127


In [9]:
# Create dataframe containing all calls grouped by neighborhood and priority
nhood_priority = df_fil.groupby(
    ['year', 'Neighborhooods - Analysis Boundaries', 'Final Priority']
).agg(
    {
    'Call Number': ['count']
    }
)

nhood_priority = nhood_priority.set_axis(nhood_priority.columns.map('_'.join), axis=1, inplace=False)
nhood_priority = nhood_priority.reset_index(level=[0,1,2])

nhood_priority.rename(columns={
    'Neighborhooods - Analysis Boundaries': 'Neighborhood',
    'Call Number_count': 'Calls',
    'Final Priority': 'Priority',
}, inplace=True)

# save and show dataframe
nhood_priority.to_csv('nhood_priority.csv', index=False)
nhood_priority.head()

Unnamed: 0,year,Neighborhood,Priority,Calls
0,2000,Bayview Hunters Point,2,1199
1,2000,Bayview Hunters Point,3,8903
2,2000,Bernal Heights,2,440
3,2000,Bernal Heights,3,2679
4,2000,Castro/Upper Market,2,635


In [10]:
# Create dataframe containing all calls grouped by district
district_all = df_fil.groupby(
    ['year', 'Supervisor Districts']
).agg(
    {
    'Call Number': ['count']
    }
)

district_all = district_all.set_axis(district_all.columns.map('_'.join), axis=1, inplace=False)
district_all = district_all.reset_index(level=[0,1])

district_all.rename(columns={
    'Supervisor Districts': 'District',
    'Call Number_count': 'Calls',
}, inplace=True)
district_all = district_all.astype({'District': 'object'})

# save and show dataframe
district_all.to_csv('district_all.csv', index=False)
district_all.head()

Unnamed: 0,year,District,Calls
0,2000,1,9814
1,2000,2,7684
2,2000,3,5606
3,2000,4,7567
4,2000,5,10586


In [11]:
# Create dataframe containing all calls grouped by district and priority
district_priority = df_fil.groupby(
    ['year', 'Supervisor Districts', 'Final Priority']
).agg(
    {
    'Call Number': ['count']
    }
)

district_priority = district_priority.set_axis(district_priority.columns.map('_'.join), axis=1, inplace=False)
district_priority = district_priority.reset_index(level=[0,1,2])

district_priority.rename(columns={
    'Supervisor Districts': 'District',
    'Call Number_count': 'Calls',
    'Final Priority': 'Priority'
}, inplace=True)
district_priority = district_priority.astype({'District': 'object'})

# save and show dataframe
district_priority.to_csv('district_priority.csv', index=False)
district_priority.head()

Unnamed: 0,year,District,Priority,Calls
0,2000,1,2,1038
1,2000,1,3,8776
2,2000,2,2,955
3,2000,2,3,6729
4,2000,3,2,743


Finally, we make a dataframe containing number of calls per 100 residents in each neighborhood. Since the population/income data we used in based on year 2012-2016, we choose to look at only calls from year 2015. We use data from: https://default.sfplanning.org/publications_reports/SF_NGBD_SocioEconomic_Profiles/2012-2016_ACS_Profile_Neighborhoods_Final.pdf

 

In [12]:
Income_Pop = pd.DataFrame({'Neighborhood': pd.Categorical(['Bayview Hunters Point', 'Bernal Heights', 
   'Castro/Upper Market', 'Chinatown','Excelsior', 'Financial District/South Beach', 'Glen Park', 
    'Golden Gate Park','Haight Ashbury', 'Hayes Valley', 'Inner Richmond', 'Inner Sunset', 'Japantown',
   'Lakeshore', 'Lincoln Park', 'Lone Mountain/USF', 'Marina', 'McLaren Park',
   'Mission', 'Mission Bay', 'Nob Hill', 'Noe Valley', 'North Beach',
   'Oceanview/Merced/Ingleside', 'Outer Mission', 'Outer Richmond', 'Pacific Heights', 'Portola',
   'Potrero Hill', 'Presidio', 'Presidio Heights', 'Russian Hill', 'Seacliff', 'South of Market',
   'Sunset/Parkside', 'Tenderloin', 'Treasure Island', 'Twin Peaks', 'Visitacion Valley', 'West of Twin Peaks',
   'Western Addition']),
    'N_Income_capita$': [24817, 53243, 94317, 24653, 28057, 114083, 72039, 108439, 81392, 61210, 56925, 63133,
             68352, 22570, 43922, 50860, 98411, 15387, 53196, 70287, 58623, 91014, 60254, 26413, 
             32582, 44745, 102141, 29659, 84521, 86967, 88517, 91854, 117489, 54202, 42430, 27946,
             15886, 64279, 20942, 67869, 51264],
    'N_ population': [37600,  26140,   21090,  14820, 39340, 17460, 8210, 90, 18050, 18250, 22500, 29120,
                      3650, 14300, 320, 18070, 25110, 850, 58640, 10530, 22300, 18650, 12600, 28010, 24270, 
                      44870, 24070, 16410, 13770, 3830, 10720, 17830, 2460, 19180, 81050, 28220, 3090, 7410,
                      18570, 38180, 22220],
    'N_Households': [11310,  9120,  11160,   6840,  10970, 10030, 3710, 80, 8360, 9090, 9510, 12240, 2280,
                     4820, 70, 6520, 13880, 290, 24340, 5190, 13330, 8810, 6370, 7960, 6740, 18450, 13440, 
                     4820, 6070, 1310, 4830, 9920, 900, 10270, 28140, 17120, 600, 3540, 4960, 13890, 11230]
   })

In [13]:
df_com = df_fil.set_index('Neighborhooods - Analysis Boundaries').join(Income_Pop.set_index('Neighborhood'))
df_com['Neighborhoods'] = df_com.index

In [14]:
# Create dataframe containing all calls and calls per 100 residents grouped by neighborhood
df_fil = df_com[df_com['year'] == 2015]
nhood_pop = df_fil.groupby(
     ['Neighborhoods', 'N_ population']
).agg(
    {
    'Call Number': ['count']
    }
)

nhood_pop = nhood_pop.set_axis(nhood_pop.columns.map('_'.join), axis=1, inplace=False)
nhood_pop = nhood_pop.reset_index(level=[0,1])

nhood_pop.rename(columns={
    'Neighborhoods': 'Neighborhood',
    'Call Number_count': 'Calls',
    'N_ population': 'Population'
}, inplace=True)

# Calculate calls per 100 residents
total = nhood_pop['Population'].values
df_group = nhood_pop.groupby(['Neighborhood'])
neighborhoods = nhood_pop['Neighborhood'].unique()
l = []
i = 0
for g in neighborhoods:
    tmp = df_group.get_group(g)
    l.append((tmp["Calls"].values[0]/total[i]) * 100) 
    i = i + 1

nhood_pop.insert(loc=3, column='Calls_pr_100', value=l)

# save and show dataframe
nhood_pop = nhood_pop.astype({'Calls_pr_100': 'int32'})
nhood_pop.to_csv('nhood_pop.csv', index=False)
nhood_pop.head()

Unnamed: 0,Neighborhood,Population,Calls,Calls_pr_100
0,Bayview Hunters Point,37600,14615,38
1,Bernal Heights,26140,4946,18
2,Castro/Upper Market,21090,7422,35
3,Chinatown,14820,6106,41
4,Excelsior,39340,5718,14


### Dataframe for histograms
Next, we create the dataframe that we will be using for our interactive histograms. The dataframe will simply contain the call type, neighborhood, year, month, day and hour of all entries from the original dataframe. 

In [15]:
warnings.simplefilter(action="ignore", category=SettingWithCopyWarning)
cols = ['Call Type', 'Neighborhooods - Analysis Boundaries', 'Datetime']
df_hist = df[cols]
df_hist['year'] = df_hist.index.year
df_hist['month'] = df_hist.index.month
df_hist['day'] = df_hist.index.weekday
df_hist['hour'] = df_hist.index.hour
df_hist = df_hist.drop('Datetime', axis = 1)

# Take a sample of dataframe, as csv file will otherwise be too large to upload to github
df_hist = df_hist.sample(700000)

# Save and show dataframe
df_hist.to_csv('histdata.csv', index=False)
df_hist.head()

Unnamed: 0_level_0,Call Type,Neighborhooods - Analysis Boundaries,year,month,day,hour
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2010-09-09 17:44:43,Medical Incident,Outer Richmond,2010,9,3,17
2007-03-07 06:22:56,Vehicle Fire,Lakeshore,2007,3,2,6
2016-07-03 09:48:07,Medical Incident,Mission,2016,7,6,9
2002-02-19 05:06:10,Medical Incident,Visitacion Valley,2002,2,1,5
2003-10-26 17:14:20,Alarms,South of Market,2003,10,6,17
