# Machine learning

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import folium
from sklearn import tree
import pydotplus
from IPython.display import Image

from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
from sklearn import model_selection
from sklearn.metrics import accuracy_score

In [2]:
# Load the dataset
data = pd.read_csv("Fire.csv") 
df = pd.DataFrame(data)

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
# Data cleaning

# Select the 14 focus calls
focuscalls = ['Medical Incident', 'Structure Fire', 'Alarms', 'Traffic Collision', 'Other', 
              'Citizen Assist / Service Call', 'Outside Fire', 'Water Rescue', 'Vehicle Fire', 
              'Gas Leak (Natural and LP Gases)', 'Electrical Hazard', 'Elevator / Escalator Rescue', 
              'Odor (Strange / Unknown)', 'Smoke Investigation (Outside)']

# Define the neighborhoods
neighborhoods = ['Pacific Heights', 'Tenderloin', 'Mission', 'Marina',
       'Bayview Hunters Point', 'Inner Sunset', 'Outer Richmond',
       'Financial District/South Beach', 'Sunset/Parkside',
       'Outer Mission', 'Lone Mountain/USF', 'Western Addition',
       'Nob Hill', 'Hayes Valley', 'Oceanview/Merced/Ingleside',
       'Seacliff', 'West of Twin Peaks', 'Mission Bay', 'South of Market',
       'Potrero Hill', 'Treasure Island', 'Noe Valley', 'Chinatown',
       'Inner Richmond', 'Portola', 'Russian Hill', 'Bernal Heights',
       'Golden Gate Park', 'North Beach', 'Excelsior', 'Lakeshore',
       'Haight Ashbury', 'Visitacion Valley', 'Presidio', 'Twin Peaks',
       'Japantown', 'Castro/Upper Market', 'Presidio Heights',
       'McLaren Park', 'Lincoln Park', 'Glen Park']

df = df[df['Call Type'].isin(focuscalls)]
df = df[df['Neighborhooods - Analysis Boundaries'].isin(neighborhoods)]

In [4]:
# Make date format
format = '%m/%d/%Y %I:%M:%S %p'
df['Datetime'] = pd.to_datetime(df['Received DtTm'], format=format)
df = df.set_index(pd.DatetimeIndex(df['Datetime']))

# Create column in dataframe with hour 
df['Hour'] = df.index.hour

# Create column in dataframe with hour of the week
df['HourOfWeek'] = df['Datetime'].dt.dayofweek * 24 + (df['Datetime'].dt.hour + 1)

# Create column in dataframe with month
df['Month'] = df.index.month 

# Create column in dataframe with week
df['Week'] = df.index.week

# Create column in dataframe with year
df['Year'] = df.index.year

# Create column in dataframe with weekday
df['Weekday'] = df.index.weekday

In [5]:
# Load weather data
df_weather = pd.read_csv('weather_data.csv')

df_weather = df_weather.set_index(pd.to_datetime(df_weather['date'],format='%Y-%m-%dT%H:%M:%S.%fZ'))
df_weather = df_weather.drop(['date'], 1)

In [6]:
# Take 10.000 datapoints from each call type, such that the dataset is balanced.

MedI = df[df['Call Type'] == 'Medical Incident'].sample(10000) 
Ala = df[df['Call Type'] == 'Alarms'].sample(10000) 
StrF = df[df['Call Type'] == 'Structure Fire'].sample(10000)
TraC = df[df['Call Type'] == 'Traffic Collision'].sample(10000)
CitA = df[df['Call Type'] == 'Citizen Assist / Service Call'].sample(10000) 
Oth = df[df['Call Type'] == 'Other'].sample(10000)
OutF = df[df['Call Type'] == 'Outside Fire'].sample(10000) 
WatR = df[df['Call Type'] == 'Water Rescue'].sample(10000) 
VehF = df[df['Call Type'] == 'Vehicle Fire'].sample(10000)
GasL = df[df['Call Type'] == 'Gas Leak (Natural and LP Gases)'].sample(10000)
EleH = df[df['Call Type'] == 'Electrical Hazard'].sample(10000) 
Ele = df[df['Call Type'] == 'Elevator / Escalator Rescue'].sample(10000)
Odor = df[df['Call Type'] == 'Odor (Strange / Unknown)'].sample(10000) 
SmoI = df[df['Call Type'] == 'Smoke Investigation (Outside)'].sample(10000)

In [7]:
# Concatenate the selected call types into one dataframe

# All 14 call types
#df_fil = pd.concat([MedI, Ala, StrF, TraC, CitA, Oth, OutF, WatR, VehF, GasL, EleH, Ele, Odor, SmoI])

# The two call types Water Rescue and Electrical Hazard
df_fil = pd.concat([WatR,EleH])

In [8]:
# Join weather and fire department data

df_fil.index = df_fil.index.round('H')
df_merged = pd.merge(df_fil,df_weather, how='inner', left_index=True, right_index=True)

In [13]:
# Choose the attribute we want to predict, and the attributes that we want to base our prediction on 
# Here we have selected to predict call type based on neighborhood, month and weather
df_new = df_merged[['Call Type','Neighborhooods - Analysis Boundaries','Month','weather']]
df1 = df_new.dropna(how='any')
# Other attributes we have used to our desicion trees: 'Final Priority','Year', 'Month', 'Week','Hour','Weekday','HourOfWeek'

# Convert categorical variables into dummy/indicator variables (Only the attributes that we base our prediction on!)
joindata = pd.get_dummies(df1[['Neighborhooods - Analysis Boundaries','Month','weather']])


# Split data into training and test set
X_trainw, X_testw, y_trainw, y_testw = model_selection.train_test_split(joindata, df1['Call Type'], test_size=0.33, random_state=1)

In [14]:
# Make decision tree
clfw = DecisionTreeClassifier()
clfw = clfw.fit(X_trainw, y_trainw)

# Evaluate precision of classifier
predicted = clfw.predict(X_testw)
acc = accuracy_score(y_testw, predicted)
print(acc)

0.8990030342436064
