# Jackson Gawarecki
## Preliminary Model for Semester Project (Decision Tree Regression)
### 3/5/24

In [54]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

df = pd.read_csv("Fire_Open_Data.csv", parse_dates=['DATE', 'CREATE', 'DISPATCH', 'ENROUTE', 'ARRIVE', 'CLEAR'])

###### Note: Priority is a scale 1-10 with 1 being the most important

In [55]:
#Create Dummy Variable for the Agency Names
dummy_agency = pd.get_dummies(df['AGENCY_NAME'], prefix="Agency")
# Concat original dummy vars
df = pd.concat([df, dummy_agency], axis=1)

In [56]:
# Drop OG column
df= df.drop('AGENCY_NAME', axis=1)

In [57]:
event_type_counts = df['EVENT TYPE'].value_counts()

# Individually Print the count of each entry in the column

print("Count of each entry in the 'EVENT TYPE' column:")
print(event_type_counts)

Count of each entry in the 'EVENT TYPE' column:
MEDICAL--MEDICAL - MED_CALL               88178
ALARM--Fire Alarm Sounding-Commercial     14278
ASSIST--Assist EMS - Code 3                6821
ALARM--Fire Alarm Sounding-Residential     6578
MEDICAL--MEDICAL - CPR                     4954
                                          ...  
SEARCH--Search-Missing Person                 3
STEAM_RUPTURE                                 1
CHEMICAL--Chemical Spill/Fire                 1
MEDICAL--Medical - Charlie                    1
AIRCRAFT--Aircraft has Crashed                1
Name: EVENT TYPE, Length: 93, dtype: int64


In [58]:
# Group the Event types into the major categories (Reduced from 100+ unique)

df['EVENT TYPE'] = df['EVENT TYPE'].str.split('-').str[0].str.strip()

In [59]:
# Drop all events where there are less than 1000 occurances

values_to_drop = ['Arson Investigation', 'SPECIAL', 'GENERAL HAZARD', 'ACTIVE AGGRESSOR', 'SEARCH',
                  'AIRCRAFT', 'WEATHER', 'STEAM_RUPTURE']
df = df[~df['EVENT TYPE'].isin(values_to_drop)]

In [60]:
#Convert the event types to dummy variables

dummy_event = pd.get_dummies(df['EVENT TYPE'], prefix="ET")
# Concat original dummy vars
df = pd.concat([df, dummy_event], axis=1)

In [61]:
df= df.drop('EVENT TYPE', axis=1)

In [62]:
df = df.rename(columns={'HOUR OF': 'HOUR_OF'})

In [63]:
#Dropping the unused Columns (potential for other models)
df.drop(['LOCATION'], axis=1, inplace=True)
df.drop(['FD EVENT NUMBER'], axis=1, inplace=True)
df.drop(['ZIP_CODE'], axis=1, inplace=True)
df.drop(['ENROUTE'], axis=1, inplace=True)
df.drop(['CREATE'], axis=1, inplace=True)
df.drop(['CLEAR'], axis=1, inplace=True)

In [64]:
# adding a column for total response time (disbatch to arrive)
df['Response_Time'] = (df['ARRIVE'] - df['DISPATCH']).dt.total_seconds()

In [65]:
# Move Response_Time to be after Arrive
df.insert(df.columns.get_loc('ARRIVE') + 1, 'Response_Time', df.pop('Response_Time'))

In [66]:
# Go from ~167k to ~147k entries
df = df.dropna(subset=['Response_Time'])

In [67]:
# Check for null values
for column_name in df.columns:
    nan_percentage = (df[column_name].isna().sum() / len(df[column_name])) * 100
    print(nan_percentage)


0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0


In [102]:
# Make two datasets to split how we handle the PRIORITY column
# Turn them into dummy variables
Priority_Dummy = df

In [88]:
# Try to re-order the list so that 1=10, 2=9, ....
Priority_re = df

#### PRIORITY_Dummy Cleaning

In [103]:
# Drop any row with priority as 0 because it only appears once
Priority_Dummy = Priority_Dummy[Priority_Dummy['PRIORITY'] != 0]

In [104]:
dummy_PRIORITY = pd.get_dummies(Priority_Dummy['PRIORITY'], prefix="PRIORITY")

In [105]:
Priority_Dummy = pd.concat([Priority_Dummy, dummy_PRIORITY], axis=1)

In [106]:
Priority_Dummy= Priority_Dummy.drop('PRIORITY', axis=1)

In [107]:
Priority_Dummy.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 146509 entries, 1 to 167585
Data columns (total 39 columns):
 #   Column                             Non-Null Count   Dtype         
---  ------                             --------------   -----         
 0   DATE                               146509 non-null  datetime64[ns]
 1   DISPATCH                           146509 non-null  datetime64[ns]
 2   ARRIVE                             146509 non-null  datetime64[ns]
 3   Response_Time                      146509 non-null  float64       
 4   HOUR_OF                            146509 non-null  int64         
 5   Agency_Buechel FD                  146509 non-null  uint8         
 6   Agency_Camp Taylor FD              146509 non-null  uint8         
 7   Agency_Eastwood FD                 146509 non-null  uint8         
 8   Agency_Fairdale FD                 146509 non-null  uint8         
 9   Agency_Fern Creek FD               146509 non-null  uint8         
 10  Agency_Highview FD  

#### PRIORITY_re Cleaning

In [91]:
Priority_count = Priority_re['PRIORITY'].value_counts()

# Individually Print the count of each entry in the column

print("Count of each entry in the 'Priority' column:")
print(Priority_count)

Count of each entry in the 'Priority' column:
7    82290
2    37546
3     9224
1     8374
8     4845
4     1873
5     1569
9      788
Name: PRIORITY, dtype: int64


In [90]:
# Drop the one row where priority = 0
Priority_re = Priority_re[Priority_re['PRIORITY'] != 0]

In [92]:
# Create a mapping dictionary
mapping = {1: 10, 2: 9, 3: 8, 4: 7, 5: 6, 6: 5, 7: 4, 8: 3, 9: 2, 10: 1}

# Apply Mapping to the 'PRIORITY' column
Priority_re['PRIORITY'] = Priority_re['PRIORITY'].map(mapping)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Priority_re['PRIORITY'] = Priority_re['PRIORITY'].map(mapping)


##### Swapped all priority values to their correct counterparts

In [94]:
# Check and see it worked
Priority_re.head()

Unnamed: 0,DATE,DISPATCH,ARRIVE,Response_Time,HOUR_OF,PRIORITY,Agency_Buechel FD,Agency_Camp Taylor FD,Agency_Eastwood FD,Agency_Fairdale FD,...,ET_ALARM,ET_ASSIST,ET_CHEMICAL,ET_FIRE,ET_GAS_LEAK,ET_LOCK_OUT/IN,ET_MEDICAL,ET_RESCUE,ET_WATER_LEAK,ET_WIRES_DOWN
1,2021-03-01,2024-03-05 00:34:19,2024-03-05 00:40:17,358.0,0,2,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,2021-03-01,2024-03-05 00:54:06,2024-03-05 01:06:34,748.0,0,4,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
4,2021-03-01,2024-03-05 01:03:45,2024-03-05 01:08:32,287.0,100,4,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
5,2021-03-01,2024-03-05 01:13:58,2024-03-05 01:16:25,147.0,100,4,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
6,2021-03-01,2024-03-05 02:05:40,2024-03-05 02:09:46,246.0,200,4,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0


### Decision Tree Regression

In [37]:
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error as msc
from sklearn.metrics import accuracy_score
from math import sqrt
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import plot_tree
from sklearn import tree

In [17]:
# Split the dataframe into a X and y variables. y is our response time and X is everything else
# (minus the datetimes because Hour_of already takes care of that)

X = df.loc[:, ~df.columns.isin(['Response_time', 'ARRIVE', 'DISPATCH', 'DATE'])]
y = df['Response_Time']

### Split into Training and Test

In [18]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=.2, random_state=0)

In [20]:
regressor=DecisionTreeRegressor(random_state=0)
regressor.fit(X,y)

DecisionTreeRegressor(random_state=0)

In [22]:
y_pred=regressor.predict(X_test)

In [23]:
# Check the Root mean squared error value for the predictions on the test values
rms=sqrt(msc(y_test,y_pred))
print('The mean squared error is',rms)

R-Square 1.0
The mean squared error is 9.29624044327521e-15


In [44]:
from sklearn.model_selection import cross_val_score

# Evaluate on Training and Test Sets
y_train_pred = regressor.predict(X_train)
train_mse = msc(y_train, y_train_pred)
train_r2 = r2_score(y_train, y_train_pred)

print("Train Set Mean Squared Error (MSE):", train_mse)


# Use Cross-Validation to test over different splits of the data
cv_scores = cross_val_score(regressor, X, y, cv=5, scoring='r2')

print("Mean Cross-Validation R2:", np.mean(cv_scores))



Train Set Mean Squared Error (MSE): 8.402082066785892e-29
Mean Cross-Validation R2: 0.999818656283875


##### Try with different regression parameters

In [48]:
regressor=DecisionTreeRegressor(splitter='random', random_state=1)
regressor.fit(X,y)

DecisionTreeRegressor(random_state=1, splitter='random')

In [49]:
y_pred=regressor.predict(X_test)

In [50]:
rms=sqrt(msc(y_test,y_pred))
print('The mean squared error is',rms)

The mean squared error is 8.215282748448146e-15


### Using the Priority corrected dataframes

### Dummy Priority 

In [108]:
XDP = Priority_Dummy.loc[:, ~Priority_Dummy.columns.isin(['Response_time', 'ARRIVE', 'DISPATCH', 'DATE'])]
yDP = Priority_Dummy['Response_Time']
XDP_train,XDP_test,yDP_train,yDP_test=train_test_split(XDP,yDP,test_size=.2, random_state=0)

In [109]:
# Same Regression parameters as Original
regressor=DecisionTreeRegressor(random_state=0)
regressor.fit(XDP,yDP)
yDP_pred=regressor.predict(XDP_test)
rms=sqrt(msc(yDP_test,yDP_pred))
print('The mean squared error of the Priority_Dummy is',rms)

The mean squared error of the Priority_Dummy is 9.135838253467874e-15


In [110]:
# Changed regression parameters
regressor=DecisionTreeRegressor(splitter='random', random_state=1)
regressor.fit(XDP,yDP)
yDP_pred=regressor.predict(XDP_test)
rms=sqrt(msc(yDP_test,yDP_pred))
print('The mean squared error of the Priority_Dummy is',rms)

The mean squared error of the Priority_Dummy is 9.558203648479954e-15


## Priority Reorder

In [97]:
XPR = Priority_re.loc[:, ~Priority_re.columns.isin(['Response_time', 'ARRIVE', 'DISPATCH', 'DATE'])]
yPR = Priority_re['Response_Time']
XPR_train,XPR_test,yPR_train,yPR_test=train_test_split(XPR,yPR,test_size=.2, random_state=0)

In [99]:
# Same regression parameters as the original
regressor=DecisionTreeRegressor(random_state=0)
regressor.fit(XPR,yPR)
yPR_pred=regressor.predict(XPR_test)
rms=sqrt(msc(yPR_test,yPR_pred))
print('The mean squared error of the Priority_re is',rms)

The mean squared error of the Priority_re is 9.037325085585732e-15


In [98]:
# Changed regression parameters
regressor=DecisionTreeRegressor(splitter='random', random_state=1)
regressor.fit(XPR,yPR)
yPR_pred=regressor.predict(XPR_test)
rms=sqrt(msc(yPR_test,yPR_pred))
print('The mean squared error of the Priority_re is',rms)

The mean squared error of the Priority_re is 8.015148211178783e-15


The Decision Tree Regression seems to fit the model extremely well. I am suspicious of potential overfitting issues, however it seems to maintain a very low mean squared error when put through different parameters on the regression.