# 1. Initial Setup

In [21]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler, LabelEncoder
import matplotlib.pyplot as plt
import seaborn as sns

In [22]:
sns.set(style="whitegrid")

In [23]:
file_path = "../data/sf-fire-calls.csv"
column_types = {
    'StationArea': 'str',
    'Box': 'str',
    'ALSUnit': 'str',
    'CallTypeGroup': 'str'
}
df = pd.read_csv(file_path, dtype=column_types, low_memory=False)

# 2. Cleaning and Pre-processing

In [24]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 157190 entries, 0 to 157189
Data columns (total 28 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   CallNumber                  157190 non-null  int64  
 1   UnitID                      157190 non-null  object 
 2   IncidentNumber              157190 non-null  int64  
 3   CallType                    157190 non-null  object 
 4   CallDate                    157190 non-null  object 
 5   WatchDate                   157190 non-null  object 
 6   CallFinalDisposition        157190 non-null  object 
 7   AvailableDtTm               155445 non-null  object 
 8   Address                     157190 non-null  object 
 9   City                        157011 non-null  object 
 10  Zipcode                     157055 non-null  float64
 11  Battalion                   157190 non-null  object 
 12  StationArea                 157116 non-null  object 
 13  Box           

In [25]:
# Remove irrelevant columns
cols_to_drop = ['CallNumber', 'IncidentNumber', 'Address', 'RowID', 'Location']
df.drop(columns=cols_to_drop, inplace=True)

In [26]:
# Handle missing values
df['Delay'] = df['Delay'].fillna(df['Delay'].mean())
df['City'] = df['City'].fillna('Unknown')

In [28]:
# Convert dates
df['CallDate'] = pd.to_datetime(df['CallDate'])
df['Year'] = df['CallDate'].dt.year

In [30]:
# Encode categorical variables
categorical_features = ['CallType', 'City', 'CallFinalDisposition', 'UnitType']
df_encoded = pd.get_dummies(df, columns=categorical_features, drop_first=True)

In [31]:
print(df_encoded.head())

  UnitID   CallDate   WatchDate           AvailableDtTm  Zipcode Battalion  \
0    T13 2002-01-11  01/10/2002  01/11/2002 01:51:44 AM  94109.0       B04   
1    M17 2002-01-11  01/10/2002  01/11/2002 03:01:18 AM  94124.0       B10   
2    M41 2002-01-11  01/10/2002  01/11/2002 02:39:50 AM  94102.0       B03   
3    E11 2002-01-11  01/10/2002  01/11/2002 04:16:46 AM  94110.0       B06   
4    B04 2002-01-11  01/10/2002  01/11/2002 06:01:58 AM  94109.0       B04   

  StationArea   Box OriginalPriority Priority  ...  \
0          38  3362                3        3  ...   
1          42  6495                3        3  ...   
2          01  1455                3        3  ...   
3          32  5626                3        3  ...   
4          03  3223                3        3  ...   

   CallFinalDisposition_Unable to Locate UnitType_CHIEF UnitType_ENGINE  \
0                                  False          False           False   
1                                  False          False 

# 3. Training/Test Split