# London Fire Incidents Dimensionality Reduction

In [1]:
%matplotlib inline 
import os
import pandas as pd
import numpy as np
import random
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

### Downloading and loadng the data

In [2]:
# Loading the data
# import data_download

data_dir = "LFB-data"
# LFB_data = pd.read_csv(os.path.join(data_dir, "LFB Incident data - Datastore - with notional cost and UPRN from January 2009.csv"))
LFB_data = pd.read_csv(os.path.join(data_dir, "lfb_incident.csv"))

# Total memory used
print(f'Total Memory Used : {round(LFB_data.memory_usage(deep=True).sum()/(1024*1024), 2)} MB')
LFB_data.head()

Total Memory Used : 2091.97 MB


Unnamed: 0,IncidentNumber,DateOfCall,CalYear,TimeOfCall,HourOfCall,IncidentGroup,StopCodeDescription,SpecialServiceType,PropertyCategory,PropertyType,...,FirstPumpArriving_AttendanceTime,FirstPumpArriving_DeployedFromStation,SecondPumpArriving_AttendanceTime,SecondPumpArriving_DeployedFromStation,NumStationsWithPumpsAttending,NumPumpsAttending,PumpCount,PumpHoursRoundUp,Notional Cost (£),NumCalls
0,235138081,01 Jan 2009,2009,00:00:37,0,Special Service,Special Service,RTC,Road Vehicle,Car,...,319.0,Battersea,342.0,Clapham,2.0,2.0,2.0,1.0,255.0,1.0
1,1091,01 Jan 2009,2009,00:00:46,0,Special Service,Special Service,Assist other agencies,Outdoor,Lake/pond/reservoir,...,,,,,,,,,,1.0
2,2091,01 Jan 2009,2009,00:03:00,0,Fire,Secondary Fire,,Outdoor,Road surface/pavement,...,308.0,Edmonton,,,1.0,1.0,1.0,1.0,255.0,2.0
3,3091,01 Jan 2009,2009,00:04:27,0,Fire,Secondary Fire,,Outdoor,Domestic garden (vegetation not equipment),...,210.0,Hillingdon,,,1.0,1.0,1.0,1.0,255.0,2.0
4,5091,01 Jan 2009,2009,00:05:39,0,Fire,Secondary Fire,,Outdoor,Cycle path/public footpath/bridleway,...,233.0,Holloway,250.0,Holloway,1.0,2.0,2.0,1.0,255.0,1.0


### Primary Data analysis

In [None]:
LFB_data.shape

In [None]:
LFB_data.info()

In [None]:
# Lets see if the data has missing values.

plt.figure(figsize = (8,6))
cols = LFB_data.columns[:]
colours = ['#000099', '#ffff00'] # specify the colours - yellow is missing. blue is not missing.
sns.heatmap(LFB_data[cols].isnull(), cmap=sns.color_palette(colours))
print("Yellow - Missing Values\nBlue - Non Missing")

We can see that there are missing values

In [None]:
# Lets see whath columns have the missing values

LFB_data.isnull().sum()

## Exploratory data analysis

### Numerical data

In [None]:
# select numeric columns
df_numeric = LFB_data.select_dtypes(include=[np.number])

print("Numeric data shape : ",df_numeric.shape)
df_numeric.columns.values

In [None]:
df_numeric.describe().T.apply(lambda s: s.apply('{0:.2f}'.format))

We can tell from the data above that there are outliers in the numeric data.
For instance, there are values that have a very huge diffrence between the 75th percentile and maximum value

#### Numerical data visualization

In [None]:
fig = plt.figure(figsize= (20,50))

for i in range(len(df_numeric.columns)):
    column = df_numeric.columns[i]
    sub = fig.add_subplot(9,3, i+1)
    chart = sns.boxplot(data=df_numeric, y=column, x = LFB_data["IncidentGroup"])
    chart.set_title(column + " by incident group")

#### Missing values on Numeric data

In [None]:
# Check for any missing values
print("Number of cols with Missing Vals: ",df_numeric.isna().any().sum())
display(df_numeric.isna().sum())

<h6>We need to fix the missing values to cluster around the mean value<br>We will consider randomizing the missing values between 30% and 70%</h6>

#### Missing values

In [None]:
df_numeric = df_numeric.drop(['CalYear', 'HourOfCall','Latitude', 'Longitude'], axis = 1)
for i in df_numeric:
    df_numeric.fillna(0, inplace=True)
    # Set 30 and 70th percentile and round off to 2
    rand_30_70 = random.uniform(round(np.percentile(df_numeric[i],30),2), round(np.percentile(df_numeric[i],70),2)) 
    for j in i:
        if j == 0:
            df_numeric.replace(to_replace=0, value=rand_30_70, inplace=True)
            

#### Outliers in numerical data

In [None]:
for i in df_numeric.columns:
    df_numeric.fillna(df_numeric[i].mode()[0], inplace = True)
    highest_val = df_numeric[i].mean() + 3*df_numeric[i].std()
    lowest_val = df_numeric[i].mean() - 3*df_numeric[i].std()
    print(f"Range for {i} : ", round(lowest_val,2), " to ",round(highest_val,2))
    
#     Trimming the outliers
    df_numeric[i]= np.where(df_numeric[i]>highest_val, highest_val,
                           np.where(df_numeric[i]<lowest_val, lowest_val,
                                   df_numeric[i]))
#     (df_numeric[i]>=lowest_val)&(df_numeric[i]<=highest_val)

print( "\n","*"*120)
df_numeric.describe().T.apply(lambda s: s.apply('{0:.2f}'.format))

In [None]:
df_numeric.isnull().sum()

### Categorical Data

In [3]:
df_categorical = LFB_data.select_dtypes(exclude=[np.number])
print(df_categorical.shape)
print( "\n","-"*120)
df_categorical.columns.values

(1465060, 21)

 ------------------------------------------------------------------------------------------------------------------------


array(['IncidentNumber', 'DateOfCall', 'TimeOfCall', 'IncidentGroup',
       'StopCodeDescription', 'SpecialServiceType', 'PropertyCategory',
       'PropertyType', 'AddressQualifier', 'Postcode_full',
       'Postcode_district', 'IncGeo_BoroughCode', 'IncGeo_BoroughName',
       'ProperCase', 'IncGeo_WardCode', 'IncGeo_WardName',
       'IncGeo_WardNameNew', 'FRS', 'IncidentStationGround',
       'FirstPumpArriving_DeployedFromStation',
       'SecondPumpArriving_DeployedFromStation'], dtype=object)

In [4]:
df_categorical.sample(10)

Unnamed: 0,IncidentNumber,DateOfCall,TimeOfCall,IncidentGroup,StopCodeDescription,SpecialServiceType,PropertyCategory,PropertyType,AddressQualifier,Postcode_full,...,IncGeo_BoroughCode,IncGeo_BoroughName,ProperCase,IncGeo_WardCode,IncGeo_WardName,IncGeo_WardNameNew,FRS,IncidentStationGround,FirstPumpArriving_DeployedFromStation,SecondPumpArriving_DeployedFromStation
1293310,157338-14122020,14 Dec 2020,00:37:13,Special Service,Special Service,Flooding,Outdoor,Road surface/pavement,In street close to gazetteer location,EN2 7HA,...,E09000010,ENFIELD,Enfield,E05000202,HIGHLANDS,HIGHLANDS,London,Southgate,Southgate,
36974,61792091,14 Apr 2009,13:08:33,False Alarm,False alarm - Good intent,,Road Vehicle,Car,In street outside gazetteer location,UB3 2TY,...,E09000017,HILLINGDON,Hillingdon,E05000325,Botwell,Botwell,London,Hillingdon,Hayes,
525477,72885131,08 Jun 2013,20:51:17,Fire,Secondary Fire,,Outdoor,Domestic garden (vegetation not equipment),On land associated with building,E15 4RY,...,E09000025,NEWHAM,Newham,E05000492,Stratford and New Town,Stratford and New Town,London,Stratford,Stratford,Stratford
249869,203944101,03 Dec 2010,21:02:37,False Alarm,AFA,,Non Residential,Warehouse,Correct incident location,SE7 7RX,...,E09000011,GREENWICH,Greenwich,E05000230,Woolwich Riverside,Woolwich Riverside,London,East Greenwich,East Greenwich,
10636,17355091,30 Jan 2009,14:28:57,Fire,Primary Fire,,Road Vehicle,Car,Correct incident location,NW10 6TD,...,E09000009,EALING,Ealing,E05000175,East Acton,East Acton,London,Park Royal,Park Royal,
171066,62800101,25 Apr 2010,00:59:27,Fire,Primary Fire,,Dwelling,Purpose Built Flats/Maisonettes - 10 or more s...,Correct incident location,,...,E09000021,KINGSTON UPON THAMES,Kingston Upon thames,E05000405,Chessington South,Chessington South,London,Surbiton,Surbiton,
706606,42927151,11 Apr 2015,18:23:00,False Alarm,AFA,,Dwelling,Self contained Sheltered Housing,Correct incident location,,...,E09000033,WESTMINSTER,Westminster,E05000643,Regent's Park,Regent's Park,London,Paddington,Paddington,Euston
670942,154881141,17 Nov 2014,21:43:05,False Alarm,False alarm - Good intent,,Non Residential,Petrol station,Correct incident location,BR6 6AA,...,E09000006,BROMLEY,Bromley,E05000110,Chelsfield and Pratts Bottom,Chelsfield and Pratts Bottom,London,Orpington,Orpington,
506184,39474131,30 Mar 2013,15:36:11,Special Service,Special Service,Lift Release,Dwelling,Purpose Built Flats/Maisonettes - 4 to 9 storeys,Within same building,,...,E09000007,CAMDEN,Camden,E05000143,St. Pancras and Somers Town,St. Pancras and Somers Town,London,Euston,,
481636,161253121,25 Dec 2012,19:17:42,False Alarm,AFA,,Dwelling,Self contained Sheltered Housing,Correct incident location,,...,E09000005,BRENT,Brent,E05000087,Brondesbury Park,Brondesbury Park,London,Willesden,Willesden,Willesden


In [5]:
print(df_categorical['IncGeo_WardName'] == df_categorical['IncGeo_WardNameNew'])
print('-'*50)

0          True
1          True
2          True
3          True
4          True
           ... 
1465055    True
1465056    True
1465057    True
1465058    True
1465059    True
Length: 1465060, dtype: bool
--------------------------------------------------


In [6]:
# Drop the incidentnumber, postcode_full,  column since we really dont need it
df_categorical.drop(['IncidentNumber', 'Postcode_full', 'IncGeo_WardNameNew'], axis = 1, inplace=True)

# Create a new column from the DateOfCall column.
df_categorical['MonthOfCall'] = df_categorical['DateOfCall'].apply(lambda x: x.split(" ")[1])
df_categorical['YearOfCall'] = df_categorical['DateOfCall'].apply(lambda x: x.split(" ")[2])
df_categorical.drop('DateOfCall', axis=1, inplace = True)


# Create a new column from the DateOfCall column.
df_categorical['HourOfCall'] = df_categorical['TimeOfCall'].apply(lambda x: x.split(":")[0])
df_categorical.drop('TimeOfCall', axis=1, inplace = True)

In [7]:
# Show new dataframe

df_categorical

Unnamed: 0,IncidentGroup,StopCodeDescription,SpecialServiceType,PropertyCategory,PropertyType,AddressQualifier,Postcode_district,IncGeo_BoroughCode,IncGeo_BoroughName,ProperCase,IncGeo_WardCode,IncGeo_WardName,FRS,IncidentStationGround,FirstPumpArriving_DeployedFromStation,SecondPumpArriving_DeployedFromStation,MonthOfCall,YearOfCall,HourOfCall
0,Special Service,Special Service,RTC,Road Vehicle,Car,In street close to gazetteer location,SW11,E09000032,WANDSWORTH,Wandsworth,E05000620,Queenstown,London,Battersea,Battersea,Clapham,Jan,2009,00
1,Special Service,Special Service,Assist other agencies,Outdoor,Lake/pond/reservoir,Open land/water - nearest gazetteer location,SE1,E09000022,LAMBETH,Lambeth,E05000416,Bishop's,London,Lambeth,,,Jan,2009,00
2,Fire,Secondary Fire,,Outdoor,Road surface/pavement,In street outside gazetteer location,N9,E09000010,ENFIELD,Enfield,E05000201,Haselbury,London,Edmonton,Edmonton,,Jan,2009,00
3,Fire,Secondary Fire,,Outdoor,Domestic garden (vegetation not equipment),On land associated with building,UB10,E09000017,HILLINGDON,Hillingdon,E05000332,Hillingdon East,London,Hillingdon,Hillingdon,,Jan,2009,00
4,Fire,Secondary Fire,,Outdoor,Cycle path/public footpath/bridleway,In street outside gazetteer location,N7,E09000019,ISLINGTON,Islington,E05000375,Holloway,London,Holloway,Holloway,Holloway,Jan,2009,00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1465055,False Alarm,AFA,,Non Residential,Warehouse,Correct incident location,TW6,E09000017,HILLINGDON,Hillingdon,E05013570,HEATHROW VILLAGES,London,Heathrow,Feltham,,Jun,2022,23
1465056,Special Service,Special Service,Effecting entry/exit,Dwelling,Purpose Built Flats/Maisonettes - Up to 3 stor...,Correct incident location,N19,E09000019,ISLINGTON,Islington,E05013712,TOLLINGTON,London,Holloway,Kentish Town,,Jun,2022,23
1465057,False Alarm,AFA,,Non Residential,Pub/wine bar/bar,Correct incident location,UB4,E09000017,HILLINGDON,Hillingdon,E05013582,WOOD END,London,Hillingdon,Hillingdon,,Jun,2022,23
1465058,False Alarm,False alarm - Good intent,,Dwelling,Purpose Built Flats/Maisonettes - 4 to 9 storeys,Correct incident location,E1W,E09000030,TOWER HAMLETS,Tower Hamlets,E05009332,SHADWELL,London,Shadwell,Shadwell,,Jun,2022,23


In [8]:
# Show unique values for each categorcal variable

df_categorical.nunique()

IncidentGroup                                3
StopCodeDescription                         10
SpecialServiceType                          21
PropertyCategory                             9
PropertyType                               291
AddressQualifier                            11
Postcode_district                          328
IncGeo_BoroughCode                          33
IncGeo_BoroughName                          33
ProperCase                                  33
IncGeo_WardCode                           1274
IncGeo_WardName                           1560
FRS                                          1
IncidentStationGround                      103
FirstPumpArriving_DeployedFromStation      114
SecondPumpArriving_DeployedFromStation     113
MonthOfCall                                 12
YearOfCall                                  14
HourOfCall                                  24
dtype: int64

#### Missing values for categorical data

We can see that we have a couple of categorical columns with missing values

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import log_loss
from sklearn.metrics import accuracy_score

X = df_categorical.drop('IncidentGroup', axis = 1).values
Y = df_categorical['IncidentGroup'].values

# spliting the data into train and test
X_train, X_test, y_train, y_test = train_test_split(X,Y, test_size = 0.2, random_state = 42)

forest = RandomForestClassifier(n_estimators = 50, max_depth = 25,
                               random_state = False, max_features = 0.6,
                               min_samples_leaf)