# London Fire Incidents Dimensionality Reduction

In [1]:
%matplotlib inline 
import os
import pandas as pd
import numpy as np
import random
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

### Downloading and loadng the data

In [2]:
# Loading the data
# import data_download

data_dir = "LFB-data"
# LFB_data = pd.read_csv(os.path.join(data_dir, "LFB Incident data - Datastore - with notional cost and UPRN from January 2009.csv"))
LFB_data = pd.read_csv(os.path.join(data_dir, "lfb_incident.csv"))

# Total memory used
print(f'Total Memory Used : {round(LFB_data.memory_usage(deep=True).sum()/(1024*1024), 2)} MB')
LFB_data.head()

Total Memory Used : 2091.97 MB


Unnamed: 0,IncidentNumber,DateOfCall,CalYear,TimeOfCall,HourOfCall,IncidentGroup,StopCodeDescription,SpecialServiceType,PropertyCategory,PropertyType,...,FirstPumpArriving_AttendanceTime,FirstPumpArriving_DeployedFromStation,SecondPumpArriving_AttendanceTime,SecondPumpArriving_DeployedFromStation,NumStationsWithPumpsAttending,NumPumpsAttending,PumpCount,PumpHoursRoundUp,Notional Cost (£),NumCalls
0,235138081,01 Jan 2009,2009,00:00:37,0,Special Service,Special Service,RTC,Road Vehicle,Car,...,319.0,Battersea,342.0,Clapham,2.0,2.0,2.0,1.0,255.0,1.0
1,1091,01 Jan 2009,2009,00:00:46,0,Special Service,Special Service,Assist other agencies,Outdoor,Lake/pond/reservoir,...,,,,,,,,,,1.0
2,2091,01 Jan 2009,2009,00:03:00,0,Fire,Secondary Fire,,Outdoor,Road surface/pavement,...,308.0,Edmonton,,,1.0,1.0,1.0,1.0,255.0,2.0
3,3091,01 Jan 2009,2009,00:04:27,0,Fire,Secondary Fire,,Outdoor,Domestic garden (vegetation not equipment),...,210.0,Hillingdon,,,1.0,1.0,1.0,1.0,255.0,2.0
4,5091,01 Jan 2009,2009,00:05:39,0,Fire,Secondary Fire,,Outdoor,Cycle path/public footpath/bridleway,...,233.0,Holloway,250.0,Holloway,1.0,2.0,2.0,1.0,255.0,1.0


### Primary Data analysis

In [3]:
LFB_data.shape

(1465060, 39)

In [4]:
LFB_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1465060 entries, 0 to 1465059
Data columns (total 39 columns):
 #   Column                                  Non-Null Count    Dtype  
---  ------                                  --------------    -----  
 0   IncidentNumber                          1465060 non-null  object 
 1   DateOfCall                              1465060 non-null  object 
 2   CalYear                                 1465060 non-null  int64  
 3   TimeOfCall                              1465060 non-null  object 
 4   HourOfCall                              1465060 non-null  int64  
 5   IncidentGroup                           1465060 non-null  object 
 6   StopCodeDescription                     1465060 non-null  object 
 7   SpecialServiceType                      459204 non-null   object 
 8   PropertyCategory                        1465060 non-null  object 
 9   PropertyType                            1465060 non-null  object 
 10  AddressQualifier              

In [None]:
# Lets see if the data has missing values.

plt.figure(figsize = (8,6))
cols = LFB_data.columns[:]
colours = ['#000099', '#ffff00'] # specify the colours - yellow is missing. blue is not missing.
sns.heatmap(LFB_data[cols].isnull(), cmap=sns.color_palette(colours))
print("Yellow - Missing Values\nBlue - Non Missing")

We can see that there are missing values

## Exploratory data analysis

### Numerical data

In [None]:
# select numeric columns
df_numeric = LFB_data.select_dtypes(include=[np.number])

print("Numeric data shape : ",df_numeric.shape)
df_numeric.columns.values

In [None]:
df_numeric.describe().T.apply(lambda s: s.apply('{0:.2f}'.format))

We can tell from the data above that there are outliers in the numeric data.
For instance, there are values that have a very huge diffrence between the 75th percentile and maximum value

#### Numerical data visualization

In [None]:
fig = plt.figure(figsize= (20,50))

for i in range(len(df_numeric.columns)):
    column = df_numeric.columns[i]
    sub = fig.add_subplot(9,3, i+1)
    chart = sns.boxplot(data=df_numeric, y=column, x = LFB_data["IncidentGroup"])
    chart.set_title(column + " by incident group")

#### Missing values on Numeric data

In [None]:
# Check for any missing values
print("Number of cols with Missing Vals: ",df_numeric.isna().any().sum())
display(df_numeric.isna().sum())

<h6>We need to fix the missing values to cluster around the mean value<br>We will consider randomizing the missing values between 30% and 70%</h6>

#### Missing values

In [None]:

for i in df_numeric[['UPRN', 'USRN', 'Easting_m', 'Northing_m','Easting_rounded', 'Northing_rounded',
                    'FirstPumpArriving_AttendanceTime','SecondPumpArriving_AttendanceTime',
                    'NumStationsWithPumpsAttending', 'NumPumpsAttending', 'PumpCount',
                    'PumpHoursRoundUp', 'Notional Cost (£)', 'NumCalls']]:
    df_numeric.fillna(0, inplace=True)
    # Set 30 and 70th percentile and round off to 2
    rand_30_70 = random.uniform(round(np.percentile(df_numeric[i],30),2), round(np.percentile(df_numeric[i],70),2)) 
    for j in i:
        if j == 0:
            df_numeric.replace(to_replace=0, value=rand_30_70, inplace=True)
            

#### Outliers in numerical data

In [None]:
for i in df_numeric.columns:
    df_numeric.fillna(df_numeric[i].mode()[0], inplace = True)
    highest_val = df_numeric[i].mean() + 3*df_numeric[i].std()
    lowest_val = df_numeric[i].mean() - 3*df_numeric[i].std()
    print(f"Range for {i} : ", round(lowest_val,2), " to ",round(highest_val,2))
    
#     Trimming the outliers
    df_numeric[i]= np.where(df_numeric[i]>highest_val, highest_val,
                           np.where(df_numeric[i]<lowest_val, lowest_val,
                                   df_numeric[i]))
#     (df_numeric[i]>=lowest_val)&(df_numeric[i]<=highest_val)

print( "\n","*"*120)
df_numeric.describe().T.apply(lambda s: s.apply('{0:.2f}'.format))

In [None]:
df_numeric.isnull().sum()

### Categorical Data

In [None]:
df_categorical = LFB_data.select_dtypes(exclude=[np.number])
print(df_categorical.shape)
print( "\n","-"*120)
df_categorical.columns.values

In [None]:
df_categorical.sample(10)

In [None]:
print(df_categorical['IncGeo_WardName'] == df_categorical['IncGeo_WardNameNew'])
print('-'*50)

In [None]:
# Drop the incidentnumber, postcode_full,  column since we really dont need it
df_categorical.drop(['IncidentNumber', 'Postcode_full', 'IncGeo_WardNameNew'], axis = 1, inplace=True)

# Create a new column from the DateOfCall column.
df_categorical['MonthOfCall'] = df_categorical['DateOfCall'].apply(lambda x: x.split(" ")[1])
df_categorical['YearOfCall'] = df_categorical['DateOfCall'].apply(lambda x: x.split(" ")[2])
df_categorical.drop('DateOfCall', axis=1, inplace = True)


# Create a new column from the DateOfCall column.
df_categorical['HourOfCall'] = df_categorical['TimeOfCall'].apply(lambda x: x.split(":")[0])
df_categorical.drop('TimeOfCall', axis=1, inplace = True)

In [None]:
# Show new dataframe

df_categorical

In [None]:
# Show unique values for each categorcal variable

df_categorical.nunique()

### Joining dataframes

In [None]:
final_df = pd.concat([df_numeric, df_categorical], axis = 1)
final_df.shape

In [None]:
final_df.sample(7)

### One hot encoding

In [None]:
final_df = pd.get_dummies(final_df)
final_df.tail()

# Dimensionality reduction

In [None]:
random.seed(10)
rand_num=random.sample(range(len(final_df)), 100000)
rand_num[:10]

In [None]:
working_df = final_df.iloc[rand_num]
working_df.sample(10)

In [None]:
working_df.shape

## T-Stochastic Neighbor Embedding (t-SNE)

In [None]:
%%time
import scipy
import time
from scipy.spatial.distance import pdist
from sklearn.manifold import TSNE
from scipy import stats
# A=scipy.spatial.distance.pdist(working_df, metric='euclidean')
# kendTSNE=[]

start_time = time.time()
tsne = TSNE(n_components = 2, verbose=1, learning_rate=200, n_iter=500)
tsne_result = tsne.fit_transform(X=working_df)
end_time = time.time()
print("Learning completed in {} seconds".format(end_time - start_time))

In [None]:
tsne_df = pd.DataFrame({"t-SNE 1":tsne_result[:,0], "t-SNE 2":tsne_result[:,1]})
tsne_df.head()

In [None]:
# Plotting the tsne data

fig, ax = plt.subplots(1, figsize=(15,10))
sns.scatterplot(x = 't-SNE 1', y= 't-SNE 2', data = tsne_df, ax = ax, s=20, palette = 'dark')
sns.color_palette("hls", 10)
lim = (tsne_df.min()-5, tsne_df.max()+5)
ax.set_title('t-SNE Visualization of Incident Group', fontsize = 16, weight = 'bold')
ax.legend(bbox_to_anchor = (1,1), loc =2, borderaxespad = 0.0)

## Same Degree Distribution

In [None]:
from scipy import linalg
from scipy.spatial.distance import pdist
from scipy.spatial.distance import squareform
from scipy.sparse import csr_matrix, issparse
from sklearn.neighbors import NearestNeighbors
from sklearn.base import BaseEstimator
from sklearn.utils import check_random_state
from sklearn.utils._openmp_helpers import _openmp_effective_n_threads
from sklearn.utils.validation import check_non_negative
from sklearn.utils.validation import _deprecate_positional_args
from sklearn.decomposition import PCA
from sklearn.metrics.pairwise import pairwise_distances

MACHINE_EPSILON = np.finfo(np.double).eps