# London Fire Incidents Dimensionality Reduction

In [None]:
%matplotlib inline 
import os
import pandas as pd
import numpy as np
import random
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

### Downloading and loadng the data

In [None]:
# Loading the data
# import data_download

data_dir = "LFB-data"
# LFB_data = pd.read_csv(os.path.join(data_dir, "LFB Incident data - Datastore - with notional cost and UPRN from January 2009.csv"))
LFB_data = pd.read_csv(os.path.join(data_dir, "lfb_incident.csv"))

# Total memory used
print(f'Total Memory Used : {round(LFB_data.memory_usage(deep=True).sum()/(1024*1024), 2)} MB')
LFB_data.head()

### Primary Data analysis

In [None]:
LFB_data.shape

In [None]:
LFB_data.info()

We can see that there are missing values

## Data Preprocessing

#### Time Processing

In [None]:
# We need to drop the existing Hour of call and create our own
LFB_data.drop('HourOfCall', axis=1, inplace=True)

# Create a new column from the DateOfCall column.
LFB_data['YearOfCall'], LFB_data['MonthOfCall'] = LFB_data['DateOfCall'].apply(lambda x: x.split("-")[2]),LFB_data['DateOfCall'].apply(lambda x: x.split("-")[1])
LFB_data['HourOfCall'] = LFB_data['TimeOfCall'].apply(lambda x: x.split(":")[0])

# Dropping unnecessary time columns
LFB_data.drop(['IncidentNumber','TimeOfCall','DateOfCall','CalYear'], axis=1, inplace = True)

#### Service and group processing

In [None]:
# We need to turn the incident group into ordinal encoding

# We convert the column from object to a category 
LFB_data['IncidentGroup'] = LFB_data['IncidentGroup'].astype('category')
LFB_data['IncidentGroup'] = LFB_data['IncidentGroup'].cat.codes # False alarm = 0, Fire = 1, special service = 2

# Drop the columns
LFB_data.drop(['StopCodeDescription','SpecialServiceType'], axis=1, inplace = True)
LFB_data.sample(4)

#### Inc Geo data

In [None]:
# Inc Geo borough name and code all describe the same borough. 

# Number of distinct rows for each column
print(LFB_data[['IncGeo_BoroughCode', 'IncGeo_BoroughName']].nunique())

In [None]:
# we can join the two columns into one to avoid to many vatiables after encoding
LFB_data['IncGeo_Borough'] = LFB_data['IncGeo_BoroughCode'] + '-' +LFB_data['IncGeo_BoroughName']

# Finally we need to drop the two columns
LFB_data.drop(['IncGeo_BoroughCode','IncGeo_BoroughName'], axis=1, inplace=True)
LFB_data['IncGeo_Borough'].tail(3)

In [None]:
# Similary for IncGeo code, ward name and ward name new, all describe the same ward

# Number of distinct rows for each column
print(LFB_data[['IncGeo_WardCode', 'IncGeo_WardName', 'IncGeo_WardNameNew']].nunique())
LFB_data['IncGeo_Ward'] = LFB_data['IncGeo_WardCode'] + '-' +LFB_data['IncGeo_WardName']

# Drop Unecessary Columns
LFB_data.drop(['IncGeo_WardCode','IncGeo_WardName','IncGeo_WardNameNew'], axis=1, inplace=True)
LFB_data['IncGeo_Ward'].sample(3)

#### Slicing Northing_m and easting_m to 4 last digits

In [None]:
# process Northing and easting data and process it ito categorical data using pandas cut

LFB_data['Easting_rounded']= LFB_data['Easting_rounded'].apply(lambda x: int(str(x)[2:]))
LFB_data['Easting_rounded']= pd.cut(LFB_data['Easting_rounded'], bins=10, labels=['Easting_rounded_0','Easting_rounded_1','Easting_rounded_2','Easting_rounded_3','Easting_rounded_4','Easting_rounded_5','Easting_rounded_6','Easting_rounded_7','Easting_rounded_8','Easting_rounded_9'])

LFB_data['Northing_rounded']= LFB_data['Northing_rounded'].apply(lambda x: int(str(x)[2:]))
LFB_data['Northing_rounded']= pd.cut(LFB_data['Northing_rounded'], bins=10, labels=['Northing_rounded_0','Northing_rounded_1','Northing_rounded_2','Northing_rounded_3','Northing_rounded_4','Northing_rounded_5','Northing_rounded_6','Northing_rounded_7','Northing_rounded_8','Northing_rounded_9'])

In [None]:
LFB_data.nunique()

In [None]:

grouping_labels = LFB_data['IncidentGroup']

#### Dropping Unecessary columns

In [None]:
LFB_data.drop(['IncidentGroup','Postcode_full','SecondPumpArriving_AttendanceTime','SecondPumpArriving_DeployedFromStation'], axis=1,inplace=True)

## Exploratory data analysis

### Numerical data

In [None]:
# select numeric columns
df_numeric = LFB_data.select_dtypes(include=[np.number])

print("Numeric data shape : ",df_numeric.shape)
df_numeric.columns.values

In [None]:
df_numeric.describe().T.apply(lambda s: s.apply('{0:.2f}'.format))

We can tell from the data above that there are outliers in the numeric data.
For instance, there are values that have a very huge diffrence between the 75th percentile and maximum value

#### Numerical data visualization

In [None]:
fig = plt.figure(figsize= (20,50))

for i in range(len(df_numeric.columns)):
    column = df_numeric.columns[i]
    sub = fig.add_subplot(9,3, i+1)
    chart = sns.boxplot(data=df_numeric, y=column, x = grouping_labels)
    chart.set_title(column + " by incident group")

#### Missing values on Numeric data

In [None]:
# Check for any missing values
print("Number of cols with Missing Vals: ",df_numeric.isna().any().sum())
display(df_numeric.isna().sum())

<h6>We need to fix the missing values to cluster around the mean value<br>We will consider randomizing the missing values between 30% and 70%</h6>

#### Missing values

In [None]:

# for i in df_numeric[['UPRN', 'USRN', 'Easting_m', 'Northing_m','Easting_rounded', 'Northing_rounded',
#                     'FirstPumpArriving_AttendanceTime','SecondPumpArriving_AttendanceTime',
#                     'NumStationsWithPumpsAttending', 'NumPumpsAttending', 'PumpCount',
#                     'PumpHoursRoundUp', 'Notional Cost (£)', 'NumCalls']]:
for i in df_numeric.columns.values:
    df_numeric.fillna(0, inplace=True)
    # Set 30 and 70th percentile and round off to 2
    rand_30_70 = random.uniform(round(np.percentile(df_numeric[i],30),2), round(np.percentile(df_numeric[i],70),2)) 
    for j in i:
        if j == 0:
            df_numeric.replace(to_replace=0, value=rand_30_70, inplace=True)
df_numeric.isnull().sum()

In [None]:
df_numeric.describe()

#### Outliers in numerical data

In [None]:
for i in df_numeric.columns:
    df_numeric.fillna(df_numeric[i].mode()[0], inplace = True)
    highest_val = df_numeric[i].mean() + 3*df_numeric[i].std()
    lowest_val = df_numeric[i].mean() - 3*df_numeric[i].std()
    print(f"Range for {i} : ", round(lowest_val,2), " to ",round(highest_val,2))
    
#     Trimming the outliers
    df_numeric[i]= np.where(df_numeric[i]>highest_val, highest_val,
                           np.where(df_numeric[i]<lowest_val, lowest_val,
                                   df_numeric[i]))
#     (df_numeric[i]>=lowest_val)&(df_numeric[i]<=highest_val)

print( "\n","*"*120)
df_numeric.describe().T.apply(lambda s: s.apply('{0:.2f}'.format))

In [None]:
df_numeric.isnull().sum()

### Categorical Data

In [None]:
df_categorical = LFB_data.select_dtypes(exclude=[np.number])
print(df_categorical.shape)
print( "\n","-"*120)
df_categorical.columns.values

In [None]:
df_categorical.sample(10)

In [None]:
# Show unique values for each categorcal variable

df_categorical.nunique()

### Joining dataframes

In [None]:
final_df = pd.concat([df_numeric, df_categorical], axis = 1)
final_df.shape

In [None]:
final_df.sample(7)

### One hot encoding

In [None]:
final_df = pd.get_dummies(final_df)
final_df.tail()

#### Adding the grouping data (labels)

In [None]:
final_df = pd.concat([grouping_labels, final_df], axis=1)
final_df.head(5)

# Dimensionality reduction

In [None]:
random.seed(10)
rand_num=random.sample(range(len(final_df)), 10000)
rand_num[:10]

In [None]:
working_df = final_df.iloc[rand_num]
working_df.sample(10)

In [None]:
working_df.shape

## T-Stochastic Neighbor Embedding (t-SNE)

In [None]:
%%time
import scipy
import time
from scipy.spatial.distance import pdist
from sklearn.manifold import TSNE
from scipy import stats
# A=scipy.spatial.distance.pdist(working_df, metric='euclidean')
# kendTSNE=[]

start_time = time.time()
tsne = TSNE(n_components = 2, verbose=1, learning_rate=200, n_iter=500)
tsne_result = tsne.fit_transform(X=working_df)
end_time = time.time()
print("Learning completed in {} seconds".format(end_time - start_time))

In [None]:
tsne_df = pd.DataFrame({"t-SNE 1":tsne_result[:,0], "t-SNE 2":tsne_result[:,1], "label":working_df['grouping_labels']})
tsne_df.head()

In [None]:
# Plotting the tsne data

fig, ax = plt.subplots(1, figsize=(15,10))
sns.scatterplot(x = 't-SNE 1', y= 't-SNE 2', data = tsne_df, hue='label', ax = ax, s=20, palette = 'dark')
sns.color_palette("hls", 10)
lim = (tsne_df.min()-5, tsne_df.max()+5)
ax.set_title('t-SNE Visualization of Incident Group', fontsize = 16, weight = 'bold')
ax.legend(bbox_to_anchor = (1,1), loc =2, borderaxespad = 0.0)

## Same Degree Distribution

In [None]:
from scipy import linalg
from scipy.spatial.distance import pdist
from scipy.spatial.distance import squareform
from scipy.sparse import csr_matrix, issparse
from sklearn.neighbors import NearestNeighbors
from sklearn.base import BaseEstimator
from sklearn.utils import check_random_state
from sklearn.utils._openmp_helpers import _openmp_effective_n_threads
from sklearn.utils.validation import check_non_negative
from sklearn.utils.validation import _deprecate_positional_args
from sklearn.decomposition import PCA
from sklearn.metrics.pairwise import pairwise_distances

MACHINE_EPSILON = np.finfo(np.double).eps