# Import libraries

In [28]:
# Import libraries
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from pyproj import Proj, transform
from geopy.distance import geodesic

# Read raw data

In [29]:

# Read the CSV file
file_path = '../data/raw/df_i.csv'
df_i = pd.read_csv(file_path)



# Display the first and last 10 rows
print(df_i.head(10))
print(df_i.tail(10))

# Get info about DF
print(df_i.info())

    IncidentNumber DateOfCall  CalYear TimeOfCall  HourOfCall  \
0  043453-28032023  28-Mar-23     2023   23:08:30          23   
1  144295-18092023  18-Sep-23     2023   08:19:06           8   
2  197526-16122023  16-Dec-23     2023   12:37:18          12   
3  137787-08092023  08-Sep-23     2023   08:33:16           8   
4  143525-16092023  16-Sep-23     2023   23:20:57          23   
5  184192-24112023  24-Nov-23     2023   03:36:47           3   
6  079155-03062023  03-Jun-23     2023   22:43:46          22   
7  056510-24042023  24-Apr-23     2023   06:12:08           6   
8  127526-22082023  22-Aug-23     2023   15:07:37          15   
9  122000-12082023  12-Aug-23     2023   21:35:23          21   

     IncidentGroup      StopCodeDescription     SpecialServiceType  \
0  Special Service          Special Service   Effecting entry/exit   
1      False Alarm                      AFA                    NaN   
2      False Alarm                      AFA                    NaN   
3   

# Keep rows important to the modle

In [32]:
import pandas as pd

# File path to the CSV file
file_path = '../data/raw/df_i.csv'

# Read the DataFrame from the CSV file
df_i = pd.read_csv(file_path)

# List of desired columns
columns_to_keep = ['IncidentNumber', 'DateOfCall', 'CalYear', 'HourOfCall',
                   'IncGeo_BoroughName', 'Easting_m', 'Northing_m',
                   'Easting_rounded', 'Northing_rounded', 'Latitude',
                   'Longitude', 'FirstPumpArriving_AttendanceTime',
                   'FirstPumpArriving_DeployedFromStation']

# Select the desired columns
df_i2 = df_i[columns_to_keep].copy()  # Create a copy of the DataFrame

# Rename columns
df_i2.rename(columns={'FirstPumpArriving_AttendanceTime': 'AttendanceTime',
                      'FirstPumpArriving_DeployedFromStation': 'DeployedFromStation'}, inplace=True)

# Display the first few rows of the DataFrame
print(df_i2.head())

# Get info about DF
print(df_i2.info())


    IncidentNumber DateOfCall  CalYear  HourOfCall      IncGeo_BoroughName  \
0  043453-28032023  28-Mar-23     2023          23               ISLINGTON   
1  144295-18092023  18-Sep-23     2023           8  KENSINGTON AND CHELSEA   
2  197526-16122023  16-Dec-23     2023          12                  EALING   
3  137787-08092023  08-Sep-23     2023           8                  CAMDEN   
4  143525-16092023  16-Sep-23     2023          23                  MERTON   

   Easting_m  Northing_m  Easting_rounded  Northing_rounded   Latitude  \
0        NaN         NaN           532350            186150        NaN   
1        NaN         NaN           527850            178950        NaN   
2   517597.0    179596.0           517550            179550  51.503080   
3   530803.0    182694.0           530850            182650  51.528024   
4        NaN         NaN           526050            170950        NaN   

   Longitude  AttendanceTime DeployedFromStation  
0        NaN           140.0     St

# Check wether 'IncidentNumber' is unique

In [33]:
# Check wether 'IncidentNumber' is unique
unique_ids_count = df_i2['IncidentNumber'].nunique()
print(f"Number of unique IncidentNumbers: {unique_ids_count}")


Number of unique IncidentNumbers: 10000


# Convert geodata and add columns

In [34]:

# Convert geodata and add columns


# Definition of the Ordnance Survey Grid Reference Project (OSGB)
osgb_proj = Transformer.from_crs("EPSG:27700", "EPSG:4326")   # EPSG code 27700 corresponds to OSGB (Ordnance Survey Grid)

# Define a function to convert UTM coordinates to WGS84
def utm_to_wgs84(easting, northing):
    lat, lon = osgb_proj.transform(easting, northing)
    return lat, lon

# Define a function to apply UTM to WGS84 conversion to each row and add the results as new columns
def utm_to_wgs84_wrapper(row):
    lat_cal, long_cal = utm_to_wgs84(row['Easting_m'], row['Northing_m'])
    lat_cal_r, long_cal_r = utm_to_wgs84(row['Easting_rounded'], row['Northing_rounded'])
    return pd.Series({'lat_cal': lat_cal, 'long_cal': long_cal, 'lat_cal_r': lat_cal_r, 'long_cal_r': long_cal_r})

# Apply the conversion function to each row and add the results as new columns
with pd.option_context('mode.chained_assignment', None):  # Suppress SettingWithCopyWarning
    df_i2[['lat_cal', 'long_cal', 'lat_cal_r', 'long_cal_r']] = df_i2.apply(utm_to_wgs84_wrapper, axis=1)

# Display the updated DataFrame
print(df_i2.head())



    IncidentNumber DateOfCall  CalYear  HourOfCall      IncGeo_BoroughName  \
0  043453-28032023  28-Mar-23     2023          23               ISLINGTON   
1  144295-18092023  18-Sep-23     2023           8  KENSINGTON AND CHELSEA   
2  197526-16122023  16-Dec-23     2023          12                  EALING   
3  137787-08092023  08-Sep-23     2023           8                  CAMDEN   
4  143525-16092023  16-Sep-23     2023          23                  MERTON   

   Easting_m  Northing_m  Easting_rounded  Northing_rounded   Latitude  \
0        NaN         NaN           532350            186150        NaN   
1        NaN         NaN           527850            178950        NaN   
2   517597.0    179596.0           517550            179550  51.503080   
3   530803.0    182694.0           530850            182650  51.528024   
4        NaN         NaN           526050            170950        NaN   

   Longitude  AttendanceTime DeployedFromStation    lat_cal  long_cal  \
0        NaN 

# Read the CSV file stations_boroughs

In [35]:
# Read the CSV file stations_boroughs
file_path = '../data/external/stations_boroughs_1.csv'
df_stations_boroughs = pd.read_csv(file_path)

# Display the first and last 10 rows
print(df_stations_boroughs.head(10))

# Get info about DF
print(df_stations_boroughs.info())

     borough  bor_sqkm       pop  pop_per_sqkm_bor  in_o_out         reg  \
0     Camden     21.80  210390.0            9651.0       1.0  north_west   
1     Camden     21.80  210390.0            9651.0       1.0  north_west   
2     Camden     21.80  210390.0            9651.0       1.0  north_west   
3  Greenwich     47.35  289254.0            6109.0       1.0  south_east   
4  Greenwich     47.35  289254.0            6109.0       1.0  south_east   
5  Greenwich     47.35  289254.0            6109.0       1.0  south_east   
6  Greenwich     47.35  289254.0            6109.0       1.0  south_east   
7  Greenwich     47.35  289254.0            6109.0       1.0  south_east   
8    Hackney     19.06  259956.0           13639.0       1.0  north_east   
9    Hackney     19.06  259956.0           13639.0       1.0  north_east   

   borough.1            stat c_stat  \
0     Camden          Euston    A23   
1     Camden   West Hampstad    A41   
2     Camden    Kentish town    A43   
3  Gree

# Merge with stations_boroughs to attach geoinfo of the stations

In [36]:
# Merge based on the 'FirstPumpArriving_DeployedFromStation' column
df_mi2 = pd.merge(
    df_i2,  # Left DataFrame
    df_stations_boroughs[['stat', 'lat', 'long', 'bor_sqkm', 'pop_per_stat', 'distance_stat']],
    left_on='DeployedFromStation',  # Column to merge on in df_i2
    right_on='stat',  # Column to merge on in stations_boroughs
    how='left'  # Type of merge (in this case, left)
)

# Display the first few rows of the merged DataFrame
print(df_mi2.head())
print(df_mi2.info())

    IncidentNumber DateOfCall  CalYear  HourOfCall      IncGeo_BoroughName  \
0  043453-28032023  28-Mar-23     2023          23               ISLINGTON   
1  144295-18092023  18-Sep-23     2023           8  KENSINGTON AND CHELSEA   
2  197526-16122023  16-Dec-23     2023          12                  EALING   
3  137787-08092023  08-Sep-23     2023           8                  CAMDEN   
4  143525-16092023  16-Sep-23     2023          23                  MERTON   

   Easting_m  Northing_m  Easting_rounded  Northing_rounded   Latitude  ...  \
0        NaN         NaN           532350            186150        NaN  ...   
1        NaN         NaN           527850            178950        NaN  ...   
2   517597.0    179596.0           517550            179550  51.503080  ...   
3   530803.0    182694.0           530850            182650  51.528024  ...   
4        NaN         NaN           526050            170950        NaN  ...   

     lat_cal  long_cal  lat_cal_r  long_cal_r           

# Example data records when 'station' is NaN

In [37]:
# Beispieldatensätze, wenn 'station' NaN ist
nan_station_records = df_mi2[df_mi2['stat'].isna()]

print("Example data records when 'stat' is NaN:")
print(nan_station_records)

Example data records when 'stat' is NaN:
       IncidentNumber DateOfCall  CalYear  HourOfCall IncGeo_BoroughName  \
10    197033-15122023  15-Dec-23     2023          16             CAMDEN   
15    028725-26022023  26-Feb-23     2023          18      TOWER HAMLETS   
16    188392-01122023  01-Dec-23     2023           5         WANDSWORTH   
18    088343-17062023  17-Jun-23     2023          15             EALING   
19    012588-26012023  26-Jan-23     2023           5              BRENT   
...               ...        ...      ...         ...                ...   
9973  057598-26042023  26-Apr-23     2023          10           HARINGEY   
9980  128328-23082023  23-Aug-23     2023          21             CAMDEN   
9985  050872-12042023  12-Apr-23     2023          19             EALING   
9997  005727-13012023  13-Jan-23     2023           3        WESTMINSTER   
9999  182176-20112023  20-Nov-23     2023          17             CAMDEN   

      Easting_m  Northing_m  Easting_rounded  

# Calculate and add 'distance' btween incident and firestation

In [38]:

# Calculate and add 'distance' btween incident and firestation

from geopy.distance import geodesic

def calculate_distance(row):
    # Check if latitude values are within the expected range
    if pd.notna(row['Latitude']) and pd.notna(row['lat']):
        valid_latitudes = all(49 <= lat <= 53 for lat in [row['Latitude'], row['lat']])
        if not valid_latitudes:
            return None

    if pd.notna(row['Latitude']) and pd.notna(row['Longitude']) and pd.notna(row['lat']) and pd.notna(row['long']):
        # If both sets of coordinates are present, calculate the distance
        coord1 = (row['Latitude'], row['Longitude'])
        coord2 = (row['lat'], row['long'])
        return geodesic(coord1, coord2).meters
    elif pd.notna(row['lat_cal_r']) and pd.notna(row['long_cal_r']) and pd.notna(row['lat']) and pd.notna(row['long']):
        # If 'Latitude' and 'Longitude' are not present, use 'lat_cal_r' and 'long_cal_r'
        coord1 = (row['lat_cal_r'], row['long_cal_r'])
        coord2 = (row['lat'], row['long'])
        return geodesic(coord1, coord2).meters
    else:
        return 0  # Replace NaN with 0 for distance

# Calculate and add 'distance' column
df_mi2['distance'] = df_mi2.apply(calculate_distance, axis=1)

# Display the first few lines of the updated DataFrame
print(df_mi2.head())

    IncidentNumber DateOfCall  CalYear  HourOfCall      IncGeo_BoroughName  \
0  043453-28032023  28-Mar-23     2023          23               ISLINGTON   
1  144295-18092023  18-Sep-23     2023           8  KENSINGTON AND CHELSEA   
2  197526-16122023  16-Dec-23     2023          12                  EALING   
3  137787-08092023  08-Sep-23     2023           8                  CAMDEN   
4  143525-16092023  16-Sep-23     2023          23                  MERTON   

   Easting_m  Northing_m  Easting_rounded  Northing_rounded   Latitude  ...  \
0        NaN         NaN           532350            186150        NaN  ...   
1        NaN         NaN           527850            178950        NaN  ...   
2   517597.0    179596.0           517550            179550  51.503080  ...   
3   530803.0    182694.0           530850            182650  51.528024  ...   
4        NaN         NaN           526050            170950        NaN  ...   

   long_cal  lat_cal_r long_cal_r             stat      

In [39]:
print(df_mi2.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10000 entries, 0 to 9999
Data columns (total 24 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   IncidentNumber       10000 non-null  object 
 1   DateOfCall           10000 non-null  object 
 2   CalYear              10000 non-null  int64  
 3   HourOfCall           10000 non-null  int64  
 4   IncGeo_BoroughName   10000 non-null  object 
 5   Easting_m            4120 non-null   float64
 6   Northing_m           4120 non-null   float64
 7   Easting_rounded      10000 non-null  int64  
 8   Northing_rounded     10000 non-null  int64  
 9   Latitude             4120 non-null   float64
 10  Longitude            4120 non-null   float64
 11  AttendanceTime       9476 non-null   float64
 12  DeployedFromStation  9476 non-null   object 
 13  lat_cal              10000 non-null  float64
 14  long_cal             10000 non-null  float64
 15  lat_cal_r            10000 non-null  

# Delete 'Easting_m',  ' Northing_m', 'Latitude', 'Longitude'

In [40]:
# Delete 'Easting_m',  ' Northing_m', 'Latitude', 'Longitude'

# List of columns to remove
columns_to_remove = ['Easting_m', 'Northing_m', 'Latitude', 'Longitude']

# Remove the columns from the DataFrame
df_mi2 = df_mi2.drop(columns=columns_to_remove)

# Display the updated DataFrame
print(df_mi2.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10000 entries, 0 to 9999
Data columns (total 20 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   IncidentNumber       10000 non-null  object 
 1   DateOfCall           10000 non-null  object 
 2   CalYear              10000 non-null  int64  
 3   HourOfCall           10000 non-null  int64  
 4   IncGeo_BoroughName   10000 non-null  object 
 5   Easting_rounded      10000 non-null  int64  
 6   Northing_rounded     10000 non-null  int64  
 7   AttendanceTime       9476 non-null   float64
 8   DeployedFromStation  9476 non-null   object 
 9   lat_cal              10000 non-null  float64
 10  long_cal             10000 non-null  float64
 11  lat_cal_r            10000 non-null  float64
 12  long_cal_r           10000 non-null  float64
 13  stat                 8999 non-null   object 
 14  lat                  8999 non-null   float64
 15  long                 8999 non-null   

# Remove all rows with NaN values

In [41]:
# Remove all rows with NaN values
df_mi2 = df_mi2.dropna()

# Display the updated DataFrame
print(df_mi2.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8976 entries, 0 to 9998
Data columns (total 20 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   IncidentNumber       8976 non-null   object 
 1   DateOfCall           8976 non-null   object 
 2   CalYear              8976 non-null   int64  
 3   HourOfCall           8976 non-null   int64  
 4   IncGeo_BoroughName   8976 non-null   object 
 5   Easting_rounded      8976 non-null   int64  
 6   Northing_rounded     8976 non-null   int64  
 7   AttendanceTime       8976 non-null   float64
 8   DeployedFromStation  8976 non-null   object 
 9   lat_cal              8976 non-null   float64
 10  long_cal             8976 non-null   float64
 11  lat_cal_r            8976 non-null   float64
 12  long_cal_r           8976 non-null   float64
 13  stat                 8976 non-null   object 
 14  lat                  8976 non-null   float64
 15  long                 8976 non-null   f

# Calculate target variable

In [42]:
# Convert 'AttendanceTime' to minutes and then classify into time intervals
df_mi2['AttendanceTimeClasses3'] = pd.cut(df_mi2['AttendanceTime'] / 60, bins=[0, 3, 6, 9, 12, 15, float('inf')], labels=['0-3min', '3-6min', '6-9min', '9-12min', '12-15min', '> 15min'])

# Display the updated DataFrame
print(df_mi2.head())
print(df_mi2.info())

    IncidentNumber DateOfCall  CalYear  HourOfCall      IncGeo_BoroughName  \
0  043453-28032023  28-Mar-23     2023          23               ISLINGTON   
1  144295-18092023  18-Sep-23     2023           8  KENSINGTON AND CHELSEA   
2  197526-16122023  16-Dec-23     2023          12                  EALING   
3  137787-08092023  08-Sep-23     2023           8                  CAMDEN   
4  143525-16092023  16-Sep-23     2023          23                  MERTON   

   Easting_rounded  Northing_rounded  AttendanceTime DeployedFromStation  \
0           532350            186150           140.0     Stoke Newington   
1           527850            178950           283.0             Chelsea   
2           517550            179550           521.0            Chiswick   
3           530850            182650           273.0              Euston   
4           526050            170950           254.0           Wimbledon   

     lat_cal  ...  lat_cal_r  long_cal_r             stat        lat  \
0 

# Save processed file in ../data/processed/

In [43]:
# Save processed file in ../data/processed/

# File path to save the processed CSV file
output_file_path = '../data/processed/df_mi5.csv'

# Save the DataFrame as a CSV file
df_mi2.to_csv(output_file_path, index=False)
