<span style="color:#42a5f5; font-size:2em; font-weight:bold;">Notebook de chargement des données d'incidents et nettoyage</span>

<span style="color:#e91e63; font-size:1em; font-weight:bold;">1. Import des bibliothèques Python</span>

In [1]:
import pandas as pd
import numpy as np
import os
import glob
from pathlib import Path

In [2]:
# Pour afficher toutes les colonnes et tout vérifier!
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)

<span style="color:#e91e63; font-size:1em; font-weight:bold;"> 2.Définition des chemins propres "</span>

In [3]:
# Revenir au dossier parent
base_path = os.path.abspath(os.path.join(os.getcwd(), "..", ".."))

# Dossiers contenant les fichiers incidents 
incidents_path = os.path.join(base_path, "data", "raw", "incidents")


# Vérifier que les chemins existent
print("Répertoire courant :", os.getcwd())
print("-Chemin incidents :", incidents_path)

# Fonction pour lister les fichiers dans les dossiers incidents
def lister_fichiers(dossier):
    if os.path.exists(dossier):
        fichiers = os.listdir(dossier)
        if fichiers:
            print(f"\nFichiers dans {dossier} :")
            for f in fichiers:
                print("   -", f)
        else:
            print(f"\nLe dossier {dossier} est vide.")
    else:
        print(f"\nLe chemin {dossier} n'existe pas.")

# Lister les fichiers dans incidents
lister_fichiers(incidents_path)

Répertoire courant : c:\Users\9609241C\london-fire-response\notebooks\Ingestion
-Chemin incidents : c:\Users\9609241C\london-fire-response\data\raw\incidents

Fichiers dans c:\Users\9609241C\london-fire-response\data\raw\incidents :
   - incident Metadata.xlsx
   - incidents_2009_2017.csv
   - Incidents_2018_2025.csv


<span style="color:#e91e63; font-size:1em; font-weight:bold;"> 3.Chargement des fichiers CSV "incidents"</span>

In [35]:
incident_files = glob.glob(os.path.join(incidents_path, "*.csv"))

# On charge tous les fichiers et on les regroupe
df_all_incidents = pd.concat([pd.read_csv(f, dtype={'IncidentNumber': str}) for f in incident_files], ignore_index=True)

# Définition initiale des types pour une lecture robuste
from IPython.display import display
# Affichage interactif du DataFrame
display(df_all_incidents.head(10)) 

# Aperçu rapide des données
print("Données incidents chargées !")
print("-Nombre de lignes :", len(df_all_incidents))
print("-Nombre de colonnes :", len(df_all_incidents.columns))
print("-Colonnes :", df_all_incidents.columns.tolist())
print("-Types de données :", df_all_incidents.dtypes)
# Affichage avec mise en forme
#df_all_incidents.head(20).style.set_table_attributes("style='display:inline'").set_caption("Aperçu des incidents")

Unnamed: 0,IncidentNumber,DateOfCall,CalYear,TimeOfCall,HourOfCall,IncidentGroup,StopCodeDescription,SpecialServiceType,PropertyCategory,PropertyType,AddressQualifier,Postcode_full,Postcode_district,UPRN,USRN,IncGeo_BoroughCode,IncGeo_BoroughName,ProperCase,IncGeo_WardCode,IncGeo_WardName,IncGeo_WardNameNew,Easting_m,Northing_m,Easting_rounded,Northing_rounded,Latitude,Longitude,FRS,IncidentStationGround,FirstPumpArriving_AttendanceTime,FirstPumpArriving_DeployedFromStation,SecondPumpArriving_AttendanceTime,SecondPumpArriving_DeployedFromStation,NumStationsWithPumpsAttending,NumPumpsAttending,PumpCount,PumpMinutesRounded,Notional Cost (£),NumCalls
0,235138081.0,01-Jan-09,2009,00:00:37,0,Special Service,Special Service,RTC,Road Vehicle,Car,In street close to gazetteer location,SW11 4LB,SW11,,,E09000032,WANDSWORTH,Wandsworth,E05014010,Battersea Park,Battersea Park,528652.0,176830.0,528650,176850,51.475812,-0.148894,London,Battersea,319.0,Battersea,342.0,Clapham,2.0,2.0,2,60,255,1.0
1,1091.0,01-Jan-09,2009,00:00:46,0,Special Service,Special Service,Assist other agencies,Outdoor,Lake/pond/reservoir,Open land/water - nearest gazetteer location,SE1 7SG,SE1,,,E09000022,LAMBETH,Lambeth,E05014118,Waterloo & South Bank,Waterloo & South Bank,530485.0,179007.0,530450,179050,51.494957,-0.121712,London,Lambeth,,,,,,,1,60,255,1.0
2,2091.0,01-Jan-09,2009,00:03:00,0,Fire,Secondary Fire,,Outdoor,Road surface/pavement,In street outside gazetteer location,N9 9EL,N9,,,E09000010,ENFIELD,Enfield,E05013682,Haselbury,Haselbury,533773.0,194492.0,533750,194450,51.633342,-0.068488,London,Edmonton,308.0,Edmonton,,,1.0,1.0,1,60,255,2.0
3,3091.0,01-Jan-09,2009,00:04:27,0,Fire,Secondary Fire,,Outdoor,Domestic garden (vegetation not equipment),On land associated with building,UB10 0DG,UB10,100021500000.0,21401491.0,E09000017,HILLINGDON,Hillingdon,E05013571,Hillingdon East,Hillingdon East,507738.0,182805.0,507750,182850,51.533882,-0.448089,London,Hillingdon,210.0,Hillingdon,,,1.0,1.0,1,60,255,2.0
4,5091.0,01-Jan-09,2009,00:05:39,0,Fire,Secondary Fire,,Outdoor,Cycle path/public footpath/bridleway,In street outside gazetteer location,N7 8HG,N7,,,E09000019,ISLINGTON,Islington,E05013708,Laycock,Laycock,531058.0,185307.0,531050,185350,51.551441,-0.11112,London,Holloway,233.0,Holloway,250.0,Holloway,1.0,2.0,2,60,255,1.0
5,6091.0,01-Jan-09,2009,00:06:03,0,False Alarm,AFA,,Dwelling,Purpose Built Flats/Maisonettes - Up to 3 stor...,Correct incident location,,NW5,0.0,20499122.0,E09000007,CAMDEN,Camden,E05013663,Kentish Town North,Kentish Town North,,,529450,185250,,,London,Kentish Town,172.0,Kentish Town,222.0,Kentish Town,1.0,2.0,2,60,255,1.0
6,8091.0,01-Jan-09,2009,00:12:31,0,Special Service,Special Service,RTC,Road Vehicle,Car,On motorway / elevated road,SE3 9PP,SE3,,,E09000011,GREENWICH,Greenwich,E05014084,Kidbrooke Village & Sutcliffe,Kidbrooke Village & Sutcliffe,541086.0,175646.0,541050,175650,51.462211,0.029557,London,Lee Green,522.0,East Greenwich,,,1.0,1.0,1,60,255,1.0
7,9091.0,01-Jan-09,2009,00:13:42,0,Fire,Secondary Fire,,Outdoor Structure,Refuse/rubbish tip,On land associated with building,TW3 2RE,TW3,100021600000.0,21500957.0,E09000018,HOUNSLOW,Hounslow,E05013623,Hounslow South,Hounslow South,514529.0,174907.0,514550,174950,51.46156,-0.352783,London,Heston,342.0,Heston,349.0,Heston,1.0,2.0,2,89,378,1.0
8,10091.0,01-Jan-09,2009,00:14:51,0,Fire,Secondary Fire,,Outdoor Structure,Small refuse/rubbish container,On land associated with building,EC1V 7PB,EC1V,5300037000.0,21604971.0,E09000019,ISLINGTON,Islington,E05013711,St. Peter's & Canalside,St. Peter's & Canalside,531902.0,182654.0,531950,182650,51.527403,-0.099948,London,Shoreditch,255.0,Clerkenwell,,,1.0,1.0,1,60,255,2.0
9,11091.0,01-Jan-09,2009,00:14:57,0,False Alarm,AFA,,Other Residential,Student Hall of Residence,Correct incident location,WC1B 3RA,WC1B,5048015.0,20401090.0,E09000007,CAMDEN,Camden,E05013653,Bloomsbury,Bloomsbury,529914.0,181705.0,529950,181750,51.519334,-0.128939,London,Soho,297.0,Euston,,,1.0,1.0,1,60,255,1.0


Données incidents chargées !
-Nombre de lignes : 1824973
-Nombre de colonnes : 39
-Colonnes : ['IncidentNumber', 'DateOfCall', 'CalYear', 'TimeOfCall', 'HourOfCall', 'IncidentGroup', 'StopCodeDescription', 'SpecialServiceType', 'PropertyCategory', 'PropertyType', 'AddressQualifier', 'Postcode_full', 'Postcode_district', 'UPRN', 'USRN', 'IncGeo_BoroughCode', 'IncGeo_BoroughName', 'ProperCase', 'IncGeo_WardCode', 'IncGeo_WardName', 'IncGeo_WardNameNew', 'Easting_m', 'Northing_m', 'Easting_rounded', 'Northing_rounded', 'Latitude', 'Longitude', 'FRS', 'IncidentStationGround', 'FirstPumpArriving_AttendanceTime', 'FirstPumpArriving_DeployedFromStation', 'SecondPumpArriving_AttendanceTime', 'SecondPumpArriving_DeployedFromStation', 'NumStationsWithPumpsAttending', 'NumPumpsAttending', 'PumpCount', 'PumpMinutesRounded', 'Notional Cost (£)', 'NumCalls']
-Types de données : IncidentNumber                             object
DateOfCall                                 object
CalYear                

<span style="color:#e91e63; font-size:1em; font-weight:bold;">4. Nettoyage et typage </span>

In [39]:
# Nettoyage et transformations
# 1. Conversion de 'DateOfCall' en format de date
# gérer les erreurs de format de date remontées!
df_all_incidents['DateOfCall'] = pd.to_datetime(df_all_incidents['DateOfCall'], format='%d-%b-%y', errors='coerce')
    
# 2. Remplacement des virgules par des points et conversion
df_all_incidents['Latitude'] = df_all_incidents['Latitude'].astype(str).str.replace(',', '.', regex=False)
df_all_incidents['Longitude'] = df_all_incidents['Longitude'].astype(str).str.replace(',', '.', regex=False)
   
# 3. Nettoyage de 'UPRN','USRN' (suppression des virgules)
df_all_incidents['UPRN'] = df_all_incidents['UPRN'].astype(str).str.replace(',', '', regex=False)
df_all_incidents['USRN'] = df_all_incidents['USRN'].astype(str).str.replace(',', '', regex=False)
  
# 4. Conversion en entier
colonnes_entiers = ['NumCalls','Easting_m','Northing_m','Easting_rounded','Northing_rounded','NumStationsWithPumpsAttending','NumPumpsAttending','FirstPumpArriving_AttendanceTime','SecondPumpArriving_AttendanceTime']
for col in colonnes_entiers:
 if col in df_all_incidents.columns:
  df_all_incidents[col] = pd.to_numeric(df_all_incidents[col], errors='coerce').astype('Int64')

  
# 5. Renommage de la colonne 'Notional Cost (£)' en 'NotionalCost'
df_all_incidents = df_all_incidents.rename(columns={'Notional Cost (£)': 'NotionalCost'})
  
print("Les données sont maintenant nettoyées et transformées !")
print("Aperçu des 5 premières lignes du DataFrame nettoyé :"),display(df_all_incidents.head())
print("Informations sur le DataFrame (types de données et valeurs non-nulles) :")
print("Taille:",df_all_incidents.shape)
print(display(df_all_incidents.info()))
print(display(df_all_incidents.describe()))

Les données sont maintenant nettoyées et transformées !
Aperçu des 5 premières lignes du DataFrame nettoyé :


Unnamed: 0,IncidentNumber,DateOfCall,CalYear,TimeOfCall,HourOfCall,IncidentGroup,StopCodeDescription,SpecialServiceType,PropertyCategory,PropertyType,AddressQualifier,Postcode_full,Postcode_district,UPRN,USRN,IncGeo_BoroughCode,IncGeo_BoroughName,ProperCase,IncGeo_WardCode,IncGeo_WardName,IncGeo_WardNameNew,Easting_m,Northing_m,Easting_rounded,Northing_rounded,Latitude,Longitude,FRS,IncidentStationGround,FirstPumpArriving_AttendanceTime,FirstPumpArriving_DeployedFromStation,SecondPumpArriving_AttendanceTime,SecondPumpArriving_DeployedFromStation,NumStationsWithPumpsAttending,NumPumpsAttending,PumpCount,PumpMinutesRounded,NotionalCost,NumCalls
0,235138081.0,2009-01-01,2009,00:00:37,0,Special Service,Special Service,RTC,Road Vehicle,Car,In street close to gazetteer location,SW11 4LB,SW11,,,E09000032,WANDSWORTH,Wandsworth,E05014010,Battersea Park,Battersea Park,528652,176830,528650,176850,51.47581173,-0.148894436,London,Battersea,319.0,Battersea,342.0,Clapham,2.0,2.0,2,60,255,1
1,1091.0,2009-01-01,2009,00:00:46,0,Special Service,Special Service,Assist other agencies,Outdoor,Lake/pond/reservoir,Open land/water - nearest gazetteer location,SE1 7SG,SE1,,,E09000022,LAMBETH,Lambeth,E05014118,Waterloo & South Bank,Waterloo & South Bank,530485,179007,530450,179050,51.49495659,-0.12171203,London,Lambeth,,,,,,,1,60,255,1
2,2091.0,2009-01-01,2009,00:03:00,0,Fire,Secondary Fire,,Outdoor,Road surface/pavement,In street outside gazetteer location,N9 9EL,N9,,,E09000010,ENFIELD,Enfield,E05013682,Haselbury,Haselbury,533773,194492,533750,194450,51.63334229,-0.068487965,London,Edmonton,308.0,Edmonton,,,1.0,1.0,1,60,255,2
3,3091.0,2009-01-01,2009,00:04:27,0,Fire,Secondary Fire,,Outdoor,Domestic garden (vegetation not equipment),On land associated with building,UB10 0DG,UB10,100021491527.0,21401491.0,E09000017,HILLINGDON,Hillingdon,E05013571,Hillingdon East,Hillingdon East,507738,182805,507750,182850,51.53388171,-0.448088609,London,Hillingdon,210.0,Hillingdon,,,1.0,1.0,1,60,255,2
4,5091.0,2009-01-01,2009,00:05:39,0,Fire,Secondary Fire,,Outdoor,Cycle path/public footpath/bridleway,In street outside gazetteer location,N7 8HG,N7,,,E09000019,ISLINGTON,Islington,E05013708,Laycock,Laycock,531058,185307,531050,185350,51.55144063,-0.111120228,London,Holloway,233.0,Holloway,250.0,Holloway,1.0,2.0,2,60,255,1


Informations sur le DataFrame (types de données et valeurs non-nulles) :
Taille: (1824973, 39)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1824973 entries, 0 to 1824972
Data columns (total 39 columns):
 #   Column                                  Dtype         
---  ------                                  -----         
 0   IncidentNumber                          object        
 1   DateOfCall                              datetime64[ns]
 2   CalYear                                 int64         
 3   TimeOfCall                              object        
 4   HourOfCall                              int64         
 5   IncidentGroup                           object        
 6   StopCodeDescription                     object        
 7   SpecialServiceType                      object        
 8   PropertyCategory                        object        
 9   PropertyType                            object        
 10  AddressQualifier                        object        
 11  Postcod

None

None


Unnamed: 0,DateOfCall,CalYear,HourOfCall,Easting_m,Northing_m,Easting_rounded,Northing_rounded,FirstPumpArriving_AttendanceTime,SecondPumpArriving_AttendanceTime,NumStationsWithPumpsAttending,NumPumpsAttending,PumpCount,PumpMinutesRounded,NotionalCost,NumCalls
count,1824973,1824973.0,1824973.0,905076.0,905076.0,1824973.0,1824973.0,1686620.0,654385.0,1810351.0,1810351.0,1824973.0,1824973.0,1824973.0,1823124.0
mean,2017-03-12 20:50:59.275944960,2016.695,13.4626,530630.371066,180366.797409,530658.198916,180484.681445,318.399393,395.596258,1.370929,1.555361,1.592914,76.24512,407.816,1.327807
min,2009-01-01 00:00:00,2009.0,0.0,503582.0,155901.0,503550.0,155950.0,1.0,1.0,1.0,1.0,1.0,60.0,255.0,1.0
25%,2012-09-29 00:00:00,2012.0,9.0,525176.0,175993.0,525250.0,176150.0,231.0,295.0,1.0,1.0,1.0,60.0,290.0,1.0
50%,2017-04-15 00:00:00,2017.0,14.0,530796.0,180983.0,530950.0,181050.0,297.0,369.0,1.0,1.0,1.0,60.0,328.0,1.0
75%,2021-09-02 00:00:00,2021.0,19.0,536834.0,184949.0,536350.0,185150.0,379.0,464.0,2.0,2.0,2.0,60.0,364.0,1.0
max,2025-03-31 00:00:00,2025.0,23.0,561126.0,200906.0,611150.0,302450.0,1200.0,1200.0,46.0,106.0,951.0,525629.0,2277726.0,369.0
std,,4.902916,6.258045,10339.286666,7435.775843,9725.050698,7360.097705,138.476331,153.169852,0.687726,0.845371,1.582791,562.3934,2701.663,1.487098


None


In [34]:
#Vérifier la période des données d'incidents : doit etre de 2009 à 2025
print("Période des données d'incidents chargées: De", min(df_all_incidents["CalYear"]), "à", max(df_all_incidents["CalYear"]))

Période des données d'incidents chargées: De 2009 à 2025


<span style="color:#e91e63; font-size:1em; font-weight:bold;">5. Export des résultats nettoyés</span>

In [None]:
# Définition du chemin d’export
output_path = Path("../../data/raw/Cleaned_data/InUSE/cleaned_data_incidents.csv")
output_path.parent.mkdir(parents=True, exist_ok=True)

# Exporter au format CSV
df_all_incidents.to_csv(output_path, index=False, encoding='utf-8')

print("l'export est mnt terminé => le fichier est sauvegardé dans:", output_path)

l'export est mnt terminé => le fichier est sauvegardé dans: data\raw\Cleaned_data\InUSE\cleaned_data_incidents.csv
