 <span style="color:#42a5f5; font-size:2em; font-weight:bold;">Jointure des données d'incidents et de mobilisations</span>

 <span style="font-weight:bold">Ce notebook à pour but d'explorer et de réaliser une jointure entre les données de  mobilisations et d'incidents !</span>

<span style="color:#e91e63; font-size:1em; font-weight:bold;"> 1. Import des données précedemment chargées et nettoyées</span>

In [1]:
import pandas as pd
import warnings

# Supprimer les warnings pour une meilleure lisibilité
warnings.filterwarnings("ignore")

In [2]:
# Chargement avec typage explicite et parsing des dates
incidents_dtypes = {
    "IncidentNumber": str, "CalYear": "Int64", "HourOfCall": "Int64", "IncidentGroup": str,
    "StopCodeDescription": str, "SpecialServiceType": str, "PropertyCategory": str,
    "PropertyType": str, "AddressQualifier": str, "Postcode_full": str,
    "Postcode_district": str, "UPRN": str, "USRN": str,
    "IncGeo_BoroughCode": str, "IncGeo_BoroughName": str, "ProperCase": str,
    "IncGeo_WardCode": str, "IncGeo_WardName": str, "IncGeo_WardNameNew": str,
    "Easting_m": "Int64", "Northing_m": "Int64", "Easting_rounded": "Int64", "Northing_rounded": "Int64",
    "Latitude": str, "Longitude": str, "FRS": str, "IncidentStationGround": str,
    "FirstPumpArriving_AttendanceTime": "Int64", "FirstPumpArriving_DeployedFromStation": str,
    "SecondPumpArriving_AttendanceTime": "Int64", "SecondPumpArriving_DeployedFromStation": str,
    "NumStationsWithPumpsAttending": "Int64", "NumPumpsAttending": "Int64", "PumpCount": "int64",
    "PumpMinutesRounded": "int64", "NotionalCost": "int64", "NumCalls": "Int64"
}

mobilisations_dtypes = {
    "IncidentNumber": str, "CalYear": "Int64", "BoroughName": str, "WardName": str,
    "HourOfCall": "Int64", "ResourceMobilisationId": "int64", "Resource_Code": str,
    "PerformanceReporting": str, "TurnoutTimeSeconds": "Int64", "TravelTimeSeconds": "Int64",
    "AttendanceTimeSeconds": "Int64", "DeployedFromStation_Code": str, "DeployedFromStation_Name": str,
    "DeployedFromLocation": str, "PumpOrder": "Int64", "PlusCode_Code": str,
    "PlusCode_Description": str, "DelayCodeId": "Int64", "DelayCode_Description": str
}

# Parsing dates
date_cols_incidents = ["DateOfCall"]
date_cols_mobilisations = [
    "DateAndTimeMobilised", "DateAndTimeMobile", "DateAndTimeArrived",
    "DateAndTimeLeft", "DateAndTimeReturned"
]

# Chargement des CSV nettoyés
df_incidents = pd.read_csv("../../data/raw/Cleaned_data/InUSE/cleaned_data_incidents.csv", dtype=incidents_dtypes, parse_dates=date_cols_incidents)
df_mobilisations = pd.read_csv("../../data/raw/Cleaned_data/InUSE/cleaned_data_mobilisations.csv", dtype=mobilisations_dtypes, parse_dates=date_cols_mobilisations)


In [3]:
def clean_incident_number(val):
    if pd.isna(val):
        return None
    val = str(val).strip()
    if '.' in val:
        val = val.split('.')[0]
    if '-' in val:
        val = val.split('-')[0]
    return val.lstrip('0')

df_incidents["IncidentNumber_clean"] = df_incidents["IncidentNumber"].apply(clean_incident_number)
df_mobilisations["IncidentNumber_clean"] = df_mobilisations["IncidentNumber"].apply(clean_incident_number)
# Filtrage des incidents pour ne garder que ceux avec un IncidentNumber propre
df_incidents = df_incidents[df_incidents["IncidentNumber_clean"].notna()]
# Filtrage des mobilisations pour ne garder que celles avec un IncidentNumber propre
df_mobilisations = df_mobilisations[df_mobilisations["IncidentNumber_clean"].notna()]
# Réinitialisation des index après filtrage
df_incidents.reset_index(drop=True, inplace=True)
df_mobilisations.reset_index(drop=True, inplace=True)

In [4]:
print("Données incidents clean chargées:")
display(df_incidents)

print("Données mobilisations clean chargées:")
display(df_mobilisations)

print("Période des données d'incidents : De", min(df_incidents["CalYear"]), "à", max(df_incidents["CalYear"]))
print("Période des données de mobilisations : De", min(df_mobilisations["CalYear"]), "à", max(df_mobilisations["CalYear"]))

print("Vérification taille:")
print("Incidents :", len(df_incidents), " | Mobilisations :", len(df_mobilisations))

print("Incidents - IDs uniques :", df_incidents["IncidentNumber_clean"].nunique())
print("Mobilisations - IDs uniques :", df_mobilisations["IncidentNumber_clean"].nunique())

print("\nVérification doublons dans incidents :", df_incidents["IncidentNumber_clean"].duplicated().sum())
print("Vérification doublons dans mobilisations :", df_mobilisations["IncidentNumber_clean"].duplicated().sum())

print("\nNAs dans incidents :", df_incidents["IncidentNumber_clean"].isna().sum())
print("NAs dans mobilisations :", df_mobilisations["IncidentNumber_clean"].isna().sum())


Données incidents clean chargées:


Unnamed: 0,IncidentNumber,DateOfCall,CalYear,TimeOfCall,HourOfCall,IncidentGroup,StopCodeDescription,SpecialServiceType,PropertyCategory,PropertyType,...,FirstPumpArriving_DeployedFromStation,SecondPumpArriving_AttendanceTime,SecondPumpArriving_DeployedFromStation,NumStationsWithPumpsAttending,NumPumpsAttending,PumpCount,PumpMinutesRounded,NotionalCost,NumCalls,IncidentNumber_clean
0,235138081.00,2009-01-01,2009,00:00:37,0,Special Service,Special Service,RTC,Road Vehicle,Car,...,Battersea,342,Clapham,2,2,2,60,255,1,235138081
1,1091.00,2009-01-01,2009,00:00:46,0,Special Service,Special Service,Assist other agencies,Outdoor,Lake/pond/reservoir,...,,,,,,1,60,255,1,1091
2,2091.00,2009-01-01,2009,00:03:00,0,Fire,Secondary Fire,,Outdoor,Road surface/pavement,...,Edmonton,,,1,1,1,60,255,2,2091
3,3091.00,2009-01-01,2009,00:04:27,0,Fire,Secondary Fire,,Outdoor,Domestic garden (vegetation not equipment),...,Hillingdon,,,1,1,1,60,255,2,3091
4,5091.00,2009-01-01,2009,00:05:39,0,Fire,Secondary Fire,,Outdoor,Cycle path/public footpath/bridleway,...,Holloway,250,Holloway,1,2,2,60,255,1,5091
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1824968,054201-31032025,2025-03-31,2025,23:39:44,23,False Alarm,AFA,,Non Residential,Shopping Centre,...,Hammersmith,247,Hammersmith,2,3,3,60,430,2,54201
1824969,054203-31032025,2025-03-31,2025,23:40:09,23,Special Service,Special Service,Flooding,Dwelling,Purpose Built Flats/Maisonettes - 4 to 9 storeys,...,Holloway,,,1,1,1,60,430,1,54203
1824970,054204-31032025,2025-03-31,2025,23:42:16,23,Special Service,Special Service,Effecting entry/exit,Dwelling,Licensed House in Multiple Occupation - Up to ...,...,Harrow,,,1,1,1,60,430,1,54204
1824971,054205-31032025,2025-03-31,2025,23:42:42,23,Fire,Secondary Fire,,Outdoor,Loose refuse,...,East Ham,412,Stratford,2,2,2,60,430,2,54205


Données mobilisations clean chargées:


Unnamed: 0,IncidentNumber,CalYear,BoroughName,WardName,HourOfCall,ResourceMobilisationId,Resource_Code,PerformanceReporting,DateAndTimeMobilised,DateAndTimeMobile,...,DateAndTimeReturned,DeployedFromStation_Code,DeployedFromStation_Name,DeployedFromLocation,PumpOrder,PlusCode_Code,PlusCode_Description,DelayCodeId,DelayCode_Description,IncidentNumber_clean
0,000004-01012025,2025,HAMMERSMITH AND FULHAM,FULHAM REACH,0,6862256,H331,2,2025-01-01 00:02:00,2025-01-01 00:07:00,...,NaT,H33,Wandsworth,Home Station,2,Initial,Initial Mobilisation,12,Not held up,4
1,000004-01012025,2025,HAMMERSMITH AND FULHAM,FULHAM REACH,0,6862257,G261,1,2025-01-01 00:02:00,2025-01-01 00:02:00,...,NaT,G36,Hammersmith,Other Station,1,Initial,Initial Mobilisation,12,Not held up,4
2,000005-01012025,2025,MERTON,WEST BARNES,0,6862259,H401,1,2025-01-01 00:03:00,2025-01-01 00:04:00,...,NaT,H40,New Malden,Home Station,1,Initial,Initial Mobilisation,,,5
3,000006-01012025,2025,CROYDON,PURLEY OAKS & RIDDLESDOWN,0,6862260,H291,1,2025-01-01 00:04:00,2025-01-01 00:06:00,...,NaT,H29,Purley,Home Station,1,Initial,Initial Mobilisation,,,6
4,000007-01012025,2025,BARNET,BURNT OAK,0,6862261,G222,1,2025-01-01 00:05:00,2025-01-01 00:06:00,...,NaT,G22,Stanmore,Home Station,1,Initial,Initial Mobilisation,,,7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2578271,221555-30122024,2024,KINGSTON UPON THAMES,COOMBE VALE,23,6861269,H391,2,2024-12-30 23:47:00,2024-12-30 23:49:00,...,NaT,H39,Surbiton,Home Station,2,Initial,Initial Mobilisation,9,"Traffic, roadworks, etc",221555
2578272,221556-30122024,2024,ISLINGTON,FINSBURY PARK,23,6861270,A311,1,2024-12-30 23:48:00,2024-12-30 23:49:00,...,NaT,A31,Holloway,Home Station,1,Initial,Initial Mobilisation,,,221556
2578273,221556-30122024,2024,ISLINGTON,FINSBURY PARK,23,6861271,F321,2,2024-12-30 23:48:00,2024-12-30 23:49:00,...,NaT,F32,Stoke Newington,Home Station,2,Initial,Initial Mobilisation,12,Not held up,221556
2578274,221557-30122024,2024,KENSINGTON AND CHELSEA,ROYAL HOSPITAL,23,6861272,G341,1,2024-12-30 23:59:00,2024-12-31 00:01:00,...,NaT,G34,Chelsea,Home Station,1,Initial,Initial Mobilisation,,,221557


Période des données d'incidents : De 2009 à 2025
Période des données de mobilisations : De 2009 à 2025
Vérification taille:
Incidents : 1824973  | Mobilisations : 2578276
Incidents - IDs uniques : 982931
Mobilisations - IDs uniques : 907153

Vérification doublons dans incidents : 842042
Vérification doublons dans mobilisations : 1671123

NAs dans incidents : 0
NAs dans mobilisations : 0


In [5]:
cols_incidents = set(df_incidents.columns)
cols_mobilisations = set(df_mobilisations.columns)
common_columns = cols_incidents.intersection(cols_mobilisations)

print("Colonnes communes :")
for col_name in sorted(common_columns):
    print(f"- {col_name}")

print(f"Nombre de colonnes communes : {len(common_columns)}")
print("\nVérification des types de données dans incidents :")
for col in df_incidents.columns:
    print(f"{col}: {df_incidents[col].dtype}")
print("\nVérification des types de données dans mobilisations :")
for col in df_mobilisations.columns:
    print(f"{col}: {df_mobilisations[col].dtype}")
print("\nVérification des doublons dans les colonnes communes :")
for col in common_columns:
    if df_incidents[col].duplicated().any() or df_mobilisations[col].duplicated().any():
        print(f"Colonne {col} a des doublons dans au moins un des DataFrames.")
    else:
        print(f"Colonne {col} n'a pas de doublons dans les deux DataFrames.")
print("\nVérification des valeurs uniques dans les colonnes communes :")
for col in common_columns:
    unique_incidents = df_incidents[col].nunique()
    unique_mobilisations = df_mobilisations[col].nunique()
    print(f"Colonne {col} - Incidents: {unique_incidents} uniques, Mobilisations: {unique_mobilisations} uniques")

Colonnes communes :
- CalYear
- HourOfCall
- IncidentNumber
- IncidentNumber_clean
Nombre de colonnes communes : 4

Vérification des types de données dans incidents :
IncidentNumber: object
DateOfCall: datetime64[ns]
CalYear: Int64
TimeOfCall: object
HourOfCall: Int64
IncidentGroup: object
StopCodeDescription: object
SpecialServiceType: object
PropertyCategory: object
PropertyType: object
AddressQualifier: object
Postcode_full: object
Postcode_district: object
UPRN: object
USRN: object
IncGeo_BoroughCode: object
IncGeo_BoroughName: object
ProperCase: object
IncGeo_WardCode: object
IncGeo_WardName: object
IncGeo_WardNameNew: object
Easting_m: Int64
Northing_m: Int64
Easting_rounded: Int64
Northing_rounded: Int64
Latitude: object
Longitude: object
FRS: object
IncidentStationGround: object
FirstPumpArriving_AttendanceTime: Int64
FirstPumpArriving_DeployedFromStation: object
SecondPumpArriving_AttendanceTime: Int64
SecondPumpArriving_DeployedFromStation: object
NumStationsWithPumpsAttendin

In [None]:
df_inner = df_incidents.merge(
    df_mobilisations,
    how="inner",
    on="IncidentNumber_clean",
    suffixes=("_incident", "_mobilisation")
)

print(f"Jointure INNER : {df_inner.shape[0]} lignes")

df_inner.to_csv("../../data/df_jointure_inner.csv", index=False)

joined_ratio = len(df_inner) / len(df_mobilisations)
print(f"Taux de jointure réussie : {joined_ratio:.2%}")

mobilisations_per_incident = df_mobilisations["IncidentNumber_clean"].value_counts()
print("Nb moyen de mobilisations par incident :", mobilisations_per_incident.mean())
print("Nb max de mobilisations pour un incident :", mobilisations_per_incident.max())
print("Nb min de mobilisations pour un incident :", mobilisations_per_incident.min())
print("Liste des incidents avec le plus de mobilisations :")
print(mobilisations_per_incident[mobilisations_per_incident == mobilisations_per_incident.max()])
print("\nListe des incidents avec le moins de mobilisations :")
print(mobilisations_per_incident[mobilisations_per_incident == mobilisations_per_incident.min()])
print("\nVérification des doublons dans les données jointes :")
print("Nombre de doublons dans les données jointes :", df_inner.duplicated().sum())
print("\nVérification des valeurs uniques dans les données jointes :")
for col in df_inner.columns:
    unique_count = df_inner[col].nunique()
    print(f"Colonne {col} - {unique_count} valeurs uniques")


Jointure INNER : 9962884 lignes
Taux de jointure réussie : 386.42%
Nb moyen de mobilisations par incident : 2.842162237241127
Nb max de mobilisations pour un incident : 61
Nb min de mobilisations pour un incident : 1
Liste des incidents avec le plus de mobilisations :
IncidentNumber_clean
84944    61
Name: count, dtype: int64

Liste des incidents avec le moins de mobilisations :
IncidentNumber_clean
221558    1
221539    1
221542    1
221546    1
221547    1
         ..
221512    1
221510    1
221557    1
221551    1
221549    1
Name: count, Length: 436886, dtype: int64

Vérification des doublons dans les données jointes :
Nombre de doublons dans les données jointes : 24131

Vérification des valeurs uniques dans les données jointes :
Colonne IncidentNumber_incident - 1746976 valeurs uniques
Colonne DateOfCall - 5931 valeurs uniques
Colonne CalYear_incident - 17 valeurs uniques
Colonne TimeOfCall - 86393 valeurs uniques
Colonne HourOfCall_incident - 24 valeurs uniques
Colonne IncidentGr

In [2]:
print(" vérification combien d'incidents sans mobilisations:")
no_mobilisations = df_incidents[~df_incidents["IncidentNumber_clean"].isin(df_mobilisations["IncidentNumber_clean"])]
print(f"Nombre d'incidents sans mobilisations : {len(no_mobilisations)}")
print("Liste des incidents sans mobilisations :")
print(no_mobilisations[["IncidentNumber_clean", "CalYear", "IncidentGroup"]].head(10))
print("\nVérification combien de mobilisations sans incidents:")
no_incidents = df_mobilisations[~df_mobilisations["IncidentNumber_clean"].isin(df_incidents["IncidentNumber_clean"])]
print(f"Nombre de mobilisations sans incidents : {len(no_incidents)}")
print("Liste des mobilisations sans incidents :")
print(no_incidents[["IncidentNumber_clean", "CalYear", "ResourceMobilisationId"]].head(10))

 vérification combien d'incidents sans mobilisations:


NameError: name 'df_incidents' is not defined

In [None]:
print(df_inner.columns)
print("Types de données après jointure :")
for col in df_inner.columns:
    print(f"{col}: {df_inner[col].dtype}")

dups_jointure = df_inner["IncidentNumber_clean"].duplicated().sum()
print(f"\nDoublons dans la jointure (IDs répétés) : {dups_jointure}")

print("\nColonnes critiques valeurs manquantes :")
print(df_inner[["IncidentNumber_clean", "DateAndTimeArrived", "AttendanceTimeSeconds"]].isna().sum())


Index(['IncidentNumber_incident', 'DateOfCall', 'CalYear_incident',
       'TimeOfCall', 'HourOfCall_incident', 'IncidentGroup',
       'StopCodeDescription', 'SpecialServiceType', 'PropertyCategory',
       'PropertyType', 'AddressQualifier', 'Postcode_full',
       'Postcode_district', 'UPRN', 'USRN', 'IncGeo_BoroughCode',
       'IncGeo_BoroughName', 'ProperCase', 'IncGeo_WardCode',
       'IncGeo_WardName', 'IncGeo_WardNameNew', 'Easting_m', 'Northing_m',
       'Easting_rounded', 'Northing_rounded', 'Latitude', 'Longitude', 'FRS',
       'IncidentStationGround', 'FirstPumpArriving_AttendanceTime',
       'FirstPumpArriving_DeployedFromStation',
       'SecondPumpArriving_AttendanceTime',
       'SecondPumpArriving_DeployedFromStation',
       'NumStationsWithPumpsAttending', 'NumPumpsAttending', 'PumpCount',
       'PumpMinutesRounded', 'NotionalCost', 'NumCalls',
       'IncidentNumber_clean', 'IncidentNumber_mobilisation',
       'CalYear_mobilisation', 'BoroughName', 'WardName'

In [8]:
print("Années uniques :", sorted(df_inner["CalYear_incident"].unique()))

invalid_years = df_inner[~df_inner["CalYear_incident"].between(2009, 2025)]
print("Nombre d'années hors intervalle 2009–2025 :", len(invalid_years))
print("Années hors intervalle 2009–2025 :", invalid_years["CalYear_incident"].unique())

Années uniques : [np.int64(2009), np.int64(2010), np.int64(2011), np.int64(2012), np.int64(2013), np.int64(2014), np.int64(2015), np.int64(2016), np.int64(2017), np.int64(2018), np.int64(2019), np.int64(2020), np.int64(2021), np.int64(2022), np.int64(2023), np.int64(2024), np.int64(2025)]
Nombre d'années hors intervalle 2009–2025 : 0
Années hors intervalle 2009–2025 : <IntegerArray>
[]
Length: 0, dtype: Int64


In [9]:
# Jointure OUTER complète
df_merge = df_incidents.merge(
    df_mobilisations,
    how="outer",
    on="IncidentNumber_clean",
    suffixes=("_incident", "_mobilisation"),
    indicator=True
)

In [12]:
# Extraction des cas selon le type de correspondance
df_outer = df_merge.copy()  # → Jointure OUTER complète     
df_left_only = df_merge[df_merge["_merge"] == "left_only"].copy()  # Incidents sans mobilisation
df_right_only = df_merge[df_merge["_merge"] == "right_only"].copy()# Mobilisations sans incident

# Affichage
print(f"Jointure OUTER complète : {df_outer.shape[0]} lignes")
print(f"→ Appariements (INNER) : {df_inner_from_outer.shape[0]} lignes")
print(f"→ Incidents sans mobilisation : {df_left_only.shape[0]} lignes")
print(f"→ Mobilisations sans incident : {df_right_only.shape[0]} lignes")

MemoryError: Unable to allocate 1.80 GiB for an array with shape (24, 10043305) and data type object

In [None]:
output_path = "../../data/processed/"
df_outer.to_csv("../../data/df_jointure_outer.csv", index=False)
df_left_only.to_csv(f"{output_path}df_incidents_without_mobilisations.csv", index=False)
df_right_only.to_csv(f"{output_path}df_mobilisations_without_incidents.csv", index=False)
print("Fichiers de jointure sauvegardés dans :", output_path)
print("Nettoyage et jointure des données terminés avec succès.")