In [1]:
import pandas as pd
from sklearn.impute import SimpleImputer
import numpy as np
from geopy.geocoders import Nominatim
from geopy.exc import GeocoderTimedOut, GeocoderServiceError
geolocator = Nominatim(user_agent="my_app")

In [8]:
def summarize_dataset(file_path):
    # Read the Parquet file
    df = pd.read_parquet(file_path)
    
    # Calculate missing values per column
    missing_values = df.isnull().sum()
    missing_percentage = (missing_values / len(df)) * 100
    data_types = df.dtypes  # Get the data types of each column
    
    # Prepare the summary DataFrame
    summary_df = pd.DataFrame({
        'Column': missing_values.index,
        'Missing Values': missing_values.values,
        'Percentage Missing (%)': missing_percentage.values,
        'Data Type': data_types.values  # Add the data types to the summary
    })
    
    # Sort the summary DataFrame by the number of missing values, descending
    summary_df = summary_df.sort_values(by='Missing Values', ascending=False)
    
    # Reset index for neat presentation
    summary_df.reset_index(drop=True, inplace=True)
    
    return summary_df

def summarize_dataframe(df):
    # Calculate missing values per column
    missing_values = df.isnull().sum()
    missing_percentage = (missing_values / len(df)) * 100
    data_types = df.dtypes  # Get the data types of each column
    
    # Prepare the summary DataFrame
    summary_df = pd.DataFrame({
        'Column': missing_values.index,
        'Missing Values': missing_values.values,
        'Percentage Missing (%)': missing_percentage.values,
        'Data Type': data_types.values  # Add the data types to the summary
    })
    
    # Sort the summary DataFrame by the number of missing values, descending
    summary_df = summary_df.sort_values(by='Missing Values', ascending=False)
    
    # Reset index for neat presentation
    summary_df.reset_index(drop=True, inplace=True)
    
    return summary_df

parquet_files = {
    "aed_locations": r"data\aed_locations.parquet.gzip",
    "cad9": r"data\cad9.parquet.gzip",
    "interventions_bxl": r"data\interventions_bxl.parquet.gzip",
    "ambulance_locations": r"data\ambulance_locations.parquet.gzip",
    "interventions_bxl": r"data\interventions_bxl.parquet.gzip",
    "interventions_bxl2": r"data\interventions_bxl2.parquet.gzip",
    "interventions1": r"data\interventions1.parquet",
    "interventions2": r"data\interventions2.parquet",
    "interventions3": r"data\interventions3.parquet",
    "mug_locations": r"data\mug_locations.parquet.gzip",
    "pit_locations": r"data\pit_locations.parquet.gzip"
}
    

#### Drop 
- PostalCode permanence,CityName permanence,StreetName permanence,HouseNumber permanence, -> These are already in the long and lat value + high missing value
- Permanence short name,	Permanence long name -> can be used as identifier but delete at least one if not both 
- EventType Firstcall, EventType Trip -> keep only those realted to heart problems: Chest pain, P039 - Cardiac problem (other than thoracic pain)','P019 - Unconscious - syncope', 'P003 - Cardiac arrest','P038 - Person does not answer the call', 'P008 - Patient with defibrillator - pacemaker'
- Delete every observation that has a 'abondon reason?' as these people were 'fixed' 
- drop everything related to location that is not coordiantes 

Transform T0 -> T9 to a date column for the incident and a time column; this can be used to calculate time to get to person and to see how long it takes for the person to get to the hospital 

All 'intervention' datasets have simmilar structure so i propose the same as above for all. 

### Cleaning Data 

In [81]:
def process_interventions_dataset(file_path):
    # List of heart disorder related event types
    heart_disorder_events = [
        'P011 - Chest pain',
        'P039 - Cardiac problem (other than thoracic pain)',
        'P019 - Unconscious - syncope',
        'P003 - Cardiac arrest',
        'P038 - Person does not answer the call',
        'P008 - Patient with defibrillator - pacemaker'
    ]

    # Columns to drop
    columns_to_drop = [
    'name_destination_hospital',
    'postalcode_destination_hospital', 'cityname_destination_hospital',
    'streetname_destination_hospital', 'housenumber_destination_hospital',
    'eventtype_firstcall',
    'permanence_long_name', 
    'service_name', 'abandon_reason',
    'unavailable_time', 't9','permanence_short_name',
    'eventlevel_firstcall','eventlevel_trip',
    't1confirmed','housenumber_permanence'
]

    
    # Load the dataset for parquet
    df = pd.read_parquet(file_path)

    #Make all the intervetion datasets uniform: make lowercase + add _ as spacing
    df.columns = df.columns.str.lower().str.replace(' ', '_', regex=False)
    
    # Filter based on heart disorders
    df = df[df['eventtype_trip'].isin(heart_disorder_events)]
    
    # Further filtering to remove rows with an abandon reason as these people are okay
    df = df[df['abandon_reason'].isna()]
       
    # Drop some columns that are in some datasets but not in all
    if 'province_intervention' in df.columns:
        columns_to_drop.append('province_intervention')

    if 'intervention_time_(t1reported)' in df.columns:
        columns_to_drop.append('intervention_time_(t1reported)')  

    if 'departure_time_(t1reported)' in df.columns:
        columns_to_drop.append('departure_time_(t1reported)')  
 

    # drop the rest of the columns 
    df.drop(columns=columns_to_drop, inplace=True)
    
    # Extracting the date that is a string from t0 (least missing values)
    df['date'] = df['t0'].str.extract('(\d{2}[A-Z]{3}\d{2})')[0]
    
    #Extracting the time for t2 till t7 
    t2_till_t7 = ['t2','t3', 't4', 't5', 't6', 't7']

    for col in t2_till_t7:
        # Convert to datetime
        df[col] = pd.to_datetime(df[col], errors='coerce')
        # Extract only the time part and replace the column
        df[col] = df[col].dt.strftime('%H:%M:%S')

    #Fixing t0 and t1 as they have string value in the data
    date_prefix_pattern = r'\d{2}[A-Z]{3}\d{2}:' #this sets the parameters that need to be removerd 01JUN22:

    df['t0'] = df['t0'].str.replace(date_prefix_pattern, '', regex=True)
    df['t1'] = df['t1'].str.replace(date_prefix_pattern, '', regex=True)

    
    # Convert 'date' column to the format dd/mm/yy
    df['date'] = pd.to_datetime(df['date'], format='%d%b%y').dt.strftime('%d/%m/%y')
    
    df['cityname_permanence'] = df['cityname_permanence'].str.replace(r"\s*\([^()]*\)", "", regex=True)
    df['cityname_intervention'] = df['cityname_intervention'].str.replace(r"\s*\([^()]*\)", "", regex=True)
    return df



def process_interventions_BXL2_dataset(file_path):
    # List of heart disorder related event types
    heart_disorder_events = [
        'P026 N02 - ONWEL ZONDER DUIDELIJKE REDEN',
        'P008 N03 - PATIËNT MET DEFIBRILLATOR OF PACEMAKER',
        'P011 N05 - PIJN OP DE BORST (Chest Pain)',
        'P039 N05 - CARDIAAL PROBLEEM (ANDERE DAN PIJN AAN DE BORST)',
        'P011 N01 - PIJN OP DE BORST (Chest Pain)',
        'P039 N03 - CARDIAAL PROBLEEM (ANDERE DAN PIJN AAN DE BORST)', 
        'P011 N03 - PIJN OP DE BORST (Chest Pain)',
        'P039 N01 - CARDIAAL PROBLEEM (ANDERE DAN PIJN AAN DE BORST)',
        'P011 N04 - PIJN OP DE BORST (Chest Pain)',
        'P003  N01 - HARTSTILSTAND - DOOD - OVERLEDEN',
        'P019 N01 - Bewusteloos - coma - syncope',
       'P059 N05 - Duizeligheid - onpasselijk',
       'P019 N03 - Bewusteloos - coma - syncope',
       'P026 N01 - ONWEL ZONDER DUIDELIJKE REDEN',
    ]

    # Columns to drop
    columns_to_drop = [
    'description_nl','ic_description_nl',
    'permanence_long_name_nl',
    'permanence_long_name_fr',
    'service_name_nl',
    'vector_type_fr',
    'abandon_reason_fr',
    'permanence_short_name_nl',
    'permanence_short_name_fr',
    'name_destination_hospital',
    'cityname_destination_hospital',
    'streetname_destination_hospital', 
    'housenumber_destination_hospital',
    'housenumber_permanence',
    'eventtype_and_eventlevel',
    'service_name_fr',
    'abandon_reason_nl',
    'creationtime',
    'vector_type_nl'
    ]
    
    # Load the dataset for parquet
    df = pd.read_parquet(file_path)

    #Make all the intervetion datasets uniform: make lowercase + add _ as spacing
    df.columns = df.columns.str.lower().str.replace(' ', '_', regex=False)
    
    #Make all the intervetion datasets uniform: make lowercase + add _ as spacing
    df.columns = df.columns.str.lower().str.replace(' ', '_', regex=False)
    
    # Filter based on heart disorders
    df = df[df['eventtype_and_eventlevel'].isin(heart_disorder_events)]
        
    # Further filtering to remove rows with an abandon reason as these people are okay
    df = df[df['abandon_reason_nl'].isna()]
       
    # Extracting the date that is a string from t0 (least missing values)
    df['date'] = df['t0'].str.extract('(\d{2}[A-Z]{3}\d{2})')[0]
    
    #Extracting the time for t2 till t7 
    t2_till_t7 = ['t2','t3', 't4', 't5', 't6', 't7']

    for col in t2_till_t7:
        # Convert to datetime
        df[col] = pd.to_datetime(df[col], errors='coerce')
        # Extract only the time part and replace the column
        df[col] = df[col].dt.strftime('%H:%M:%S')

    #Fixing t0 and t1 as they have string value in the data
    date_prefix_pattern = r'\d{2}[A-Z]{3}\d{2}:' #this sets the parameters that need to be removerd 01JUN22:

    df['t0'] = df['t0'].str.replace(date_prefix_pattern, '', regex=True)
    df['t1'] = df['t1'].str.replace(date_prefix_pattern, '', regex=True)
    
    #Getting the postalcode + cleaning cityname columns
    df['postalcode_intervention'] = df['cityname_intervention'].str.extract(r'(\d{4})')[0]
    df['cityname_intervention'] = df['cityname_intervention'].str.extract(r'\((.*?)\)')[0]
    df['postalcode_permanence'] = df['cityname_permanence'].str.extract(r'(\d{4})')[0]
    df['cityname_permanence'] = df['cityname_permanence'].str.extract(r'\((.*?)\)')[0]
    df['vector_type']=df['vector_type_nl']
    
    #Changing date format 
    df['date'] = pd.to_datetime(df['date'], format='%d%b%y').dt.strftime('%d/%m/%y')

    # drop the rest of the columns 
    df.drop(columns=columns_to_drop, inplace=True)
    return df

#Function to count duplicate mission id's 
def count_same_mission(df):
    duplicate_mission_ids = df.duplicated(subset='mission_id', keep=False)
    return duplicate_mission_ids.sum()


In [77]:
file_path_interventions1 = "data/interventions1.parquet"
df_internetions_1 = process_interventions_dataset(file_path_interventions1)
file_path_interventions2 = "data/interventions2.parquet"
df_internetions_2 = process_interventions_dataset(file_path_interventions2)
file_path_interventions3 = "data/interventions3.parquet"
df_internetions_3 = process_interventions_dataset(file_path_interventions3)

file_path_bxl2 = "data\interventions_bxl2.parquet.gzip"
df_interventions_bxl2 = process_interventions_BXL2_dataset(file_path_bxl2)

In [108]:
def process_interventions_BXL_dataset(file_path):
    # List of heart disorder related event types
    heart_disorder_events = [
        'P011 - Chest pain',
        'P039 - Cardiac problem (other than thoracic pain)',
        'P019 - Unconscious - syncope',
        'P003 - Cardiac arrest',
        'P038 - Person does not answer the call',
        'P008 - Patient with defibrillator - pacemaker'
    ]

    # Columns to drop
    columns_to_drop = [
    'name_destination_hospital',
    'postalcode_destination_hospital', 'cityname_destination_hospital',
    'streetname_destination_hospital', 'housenumber_destination_hospital',
    'eventtype_firstcall',
    'permanence_long_name', 
    'service_name', 'abandon_reason',
    'unavailable_time', 't9','permanence_short_name',
    'eventlevel_firstcall','eventlevel_trip',
    't1confirmed','housenumber_permanence',
    't0_str',
    'eventtype_trip'
]

    
    # Load the dataset for parquet
    df = pd.read_parquet(file_path)

    #Make all the intervetion datasets uniform: make lowercase + add _ as spacing
    df.columns = df.columns.str.lower().str.replace(' ', '_', regex=False)
    
    # Filter based on heart disorders
    df = df[df['eventtype_trip'].isin(heart_disorder_events)]
    
    # Further filtering to remove rows with an abandon reason as these people are okay
    df = df[df['abandon_reason'].isna()]
       
    # Drop some columns that are in some datasets but not in all
    if 'province_intervention' in df.columns:
        columns_to_drop.append('province_intervention')

    if 'intervention_time_(t1reported)' in df.columns:
        columns_to_drop.append('intervention_time_(t1reported)')  

    if 'departure_time_(t1reported)' in df.columns:
        columns_to_drop.append('departure_time_(t1reported)')  
 

    
    df['t0_str'] = df['t0'].astype(str)
    df['date'] = df['t0_str'].str.extract(r'(\d{4})-(\d{2})-(\d{2})').apply(lambda x: f"{x[2]}/{x[1]}/{x[0][2:]}", axis=1)

    


    

    
   

    
    df['cityname_permanence'] = df['cityname_permanence'].str.replace(r"\s*\([^()]*\)", "", regex=True)
    df['cityname_intervention'] = df['cityname_intervention'].str.replace(r"\s*\([^()]*\)", "", regex=True)

    
    # drop the rest of the columns 
    df.drop(columns=columns_to_drop, inplace=True)
    return df

file_path_bxl = "data\interventions_bxl.parquet.gzip"
df_interventions_bxl = process_interventions_BXL_dataset(file_path_bxl)
df_interventions_bxl['t0']

23        2022-09-06 14:37:30.4641126 +02:00
24        2022-09-06 14:38:11.1705396 +02:00
28        2022-09-06 14:51:41.9686773 +02:00
29        2022-09-06 14:51:41.9686773 +02:00
34        2022-09-06 15:01:24.1076499 +02:00
                         ...                
115559    2023-05-31 19:24:30.8218695 +02:00
115563    2023-05-31 19:35:08.2542141 +02:00
115567    2023-05-31 19:35:08.2542141 +02:00
115623    2023-05-31 22:17:36.0431930 +02:00
115641    2023-05-31 23:26:41.8035155 +02:00
Name: t0, Length: 13850, dtype: object

In [109]:
print(f"Same mission ID in intervention_1 for {count_same_mission(df_internetions_1)} rows")
print(f"Same mission ID in intervention_2 for {count_same_mission(df_internetions_2)} rows")
print(f"Same mission ID in intervention_3 for {count_same_mission(df_internetions_3)} rows")
print(f"Same mission ID in intervention_bxl for {count_same_mission(df_interventions_bxl)} rows")
print(f"Same mission ID in intervention_bxl for {count_same_mission(df_interventions_bxl2)} rows")

Same mission ID in intervention_1 for 17632 rows
Same mission ID in intervention_2 for 17681 rows
Same mission ID in intervention_3 for 18254 rows
Same mission ID in intervention_bxl for 7042 rows
Same mission ID in intervention_bxl for 2024 rows


There are a lot of interventions where a Ambulance and MUG has been sent out to resulting in a dubble registration -> we could combine these but that we have to look at Intervention duration as they are different for Ambulance and Mug. 

In [110]:
def combine_intervention_rows(df):
    # Helper function to combine vector types
    def combine_vector_types(x):
        return " + ".join(sorted(set(x)))

    # Helper function to get the maximum value, assuming the values are numeric or can be converted
    def get_max_value(x):
        def convert_to_float(v):
            try:
                # Try to directly convert to float
                return float(str(v).replace(',', '.'))
            except ValueError:
                # Handle time or duration strings
                parts = v.split(':')
                if len(parts) == 3:
                    # Assuming format is HH:MM:SS.SSS
                    return int(parts[0]) * 3600 + int(parts[1]) * 60 + float(parts[2])
                elif len(parts) == 2:
                    # Assuming format is MM:SS.SSS
                    return int(parts[0]) * 60 + float(parts[1])
                else:
                    # Default case if format is unknown
                    return 0
        max_value = max(x, key=lambda v: convert_to_float(v))
        return max_value if isinstance(max_value, float) else str(max_value)

    # Define aggregation rules
    aggregation_rules = {
        'latitude_permanence': 'first',
        'longitude_permanence': 'first',
        'latitude_intervention': 'first',
        'longitude_intervention': 'first',
        't0': 'first',
        'vector_type': combine_vector_types,
        't1': 'first', 
        'date':'first',
        'streetname_permanence':'first',
        'postalcode_permanence':'first',
        'postalcode_intervention':'first',
        'cityname_intervention':'first',
    }

    # Columns for which we need the maximum value
    max_columns = ['t2', 't3', 't4', 't5', 't6', 't7', 'intervention_time_(t1reported)', 
                   'intervention_time_(t1confirmed)', 'waiting_time', 'intervention_duration', 
                   'departure_time_(t1reported)', 'departure_time_(t1confirmed)', 
                   'calculated_traveltime_destination', 'calculated_distance_destination',
                   'calculated_traveltime_destinatio', 'number_of_transported_persons']

    # Add max rules for specific columns
    for col in max_columns:
        if col in df.columns:
            aggregation_rules[col] = get_max_value

    # Group by 'mission_id' and apply the aggregation rules
    df_combined = df.groupby('mission_id', as_index=False).agg(aggregation_rules)
    
    return df_combined


In [111]:
df_interventions_1_comb= combine_intervention_rows(df_internetions_1)
df_interventions_2_comb= combine_intervention_rows(df_internetions_2)
df_interventions_3_comb= combine_intervention_rows(df_internetions_3)
df_interventions_bxl= combine_intervention_rows(df_interventions_bxl)
df_interventions_bxl2= combine_intervention_rows(df_interventions_bxl)
print(f"Same mission ID in intervention_1 for {count_same_mission(df_interventions_1_comb)} rows")
print(f"Same mission ID in intervention_2 for {count_same_mission(df_interventions_2_comb)} rows")
print(f"Same mission ID in intervention_3 for {count_same_mission(df_interventions_3_comb)} rows")
print(f"Same mission ID in intervention_bxl for {count_same_mission(df_interventions_bxl)} rows")
print(f"Same mission ID in intervention_bxl2 for {count_same_mission(df_interventions_bxl2)} rows")
df_combined_interventions = pd.concat([df_interventions_1_comb, df_interventions_2_comb, df_interventions_3_comb,df_interventions_bxl,df_interventions_bxl2], ignore_index=True)


AttributeError: 'NoneType' object has no attribute 'split'

## Summary of data before imputing 

In [63]:
summarize_dataframe(df_combined_interventions)

Unnamed: 0,Column,Missing Values,Percentage Missing (%),Data Type
0,postalcode_intervention,57656,73.725129,object
1,calculated_distance_destination,39432,50.421973,float64
2,t5,30677,39.226894,object
3,t6,27424,35.06726,object
4,t4,27015,34.544269,object
5,number_of_transported_persons,26782,34.24633,float64
6,intervention_time_(t1confirmed),25635,32.779653,float64
7,departure_time_(t1confirmed),23808,30.443456,float64
8,calculated_traveltime_destinatio,23798,30.430669,float64
9,date,20522,26.241624,object


- number_of_transported_persons -> mean impute 
- calculated_traveltime_destinatio -> regression impute 
- calculated_distance_destination -> regression impute 
- t's -> use time between and devide by coumns needed
- intervention_duration -> calculate from t's?  
- longitude_intervention + latitude_permanence -> get from adress 
- longitude_intervention + latitude_intervention -> get from adres?

### Impute the numbers of transported persons 

In [64]:
# Create a SimpleImputer object with strategy set to 'mean'
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
number_of_transported = df_combined_interventions['number_of_transported_persons'].values.reshape(-1, 1)
df_combined_interventions['number_of_transported_persons'] = imputer.fit_transform(number_of_transported)
df_combined_interventions['number_of_transported_persons']=round(df_combined_interventions['number_of_transported_persons'],0)



### Impute the T columns 

In [66]:
def impute_timestamps(df):
    def to_seconds(time_str):
        """Convert HH:MM:SS or full datetime to total seconds."""
        if pd.isna(time_str):
            return None
        # Check if the string includes a date part and extract only the time part if so
        if ' ' in time_str:
            time_str = time_str.split(' ')[-1]  # Assume the last part is time
        # Split the time part into hours, minutes, and seconds
        parts = time_str.split(':')
        h, m = int(parts[0]), int(parts[1])
        s = int(parts[2]) if len(parts) == 3 else 0  # Handle cases without seconds
        return h * 3600 + m * 60 + s

    def to_hms(seconds):
        """Convert total seconds back to HH:MM:SS."""
        if seconds is None:
            return None
        h = seconds // 3600
        m = (seconds % 3600) // 60
        s = seconds % 60
        return f"{h:02d}:{m:02d}:{s:02d}"

    # Iterate over each row
    for index, row in df.iterrows():
        timestamps = row[['t0', 't1', 't2', 't3', 't4', 't5', 't6', 't7']]
        missing_indices = [i for i, x in enumerate(timestamps) if pd.isna(x)]
        if not missing_indices:
            continue
        
        for missing_index in missing_indices:
            if missing_index == 0 or missing_index == len(timestamps) - 1:
                continue
            prev_known_index = next((i for i in range(missing_index - 1, -1, -1) if not pd.isna(timestamps[i])), None)
            next_known_index = next((i for i in range(missing_index + 1, len(timestamps)) if not pd.isna(timestamps[i])), None)
            
            if prev_known_index is None or next_known_index is None:
                continue
            
            start_time = to_seconds(timestamps.iloc[prev_known_index])
            end_time = to_seconds(timestamps.iloc[next_known_index])
            if start_time is not None and end_time is not None:
                interval = (end_time - start_time) // (next_known_index - prev_known_index)
                
                for i in range(prev_known_index + 1, next_known_index):
                    start_time += interval
                    df.at[index, f't{i}'] = to_hms(start_time)

    return df


In [67]:
df_combined_interventions = impute_timestamps(df_combined_interventions)

In [68]:
summarize_dataframe(df_combined_interventions)

Unnamed: 0,Column,Missing Values,Percentage Missing (%),Data Type
0,postalcode_intervention,57656,73.725129,object
1,calculated_distance_destination,39432,50.421973,float64
2,intervention_time_(t1confirmed),25635,32.779653,float64
3,departure_time_(t1confirmed),23808,30.443456,float64
4,calculated_traveltime_destinatio,23798,30.430669,float64
5,date,20522,26.241624,object
6,intervention_duration,19474,24.90154,float64
7,t7,17682,22.610097,object
8,t6,10407,13.307503,object
9,t5,9876,12.62851,object


### Impute the intervention coordinates 

In [105]:
def fetch_coordinates(row):
    # Check if both latitude and longitude are missing
    if pd.isnull(row['latitude_intervention']) or pd.isnull(row['longitude_intervention']):
        try:
            # Attempt to fetch location using city name
            location = geolocator.geocode(row['cityname_intervention'])
            if location:
                return location.latitude, location.longitude
            else:
                return None, None
        except (GeocoderTimedOut, GeocoderServiceError) as e:
            print(f"Geocoding error for {row['cityname_intervention']}: {e}")
            return None, None
    else:
        # Return existing coordinates
        return row['latitude_intervention'], row['longitude_intervention']
    
df_combined_interventions[['latitude_intervention', 'longitude_intervention']] = df_combined_interventions.apply(fetch_coordinates, axis=1, result_type='expand')
