In [1]:
import pandas as pd


In [118]:
def summarize_dataset(file_path):
    # Read the Parquet file
    df = pd.read_parquet(file_path)
    
    # Calculate missing values per column
    missing_values = df.isnull().sum()
    missing_percentage = (missing_values / len(df)) * 100
    data_types = df.dtypes  # Get the data types of each column
    
    # Prepare the summary DataFrame
    summary_df = pd.DataFrame({
        'Column': missing_values.index,
        'Missing Values': missing_values.values,
        'Percentage Missing (%)': missing_percentage.values,
        'Data Type': data_types.values  # Add the data types to the summary
    })
    
    # Sort the summary DataFrame by the number of missing values, descending
    summary_df = summary_df.sort_values(by='Missing Values', ascending=False)
    
    # Reset index for neat presentation
    summary_df.reset_index(drop=True, inplace=True)
    
    return summary_df


parquet_files = {
    "aed_locations": r"Data\Data\aed_locations.parquet.gzip",
    "cad9": r"Data\Data\cad9.parquet.gzip",
    "interventions_bxl": r"Data\Data\interventions_bxl.parquet.gzip",
    "ambulance_locations": r"Data\Data\ambulance_locations.parquet.gzip",
    "interventions_bxl": r"Data\Data\interventions_bxl.parquet.gzip",
    "interventions_bxl2": r"Data\Data\interventions_bxl2.parquet.gzip",
    "interventions1": r"Data\Data\interventions1.parquet",
    "interventions2": r"Data\Data\interventions2.parquet",
    "interventions3": r"Data\Data\interventions3.parquet",
    "mug_locations": r"Data\Data\mug_locations.parquet.gzip",
    "pit_locations": r"Data\Data\pit_locations.parquet.gzip"
}
    

#### Drop 
- PostalCode permanence,CityName permanence,StreetName permanence,HouseNumber permanence, -> These are already in the long and lat value + high missing value
- Permanence short name,	Permanence long name -> can be used as identifier but delete at least one if not both 
- EventType Firstcall, EventType Trip -> keep only those realted to heart problems: Chest pain, P039 - Cardiac problem (other than thoracic pain)','P019 - Unconscious - syncope', 'P003 - Cardiac arrest','P038 - Person does not answer the call', 'P008 - Patient with defibrillator - pacemaker'
- Delete every observation that has a 'abondon reason?' as these people were 'fixed' 
- drop everything related to location that is not coordiantes 

Transform T0 -> T9 to a date column for the incident and a time column; this can be used to calculate time to get to person and to see how long it takes for the person to get to the hospital 

All 'intervention' datasets have simmilar structure so i propose the same as above for all. 

In [114]:
def process_interventions_dataset(file_path):
    # List of heart disorder related event types
    heart_disorder_events = [
        'P011 - Chest pain',
        'P039 - Cardiac problem (other than thoracic pain)',
        'P019 - Unconscious - syncope',
        'P003 - Cardiac arrest',
        'P038 - Person does not answer the call',
        'P008 - Patient with defibrillator - pacemaker'
    ]

    # Columns to drop
    columns_to_drop = [
    'postalcode_intervention', 'name_destination_hospital',
    'postalcode_destination_hospital', 'cityname_destination_hospital',
    'streetname_destination_hospital', 'housenumber_destination_hospital',
    'eventtype_firstcall', 'cityname_intervention', 'postalcode_intervention',
    'permanence_long_name', 'postalcode_permanence', 'cityname_permanence',
    'streetname_permanence', 'housenumber_permanence', 'service_name', 'abandon_reason',
    'eventtype_trip', 'unavailable_time', 't9'
]

    
    # Load the dataset for parquet
    df = pd.read_parquet(file_path)

    #Make all the intervetion datasets uniform: make lowercase + add _ as spacing
    df.columns = df.columns.str.lower().str.replace(' ', '_', regex=False)
    
    # Filter based on heart disorders
    df = df[df['eventtype_trip'].isin(heart_disorder_events)]
    
    # Further filtering to remove rows with an abandon reason as these people are okay
    df = df[df['abandon_reason'].isna()]
       
    # Drop some columns that are in some datasets but not in all
    if 'province_intervention' in df.columns:
        columns_to_drop.append('province_intervention')

    # drop the rest of the columns 
    df.drop(columns=columns_to_drop, inplace=True)
    
    # Extracting the date that is a string from t0 (least missing values)
    df['date'] = df['t0'].str.extract('(\d{2}[A-Z]{3}\d{2})')[0]
    
    # Columns to process for time extraction
    columns_to_process = ['t0', 't1', 't2','t3', 't4', 't5', 't6', 't7']
    for col in columns_to_process:
        if col in df.columns:
            df[col] = df[col].str.split(':', n=1).str[1]
    
    # Convert 'date' column to the format dd/mm/yy
    df['date'] = pd.to_datetime(df['date'], format='%d%b%y').dt.strftime('%d/%m/%y')
    
    return df

#Function to count duplicate mission id's 
def count_same_mission(df):
    duplicate_mission_ids = df.duplicated(subset='mission_id', keep=False)
    return duplicate_mission_ids.sum()


### Cleaning Data 

In [107]:
file_path_interventions1 = "Data/Data/interventions1.parquet"
df_internetions_1 = process_interventions_dataset(file_path_interventions1)
file_path_interventions2 = "Data/Data/interventions2.parquet"
df_internetions_2 = process_interventions_dataset(file_path_interventions2)
file_path_interventions3 = "Data/Data/interventions3.parquet"
df_internetions_3 = process_interventions_dataset(file_path_interventions3)

17632


In [116]:
print(f"Same mission ID in intervention_1 for {count_same_mission(df_internetions_1)} rows")
print(f"Same mission ID in intervention_2 for {count_same_mission(df_internetions_2)} rows")
print(f"Same mission ID in intervention_3 for {count_same_mission(df_internetions_3)} rows")

Same mission ID in intervention_1 for 17632 rows
Same mission ID in intervention_2 for 17681 rows
Same mission ID in intervention_3 for 18254 rows


There are a lot of interventions where a Ambulance and MUG has been sent out to resulting in a dubble registration -> we could combine these but that we have to look at Intervention duration as they are different for Ambulance and Mug. 

In [117]:
file_path_bxl = "Data\Data\interventions_bxl.parquet.gzip"
df_internetions_bxl = process_interventions_dataset(file_path_bxl)


In [119]:
summarize_dataset("Data\Data\interventions_bxl.parquet.gzip")

Unnamed: 0,Column,Missing Values,Percentage Missing (%),Data Type
0,unavailable_time,115643,99.996541,float64
1,t9,115592,99.952441,object
2,t1confirmed,115362,99.75356,object
3,housenumber_permanence,113417,98.071718,object
4,abandon_reason,86613,74.89429,object
5,number_of_transported_persons,62159,53.748908,float64
6,housenumber_destination_hospital,62030,53.637362,float64
7,t5,56507,48.861622,object
8,t4,47558,41.123419,object
9,streetname_destination_hospital,47008,40.647833,object


In [120]:
summarize_dataset("Data\Data\interventions_bxl2.parquet.gzip")


Unnamed: 0,Column,Missing Values,Percentage Missing (%),Data Type
0,Housenumber destination hospital,38619,99.997411,object
1,Abandon reason FR,34326,88.881409,object
2,Abandon reason NL,34326,88.881409,object
3,Streetname destination hospital,25914,67.099948,object
4,Cityname destination hospital,25914,67.099948,object
5,Name destination hospital,25914,67.099948,object
6,T6,25905,67.076644,object
7,T4,25113,65.025893,object
8,T5,24748,64.080787,object
9,T3,21746,56.307613,object


BXL2 data is very different form BXL1 -> further work needed