In [1]:
import pandas as pd


In [2]:
def summarize_dataset(file_path):
    # Read the Parquet file
    df = pd.read_parquet(file_path)
    
    # Calculate missing values per column
    missing_values = df.isnull().sum()
    missing_percentage = (missing_values / len(df)) * 100
    data_types = df.dtypes  # Get the data types of each column
    
    # Prepare the summary DataFrame
    summary_df = pd.DataFrame({
        'Column': missing_values.index,
        'Missing Values': missing_values.values,
        'Percentage Missing (%)': missing_percentage.values,
        'Data Type': data_types.values  # Add the data types to the summary
    })
    
    # Sort the summary DataFrame by the number of missing values, descending
    summary_df = summary_df.sort_values(by='Missing Values', ascending=False)
    
    # Reset index for neat presentation
    summary_df.reset_index(drop=True, inplace=True)
    
    return summary_df


parquet_files = {
    "aed_locations": r"Data\Data\aed_locations.parquet.gzip",
    "cad9": r"Data\Data\cad9.parquet.gzip",
    "interventions_bxl": r"Data\Data\interventions_bxl.parquet.gzip",
    "ambulance_locations": r"Data\Data\ambulance_locations.parquet.gzip",
    "interventions_bxl": r"Data\Data\interventions_bxl.parquet.gzip",
    "interventions_bxl2": r"Data\Data\interventions_bxl2.parquet.gzip",
    "interventions1": r"Data\Data\interventions1.parquet",
    "interventions2": r"Data\Data\interventions2.parquet",
    "interventions3": r"Data\Data\interventions3.parquet",
    "mug_locations": r"Data\Data\mug_locations.parquet.gzip",
    "pit_locations": r"Data\Data\pit_locations.parquet.gzip"
}
    

In [3]:
df_internetions_1 = pd.read_parquet( r"Data\Data\interventions1.parquet")
df_internetions_1['EventType Firstcall'].unique()

array(['P020 - Intoxication alcohol', 'P068 - Urogenital problem',
       'P033 - Trauma', 'P022 - Intoxication medication',
       'P026 - Unclear problem', 'P031 - Psychiatric problem',
       'P065 - Sudden deafness or ringing in the ears', 'P009 - Diabetes',
       'FI (1.3.0) fire building', 'P030 - Hanging and strangulation',
       'P011 - Chest pain', 'P012 - Non-traumatic abdominal pain',
       'P010 - Respiratory problems', 'P099 - Interhospital transport',
       'P017 - Non-traumatic bleeding',
       'P071 - Sick child < 15 years with abdominal pain',
       'P096 - Out of service',
       'P039 - Cardiac problem (other than thoracic pain)',
       'P019 - Unconscious - syncope', 'P003 - Cardiac arrest',
       'P001 - Traffic accident',
       'P023 - Intoxication (household, agricultural, industrial)',
       'P015 - Epilepsy - convulsions', 'P013 - Non-traumatic back pain',
       'P004 - Stroke', 'P034 - Skull trauma', 'P005 - Wounded by weapon',
       'P066 - Post-o

In [13]:
# For interventions1
print("Data Summary for: interventions1")
summary_df = summarize_dataset(r"Data\Data\interventions1.parquet")
display(summary_df)


Data Summary for: interventions1


Unnamed: 0,Column,Missing Values,Percentage Missing (%),Data Type
0,PostalCode destination hospital,200627,100.0,object
1,PostalCode intervention,200356,99.864923,object
2,HouseNumber destination hospital,200320,99.84698,object
3,T9,197952,98.66668,object
4,Unavailable time,197061,98.222572,float64
5,HouseNumber permanence,192194,95.796677,object
6,Abandon reason,177789,88.616687,object
7,T5,83028,41.38426,object
8,T4,76091,37.9266,object
9,Calculated travelTime destinatio,75042,37.403739,float64


#### Drop 
- PostalCode permanence,CityName permanence,StreetName permanence,HouseNumber permanence, -> These are already in the long and lat value + high missing value
- Permanence short name,	Permanence long name -> can be used as identifier but delete at least one if not both 
- EventType Firstcall, EventType Trip -> keep only those realted to heart problems: Chest pain, P039 - Cardiac problem (other than thoracic pain)','P019 - Unconscious - syncope', 'P003 - Cardiac arrest','P038 - Person does not answer the call', 'P008 - Patient with defibrillator - pacemaker'
- Delete every observation that has a 'abondon reason?' as these people were 'fixed' 
- drop everything related to location that is not coordiantes 

Transform T0 -> T9 to a date column for the incident and a time column; this can be used to calculate time to get to person and to see how long it takes for the person to get to the hospital 

All 'intervention' datasets have simmilar structure so i propose the same as above for all. 

In [57]:
def process_interventions_dataset(file_path):
    # List of heart disorder related event types
    heart_disorder_events = [
        'P011 - Chest pain',
        'P039 - Cardiac problem (other than thoracic pain)',
        'P019 - Unconscious - syncope',
        'P003 - Cardiac arrest',
        'P038 - Person does not answer the call',
        'P008 - Patient with defibrillator - pacemaker'
    ]

    # Columns to drop
    columns_to_drop = [
        'PostalCode intervention', 'Name destination hospital',
        'PostalCode destination hospital', 'CityName destination hospital',
        'StreetName destination hospital', 'HouseNumber destination hospital',
        'EventType Firstcall', 'CityName intervention', 'PostalCode intervention',
        'Permanence long name', 'PostalCode permanence', 'CityName permanence',
        'StreetName permanence', 'HouseNumber permanence', 'Service Name', 'Abandon reason',
        'EventType Trip', 'Province intervention','Unavailable time','T9'
    ]
    
    # Load the dataset
    df = pd.read_parquet(file_path)
    
    # Filter based on 'EventType Trip' for heart disorders
    df = df[df['EventType Trip'].isin(heart_disorder_events)]
    
    # Further filtering to remove rows with an abandon reason
    df = df[df['Abandon reason'].isna()]
       
    # Drop specified columns
    df.drop(columns=columns_to_drop, inplace=True)
    
    # Extracting the date string from T0
    df['date'] = df['T0'].str.extract('(\d{2}[A-Z]{3}\d{2})')[0]
    
    # Columns to process for time extraction
    columns_to_process = ['T0', 'T1', 'T3', 'T4', 'T5', 'T6', 'T7']
    for col in columns_to_process:
        if col in df.columns:
            df[col] = df[col].str.split(':', n=1).str[1]
    
    # Convert 'date' column to the format dd/mm/yy
    df['date'] = pd.to_datetime(df['date'], format='%d%b%y').dt.strftime('%d/%m/%y')
    
    return df


In [58]:
file_path_interventions1 = "Data/Data/interventions1.parquet"
df_internetions_1 = process_interventions_dataset(file_path_interventions1)
df_internetions_1.head()

Unnamed: 0,Mission ID,Latitude permanence,Longitude permanence,Permanence short name,Vector type,EventLevel Firstcall,EventLevel Trip,Latitude intervention,Longitude intervention,T0,...,Intervention time (T1Reported),Intervention time (T1Confirmed),Waiting time,Intervention duration,Departure time (T1Reported),Departure time (T1Confirmed),Calculated travelTime destinatio,Calculated Distance destination,Number of transported persons,date
17,10221520015,51.23355,4.49318,AAWIJN01A,Ambulance,N5,N5,51.23266,4.4444,01:26:09,...,6.0,6.0,8.0,58.0,4.0,4.0,172.0,2031.0,1.0,01/06/22
18,10221520015,51.22249,4.43629,UAANTW01A,MUG,N5,N2,51.23266,4.4444,01:26:09,...,7.0,7.0,26.0,36.0,3.0,3.0,,,,01/06/22
22,10221520020,51.18678,5.11457,PAMOL_01A,PIT,N4,N4,51.16376,4.98392,01:51:06,...,16.0,15.0,18.0,62.0,6.0,5.0,220.0,2119.0,1.0,01/06/22
29,10221520032,51.29857,4.488,AABRAS01A,Ambulance,N4,N4,51.2985,4.47723,04:05:01,...,6.0,5.0,8.0,23.0,3.0,2.0,128.0,1289.0,,01/06/22
30,10221520033,51.22249,4.43629,PAANTW01A,PIT,N4,N4,51.25461,4.49728,04:06:31,...,14.0,14.0,17.0,166.0,3.0,3.0,396.0,4414.0,1.0,01/06/22


In [60]:
duplicate_mission_ids = df_internetions_1.duplicated(subset='Mission ID', keep=False)

# Summarize the findings
duplicate_count = duplicate_mission_ids.sum()
duplicate_count

17632

In [63]:
file_path_interventions2 = "Data/Data/interventions2.parquet"
df_internetions_2 = process_interventions_dataset(file_path_interventions2)
duplicate_mission_ids = df_internetions_2.duplicated(subset='Mission ID', keep=False)
# Summarize the findings
duplicate_count = duplicate_mission_ids.sum()
duplicate_count

17681

In [65]:
file_path_interventions3 = "Data/Data/interventions3.parquet"
df_internetions_3 = process_interventions_dataset(file_path_interventions3)
duplicate_mission_ids = df_internetions_3.duplicated(subset='Mission ID', keep=False)
# Summarize the findings
duplicate_count = duplicate_mission_ids.sum()
duplicate_count

18254

There are a lot of interventions where a Ambulance and MUG has been sent out to resulting in a dubble registration -> we could combine these but that we have to look at Intervention duration as they are different for Ambulance and Mug. 