In [1]:
import pandas as pd
from sklearn.impute import SimpleImputer
import numpy as np
from geopy.geocoders import Nominatim
from geopy.exc import GeocoderTimedOut, GeocoderServiceError
geolocator = Nominatim(user_agent="my_app")

In [2]:
def summarize_dataset(file_path):
    # Read the Parquet file
    df = pd.read_parquet(file_path)
    
    # Calculate missing values per column
    missing_values = df.isnull().sum()
    missing_percentage = (missing_values / len(df)) * 100
    data_types = df.dtypes  # Get the data types of each column
    
    # Prepare the summary DataFrame
    summary_df = pd.DataFrame({
        'Column': missing_values.index,
        'Missing Values': missing_values.values,
        'Percentage Missing (%)': missing_percentage.values,
        'Data Type': data_types.values  # Add the data types to the summary
    })
    
    # Sort the summary DataFrame by the number of missing values, descending
    summary_df = summary_df.sort_values(by='Missing Values', ascending=False)
    
    # Reset index for neat presentation
    summary_df.reset_index(drop=True, inplace=True)
    
    return summary_df

def summarize_dataframe(df):
    # Calculate missing values per column
    missing_values = df.isnull().sum()
    missing_percentage = (missing_values / len(df)) * 100
    data_types = df.dtypes  # Get the data types of each column
    
    # Prepare the summary DataFrame
    summary_df = pd.DataFrame({
        'Column': missing_values.index,
        'Missing Values': missing_values.values,
        'Percentage Missing (%)': missing_percentage.values,
        'Data Type': data_types.values  # Add the data types to the summary
    })
    
    # Sort the summary DataFrame by the number of missing values, descending
    summary_df = summary_df.sort_values(by='Missing Values', ascending=False)
    
    # Reset index for neat presentation
    summary_df.reset_index(drop=True, inplace=True)
    
    return summary_df  

#### Drop 
- PostalCode permanence,CityName permanence,StreetName permanence,HouseNumber permanence, -> These are already in the long and lat value + high missing value
- Permanence short name,	Permanence long name -> can be used as identifier but delete at least one if not both 
- EventType Firstcall, EventType Trip -> keep only those realted to heart problems: Chest pain, P039 - Cardiac problem (other than thoracic pain)','P019 - Unconscious - syncope', 'P003 - Cardiac arrest','P038 - Person does not answer the call', 'P008 - Patient with defibrillator - pacemaker'
- Delete every observation that has a 'abondon reason?' as these people were 'fixed' 
- drop everything related to location that is not coordiantes 

Transform T0 -> T9 to a date column for the incident and a time column; this can be used to calculate time to get to person and to see how long it takes for the person to get to the hospital 

All 'intervention' datasets have simmilar structure so i propose the same as above for all. 

### Cleaning Data 

In [3]:
def process_interventions_dataset(file_path):
    # List of heart disorder related event types these are the events we want to use
    heart_disorder_events = [
        'P011 - Chest pain',
        'P039 - Cardiac problem (other than thoracic pain)',
        'P019 - Unconscious - syncope',
        'P003 - Cardiac arrest',
        'P038 - Person does not answer the call',
        'P008 - Patient with defibrillator - pacemaker'
    ]

    # Columns to drop, these are in my opinion not important or have to many NaN 
    columns_to_drop = [
    'name_destination_hospital',
    'postalcode_destination_hospital', 'cityname_destination_hospital',
    'streetname_destination_hospital', 'housenumber_destination_hospital',
    'eventtype_firstcall',
    'permanence_long_name', 
    'service_name', 'abandon_reason',
    'unavailable_time', 't9','permanence_short_name',
    'eventlevel_firstcall','eventlevel_trip',
    't1confirmed','housenumber_permanence'
]

    
    # Load the dataset for parquet
    df = pd.read_parquet(file_path)

    #Make all the intervetion datasets uniform: make lowercase + add _ as spacing
    df.columns = df.columns.str.lower().str.replace(' ', '_', regex=False)
    
    # Filter based on heart disorders
    df = df[df['eventtype_trip'].isin(heart_disorder_events)]
    
    # Further filtering to remove rows with an abandon reason as these people are okay
    # I assumed that the ambulance was not hurrying back to the hospital of to the person 
    df = df[df['abandon_reason'].isna()]
       
    # Some columns are in the bxl dataset and not in the interventions dataset of vica versa 
    # These lines of code check if they are there and than deletes them 
    if 'province_intervention' in df.columns:
        columns_to_drop.append('province_intervention')

    if 'intervention_time_(t1reported)' in df.columns:
        columns_to_drop.append('intervention_time_(t1reported)')  

    if 'departure_time_(t1reported)' in df.columns:
        columns_to_drop.append('departure_time_(t1reported)')  
 

    # drop the columns from above 
    df.drop(columns=columns_to_drop, inplace=True)
    
    # The date of the intervention is in T0 
    # we are extrecting these as they are string based (06JAN22) and creating a new column named 'date'
    df['date'] = df['t0'].str.extract('(\d{2}[A-Z]{3}\d{2})')[0]
    
    # The date is is also in culumns t2 - t7 and is redundend 
    # this deletes this from the indicidual cell  
    t2_till_t7 = ['t2','t3', 't4', 't5', 't6', 't7']
    for col in t2_till_t7:
        # Convert to datetime
        df[col] = pd.to_datetime(df[col], errors='coerce')
        # Extract only the time part and replace the column
        df[col] = df[col].dt.strftime('%H:%M:%S')
    #Fixing t0 and t1 as they have string value in the data
    date_prefix_pattern = r'\d{2}[A-Z]{3}\d{2}:' #this sets the parameters that need to be removerd 01JUN22:
    df['t0'] = df['t0'].str.replace(date_prefix_pattern, '', regex=True)
    df['t1'] = df['t1'].str.replace(date_prefix_pattern, '', regex=True)

    
    # Convert 'date' column to the format dd/mm/yy to make it easier to read
    df['date'] = pd.to_datetime(df['date'], format='%d%b%y').dt.strftime('%d/%m/%y')
    
    # The cityname has the name twice like Antwerpen (Antwerpen) -> this makes it just Antwerpen 
    df['cityname_permanence'] = df['cityname_permanence'].str.replace(r"\s*\([^()]*\)", "", regex=True)
    df['cityname_intervention'] = df['cityname_intervention'].str.replace(r"\s*\([^()]*\)", "", regex=True)
    
    return df

# Modify 't' columns by slicing off the first 11 characters
def slice_time_columns(df, columns):
    for col in columns:
        if col in df.columns:  
            df[col] = df[col].str.slice(start=11)
    return df

def process_interventions_BXL_dataset(file_path):
    # List of heart disorder related event types
    heart_disorder_events = [
        'P011 - Chest pain',
        'P039 - Cardiac problem (other than thoracic pain)',
        'P019 - Unconscious - syncope',
        'P003 - Cardiac arrest',
        'P038 - Person does not answer the call',
        'P008 - Patient with defibrillator - pacemaker'
    ]

    # Columns to drop
    columns_to_drop = [
    'name_destination_hospital',
    'postalcode_destination_hospital', 'cityname_destination_hospital',
    'streetname_destination_hospital', 'housenumber_destination_hospital',
    'eventtype_firstcall',
    'permanence_long_name', 
    'service_name', 'abandon_reason',
    'unavailable_time', 't9','permanence_short_name',
    'eventlevel_firstcall','eventlevel_trip',
    't1confirmed','housenumber_permanence',
    't0_str',
    'eventtype_trip'
]

    
    # Load the dataset for parquet
    df = pd.read_parquet(file_path)

    #Make all the intervetion datasets uniform: make lowercase + add _ as spacing
    df.columns = df.columns.str.lower().str.replace(' ', '_', regex=False)
    
    # Filter based on heart disorders
    df = df[df['eventtype_trip'].isin(heart_disorder_events)]
    
    # Further filtering to remove rows with an abandon reason as these people are okay
    df = df[df['abandon_reason'].isna()]
       
    # Drop some columns that are in some datasets but not in all
    if 'province_intervention' in df.columns:
        columns_to_drop.append('province_intervention')

    if 'intervention_time_(t1reported)' in df.columns:
        columns_to_drop.append('intervention_time_(t1reported)')  

    if 'departure_time_(t1reported)' in df.columns:
        columns_to_drop.append('departure_time_(t1reported)')  
 

    
    df['t0_str'] = df['t0'].astype(str)
    df['date'] = df['t0_str'].str.extract(r'(\d{4})-(\d{2})-(\d{2})').apply(lambda x: f"{x[2]}/{x[1]}/{x[0][2:]}", axis=1)
    
    # Modify the 't' columns 
    time_columns = ['t0', 't1', 't2', 't3', 't4', 't5', 't6', 't7']
    df = slice_time_columns(df, time_columns)
    t2_till_t7 = ['t2','t3', 't4', 't5', 't6', 't7']

    for col in t2_till_t7:
        # Convert to datetime
        df[col] = pd.to_datetime(df[col], errors='coerce')
        # Extract only the time part and replace the column
        df[col] = df[col].dt.strftime('%H:%M:%S')
   

    
    df['cityname_permanence'] = df['cityname_permanence'].str.replace(r"\s*\([^()]*\)", "", regex=True)
    df['cityname_intervention'] = df['cityname_intervention'].str.replace(r"\s*\([^()]*\)", "", regex=True)

    
    # drop the rest of the columns 
    df.drop(columns=columns_to_drop, inplace=True)
    return df



def process_interventions_BXL2_dataset(file_path):
    # List of heart disorder related event types these are the events we want to use
    heart_disorder_events = [
        'P026 N02 - ONWEL ZONDER DUIDELIJKE REDEN',
        'P008 N03 - PATIËNT MET DEFIBRILLATOR OF PACEMAKER',
        'P011 N05 - PIJN OP DE BORST (Chest Pain)',
        'P039 N05 - CARDIAAL PROBLEEM (ANDERE DAN PIJN AAN DE BORST)',
        'P011 N01 - PIJN OP DE BORST (Chest Pain)',
        'P039 N03 - CARDIAAL PROBLEEM (ANDERE DAN PIJN AAN DE BORST)', 
        'P011 N03 - PIJN OP DE BORST (Chest Pain)',
        'P039 N01 - CARDIAAL PROBLEEM (ANDERE DAN PIJN AAN DE BORST)',
        'P011 N04 - PIJN OP DE BORST (Chest Pain)',
        'P003  N01 - HARTSTILSTAND - DOOD - OVERLEDEN',
        'P019 N01 - Bewusteloos - coma - syncope',
       'P059 N05 - Duizeligheid - onpasselijk',
       'P019 N03 - Bewusteloos - coma - syncope',
       'P026 N01 - ONWEL ZONDER DUIDELIJKE REDEN',
    ]

    # Columns to drop, these are in my opinion not important or have to many NaN 
    columns_to_drop = [
    'description_nl','ic_description_nl',
    'permanence_long_name_nl',
    'permanence_long_name_fr',
    'service_name_nl',
    'vector_type_fr',
    'abandon_reason_fr',
    'permanence_short_name_nl',
    'permanence_short_name_fr',
    'name_destination_hospital',
    'cityname_destination_hospital',
    'streetname_destination_hospital', 
    'housenumber_destination_hospital',
    'housenumber_permanence',
    'eventtype_and_eventlevel',
    'service_name_fr',
    'abandon_reason_nl',
    'creationtime',
    'vector_type_nl'
    ]
    
    # Load the dataset for parquet
    df = pd.read_parquet(file_path)

    # Some datasets have a different column name structure 
    # This make all the intervetion datasets uniform: make lowercase + add _ as spacing
    df.columns = df.columns.str.lower().str.replace(' ', '_', regex=False)
    
    # Filter based on heart disorders (only keep heart related iunterventions)
    df = df[df['eventtype_and_eventlevel'].isin(heart_disorder_events)]
        
    # If an abandomend reason was given these rows are dropped
    df = df[df['abandon_reason_nl'].isna()]
       
    # Extracting the date that is a string from t0 (least missing values)
    df['date'] = df['t0'].str.extract('(\d{2}[A-Z]{3}\d{2})')[0]
    
    #Extracting the time for t2 till t7 
    t2_till_t7 = ['t2','t3', 't4', 't5', 't6', 't7']

    for col in t2_till_t7:
        # Convert to datetime
        df[col] = pd.to_datetime(df[col], errors='coerce')
        # Extract only the time part and replace the column
        df[col] = df[col].dt.strftime('%H:%M:%S')

    #Fixing t0 and t1 as they have string value in the data
    date_prefix_pattern = r'\d{2}[A-Z]{3}\d{2}:' #this sets the parameters that need to be removerd 01JUN22:

    df['t0'] = df['t0'].str.replace(date_prefix_pattern, '', regex=True)
    df['t1'] = df['t1'].str.replace(date_prefix_pattern, '', regex=True)
    
    #Getting the postalcode + cleaning cityname columns
    df['postalcode_intervention'] = df['cityname_intervention'].str.extract(r'(\d{4})')[0]
    df['cityname_intervention'] = df['cityname_intervention'].str.extract(r'\((.*?)\)')[0]
    df['postalcode_permanence'] = df['cityname_permanence'].str.extract(r'(\d{4})')[0]
    df['cityname_permanence'] = df['cityname_permanence'].str.extract(r'\((.*?)\)')[0]
    df['vector_type']=df['vector_type_nl']
    
    #Changing date format 
    df['date'] = pd.to_datetime(df['date'], format='%d%b%y').dt.strftime('%d/%m/%y')

    # drop the rest of the columns 
    df.drop(columns=columns_to_drop, inplace=True)
    return df

#Function to count duplicate mission id's 
def count_same_mission(df):
    duplicate_mission_ids = df.duplicated(subset='mission_id', keep=False)
    return duplicate_mission_ids.sum()


There are a lot of interventions where a Ambulance and MUG has been sent out to resulting in a dubble registration -> we could combine these but that we have to look at Intervention duration as they are different for Ambulance and Mug. 

In [5]:
def combine_intervention_rows(df):
    # Helper function to combine vector types, handling None values
    def combine_vector_types(x):
        # Filter out None values before joining
        filtered_x = filter(None, x)
        return " + ".join(sorted(set(filtered_x)))

    # Helper function to get the maximum value, assuming the values are numeric or can be converted
    def get_max_value(x):
        def convert_to_float(v):
            try:
                # Try to directly convert to float
                return float(str(v).replace(',', '.'))
            except ValueError:
                # Handle time or duration strings
                parts = v.split(':')
                if len(parts) == 3:
                    # Assuming format is HH:MM:SS.SSS
                    return int(parts[0]) * 3600 + int(parts[1]) * 60 + float(parts[2])
                elif len(parts) == 2:
                    # Assuming format is MM:SS.SSS
                    return int(parts[0]) * 60 + float(parts[1])
                else:
                    # Default case if format is unknown
                    return 0
        # Ensure x is a list with no None values before finding max
        non_none_x = [v for v in x if v is not None]
        max_value = max(non_none_x, key=lambda v: convert_to_float(v), default=None)
        return max_value if isinstance(max_value, float) else str(max_value)

    # Define aggregation rules
    aggregation_rules = {
        'latitude_permanence': 'first',
        'longitude_permanence': 'first',
        'latitude_intervention': 'first',
        'longitude_intervention': 'first',
        't0': 'first',
        'vector_type': combine_vector_types,
        't1': 'first', 
        'date': 'first',
        'streetname_permanence': 'first',
        'postalcode_permanence': 'first',
        'postalcode_intervention': 'first',
        'cityname_intervention': 'first',
    }

    # Columns for which we need the maximum value
    max_columns = ['t2', 't3', 't4', 't5', 't6', 't7', 'intervention_time_(t1reported)', 
                   'intervention_time_(t1confirmed)', 'waiting_time', 'intervention_duration', 
                   'departure_time_(t1reported)', 'departure_time_(t1confirmed)', 
                   'calculated_traveltime_destination', 'calculated_distance_destination',
                   'calculated_traveltime_destinatio', 'number_of_transported_persons']

    # Add max rules for specific columns
    for col in max_columns:
        if col in df.columns:
            aggregation_rules[col] = get_max_value

    # Group by 'mission_id' and apply the aggregation rules
    df_combined = df.groupby('mission_id', as_index=False).agg(aggregation_rules)
    
    return df_combined


In [39]:
file_path_interventions1 = "Data\Data/interventions1.parquet"
df_internetions_1 = process_interventions_dataset(file_path_interventions1)
file_path_interventions2 = "Data\Data/interventions2.parquet"
df_internetions_2 = process_interventions_dataset(file_path_interventions2)
file_path_interventions3 = "Data\Data/interventions3.parquet"
df_internetions_3 = process_interventions_dataset(file_path_interventions3)
file_path_bxl = "Data\Data\interventions_bxl.parquet.gzip"
df_interventions_bxl = process_interventions_BXL_dataset(file_path_bxl)
file_path_bxl2 = "Data\Data\interventions_bxl2.parquet.gzip"
df_interventions_bxl2 = process_interventions_BXL2_dataset(file_path_bxl2)
df_interventions_1_comb= combine_intervention_rows(df_internetions_1)
df_interventions_2_comb= combine_intervention_rows(df_internetions_2)
df_interventions_3_comb= combine_intervention_rows(df_internetions_3)
df_interventions_bxl= combine_intervention_rows(df_interventions_bxl)
df_interventions_bxl2= combine_intervention_rows(df_interventions_bxl2)
print(f"Same mission ID in intervention_1 for {count_same_mission(df_interventions_1_comb)} rows")
print(f"Same mission ID in intervention_2 for {count_same_mission(df_interventions_2_comb)} rows")
print(f"Same mission ID in intervention_3 for {count_same_mission(df_interventions_3_comb)} rows")
print(f"Same mission ID in intervention_bxl for {count_same_mission(df_interventions_bxl)} rows")
print(f"Same mission ID in intervention_bxl2 for {count_same_mission(df_interventions_bxl2)} rows")
df_combined_interventions = pd.concat([df_interventions_1_comb, df_interventions_2_comb, df_interventions_3_comb,df_interventions_bxl,df_interventions_bxl2], ignore_index=True)


  df[col] = pd.to_datetime(df[col], errors='coerce')
  df[col] = pd.to_datetime(df[col], errors='coerce')
  df[col] = pd.to_datetime(df[col], errors='coerce')
  df[col] = pd.to_datetime(df[col], errors='coerce')
  df[col] = pd.to_datetime(df[col], errors='coerce')
  df[col] = pd.to_datetime(df[col], errors='coerce')
  df[col] = pd.to_datetime(df[col], errors='coerce')
  df[col] = pd.to_datetime(df[col], errors='coerce')
  df[col] = pd.to_datetime(df[col], errors='coerce')
  df[col] = pd.to_datetime(df[col], errors='coerce')
  df[col] = pd.to_datetime(df[col], errors='coerce')
  df[col] = pd.to_datetime(df[col], errors='coerce')


Same mission ID in intervention_1 for 0 rows
Same mission ID in intervention_2 for 0 rows
Same mission ID in intervention_3 for 0 rows
Same mission ID in intervention_bxl for 0 rows
Same mission ID in intervention_bxl2 for 0 rows


## Data imputing 

### Cleaning postal codes

In [40]:
def clean_postal_code(value):
    # Convert value to string if not None
    if value is not None:
        value_str = str(value)
        # Remove leading 'B' if present
        if value_str.startswith('B'):
            value_str = value_str[1:]
        # Remove trailing '.0' if present
        if value_str.endswith('.0'):
            value_str = value_str[:-2]
        #Delete first 0
        if value_str.startswith('0'):
            value_str = value_str[1:]
        return value_str
    return None

df_combined_interventions['postalcode_intervention'] = df_combined_interventions['postalcode_intervention'].apply(clean_postal_code)
df_combined_interventions['postalcode_permanence'] = df_combined_interventions['postalcode_permanence'].apply(clean_postal_code)

### Imputing intervention coordiantes based on the intervention city

In [8]:
def fetch_coordinates_by_city_name(city_name):
    try:
        # Attempt to fetch location using city name
        location = geolocator.geocode(city_name)
        if location:
            return location.latitude, location.longitude
        else:
            return None, None
    except (GeocoderTimedOut, GeocoderServiceError) as e:
        print(f"Geocoding error for city {city_name}: {e}")
        return None, None
    
# Assuming 'cityname_intervention' has been cleaned as needed
unique_city_names_intervention = df_combined_interventions['cityname_intervention'].dropna().unique()

# Dictionary to store city name coordinates
city_name_coordinates_intervention = {}

# Fetch coordinates for each unique city name
for city_name in unique_city_names_intervention:
    if city_name not in city_name_coordinates_intervention:  # Check if coordinates are already fetched
        lat, lon = fetch_coordinates_by_city_name(city_name)
        city_name_coordinates_intervention[city_name] = (lat, lon)

def apply_city_coordinates(row):
    # Update intervention coordinates if missing
    city_name_intervention = row['cityname_intervention']
    if city_name_intervention in city_name_coordinates_intervention:
        city_lat_intervention, city_lon_intervention = city_name_coordinates_intervention[city_name_intervention]
        
        if pd.isnull(row['latitude_intervention']):
            row['latitude_intervention'] = city_lat_intervention
        if pd.isnull(row['longitude_intervention']):
            row['longitude_intervention'] = city_lon_intervention
    
    return row  # Ensure you return the modified row

# Apply the function to each row of df_combined_interventions
df_combined_interventions = df_combined_interventions.apply(apply_city_coordinates, axis=1)

Geocoding error for city H√©l√©cine: HTTPSConnectionPool(host='nominatim.openstreetmap.org', port=443): Max retries exceeded with url: /search?q=H%E2%88%9A%C2%A9l%E2%88%9A%C2%A9cine&format=json&limit=1 (Caused by ReadTimeoutError("HTTPSConnectionPool(host='nominatim.openstreetmap.org', port=443): Read timed out. (read timeout=1)"))
Geocoding error for city Fontaine-l'Ev√™que: HTTPSConnectionPool(host='nominatim.openstreetmap.org', port=443): Max retries exceeded with url: /search?q=Fontaine-l%27Ev%E2%88%9A%E2%84%A2que&format=json&limit=1 (Caused by ReadTimeoutError("HTTPSConnectionPool(host='nominatim.openstreetmap.org', port=443): Read timed out. (read timeout=1)"))
Geocoding error for city Saint-L√©ger: HTTPSConnectionPool(host='nominatim.openstreetmap.org', port=443): Max retries exceeded with url: /search?q=Saint-L%E2%88%9A%C2%A9ger&format=json&limit=1 (Caused by ReadTimeoutError("HTTPSConnectionPool(host='nominatim.openstreetmap.org', port=443): Read timed out. (read timeout=1)"))

### Impute the T columns 

In [41]:
def impute_timestamps(df):
    def to_seconds(time_str):
        """Convert HH:MM:SS or full datetime to total seconds."""
        if pd.isna(time_str):
            return None
        # Check if the string includes a date part and extract only the time part if so
        if ' ' in time_str:
            time_str = time_str.split(' ')[-1]  # Assume the last part is time
        # Split the time part into hours, minutes, and seconds
        parts = time_str.split(':')
        h, m = int(parts[0]), int(parts[1])
        s = int(parts[2]) if len(parts) == 3 else 0  # Handle cases without seconds
        return h * 3600 + m * 60 + s

    def to_hms(seconds):
        """Convert total seconds back to HH:MM:SS."""
        if seconds is None:
            return None
        h = seconds // 3600
        m = (seconds % 3600) // 60
        s = seconds % 60
        return f"{h:02d}:{m:02d}:{s:02d}"

    # Iterate over each row
    for index, row in df.iterrows():
        timestamps = row[['t0', 't1', 't2', 't3', 't4', 't5', 't6', 't7']]
        missing_indices = [i for i, x in enumerate(timestamps) if pd.isna(x)]
        if not missing_indices:
            continue
        
        for missing_index in missing_indices:
            if missing_index == 0 or missing_index == len(timestamps) - 1:
                continue
            prev_known_index = next((i for i in range(missing_index - 1, -1, -1) if not pd.isna(timestamps[i])), None)
            next_known_index = next((i for i in range(missing_index + 1, len(timestamps)) if not pd.isna(timestamps[i])), None)
            
            if prev_known_index is None or next_known_index is None:
                continue
            
            start_time = to_seconds(timestamps.iloc[prev_known_index])
            end_time = to_seconds(timestamps.iloc[next_known_index])
            if start_time is not None and end_time is not None:
                interval = (end_time - start_time) // (next_known_index - prev_known_index)
                
                for i in range(prev_known_index + 1, next_known_index):
                    start_time += interval
                    df.at[index, f't{i}'] = to_hms(start_time)

    return df

df_combined_interventions = impute_timestamps(df_combined_interventions)

### Calculate the differences between the timesstamps 

In [42]:
from datetime import datetime

def calculate_time_differences(df):
    # Define a helper function to convert time strings to datetime objects
    def time_to_datetime(t):
        if pd.isnull(t):  # Check for None values
            return None
        try:
            return datetime.strptime(t, '%H:%M:%S')
        except ValueError:
            # Handle cases where conversion fails due to incorrect format
            return None

    # Iterate over the t columns to calculate differences
    for i in range(0, 7):
        t_col_current = f't{i}'
        t_col_next = f't{i+1}'
        
        # Check if both columns exist in the DataFrame
        if t_col_current in df.columns and t_col_next in df.columns:
            # Convert time strings to datetime objects, handling None values
            current_times = df[t_col_current].apply(time_to_datetime)
            next_times = df[t_col_next].apply(time_to_datetime)
            
            # Calculate time differences where both current and next times are not None
            time_diffs = [(next - current).total_seconds() / 60 if current is not None and next is not None else None
                          for current, next in zip(current_times, next_times)]
            
            # Store the results in a new column
            df[f'{t_col_current}_{t_col_next}_diff'] = time_diffs

    return df

# Apply the function to your DataFrame
df_combined_interventions = calculate_time_differences(df_combined_interventions)


#### there are negative values so these have be be changed to the mean

In [45]:
negative_rows_count = sum((df_combined_interventions[col] < 0).sum() for col in [f't{i}-t{i+1} diff in minutes' for i in range(0, 7)] if col in df_combined_interventions.columns)
negative_rows_count

4503

In [None]:
# List of the newly created time difference columns
time_diff_cols = [f't{i}_t{i+1}_diff' for i in range(0, 7)]

# Impute missing values, round to full integer, and rename columns
for i, col in enumerate(time_diff_cols):
    if col in df_combined_interventions.columns:  # Check if the column exists in the DataFrame
       
        # Rename the column to "tx-ty diff in minutes"
        new_col_name = f't{i}-t{i+1} diff in minutes'
        df_combined_interventions.rename(columns={col: new_col_name}, inplace=True)


In [None]:
time_diff_cols_renamed = [f't{i}-t{i+1} diff in minutes' for i in range(0, 7)]

# Iterate over each renamed time difference column
for col in time_diff_cols_renamed:
    if col in df_combined_interventions.columns:
        # Calculate the mean excluding negative values
        mean_value = df_combined_interventions[col][df_combined_interventions[col] > 0].mean()

        # Replace negative values with the calculated mean
        df_combined_interventions[col] = df_combined_interventions[col].apply(lambda x: mean_value if x < 0 else x)

        # Ensure the column is rounded to full integers
        df_combined_interventions[col] = df_combined_interventions[col].round(0)


In [None]:
negative_rows_count = sum((df_combined_interventions[col] < 0).sum() for col in [f't{i}-t{i+1} diff in minutes' for i in range(0, 7)] if col in df_combined_interventions.columns)
negative_rows_count

In [51]:
# Create a SimpleImputer object with strategy set to 'mean'
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
number_of_transported = df_combined_interventions['number_of_transported_persons'].values.reshape(-1, 1)
df_combined_interventions['number_of_transported_persons'] = imputer.fit_transform(number_of_transported)
df_combined_interventions['number_of_transported_persons']=round(df_combined_interventions['number_of_transported_persons'],0)
df_combined_interventions['t1-t2 diff in minutes']=round(df_combined_interventions['t1-t2 diff in minutes'],0)
df_combined_interventions['t2-t3 diff in minutes']=round(df_combined_interventions['t2-t3 diff in minutes'],0)
df_combined_interventions['t3-t4 diff in minutes']=round(df_combined_interventions['t3-t4 diff in minutes'],0)
df_combined_interventions['t4-t5 diff in minutes']=round(df_combined_interventions['t4-t5 diff in minutes'],0)
df_combined_interventions['t5-t6 diff in minutes']=round(df_combined_interventions['t5-t6 diff in minutes'],0)
df_combined_interventions['t6-t7 diff in minutes']=round(df_combined_interventions['t6-t7 diff in minutes'],0)

In [56]:
df_combined_interventions['t1-t2 diff in minutes'].mean()

4.34028895811989