In [None]:
# Import packages
from datetime import datetime
import pandas as pd
import numpy as np
import statsmodels.api as sm
import os

In [None]:
# Import facilities dataset
fac = pd.read_csv("./Data/facilities.csv")
fac.head()

In [None]:
# Check number of missing values per variable
for col in fac.columns:
    missings = len(fac[col][fac[col].isnull()]) / float(len(fac))
    print(col, missings)

# All sales_* columns have a very high number of missing values indicating a data quality issue. 

In [None]:
# Further investigation of missing values in facilities columns

# we will exclude sales_* (because of DQ issue) and determine which columns are metadata
exclude_prefix = 'sales_'
metadata = {'station_id', 'station', 'name', 'street', 'zip', 'city'}
cols_to_check = [
    c for c in fac.columns
    if not c.startswith(exclude_prefix) and c not in metadata
]

print(f"Number of columns checked: {len(cols_to_check)}")
print("Some of the checked columns:", cols_to_check[:12])

# make mask for rows with at least one NaN in those columns
nan_mask = fac[cols_to_check].isna().any(axis=1)

# make results table
id_cols = [c for c in ['station_id','station','name'] if c in fac.columns]
result_cols = id_cols + cols_to_check

nan_stations = fac.loc[nan_mask, result_cols].copy()

# add column to table that counts how many NaN's there are per station for the checked columns
nan_stations['n_missing'] = fac.loc[nan_mask, cols_to_check].isna().sum(axis=1)

nan_per_column = nan_stations[cols_to_check].isna().sum().sort_values(ascending=False).to_frame(name='n_missing_in_subset')

# overview
print("Number of stations with at least 1 NaN:", nan_stations.shape[0])
display(nan_stations.head(50)) 
display(nan_per_column)   

In [None]:
# From the above, we see that the stations with missing facility data contain no information in all facility columns.
# Therefore, we will remove all stations with missing facility data from the dataset to perform our analysis.
# Additionally, we exclude all sales_* columns from the resulting dataframe.
sales_cols = [col for col in fac.columns if col.startswith('sales_')]
cols_to_keep = [col for col in fac.columns if col not in sales_cols]
fac_no_missing_facilities = fac.loc[~nan_mask, cols_to_keep].copy()
fac_no_missing_facilities.head()

In [None]:
# We transform disabled_parking_spots to a binary indicator
fac_no_missing_facilities["disabled_parking_ind"] = (fac["disabled_parking_spots"].fillna(0) > 0).astype(float)

# Kies de binaire kolommen (pas namen aan als ze bij jou licht verschillen)
binary_cols = ["ticket_vending_machine","luggage_lockers","free_parking","taxi","bicycle_spots","blue-bike","bus","tram","metro","wheelchair_available","ramp","disabled_parking_ind","elevated_platform","escalator_up","escalator_down","elevator_platform","audio_induction_loop"]

# Som over binaire kolommen + disabled_parking_ind (niet disabled_parking_spots)
fac_no_missing_facilities["n_facilities"] = fac_no_missing_facilities[binary_cols].sum(axis=1)
fac_no_missing_facilities


In [None]:
# Import travelers dataset
travelers = pd.read_csv("./Data/travelers.csv", sep=";", index_col=0)
# Let's rename for convenience later
travelers = travelers.rename({"Station": "station",
                                    "Avg number of travelers in the week": "week",
                                    "Avg number of travelers on Saturday": "saturday",
                                    "Avg number of travelers on Sunday": "sunday"}, axis=1)
travelers.head()

In [None]:
# Check number of missing values per variable
for col in travelers.columns:
    missings = len(travelers[col][travelers[col].isnull()]) / float(len(travelers))
    print(col, missings)

In [None]:
# Further inspection on Wikipedia and the NMBS website reveal that there are no train rides on these dates for these stations. 
# For example, Baasrode-Zuid & Buda only have train rides during the week and none in the weekend. 
# Therefore, we will impute every missing value with zero.
travelers['week'] = travelers['week'].fillna(0)
travelers['saturday'] = travelers['saturday'].fillna(0)
travelers['sunday'] = travelers['sunday'].fillna(0)

# Show
travelers.head()

In [None]:
# Create total travelers over the week
travelers["week_total"] = 5 * travelers["week"] + travelers["saturday"] + travelers["sunday"]

In [None]:
# Get avg travelers per day (including weekends)
travelers["avg_day"] = travelers["week_total"] / float(7)
travelers

In [None]:
# Import stations dataset
stations = pd.read_csv("./Data/stations.csv")
stations.head()

In [None]:
stations["daily_trains"] = stations["avg_stop_times"]
stations["daily_trains"]

In [None]:
# Merge datasets on station_id and name
from unidecode import unidecode
fac_no_missing_facilities['name_clean'] = fac_no_missing_facilities['name'].apply(lambda x: unidecode(str(x)).lower().strip())
travelers['station_clean'] = travelers['station'].apply(lambda x: unidecode(str(x)).lower().strip())
df = fac_no_missing_facilities.merge(stations[['station_id', 'daily_trains']], on='station_id', how='left')
df = df.merge(travelers[['station_clean', 'avg_day']],
              left_on='name_clean', right_on='station_clean', how='left')
df

In [None]:
# Regression: number of facilities on number of trains and number of travelers
reg_df = df[['n_facilities', 'daily_trains', 'avg_day']].dropna()
X = reg_df[['daily_trains', 'avg_day']]
y = reg_df['n_facilities']
X = sm.add_constant(X)
model = sm.OLS(y, X).fit()
print(model.summary())

In [None]:
incidents = pd.read_csv("./Data/incidents.csv", sep=";", index_col=0)
# Remove duplicate columns 'Place.1' en 'Place.2'
if 'Place.1' in incidents.columns:
    incidents = incidents.drop(columns=['Place.1'])
if 'Place.2' in incidents.columns:
    incidents = incidents.drop(columns=['Place.2'])
incidents.head()

Unnamed: 0_level_0,Incident date,Line,Place,Incident description NL,Incident description FR,Incident description,Minutes of delay,Number of cancelled trains
Month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2025-08,2025-08-25,96A,Y.RUISBROEK,Schade bovenleiding,Avarie caténaire,Damage catenary,1275,9
2025-08,2025-08-13,59,SINAAI,Storing aan een overweg,Dérangement à un passage à niveau,Disturbance at a level crossing,1020,38
2025-08,2025-08-13,0/1,BRUSSEL-NOORD,Aanwezigheid verdacht pakket,Présence d'un colis suspect,Presence of a suspicious package,5866,169
2025-08,2025-08-11,50A,Y.MEULEWIJK,Storing seininrichting,Dérangement à la signalisation,Disturbance with signalling,1986,17
2025-08,2025-08-07,50C,ANDERLECHT,Brand in de nabijheid van het spoor,Incendie à proximité des voies,Fire near the tracks,9671,207


In [None]:
# ik moet nog een analysis doen van de null values in incidents

In [None]:
# Analysis of delays by incident type
incident_delay_stats = incidents.groupby('Incident description')['Minutes of delay'].describe()
print(incident_delay_stats)

                                                    count         mean  \
Incident description                                                     
Accident at a level crossing                         18.0  1420.166667   
Body in the tracks                                    7.0  1994.285714   
Bomb alert                                            4.0  3016.250000   
Cable theft                                          15.0  1627.466667   
Collision with a person                             117.0  2293.905983   
Collision with an animal                              4.0  2841.750000   
Crossing of a red signal                              8.0  1694.750000   
Damage catenary                                      63.0  1893.269841   
Damage high speed train                               2.0  4679.500000   
Damage rolling stock                                122.0  1602.795082   
Dangerous goods near the tracks                       2.0  2084.500000   
Delays from neighbouring networks     

In [39]:
import numpy as np
results = []
for incident_type, group in incidents.groupby('Incident description'):
    # Define extreme delay as 90th percentile for each incident type
    threshold = np.percentile(group['Minutes of delay'], 90)
    extreme = group[group['Minutes of delay'] >= threshold]
    # Group by location
    loc_stats = extreme.groupby('Place').agg({
        'Minutes of delay': ['count', 'mean', 'max'],
        'Number of cancelled trains': 'sum'
    }).sort_values(('Minutes of delay', 'count'), ascending=False)
    # Flatten multi-level columns for merging
    loc_stats.columns = ['_'.join([str(i) for i in col if i]) for col in loc_stats.columns]
    loc_stats = loc_stats.reset_index()
    # Merge with travelers data if possible
    if 'avg_day' in travelers.columns:
        loc_stats = loc_stats.merge(travelers[['station', 'avg_day']], left_on='Place', right_on='station', how='left')
        loc_stats['estimated_affected_travelers'] = loc_stats['Number of cancelled trains_sum'] * loc_stats['avg_day']
    else:
        loc_stats['estimated_affected_travelers'] = np.nan
    # Get top location
    if not loc_stats.empty:
        top_loc = loc_stats.iloc[0]
        results.append({
            'incident_type': incident_type,
            'location': top_loc['Place'],
            'n_extreme_delays': int(top_loc.get('Minutes of delay_count', 0)),
            'mean_extreme_delay': float(top_loc.get('Minutes of delay_mean', np.nan)),
            'max_extreme_delay': float(top_loc.get('Minutes of delay_max', np.nan)),
            'cancelled_trains': int(top_loc.get('Number of cancelled trains_sum', 0)),
            'estimated_affected_travelers': float(top_loc.get('estimated_affected_travelers', np.nan)) if not np.isnan(top_loc.get('estimated_affected_travelers', np.nan)) else None,
            'threshold': threshold
        })
import pandas as pd
summary_df = pd.DataFrame(results)
# Ensure all expected columns are present
expected_cols = [
    'incident_type', 'location', 'n_extreme_delays', 'mean_extreme_delay',
    'max_extreme_delay', 'cancelled_trains', 'estimated_affected_travelers', 'threshold'
 ]
for col in expected_cols:
    if col not in summary_df.columns:
        summary_df[col] = np.nan
print('Summary of top incident-location combinations with extreme delays:')
print(summary_df[expected_cols].sort_values('estimated_affected_travelers', ascending=False))
# Recommendations (example logic)
for row in summary_df.sort_values('estimated_affected_travelers', ascending=False).head(5).itertuples():
    print(f"Priority: {row.incident_type} at {row.location} (Extreme delays: {row.n_extreme_delays}, Cancelled trains: {row.cancelled_trains}, Estimated affected travelers: {row.estimated_affected_travelers})")
    if 'technical' in row.incident_type.lower():
        print('  - Recommendation: Increase preventive maintenance and rapid response teams.')
    elif 'overcrowding' in row.incident_type.lower():
        print('  - Recommendation: Improve scheduling and add more carriages.')
    elif 'infrastructure' in row.incident_type.lower():
        print('  - Recommendation: Upgrade infrastructure or provide alternative routes.')
    else:
        print('  - Recommendation: Improve real-time communication and incident management.')

Summary of top incident-location combinations with extreme delays:
                                        incident_type           location  \
9                                Damage rolling stock             BRUGGE   
16                        Disturbance with signalling  ANTWERPEN-BERCHEM   
18                            Error during a maneuver           MECHELEN   
31                        Obstacle in/near the tracks        DENDERLEEUW   
3                                         Cable theft           GEMBLOUX   
7                                     Damage catenary             AALTER   
28                           Late completion of works             TIENEN   
4                             Collision with a person         KORTENBERG   
8                             Damage high speed train              HALLE   
0                        Accident at a level crossing              LEUZE   
17                            Disturbance with switch             TUBIZE   
25                   