In [1]:
# add number of delays per day over full dataset

In [2]:
import mca
import numpy as np
import pandas as pd
import seaborn as sns
import requests
from datetime import datetime, timedelta
import os
import html
import re
from dateutil.parser import parse
import warnings
#warnings.filterwarnings("ignore", category=FutureWarning)
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split, ShuffleSplit, KFold, cross_val_score, GridSearchCV, StratifiedKFold

## Data loading 

In [3]:
df_lines = pd.read_csv("./Online_Data/referentiel-des-lignes.csv", sep=';')
stops_data = pd.read_csv("./Online_Data/arrets.csv", sep=';')

trafic2023_ratio = pd.read_csv("./Online_Data/validations-1er-semestre.csv", sep=';')
trafic2023_raw = pd.read_csv("./Online_Data/validations-reseau.csv", sep=";")

hourly_weather = pd.read_csv("./Online_Data/Weather/hourly_weather.csv")
minutely_15_weather = pd.read_csv("./Online_Data/Weather/minutely_15_weather.csv")

metro_incident = pd.read_csv("./Collected_Data/metro_line_reports.csv")
rer_incident = pd.read_csv("./Collected_Data/rer_line_reports.csv")

df_holidays = pd.read_csv("./Online_Data/vacances-scolaires-par-zone.csv", sep = ";")
df_bank_holidays = pd.read_csv("./Online_Data/jours_feries_metropole.csv", sep=",")

In [4]:
delays_metro = pd.read_csv("./Collected_Data/metro_delays.csv")
delays_rer = pd.read_csv("./Collected_Data/rail_delays.csv")
onTime_metro = pd.read_csv("./Collected_Data/metro_onTime.csv", low_memory=False)
onTime_rer = pd.read_csv("./Collected_Data/rail_onTime.csv", low_memory=False)

In [5]:
delays_metro2 = pd.read_csv("./Collected_Data/metro_delays2.csv")
delays_rer2 = pd.read_csv("./Collected_Data/rail_delays2.csv")
onTime_metro2 = pd.read_csv("./Collected_Data/metro_onTime2.csv", low_memory=False)
onTime_rer2 = pd.read_csv("./Collected_Data/rail_onTime2.csv", low_memory=False)

## Data cleaning

In [6]:
delays_metro = pd.concat([delays_metro, delays_metro2], ignore_index=True)
delays_rer = pd.concat([delays_rer, delays_rer2], ignore_index=True)
onTime_metro = pd.concat([onTime_metro, onTime_metro2], ignore_index=True)
onTime_rer = pd.concat([onTime_rer, onTime_rer2], ignore_index=True)

In [7]:
#Create filter
stops_filter = [22086, 463013, 22136, 462993, 21964, 462969, 22125, 463113, 41295, 473921, 473993, 41354, 474060, 474061]
name_filter = ["CH.D.G.ETOILE", "CHATELET", "SAINT-LAZARE","ST-GERM.D.PRES", "BLANCHE", "AVENUE DU PRESIDENT KENNEDY", "BUNO GIRONVILLE", "MASSY PALAISEAU"]
lines_filter = ["C01371", "C01372", "C01374" ," C01382", "C01742", "C01743", "C01727", "C0172"]

In [8]:
#Clean datasets
line_refs = df_lines[(~df_lines['TransportSubmode'].isin(['suburbanRailway', 'regionalRail', 'railShuttle']))]
line_refs = line_refs[['ID_Line', 'TransportMode', 'Name_Line']]
line_refs = line_refs.sort_values(by='ID_Line')

stops_data = stops_data[stops_data['ArRType'].isin(['metro', 'rail'])]
stops_data = stops_data.sort_values(by=['ArRType', 'ArRId'])
stops_data = stops_data[['ArRId', 'ArRName', 'ArRType', 'ArRTown']]

def cleaning_message(df):
    df['ref'] = df['ref'].str.replace('stop_point:IDFM:', '', regex=False)
    df['ref'] = df['ref'].str.replace('line:IDFM:', '', regex=False)
    df['ref'] = df['ref'].str.replace('stop_area:IDFM:', '', regex=False)
    df['message_text'] = df['message_text'].str.replace('<p>', '', regex=False)
    df['message_text'] = df['message_text'].str.replace('<br>', '', regex=False) 
    df = df[df["channel_name"].isin(["moteur"])]

cleaning_message(metro_incident)
cleaning_message(rer_incident)

metro_incident['message_text'] = metro_incident['message_text'].str.split('</p>').str[0]
metro_incident.drop_duplicates(subset=["disruption_id"], inplace=True)
metro_incident.drop(['tags', 'category', 'updated_at', 'channel_name'], axis=1, inplace=True) 

rer_incident['message_text'] = rer_incident['message_text'].str.replace('</p>', '', regex=False)
rer_incident['message_text'] = rer_incident['message_text'].apply(html.unescape)
rer_incident.drop_duplicates(subset=["disruption_id"], inplace=True)
rer_incident.drop(['tags', 'category', 'updated_at', 'channel_name'], axis=1, inplace=True) 

def refs(df):
    df['line_ref'] = df['line_ref'].str.replace('STIF:Line::', '', regex=False).str.rstrip(':')
    df['stop_reference'] = pd.to_numeric(df['stop_reference'], errors='coerce')
    df['stop_reference'] = df['stop_reference'].fillna(0).astype('int64')

refs(delays_metro)
refs(delays_rer)
refs(onTime_metro)
refs(onTime_rer)

delays_metro.drop(['scheduled_arrival','scheduled_departure','arrival_difference','departure_difference'], axis=1, inplace=True)
onTime_metro.drop(['scheduled_arrival','scheduled_departure','arrival_difference','departure_difference'], axis=1, inplace=True)

#mapping = dict(zip(name_filter, stops_filter))
#trafic2023_ratio['LIBELLE_ARRET_REA'] = trafic2023_ratio['LIBELLE_ARRET'].replace(mapping) #issue -  incorrect mapping? stop number refers to wrong stop
#trafic2023_ratio = trafic2023_ratio[trafic2023_ratio['LIBELLE_ARRET_REA'].isin(stops_filter)]

#trafic2023_raw['LIBELLE_ARRET_REA'] = trafic2023_raw['LIBELLE_ARRET'].replace(mapping) #don't need anymore?
#trafic2023_raw.drop(["lda"],axis=1, inplace=True)

In [9]:
trafic2023_ratio_rer = trafic2023_ratio[trafic2023_ratio['CODE_STIF_TRNS'].isin([810, 800])]
trafic2023_ratio_metro = trafic2023_ratio[trafic2023_ratio['CODE_STIF_TRNS'].isin([100])]
trafic2023_raw_rer = trafic2023_raw[trafic2023_raw['CODE_STIF_TRNS'].isin([810, 800])]
trafic2023_raw_metro = trafic2023_raw[trafic2023_raw['CODE_STIF_TRNS'].isin([100])]

In [10]:
#Holidays and day types
df_bank_holidays["Date"] = pd.to_datetime(df_bank_holidays["date"])
df_holidays["Date"] = pd.to_datetime(df_holidays["Date"])

df_with_holidays = df_holidays.merge(df_bank_holidays, on = "Date", how = "left")
df_with_holidays = df_with_holidays.sort_values(by='Date')

df_with_holidays['holiday_type'] = df_with_holidays['nom_jour_ferie'].combine_first(df_with_holidays['Nom de la période'])
df_with_holidays['day_of_week'] = df_with_holidays['Date'].dt.weekday

def classify_day(row):
    if row['day_of_week'] < 5:  # Weekdays (Monday to Friday)
        if pd.notna(row['holiday_type']):
            return 'JOVS'  # Weekday with a holiday
        else:
            return 'JOHV'  # Weekday without a holiday
    elif row['day_of_week'] == 5:  # Saturday
        if pd.notna(row['holiday_type']):
            return 'SAVS'  # Saturday with a holiday
        else:
            return 'SAHV'  # Saturday without a holiday
    elif row['day_of_week'] == 6:  # Sunday
        return 'DIJFP'  # Sunday (always labeled DIJFP)
    return None

df_with_holidays['day_type'] = df_with_holidays.apply(classify_day, axis=1)
df_with_holidays["is_bank_holiday"] = (df_with_holidays["nom_jour_ferie"]).notna().astype(int)
df_with_holidays["is_holiday"] = (df_with_holidays["Nom de la période"]).notna().astype(int)
df_with_holidays["saturday"]= (df_with_holidays["Date"].dt.weekday == 5).astype(int)
df_with_holidays["sunday"]=  (df_with_holidays["Date"].dt.weekday == 6).astype(int)
df_with_holidays["is_weekend"]= df_with_holidays["Date"].dt.weekday.isin([5,6]).astype(int)
df_with_holidays["is_weekend_or_bank_holiday"] = df_with_holidays[["is_weekend", "is_bank_holiday"]].max(axis=1)

df_with_holidays.drop(['timestamp_unix', 'date', 'annee', 'zone', 'Calendrier Zone A', 'Calendrier Zone B', 'Calendrier Zone C'], axis=1, inplace=True)

In [11]:
#filter data set

In [12]:
#Merge datasets
merged_metro = pd.concat([delays_metro, onTime_metro], ignore_index=True)
merged_metro = pd.merge(merged_metro, stops_data, left_on='stop_reference', right_on='ArRId')
merged_metro.drop(['ArRId', 'ArRName', 'ArRType', 'transport_mode', 'recorded_at_time'], axis=1, inplace=True) 

merged_rer = pd.concat([delays_rer, onTime_rer], ignore_index=True)
merged_rer = pd.merge(merged_rer, stops_data, left_on='stop_reference', right_on='ArRId')
merged_rer.drop(['ArRId', 'ArRName', 'ArRType', 'transport_mode', 'recorded_at_time'],  axis=1, inplace=True)

def date_format(df):
    df['timestamp'] = pd.to_datetime(df['timestamp'].str.replace('Z', ''), errors='coerce')
    df['timestamp'] = pd.to_datetime(df['timestamp'])
    df['real_arrival'] = pd.to_datetime(df['real_arrival']).dt.strftime('%Y-%m-%d %H:%M:%S')
    df['real_arrival'] = pd.to_datetime(df['real_arrival'])
    df['real_departure'] = pd.to_datetime(df['real_departure']).dt.strftime('%Y-%m-%d %H:%M:%S')
    df['real_departure'] = pd.to_datetime(df['real_departure'])


def format_rer(df):
    df['scheduled_arrival'] = pd.to_datetime(df['scheduled_arrival']).dt.strftime('%Y-%m-%d %H:%M:%S')
    df['scheduled_arrival'] = pd.to_datetime(df['scheduled_arrival'])
    df['scheduled_departure'] = pd.to_datetime(df['scheduled_departure']).dt.strftime('%Y-%m-%d %H:%M:%S')
    df['scheduled_departure'] = pd.to_datetime(df['scheduled_departure'])
    df.loc[(df['arrival_difference'] >= 3) | (df['departure_difference'] >= 3), 'departure_status'] = 'delayed'
    df['nearest_datetime'] = df['scheduled_arrival'].combine_first(df['scheduled_departure'])

date_format(merged_metro)
date_format(merged_rer)
format_rer(merged_rer)

merged_metro = merged_metro.sort_values(by='real_arrival')
merged_rer = merged_rer.sort_values(by='nearest_datetime')

In [13]:
#Add holidays
merged_rer['day'] = pd.to_datetime(merged_rer['nearest_datetime']).dt.normalize()
merged_rer = merged_rer.merge(df_with_holidays, left_on='day', right_on='Date', how='left')
merged_rer['hour'] = merged_rer['nearest_datetime'].dt.hour.astype('int64')

merged_metro['day'] = pd.to_datetime(merged_metro['real_arrival']).dt.normalize()
merged_metro = merged_metro.merge(df_with_holidays, left_on='day', right_on='Date', how='left')
merged_metro['hour'] = merged_metro['real_arrival'].dt.hour.astype('int64')

merged_metro = merged_metro.sort_values(by='real_arrival')
merged_rer = merged_rer.sort_values(by='nearest_datetime')

IntCastingNaNError: Cannot convert non-finite values (NA or inf) to integer

In [None]:
merged_metro = merged_metro.merge(trafic2023_ratio_metro, left_on=['day_type', 'hour'], right_on=['CAT_JOUR', 'start_hour'], how='left')
#SGP = SGP.merge(SGP_traffic, left_on=['day_type', 'hour'], right_on=['CAT_JOUR', 'start_hour'], how='left')

In [270]:
merged_rer['scheduled_arrival'] = pd.to_datetime(merged_rer['scheduled_arrival'], format='%Y-%m-%d %H:%M:%S')
merged_rer['pourc_validations'] = merged_rer['pourc_validations'].fillna(0)
merged_rer["departure_status"] = merged_rer["departure_status"].astype("string")

merged_metro['real_arrival'] = pd.to_datetime(merged_metro['real_arrival'], format='%Y-%m-%d %H:%M:%S')
merged_metro['pourc_validations'] = merged_metro['pourc_validations'].fillna(0)
merged_metro["departure_status"] = merged_metro["departure_status"].astype("string")

start_date = pd.to_datetime('2024-11-02 00:00:00')

merged_metro = merged_metro[(merged_metro['real_arrival'] >= start_date)]
merged_rer = merged_rer[(merged_rer['scheduled_arrival'] >= start_date)]

# Summary Statistics

In [None]:
## for better readability, merge noth/south east/west directions

# Ensure the datetime column is in the correct format (if not already)
CLH['scheduled_arrival'] = pd.to_datetime(CLH['scheduled_arrival'])

# Filter for delayed departures
delayed_rer = CLH[CLH['departure_status'] == 'delayed']

# Add a 'day_of_week' column (0 = Monday, 1 = Tuesday, ..., 6 = Sunday) using .loc
delayed_rer.loc[:, 'day_of_week'] = delayed_rer['scheduled_arrival'].dt.dayofweek

# Map numerical days to actual weekday names using .loc
day_names = {0: 'Monday', 1: 'Tuesday', 2: 'Wednesday', 3: 'Thursday', 4: 'Friday', 5: 'Saturday', 6: 'Sunday'}
delayed_rer.loc[:, 'day_of_week'] = delayed_rer['day_of_week'].map(day_names)

# Group by day of the week and destination_name, then count the number of delays
delayed_count_per_weekday = delayed_rer.groupby(['day_of_week', 'destination_name']).size().reset_index(name='delay_count')

# Calculate the average number of delays per weekday for each destination
average_delays_per_weekday = delayed_count_per_weekday.groupby(['day_of_week', 'destination_name'])['delay_count'].mean().reset_index()

# Sort by weekday order
ordered_days = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
average_delays_per_weekday['day_of_week'] = pd.Categorical(average_delays_per_weekday['day_of_week'], categories=ordered_days, ordered=True)

# Plot the average delays per weekday for each destination as a bar chart
plt.figure(figsize=(14, 6))
sns.barplot(
    data=average_delays_per_weekday,
    x='day_of_week',
    y='delay_count',
    hue='destination_name',
    palette = "tab10"
)

# Add labels and title
plt.xlabel('Day of Week')
plt.ylabel('Average Number of Delays')
plt.title('Average Number of Delays Per Weekday by Destination')

# Rotate x-ticks for better readability
plt.xticks(rotation=45)

# Add grid for better readability
plt.grid(True)

# Show the plot
plt.tight_layout()
plt.show()


In [None]:
rer["stop_name"].value_counts()

In [None]:
# Ensure datetime format
rer_filtered = rer[rer['arrival_difference'] <= 100].copy()
rer_filtered['hour'] = rer_filtered['scheduled_arrival'].dt.hour

# Group by 'hour' and 'ArRTown' to calculate mean delay
average_delay_by_category = rer_filtered.groupby(['hour', 'stop_name'])['arrival_difference'].mean().reset_index()

custom_palette = {
    'Massy - Palaiseau': '#4B92DB',
    'Avenue du Président Kennedy': '#F3D311',
    'Châtelet - Les Halles': '#F7403A',
    'Gare de Buno Gironville': '#3F9C35'
}

# Plot
plt.figure(figsize=(10, 6))
sns.lineplot(
    data=average_delay_by_category,
    x='hour',
    y='arrival_difference',
    hue='stop_name',
    palette=custom_palette,
    marker="o"
)

#plt.xticks(ticks=range(0, 24), labels=[str(hour) for hour in range(24)])
plt.gca().set_xticks(range(0, 24, 4))  # Major ticks every 4 hours

plt.xlabel('Hour of the Day')
plt.ylabel('Average Delay (in mins)')
plt.title('Average Delay Duration Over a Single Day by Stop')
plt.legend(title="stop_name")
plt.grid(True)
plt.savefig("./Figures/Average-Delay-Stop.png", format='png', dpi=300)
plt.show()


In [None]:
# Remove outliers where delay exceeds 100 seconds
rer_filtered = rer[rer['arrival_difference'] <= 100]

rer_filtered.loc[:, 'day_of_week'] = rer_filtered['scheduled_arrival'].dt.dayofweek
rer_filtered.loc[:, 'day_of_week'] = rer_filtered['day_of_week'].map(day_names)
rer_filtered.loc[:, 'day_of_week'] = pd.Categorical(
    rer_filtered['day_of_week'],
    categories=['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'],
    ordered=True
)

# Group by 'day_of_week' and 'ArRTown' to calculate mean delay
average_delay_by_day = rer_filtered.groupby(
    ['day_of_week', 'stop_name'], observed=False
)['arrival_difference'].mean().reset_index()


custom_palette = {
    'Massy - Palaiseau': '#4B92DB',
    'Avenue du Président Kennedy': '#F3D311',
    'Châtelet - Les Halles': '#F7403A',
    'Gare de Buno Gironville': '#3F9C35'
}

# Plot
plt.figure(figsize=(10, 6))
sns.lineplot(
    data=average_delay_by_day,
    x='day_of_week',
    y='arrival_difference',
    hue='stop_name',
    palette=custom_palette,
    marker="o"
)
plt.xlabel('Day of the Week')
plt.ylabel('Average Delay (in secs)')
plt.title('Average Delay Duration by Day of the Week and Stop (Filtered)')
plt.legend(title="stop_name")
plt.grid(True)
plt.savefig("./Figures/Average-Delay-DayOfWeek-Stop.png", format='png', dpi=300)
plt.show()


In [None]:
## idk how to format the dates propertly and on axis ticker location

# Ensure the datetime column is in proper format
rer['Date'] = pd.to_datetime(rer['Date'])

# Filter for delayed departures
#delayed_rer = rer[rer['departure_status'] == 'delayed']

# Group by date and count the number of delays
#delayed_count_per_day = delayed_rer.groupby(delayed_rer['Date'].dt.date).size().reset_index(name='delay_count')
average_delay_by_category = rer_filtered.groupby(['Date', 'stop_name'])['arrival_difference'].mean().reset_index()

# Rename columns for clarity
average_delay_by_category.rename(columns={'Date': 'date'}, inplace=True)

# Convert 'date' to datetime for better handling of date ticks
average_delay_by_category['date'] = pd.to_datetime(average_delay_by_category['date'])

# Plot as a bar chart
plt.figure(figsize=(14, 6))
sns.barplot(
    data=average_delay_by_category,
    x='date',
    y='arrival_difference',
    color="#FF4500"  # Custom orange-red color
)

# Set x-axis ticks to be on each Monday
plt.gca().xaxis.set_major_locator(mdates.WeekdayLocator(byweekday=mdates.MO))  # Major ticks every Monday

# Get the current axis and set the ticks
ax = plt.gca()
ticks = ax.get_xticks()

# Alternate the height of the ticks
for i, tick in enumerate(ticks):
    if i % 2 == 0:
        ax.get_xticklabels()[i].set_verticalalignment('bottom')  # Lower even-index ticks
    else:
        ax.get_xticklabels()[i].set_verticalalignment('top')  # Raise odd-index ticks

# Add labels and title
plt.xlabel('Date')
plt.ylabel('Duration of Delays')
plt.title('Duration of Delays Per Day')

# Enable grid for better visibility of bar heights
plt.grid(axis='y', linestyle='--', linewidth=0.7)

# Tight layout for better spacing
plt.tight_layout()

# Save and show the plot
plt.savefig("./Figures/Duration-Delays-Per-Day-Weekly-Ticks-Alternating.png", format='png', dpi=300)
plt.show()


In [None]:
## idk how to format the dates propertly and on axis ticker location

# Ensure the datetime column is in proper format
rer['Date'] = pd.to_datetime(rer['Date'])

# Filter for delayed departures
delayed_rer = rer[rer['departure_status'] == 'delayed']

# Group by date and count the number of delays
delayed_count_per_day = delayed_rer.groupby(delayed_rer['Date'].dt.date).size().reset_index(name='delay_count')

# Rename columns for clarity
delayed_count_per_day.rename(columns={'Date': 'date'}, inplace=True)

# Convert 'date' to datetime for better handling of date ticks
delayed_count_per_day['date'] = pd.to_datetime(delayed_count_per_day['date'])

# Plot as a bar chart
plt.figure(figsize=(14, 6))
sns.barplot(
    data=delayed_count_per_day,
    x='date',
    y='delay_count',
    color="#FF4500"  # Custom orange-red color
)

# Set x-axis ticks to be on each Monday
plt.gca().xaxis.set_major_locator(mdates.WeekdayLocator(byweekday=mdates.MO))  # Major ticks every Monday

# Get the current axis and set the ticks
ax = plt.gca()
ticks = ax.get_xticks()

# Alternate the height of the ticks
for i, tick in enumerate(ticks):
    if i % 2 == 0:
        ax.get_xticklabels()[i].set_verticalalignment('bottom')  # Lower even-index ticks
    else:
        ax.get_xticklabels()[i].set_verticalalignment('top')  # Raise odd-index ticks

# Add labels and title
plt.xlabel('Date')
plt.ylabel('Number of Delays')
plt.title('Number of Delays Per Day')

# Enable grid for better visibility of bar heights
plt.grid(axis='y', linestyle='--', linewidth=0.7)

# Tight layout for better spacing
plt.tight_layout()

# Save and show the plot
plt.savefig("./Figures/Delays-Per-Day-Weekly-Ticks-Alternating.png", format='png', dpi=300)
plt.show()


In [None]:
# Select numerical variables
numerical_vars = metro_learn.select_dtypes(include=['float64'])

# Standardize the numerical variables
scaler = StandardScaler()
numerical_vars_scaled = scaler.fit_transform(numerical_vars)

# Perform PCA
pca = PCA()
pca_results = pca.fit_transform(numerical_vars_scaled)

In [None]:
# Create a DataFrame for the PCA results
pca_df = pd.DataFrame(
    pca_results,
    columns=[f'PC{i+1}' for i in range(pca_results.shape[1])]
)

# Explained variance ratio
explained_variance = pca.explained_variance_ratio_

# Output the PCA results and explained variance
print("Explained Variance Ratio:", explained_variance)
print("PCA DataFrame Head:\n", pca_df.head())

In [None]:
# -or
# Assuming you already have your dataset loaded into 'APK' DataFrame and it's preprocessed

# Select numerical columns
numerical_vars = metro_learn.select_dtypes(include=['float64'])

# Standardize the data
scaler = StandardScaler()
numerical_vars_scaled = scaler.fit_transform(numerical_vars)

# Perform PCA
pca = PCA()
pca.fit(numerical_vars_scaled)

# Eigenvalues (explained variance for each principal component)
eigval = pca.explained_variance_

# Number of components (k)
k = len(eigval)

# Scree plot
plt.plot(np.arange(1, k + 1), eigval, marker='o')
plt.title("Scree plot")
plt.ylabel("Eigen values")
plt.xlabel("Factor number")
plt.grid(True)
plt.savefig('Screeplot.png', bbox_inches='tight', dpi=300)
plt.show()


In [None]:
plt.plot(np.arange(1,k+1),np.cumsum(pca.explained_variance_ratio_)) 
plt.title("Explained variance vs. # of factors") 
plt.ylabel("Cumsum explained variance ratio") 
plt.xlabel("Factor number") 
plt.savefig('CumSumPlot.png', bbox_inches='tight', dpi=300)
plt.show()

In [219]:
## 5 factors picked for expainability

In [None]:
eigval

In [None]:
print(np.cumsum(pca.explained_variance_ratio_))

In [None]:
# Assuming APK_learn is your DataFrame, and categorical columns are encoded as integers
# Select columns that are of integer type (numerical encoded categorical columns)
categorical_columns = APK_learn.select_dtypes(include=['int64']).columns

# Run MCA using the mca library, specifying n_components for multiple components
mca_model = mca.MCA(APK_learn[categorical_columns], ncols=7)  # Request 7 components

# Access the MCA results (coordinates of the data points in the reduced space)
mca_results = mca_model.fs_r()  # Call the function to get the result

# Create a DataFrame with the MCA results
mca_df = pd.DataFrame(mca_results, columns=[f'MCA{i+1}' for i in range(mca_results.shape[1])])

# Display the first few rows of the MCA results
print(mca_df.head())


In [None]:
# Optionally, you can plot the first two components (MCA1 vs MCA2)
plt.figure(figsize=(10, 6))
plt.scatter(mca_df['MCA1'], mca_df['MCA2'], alpha=0.5)
plt.title('MCA: First Two Components')
plt.xlabel('MCA1')
plt.ylabel('MCA2')
plt.grid(True)
plt.show()

In [None]:
# Ensure the datetime column is in the correct format (if not already)
rer['scheduled_arrival'] = pd.to_datetime(rer['scheduled_arrival'])

# Extract hour from the datetime column
rer['hour'] = rer['scheduled_arrival'].dt.hour

# Group by hour and calculate the average pourc_validations
hourly_validations = rer.groupby('hour')['pourc_validations'].mean().reset_index()

# Plot the hourly average pourc_validations
plt.figure(figsize=(14, 6))
sns.lineplot(
    data=hourly_validations,
    x='hour',
    y='pourc_validations',
    marker='o',
    color='b'  # Blue color for the line
)

# Add labels and title
plt.xlabel('Hour of Day')
plt.ylabel('Average Pourc Validations')
plt.title('Average Pourc Validations Per Hour of Day')

# Set the x-ticks to be from 0 to 23 (representing hours)
plt.xticks(range(24))

# Add grid for better readability
plt.grid(True)

# Show the plot
plt.tight_layout()
plt.show()


# metro

In [None]:
## for better readability, merge noth/south east/west directions

# Ensure the datetime column is in the correct format (if not already)
metro['real_arrival'] = pd.to_datetime(metro['real_arrival'])

# Filter for delayed departures
delayed_rer = metro[metro['departure_status'] == 'delayed']

# Add a 'day_of_week' column (0 = Monday, 1 = Tuesday, ..., 6 = Sunday) using .loc
delayed_rer.loc[:, 'day_of_week'] = delayed_rer['real_arrival'].dt.dayofweek

# Map numerical days to actual weekday names using .loc
day_names = {0: 'Monday', 1: 'Tuesday', 2: 'Wednesday', 3: 'Thursday', 4: 'Friday', 5: 'Saturday', 6: 'Sunday'}
delayed_rer.loc[:, 'day_of_week'] = delayed_rer['day_of_week'].map(day_names)

# Group by day of the week and destination_name, then count the number of delays
delayed_count_per_weekday = delayed_rer.groupby(['day_of_week', 'destination_name']).size().reset_index(name='delay_count')

# Calculate the average number of delays per weekday for each destination
average_delays_per_weekday = delayed_count_per_weekday.groupby(['day_of_week', 'destination_name'])['delay_count'].mean().reset_index()

# Sort by weekday order
ordered_days = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
average_delays_per_weekday['day_of_week'] = pd.Categorical(average_delays_per_weekday['day_of_week'], categories=ordered_days, ordered=True)

# Plot the average delays per weekday for each destination as a bar chart
plt.figure(figsize=(14, 6))
sns.barplot(
    data=average_delays_per_weekday,
    x='day_of_week',
    y='delay_count',
    hue='destination_name',
    palette = "tab10"
)

# Add labels and title
plt.xlabel('Day of Week')
plt.ylabel('Average Number of Delays')
plt.title('Average Number of Delays Per Weekday by Destination')

# Rotate x-ticks for better readability
plt.xticks(rotation=45)

# Add grid for better readability
plt.grid(True)

# Show the plot
plt.tight_layout()
plt.show()


In [None]:
## idk how to format the dates propertly and on axis ticker location

# Ensure the datetime column is in proper format
metro['real_arrival'] = pd.to_datetime(metro['real_arrival'])

# Filter for delayed departures
delayed_metro = metro[metro['departure_status'] == 'delayed']

# Group by date and count the number of delays
delayed_count_per_day = delayed_metro.groupby(delayed_metro['real_arrival'].dt.date).size().reset_index(name='delay_count')

# Rename columns for clarity
delayed_count_per_day.rename(columns={'real_arrival': 'date'}, inplace=True)

# Convert 'date' to datetime for better handling of date ticks
delayed_count_per_day['date'] = pd.to_datetime(delayed_count_per_day['date'])

# Plot as a bar chart
plt.figure(figsize=(14, 6))
sns.barplot(
    data=delayed_count_per_day,
    x='date',
    y='delay_count',
    color="#FF4500"  # Custom orange-red color
)

# Set x-axis ticks to be on each Monday
plt.gca().xaxis.set_major_locator(mdates.WeekdayLocator(byweekday=mdates.MO))  # Major ticks every Monday

# Get the current axis and set the ticks
ax = plt.gca()
ticks = ax.get_xticks()

# Alternate the height of the ticks
for i, tick in enumerate(ticks):
    if i % 2 == 0:
        ax.get_xticklabels()[i].set_verticalalignment('bottom')  # Lower even-index ticks
    else:
        ax.get_xticklabels()[i].set_verticalalignment('top')  # Raise odd-index ticks

# Add labels and title
plt.xlabel('Date')
plt.ylabel('Number of Delays')
plt.title('Number of Delays Per Day')

# Enable grid for better visibility of bar heights
plt.grid(axis='y', linestyle='--', linewidth=0.7)

# Tight layout for better spacing
plt.tight_layout()

# Save and show the plot
plt.savefig("./Figures/Delays-Per-Day-Weekly-Ticks-Alternating.png", format='png', dpi=300)
plt.show()


In [None]:
# Ensure the datetime column is in the correct format (if not already)
metro['real_arrival'] = pd.to_datetime(metro['real_arrival'])

# Extract hour from the datetime column
metro['hour'] = metro['real_arrival'].dt.hour

# Group by hour and calculate the average pourc_validations
hourly_validations = metro.groupby('hour')['pourc_validations'].mean().reset_index()

# Plot the hourly average pourc_validations
plt.figure(figsize=(14, 6))
sns.lineplot(
    data=hourly_validations,
    x='hour',
    y='pourc_validations',
    marker='o',
    color='b'  # Blue color for the line
)

# Add labels and title
plt.xlabel('Hour of Day')
plt.ylabel('Average Pourc Validations')
plt.title('Average Pourc Validations Per Hour of Day')

# Set the x-ticks to be from 0 to 23 (representing hours)
plt.xticks(range(24))

# Add grid for better readability
plt.grid(True)

# Show the plot
plt.tight_layout()
plt.show()
