In [12]:
import pandas as pd
import statsmodels.api as sm

print("--- Starting Analysis of Preprocessed Flight Data ---")

try:
    # Load the preprocessed CSV file
    df = pd.read_csv('preprocessed_flight_data.csv')
    print("Successfully loaded 'preprocessed_flight_data.csv'.")
except FileNotFoundError:
    print("Error: 'preprocessed_flight_data.csv' not found.")
    print("Please make sure the preprocessed file is in the same directory as the script.")
    exit()

# --- 1. Average Delay and Percentage of Delayed Flights ---
print("\n--- Question 1: Delays ---")
delayed_flights = df[df['departure_delay'] > 0]
print(f"No. of delayed flights: {len(delayed_flights)} out of {len(df)} total flights")
average_delay = delayed_flights['departure_delay'].mean()
percentage_delayed = (len(delayed_flights) / len(df)) * 100
print(f"Average departure delay for late flights: {average_delay:.2f} minutes")
print(f"Percentage of flights that depart late: {percentage_delayed:.2f}%")


--- Starting Analysis of Preprocessed Flight Data ---
Successfully loaded 'preprocessed_flight_data.csv'.

--- Question 1: Delays ---
No. of delayed flights: 4018 out of 8099 total flights
Average departure delay for late flights: 47.06 minutes
Percentage of flights that depart late: 49.61%


In [13]:
df.head()

Unnamed: 0,company_id,flight_number,scheduled_departure_date_local,scheduled_departure_station_code,scheduled_arrival_station_code,scheduled_departure_datetime_local,scheduled_arrival_datetime_local,actual_departure_datetime_local,actual_arrival_datetime_local,total_seats,...,ground_time_diff,total_pax,children_count,stroller_users,lap_child_count,ssr_count,Hot Transfer,checked_bags,transfer_bags,load_factor
0,OO,4792,2025-08-04,ORD,ROA,2025-08-04 17:57:00+00:00,2025-08-04 21:04:00+00:00,2025-08-04 18:04:00+00:00,2025-08-04 20:52:00+00:00,76,...,8,65,1,0,0,3.0,16,6,20,0.855263
1,UA,920,2025-08-03,ORD,LHR,2025-08-03 18:05:00+00:00,2025-08-04 08:20:00+00:00,2025-08-03 18:27:00+00:00,2025-08-04 08:06:00+00:00,167,...,90,171,5,0,1,3.0,16,90,37,1.0
2,UA,1776,2025-08-10,ORD,PHL,2025-08-10 18:20:00+00:00,2025-08-10 21:35:00+00:00,2025-08-10 20:11:00+00:00,2025-08-10 23:26:00+00:00,166,...,25,180,5,2,0,0.0,1,46,38,1.0
3,OO,5790,2025-08-06,ORD,CRW,2025-08-06 18:20:00+00:00,2025-08-06 21:04:00+00:00,2025-08-06 20:05:00+00:00,2025-08-06 22:42:00+00:00,50,...,194,55,2,0,0,2.0,0,14,40,1.0
4,UA,1398,2025-08-05,ORD,ATL,2025-08-05 18:20:00+00:00,2025-08-05 21:29:00+00:00,2025-08-05 18:16:00+00:00,2025-08-05 21:49:00+00:00,166,...,24,136,3,0,0,2.0,0,19,51,0.819277


In [14]:

# --- 2. Flights with Ground Time Below Minimum ---
print("\n--- Question 2: Ground Time ---")
# ground_time_diff was calculated as scheduled_ground_time_minutes - minimum_turn_minutes
tight_schedule_flights = df[df['ground_time_diff'] <= 0]
print(f"Number of flights with scheduled ground time at or below minimum: {len(tight_schedule_flights)}")



--- Question 2: Ground Time ---
Number of flights with scheduled ground time at or below minimum: 652


In [19]:

# --- 3. Ratio of Transfer Bags vs. Checked Bags ---
print("\n--- Question 3: Bag Ratio ---")
# To avoid division by zero, only consider flights with at least one checked bag
bag_ratio_df = df[df['checked_bags'] > 0].copy()
bag_ratio_df['bag_ratio'] = bag_ratio_df['transfer_bags'] / bag_ratio_df['checked_bags']
# print(bag_ratio_df.head())
average_bag_ratio = bag_ratio_df['bag_ratio'].mean()
print(f"Average ratio of transfer bags to checked bags: {average_bag_ratio:.2f}")



--- Question 3: Bag Ratio ---
Average ratio of transfer bags to checked bags: 3.04


In [16]:

# --- 4. Passenger Load vs. Operational Difficulty (Delay) ---
print("\n--- Question 4: Load Factor vs. Delay ---")
correlation = df['load_factor'].corr(df['departure_delay'])
print(f"Correlation between passenger load factor and departure delay: {correlation:.2f}")



--- Question 4: Load Factor vs. Delay ---
Correlation between passenger load factor and departure delay: -0.17


In [17]:

# --- 5. High SSR Flights vs. Delays (Controlling for Load) ---
print("\n--- Question 5: SSRs and Delays ---")
# Prepare data for regression
analysis_df = df[['departure_delay', 'load_factor', 'ssr_count']].dropna()
X = analysis_df[['load_factor', 'ssr_count']]
X = sm.add_constant(X)  # Add a constant (intercept) to the model
y = analysis_df['departure_delay']

# Fit the Ordinary Least Squares (OLS) model
model = sm.OLS(y, X).fit()

# Extract the p-value for the 'ssr_count' coefficient
p_value_ssr = model.pvalues['ssr_count']
print(f"P-value for ssr_count's impact on delay (controlling for load factor): {p_value_ssr:.2f}")

if p_value_ssr < 0.05:
    print("Conclusion: SSRs have a statistically significant impact on delays, even after accounting for load factor.")
else:
    print("Conclusion: SSRs do not have a statistically significant impact on delays when controlling for load factor.")


--- Question 5: SSRs and Delays ---
P-value for ssr_count's impact on delay (controlling for load factor): 0.00
Conclusion: SSRs have a statistically significant impact on delays, even after accounting for load factor.
