In [1]:
import pandas as pd
import statsmodels.api as sm

print("--- Starting Analysis of Preprocessed Flight Data ---")

try:
    # Load the preprocessed CSV file
    df = pd.read_csv('preprocessed_flight_data.csv')
    print("Successfully loaded 'preprocessed_flight_data.csv'.")
except FileNotFoundError:
    print("Error: 'preprocessed_flight_data.csv' not found.")
    print("Please make sure the preprocessed file is in the same directory as the script.")
    exit()

# --- 1. Average Delay and Percentage of Delayed Flights ---
print("\n--- Question 1: Delays ---")
delayed_flights = df[df['departure_delay'] > 0]
average_delay = delayed_flights['departure_delay'].mean()
percentage_delayed = (len(delayed_flights) / len(df)) * 100
print(f"Average departure delay for late flights: {average_delay:.2f} minutes")
print(f"Percentage of flights that depart late: {percentage_delayed:.2f}%")


--- Starting Analysis of Preprocessed Flight Data ---
Successfully loaded 'preprocessed_flight_data.csv'.

--- Question 1: Delays ---
Average departure delay for late flights: 47.06 minutes
Percentage of flights that depart late: 49.61%


In [2]:

# --- 2. Flights with Ground Time Below Minimum ---
print("\n--- Question 2: Ground Time ---")
# ground_time_diff was calculated as scheduled_ground_time_minutes - minimum_turn_minutes
tight_schedule_flights = df[df['ground_time_diff'] <= 0]
print(f"Number of flights with scheduled ground time at or below minimum: {len(tight_schedule_flights)}")



--- Question 2: Ground Time ---
Number of flights with scheduled ground time at or below minimum: 652


In [3]:

# --- 3. Ratio of Transfer Bags vs. Checked Bags ---
print("\n--- Question 3: Bag Ratio ---")
# To avoid division by zero, only consider flights with at least one checked bag
bag_ratio_df = df[df['checked_bags'] > 0].copy()
bag_ratio_df['bag_ratio'] = bag_ratio_df['transfer_bags'] / bag_ratio_df['checked_bags']
average_bag_ratio = bag_ratio_df['bag_ratio'].mean()
print(f"Average ratio of transfer bags to checked bags: {average_bag_ratio:.2f}")



--- Question 3: Bag Ratio ---
Average ratio of transfer bags to checked bags: 3.04


In [4]:

# --- 4. Passenger Load vs. Operational Difficulty (Delay) ---
print("\n--- Question 4: Load Factor vs. Delay ---")
correlation = df['load_factor'].corr(df['departure_delay'])
print(f"Correlation between passenger load factor and departure delay: {correlation:.2f}")



--- Question 4: Load Factor vs. Delay ---
Correlation between passenger load factor and departure delay: -0.17


In [5]:

# --- 5. High SSR Flights vs. Delays (Controlling for Load) ---
print("\n--- Question 5: SSRs and Delays ---")
# Prepare data for regression
analysis_df = df[['departure_delay', 'load_factor', 'ssr_count']].dropna()
X = analysis_df[['load_factor', 'ssr_count']]
X = sm.add_constant(X)  # Add a constant (intercept) to the model
y = analysis_df['departure_delay']

# Fit the Ordinary Least Squares (OLS) model
model = sm.OLS(y, X).fit()

# Extract the p-value for the 'ssr_count' coefficient
p_value_ssr = model.pvalues['ssr_count']
print(f"P-value for ssr_count's impact on delay (controlling for load factor): {p_value_ssr:.2f}")

if p_value_ssr < 0.05:
    print("Conclusion: SSRs have a statistically significant impact on delays, even after accounting for load factor.")
else:
    print("Conclusion: SSRs do not have a statistically significant impact on delays when controlling for load factor.")


--- Question 5: SSRs and Delays ---
P-value for ssr_count's impact on delay (controlling for load factor): 0.00
Conclusion: SSRs have a statistically significant impact on delays, even after accounting for load factor.
