In [19]:
import pandas as pd
import statsmodels.formula.api as smf
import numpy as np

# --- Load All Necessary Data Files ---
# Make sure these CSV files are in the same directory as your script
try:
    flight_df = pd.read_csv('Flight Level Data.csv')
    bag_df = pd.read_csv('Bag+Level+Data.csv')
    pnr_flight_df = pd.read_csv('PNR+Flight+Level+Data.csv')
    pnr_remark_df = pd.read_csv('PNR Remark Level Data.csv')
    print("Files loaded successfully.")
except FileNotFoundError as e:
    print(f"Error loading files: {e}.")
    print("Please ensure the CSV files are in the same directory as the script.")
    exit()


Files loaded successfully.


In [20]:

# --- Question 1: Flight Delays ---
print("\n--- Analyzing Question 1: Flight Delays ---")
# Convert time columns to datetime objects, coercing errors
flight_df['actual_departure_datetime_local'] = pd.to_datetime(flight_df['actual_departure_datetime_local'], errors='coerce')
flight_df['scheduled_departure_datetime_local'] = pd.to_datetime(flight_df['scheduled_departure_datetime_local'], errors='coerce')
flight_df.dropna(subset=['actual_departure_datetime_local', 'scheduled_departure_datetime_local'], inplace=True)
flight_df['departure_delay'] = (flight_df['actual_departure_datetime_local'] - flight_df['scheduled_departure_datetime_local']).dt.total_seconds() / 60
delayed_flights = flight_df[flight_df['departure_delay'] > 0]
average_delay = delayed_flights['departure_delay'].mean()
percentage_delayed = (len(delayed_flights) / len(flight_df)) * 100
print(f"Average departure delay for flights that are late: {average_delay:.2f} minutes")
print(f"Percentage of flights that depart later than scheduled: {percentage_delayed:.2f}%")



--- Analyzing Question 1: Flight Delays ---
Average departure delay for flights that are late: 47.06 minutes
Percentage of flights that depart later than scheduled: 49.61%


In [21]:

# --- Question 2: Ground Time ---
print("\n--- Analyzing Question 2: Ground Time ---")
tight_schedule_flights = flight_df[flight_df['scheduled_ground_time_minutes'] <= flight_df['minimum_turn_minutes']]
count_tight_schedule = len(tight_schedule_flights)
print(f"Number of flights with scheduled ground time <= minimum turn time: {count_tight_schedule}")



--- Analyzing Question 2: Ground Time ---
Number of flights with scheduled ground time <= minimum turn time: 652


In [22]:
# --- Question 3: Bag Ratio ---
print("\n--- Analyzing Question 3: Bag Ratio ---")
# Assuming 'Origin' bags are the primary type of "checked" bags to compare against "Transfer"
bag_df['bag_type'] = bag_df['bag_type'].replace({'Origin': 'Checked'})
flight_cols = ['company_id', 'flight_number', 'scheduled_departure_date_local']
bag_counts = bag_df.groupby(flight_cols + ['bag_type']).size().unstack(fill_value=0)
if 'Transfer' not in bag_counts.columns: bag_counts['Transfer'] = 0
if 'Checked' not in bag_counts.columns: bag_counts['Checked'] = 0
bag_counts = bag_counts[bag_counts['Checked'] > 0]
bag_counts['ratio'] = bag_counts['Transfer'] / bag_counts['Checked']
average_ratio = bag_counts['ratio'].mean()
print(f"Average ratio of Transfer vs. Checked bags: {average_ratio:.2f}")



--- Analyzing Question 3: Bag Ratio ---
Average ratio of Transfer vs. Checked bags: 3.22


In [23]:

# --- Question 4: Passenger Load vs. Operational Difficulty ---
print("\n--- Analyzing Question 4: Passenger Load vs. Operational Difficulty ---")
# Define a unique flight identifier
flight_identifier_cols = ['company_id', 'flight_number', 'scheduled_departure_date_local']
# Aggregate total passengers from PNR data
pax_by_flight = pnr_flight_df.groupby(flight_identifier_cols)['total_pax'].sum().reset_index()
# Merge flight data with passenger data
flight_analysis_df = pd.merge(flight_df, pax_by_flight, on=flight_identifier_cols)
# Calculate load factor
flight_analysis_df['load_factor'] = flight_analysis_df['total_pax'] / flight_analysis_df['total_seats']
flight_analysis_df.replace([np.inf, -np.inf], np.nan, inplace=True)
flight_analysis_df.dropna(subset=['load_factor', 'departure_delay'], inplace=True)
correlation = flight_analysis_df['load_factor'].corr(flight_analysis_df['departure_delay'])
print(f"Correlation between load factor and departure delay: {correlation:.2f}")



--- Analyzing Question 4: Passenger Load vs. Operational Difficulty ---
Correlation between load factor and departure delay: -0.15


In [24]:

# --- Question 5: SSRs vs. Delays (controlling for load) ---
print("\n--- Analyzing Question 5: SSRs vs. Delays (controlling for load) ---")
# Aggregate SSRs by flight
ssr_by_flight = pnr_remark_df.groupby(['flight_number']).size().reset_index(name='ssr_count')
# Merge SSR data into the main analysis dataframe
flight_analysis_df = pd.merge(flight_analysis_df, ssr_by_flight, on='flight_number', how='left')
flight_analysis_df['ssr_count'].fillna(0, inplace=True)
# Perform multiple regression
# Using a subset to avoid issues with perfect multicollinearity or other data problems in the full dataset
subset_df = flight_analysis_df[['departure_delay', 'load_factor', 'ssr_count']].dropna()
# Ensure the sample size is not larger than the population
sample_size = min(5000, len(subset_df))
if sample_size > 0:
    subset_df = subset_df.sample(n=sample_size, random_state=42)
    model = smf.ols('departure_delay ~ load_factor + ssr_count', data=subset_df).fit()
    # Check the p-value for ssr_count
    p_value_ssr = model.pvalues['ssr_count']
    print(f"\nFull regression model summary:\n{model.summary()}\n")
    print(f"P-value for ssr_count: {p_value_ssr:.2f}")
    if p_value_ssr < 0.05:
        print("The number of SSRs is a statistically significant predictor of delay, after controlling for load factor.")
    else:
        print("The number of SSRs is NOT a statistically significant predictor of delay, after controlling for load factor.")
else:
    print("Could not create a sample for the regression analysis due to insufficient data.")


--- Analyzing Question 5: SSRs vs. Delays (controlling for load) ---

Full regression model summary:
                            OLS Regression Results                            
Dep. Variable:        departure_delay   R-squared:                       0.035
Model:                            OLS   Adj. R-squared:                  0.034
Method:                 Least Squares   F-statistic:                     89.94
Date:                Sat, 04 Oct 2025   Prob (F-statistic):           4.21e-39
Time:                        16:36:11   Log-Likelihood:                -27444.
No. Observations:                5000   AIC:                         5.489e+04
Df Residuals:                    4997   BIC:                         5.491e+04
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                  coef    std err          t      P>|t|      [0.025      0.975]
----------------------------

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  flight_analysis_df['ssr_count'].fillna(0, inplace=True)
