In [None]:
#Dataset came from https://www.transtats.bts.gov/ot_delay/OT_DelayCause1.asp?20=E

In [None]:
import pandas as pd
import numpy as np
#load data
df = pd.read_csv('Airline_Delay_Cause.csv')

#remove all years except 2009 to 2019
df = df[df['year'] >= 2009]
df = df[df['year'] <= 2019]

#create list of unique values in carrier_name column
carrier_reduce = [ 'American Airlines Inc.',
       'Alaska Airlines Inc.', 'JetBlue Airways', 'Delta Air Lines Inc.',
       'Frontier Airlines Inc.', 'Allegiant Air',
       'Hawaiian Airlines Inc.',  'Spirit Air Lines',
       'United Air Lines Inc.', 'Southwest Airlines Co.']

#reduce the dataframe to only include the carriers in the carrier_reduce list
df = df[df['carrier_name'].isin(carrier_reduce)]
#shorten the carrier names to be only the first word
df['carrier_name'] = df['carrier_name'].str.split().str[0]

#drop unnecessary columns
df = df.drop(columns=['carrier','airport'])
#change carrier_name to carrier
df = df.rename(columns={'carrier_name':'carrier'})
#upper case all column names
df.columns = df.columns.str.upper()

#Column creation for analysis
#add a column called 'delay_ct_frac' to the dataframe. This column contains the fraction of flights that were delayed over the total number of flights for each row
df['DELAY_CT_FRAC'] = df['ARR_DEL15'] / df['ARR_FLIGHTS']
#add a column called 'avg_delay' to the dataframe. This column contains the average delay time for each row for all types of delays
df['AVG_DELAY'] = df['ARR_DELAY'] / df['ARR_DEL15']
#add a column called 'fault_delay_ct_frac' which sums of the delay_ct of the following columns: 'carrier_ct', 'nas_ct', 'security_ct', 'late_aircraft_ct' and divides the sum by total number of flights
#National Airspace (NAS), weather, carrier, late aircraft, and security were picked as being the fault of the airline. I didnt pick weather for obvious reasons.
#data value tells us how many flights were delayed out of the total number of flights done by that airline for that month at that airport.
df['FAULT_DELAY_CT_FRAC'] = (df['CARRIER_CT'] + df['NAS_CT'] + df['SECURITY_CT'] + df['LATE_AIRCRAFT_CT'])/df['ARR_FLIGHTS']
#add a column called 'airport_delay_ct_frac' which sums of the delay_ct of the following columns: 'weather_ct', 'security_ct', 'nas_ct' and divides the sum by total number of flights for that row
#National Airspace (NAS), weather, and security were picked as being due to the airports location or staff.
#if the data value is summed across all airlines and divided by the total number of flights at that airport for that month then it gives
# an indicator of the airports performance in terms of delays.
df['AIRPORT_DELAY_CT_FRAC'] = (df['WEATHER_CT'] + df['SECURITY_CT'] + df['NAS_CT'])/df['ARR_FLIGHTS']
#add a column called 'fault_delay_avg' which sums the delay length of the following columns: 'carrier_delay', 'nas_delay', 'security_delay', 'late_aircraft_delay' and divided by the number of delayed flights
#Provides the average delay time for delays that were the fault of the airline
df['FAULT_DELAY_AVG'] = (df['CARRIER_DELAY'] + df['NAS_DELAY'] + df['SECURITY_DELAY'] + df['LATE_AIRCRAFT_DELAY'])/df['ARR_DEL15']
#add a column for delay_ct of the following columns: 'carrier_ct', 'nas_ct', 'security_ct', 'late_aircraft_ct', 'weather_ct' and divide by 'arr_flights'
#segments the delay_ct into the different types of delays as a fraction of the total number of flights
df['NAS_CT_FRAC'] = (df['NAS_CT'])/df['ARR_FLIGHTS']
df['CARRIER_CT_FRAC'] = (df['CARRIER_CT'])/df['ARR_FLIGHTS']
df['SECURITY_CT_FRAC'] = (df['SECURITY_CT'])/df['ARR_FLIGHTS']
df['LATE_AIRCRAFT_CT_FRAC'] = (df['LATE_AIRCRAFT_CT'])/df['ARR_FLIGHTS']
df['WEATHER_CT_FRAC'] = (df['WEATHER_CT'])/df['ARR_FLIGHTS']
df['DEL_DIV_CANC_CT_FRAC'] = df['ARR_DEL15'] + df['ARR_CANCELLED'] + df['ARR_DIVERTED']
df['DEL_DIV_CANC_CT_FRAC'] = df['DEL_DIV_CANC_CT_FRAC']/df['ARR_FLIGHTS']
#add a column for delay_avg of the following columns: 'carrier_ct', 'nas_ct', 'security_ct', 'late_aircraft_ct', 'weather_ct' and divide by categories count. fill NaN with 0
#segments the total average delay length into average delay lengths by delay type.
df['NAS_DELAY_AVG'] = df['NAS_DELAY'] / df['NAS_CT']
df['CARRIER_DELAY_AVG'] = df['CARRIER_DELAY'] / df['CARRIER_CT']
df['SECURITY_DELAY_AVG'] = df['SECURITY_DELAY'] / df['SECURITY_CT']
df['LATE_AIRCRAFT_DELAY_AVG'] = df['LATE_AIRCRAFT_DELAY'] / df['LATE_AIRCRAFT_CT']
df['WEATHER_DELAY_AVG'] = df['WEATHER_DELAY'] / df['WEATHER_CT']

#Create Region data
northeast = ['CT', 'ME', 'MA', 'NH', 'RI', 'VT', 'NJ', 'NY', 'PA']
midwest = ['IL', 'IN', 'MI', 'OH', 'WI', 'IA', 'KS', 'MN', 'MO', 'NE', 'ND', 'SD']
south = ['DE', 'FL', 'GA', 'MD', 'NC', 'SC', 'VA', 'DC', 'WV', 'AL', 'KY', 'MS', 'TN', 'AR', 'LA', 'OK', 'TX']
west = ['AZ', 'CO', 'ID', 'MT', 'NV', 'NM', 'UT', 'WY', 'AK', 'CA', 'HI', 'OR', 'WA']

# Extract state codes. Thank you CHATGPT for the regex expression
df['STATE_CODE'] = df['AIRPORT_NAME'].str.extract(r',\s([A-Z]{2}):')

# Define conditions for the regions based on the extracted state code
conditions = [
    df['STATE_CODE'].isin(northeast),
    df['STATE_CODE'].isin(midwest),
    df['STATE_CODE'].isin(south),
    df['STATE_CODE'].isin(west)
]

# Define the choices corresponding to the conditions
choices = ['Northeast', 'Midwest', 'South', 'West']

# Use np.select to assign regions based on the conditions
df['REGION'] = np.select(conditions, choices, default='Other')

#send to csv
df.to_csv('delays_cleaned_MB.csv', index=False)

avg_prob_ct_frac = df.groupby('CARRIER')['DEL_DIV_CANC_CT_FRAC'].mean().reset_index()
#create average for delay_avg by airline
avg_len_delay = df.groupby('CARRIER')['AVG_DELAY'].mean().reset_index()
#combine dataframes
avg_delay = pd.merge(avg_prob_ct_frac, avg_len_delay, on='CARRIER')
#add cost column that multiplies average delay time by 380
avg_delay['COST'] = avg_delay['AVG_DELAY']*380/120
#add cost expected cost column that multiplies cost and del_div_can_ct_frac
avg_delay['EXPECTED_COST'] = avg_delay['COST']*avg_delay['DEL_DIV_CANC_CT_FRAC']
#output to csv
avg_delay.to_csv('avg_delay.csv', index=False)

#change delay data to excel file
delay = pd.read_csv('avg_delay.csv')

#change baggage data to excel file
baggage = pd.read_csv('Baggage_Base_Data_Cleaned.csv')
#group baggage data by airline and divide  total mishandled bags by total bags carried
baggage = baggage.groupby('CARRIER').sum()
baggage['P_MISHANDLED_BAG'] = baggage['MISHANDLED_BAG']/baggage['TOTAL_BAG']

#try and except to avoid error if sheet already exists
try:
    #delay data as a new sheet to air_LP_data.xlsx
    with pd.ExcelWriter('air_LP_data.xlsx', mode='a', engine='openpyxl') as writer:
        delay.to_excel(writer, sheet_name='delay', index=False)

    #baggage data as a new sheet to air_LP_data.xlsx
    with pd.ExcelWriter('air_LP_data.xlsx', mode='a', engine='openpyxl') as writer:
        baggage.to_excel(writer, sheet_name='baggage', index=True)
except:
    pass

#change name of file air_LP_data.xlsx to air_LP_data_v2.xlsx
os.rename('air_LP_data.xlsx', 'air_LP_data_v2.xlsx')



### ALL CODE BELOW IS A SUCCESSFUL LINEAR REGRESSION MODEL THAT WAS NOT USED IN THE FINAL PROJECT###


In [39]:
### IGNORE BELOW FOR FINAL PRODCUT PURPOSES ###

import numpy as np
from sklearn.linear_model import LinearRegression

In [40]:
display(df.head())

Unnamed: 0,YEAR,MONTH,CARRIER,AIRPORT_NAME,ARR_FLIGHTS,ARR_DEL15,CARRIER_CT,WEATHER_CT,NAS_CT,SECURITY_CT,...,LATE_AIRCRAFT_CT_FRAC,WEATHER_CT_FRAC,DEL_DIV_CANC_CT_FRAC,NAS_DELAY_AVG,CARRIER_DELAY_AVG,SECURITY_DELAY_AVG,LATE_AIRCRAFT_DELAY_AVG,WEATHER_DELAY_AVG,STATE_CODE,REGION
1361,2018,1,Southwest,"Phoenix, AZ: Phoenix Sky Harbor International",5067.0,1640.0,207.38,8.22,1061.83,1.35,...,0.071287,0.001622,0.335307,37.630317,51.755232,106.666667,48.564547,90.145985,AZ,West
1362,2017,5,Southwest,"Los Angeles, CA: Los Angeles International",3671.0,1711.0,234.32,50.63,1030.82,0.76,...,0.107453,0.013792,0.491147,42.16449,50.734039,21.052632,58.3177,48.153269,CA,West
1363,2018,2,Southwest,"Phoenix, AZ: Phoenix Sky Harbor International",4534.0,1907.0,366.0,15.67,1004.73,4.47,...,0.113835,0.003456,0.435157,41.789336,52.297814,33.10962,50.690717,93.618379,AZ,West
1365,2016,10,Southwest,"Phoenix, AZ: Phoenix Sky Harbor International",4963.0,1533.0,191.84,21.19,921.51,0.0,...,0.080286,0.00427,0.312311,39.687035,48.175563,,49.194398,54.082114,AZ,West
1367,2014,10,Southwest,"Chicago, IL: Chicago Midway International",7125.0,1742.0,296.49,41.53,840.73,0.0,...,0.079054,0.005829,0.268632,37.749337,52.308678,,56.004332,47.146641,IL,Midwest


In [42]:
#create df that is grouped by carrier_name and year. Takes the average of del_div_can_ct_frac
CARRIER_YEAR = df.groupby(['CARRIER', 'YEAR'])['DEL_DIV_CANC_CT_FRAC'].mean().reset_index()
#create df that is grouped by carrier_name and year. Takes the average of avg_delay
CARRIER_YEAR_AVG_DELAY = df.groupby(['CARRIER', 'YEAR'])['AVG_DELAY'].mean().reset_index() 
#merge the two dataframes
CARRIER_YEAR = pd.merge(CARRIER_YEAR, CARRIER_YEAR_AVG_DELAY, on=['CARRIER', 'YEAR'])
display(CARRIER_YEAR.head(25))

Unnamed: 0,CARRIER,YEAR,DEL_DIV_CANC_CT_FRAC,AVG_DELAY
0,Alaska,2009,0.184049,44.65887
1,Alaska,2010,0.138785,43.626849
2,Alaska,2011,0.130657,45.214206
3,Alaska,2012,0.136313,49.285718
4,Alaska,2013,0.137765,43.269175
5,Alaska,2014,0.144966,44.596383
6,Alaska,2015,0.144369,45.376774
7,Alaska,2016,0.125458,43.205089
8,Alaska,2017,0.161917,46.935749
9,Alaska,2018,0.154797,44.081846


In [44]:
carriers = CARRIER_YEAR['CARRIER'].unique()
carriers

array(['Alaska', 'Allegiant', 'American', 'Delta', 'Frontier', 'Hawaiian',
       'JetBlue', 'Southwest', 'Spirit', 'United'], dtype=object)

In [46]:
for carrier in carriers:
    try:
        carrier_year_AA = CARRIER_YEAR[CARRIER_YEAR['CARRIER'] == carrier]
        AL_year_P = carrier_year_AA['DEL_DIV_CANC_CT_FRAC']
        AL_year_P = AL_year_P.values
        x = np.array([0,1,2,3,4,5,6,7,8,9,10]).reshape(-1, 1)
        y = AL_year_P
        model = LinearRegression()
        model.fit(x, y)
        LinearRegression()
        model = LinearRegression().fit(x, y)
        r_sq = model.score(x, y)
        print(f"coefficient of determination for {carrier}: {r_sq}")
    except:
        print(f"error for {carrier}")

coefficient of determination for Alaska: 0.0412765401265881
error for Allegiant
coefficient of determination for American: 0.09283289336447142
coefficient of determination for Delta: 0.5780511512178065
coefficient of determination for Frontier: 0.14806726505640788
coefficient of determination for Hawaiian: 0.08450356666562497
coefficient of determination for JetBlue: 0.2622289531303852
coefficient of determination for Southwest: 0.013066029441597293
error for Spirit
coefficient of determination for United: 0.022293440237556084


In [47]:
carrier_year_AA = CARRIER_YEAR[CARRIER_YEAR['CARRIER'] == 'Delta']
AL_year_P = carrier_year_AA['DEL_DIV_CANC_CT_FRAC']
AL_year_P = AL_year_P.values
x = np.array([0,1,2,3,4,5,6,7,8,9,10]).reshape(-1, 1)
y = AL_year_P
model = LinearRegression()
model.fit(x, y)
LinearRegression()
model = LinearRegression().fit(x, y)
r_sq = model.score(x, y)
print(f"coefficient of determination for {carrier}: {r_sq}")

coefficient of determination for United: 0.5780511512178065


In [48]:
print(f"intercept: {model.intercept_}")

print(f"slope: {model.coef_}")


intercept: 0.20241761518417384
slope: [-0.00783448]


In [49]:
>>> new_model = LinearRegression().fit(x, y.reshape((-1, 1)))
>>> print(f"intercept: {new_model.intercept_}")

>>> print(f"slope: {new_model.coef_}")

intercept: [0.20241762]
slope: [[-0.00783448]]


In [3]:
from pivottablejs import pivot_ui

In [7]:
import pandas as pd
import numpy as np

In [8]:
from pandasgui import show

In [5]:
df = pd.read_csv('Airline_Delay_Cause.csv')

In [9]:
gui = show(df)

PandasGUI INFO — pandasgui.gui — Opening PandasGUI


AttributeError: module 'bokeh.plotting' has no attribute 'Figure'