In [1]:
import pandas as pd
import numpy as np

In [2]:
# Rename the path as necessary to where the data is stored
flights = pd.read_csv('../data/raw_flights_100_random_sample.csv')

In [3]:
flights_summary = pd.DataFrame(flights.describe())
flights_summary[['dep_delay', 'arr_delay']]

Unnamed: 0,dep_delay,arr_delay
count,98.0,98.0
mean,24.520408,18.591837
std,138.998497,138.40303
min,-14.0,-52.0
25%,-5.0,-14.75
50%,-1.0,-5.0
75%,10.75,13.0
max,1353.0,1332.0


In [4]:
flights[['crs_elapsed_time', 'actual_elapsed_time', 'air_time']]

Unnamed: 0,crs_elapsed_time,actual_elapsed_time,air_time
0,167,196.0,140.0
1,190,212.0,177.0
2,75,70.0,55.0
3,69,68.0,53.0
4,160,153.0,137.0
...,...,...,...
95,100,100.0,83.0
96,83,72.0,50.0
97,102,,
98,150,148.0,126.0


In [5]:
# Create two new columns: 1) Air time delay (or gain) 2) Total departure delay and air time delay

flights['air_time_delay'] = flights.actual_elapsed_time - flights.crs_elapsed_time
flights['air_dep_total'] = flights.air_time_delay + flights.dep_delay

In [6]:
# Quick look to check total delay time -> yes, the total seems to equal the arr_delay
# daf = data analysis filter
daf = ['dep_delay', 'arr_delay', 'air_dep_total', 'air_time_delay']
flights[daf].sort_values(by='arr_delay').tail()

Unnamed: 0,dep_delay,arr_delay,air_dep_total,air_time_delay
47,106.0,92.0,92.0,-14.0
43,176.0,165.0,165.0,-11.0
42,1353.0,1332.0,1332.0,-21.0
22,,,,
97,,,,


In [7]:
# Double check this -> yes, only to false returns from NaN i.e. the two cancelled flights
(flights.arr_delay == flights.air_dep_total).value_counts()

# After this, I won't be using air_dep_total

True     98
False     2
dtype: int64

In [8]:
# Created function to see proportion of delay time from depature delays versus air time delays 
# Also shows if one or the other reduces delay time

def delay_proportions(df, air_time=False):
    '''Returns the proportion of the delay accounted for by the departure delay and by the air time delay.
        Also returns the percentage adjustment if either air time or early departure compensates for the delay.
        Parameters:
            a (Pandas Data Frame) Flight data frame without additional column air_total_delay
            b (boolean) If True, will use data from previously created column 
        Returns:
            Four numpy one-dimensional arrays to use for new columns in dataframe
    '''
    size = df.shape[0]
    
    departure = np.empty(size)
    airtime = np.empty(size)
    air_adjust = np.zeros(size)
    dep_adjust = np.zeros(size)
    
    for i, row in df.iterrows():
        delay = row['arr_delay']
        dep_delay = row['dep_delay']
        air_delay = 0
        
        if air_time == False:
            air_delay = row['actual_elapsed_time'] - row['crs_elapsed_time']
        else:
            air_delay = row['air_time_delay']
       
        if delay is np.NAN:
            departure[i] = np.NAN
            airtime[i] = np.NAN
            air_adjust[i] = np.NAN
            dep_adjust[i] = np.NAN
        
        elif delay <= 0: # No delay case: for now, all zero proportions
            airtime[i] = 0
            departure[i] = 0
        else:
            
            if dep_delay > delay: # Case departure delay is 100% but air time compensates
                departure[i] = 1.0
                airtime[i] = 0
                air_adjust[i] = round(air_delay/dep_delay, 2)
            
            elif dep_delay == delay: # Case departure delay is exactly 100%
                departure[i] = 1.0
                airtime[i] = 0
            
            elif dep_delay < 0: # Case air time delay is 100% but early departure compensates
                departure[i] = 0
                airtime[i] = 1.0
                dep_adjust[i] = round(dep_delay/air_delay, 2)
            
            elif dep_delay == 0: # Case air time delay is exaclty 100%
                departure[i] = 0
                airtime[i] = 1.0
            
            else: # Case where departure is from both
                departure[i] = round(dep_delay/delay, 2) 
                airtime[i] = round(air_delay/delay, 2)
    
    return departure, airtime, air_adjust, dep_adjust


In [9]:
dep, air, air_ad, dep_ad = delay_proportions(flights, air_time=True)
flights['dep_delay_prop'] = dep
flights['air_delay_prop'] = air
flights['air_adjust_prop'] = air_ad
flights['dep_adjust_prop'] = dep_ad

In [10]:
daf = ['dep_delay', 'air_time_delay', 'arr_delay', 'dep_delay_prop', 'air_delay_prop',
       'air_adjust_prop', 'dep_adjust_prop']

In [11]:
# sort by the departure delay proportion and the air time adjustment -> is the pilot flying the plane faster?

flights[daf].sort_values(by=['dep_delay_prop','air_adjust_prop'], ascending=[False, True]).head(25)

# not clear if the air plane is flying faster -> need another metric or more data

Unnamed: 0,dep_delay,air_time_delay,arr_delay,dep_delay_prop,air_delay_prop,air_adjust_prop,dep_adjust_prop
27,11.0,-7.0,4.0,1.0,0.0,-0.64,0.0
39,29.0,-16.0,13.0,1.0,0.0,-0.55,0.0
15,36.0,-16.0,20.0,1.0,0.0,-0.44,0.0
84,51.0,-22.0,29.0,1.0,0.0,-0.43,0.0
51,33.0,-10.0,23.0,1.0,0.0,-0.3,0.0
4,38.0,-7.0,31.0,1.0,0.0,-0.18,0.0
47,106.0,-14.0,92.0,1.0,0.0,-0.13,0.0
86,93.0,-11.0,82.0,1.0,0.0,-0.12,0.0
74,87.0,-9.0,78.0,1.0,0.0,-0.1,0.0
94,45.0,-4.0,41.0,1.0,0.0,-0.09,0.0


In [12]:
# sort by arrival delay ascending to see how often the plane is early and why
flights[daf].sort_values(by=['arr_delay']).head(30)

# most early arrivals are from air time compensation 
# Question: do planes always try to fly faster or head/tail wind related? 

Unnamed: 0,dep_delay,air_time_delay,arr_delay,dep_delay_prop,air_delay_prop,air_adjust_prop,dep_adjust_prop
6,4.0,-56.0,-52.0,0.0,0.0,0.0,0.0
90,-8.0,-33.0,-41.0,0.0,0.0,0.0,0.0
60,-8.0,-28.0,-36.0,0.0,0.0,0.0,0.0
44,-10.0,-25.0,-35.0,0.0,0.0,0.0,0.0
8,-5.0,-30.0,-35.0,0.0,0.0,0.0,0.0
78,-7.0,-22.0,-29.0,0.0,0.0,0.0,0.0
30,-8.0,-21.0,-29.0,0.0,0.0,0.0,0.0
31,-2.0,-26.0,-28.0,0.0,0.0,0.0,0.0
23,-4.0,-22.0,-26.0,0.0,0.0,0.0,0.0
72,-6.0,-18.0,-24.0,0.0,0.0,0.0,0.0


In [13]:

# flights[daf].sort_values(by='dep_delay_prop', ascending=False).head(25)
# flights[daf].sort_values(by=['air_adjust_prop', 'air_time_delay']).head(25)

# flights[daf].sort_values(by=['air_adjust_prop','air_time_delay']).head(25)

# flights[daf].sort_values(by=['air_delay_prop','air_adjust_prop'], ascending=[False, True]).head(25)
# very rarely the case that there was early departure and air time delay

In [14]:
new_df = flights[daf].sort_values(by=['dep_delay_prop','air_adjust_prop'], ascending=[False, True])
new_df

Unnamed: 0,dep_delay,air_time_delay,arr_delay,dep_delay_prop,air_delay_prop,air_adjust_prop,dep_adjust_prop
27,11.0,-7.0,4.0,1.0,0.0,-0.64,0.0
39,29.0,-16.0,13.0,1.0,0.0,-0.55,0.0
15,36.0,-16.0,20.0,1.0,0.0,-0.44,0.0
84,51.0,-22.0,29.0,1.0,0.0,-0.43,0.0
51,33.0,-10.0,23.0,1.0,0.0,-0.30,0.0
...,...,...,...,...,...,...,...
95,-1.0,0.0,-1.0,0.0,0.0,0.00,0.0
96,-10.0,-11.0,-21.0,0.0,0.0,0.00,0.0
98,-4.0,-2.0,-6.0,0.0,0.0,0.00,0.0
22,,,,,,0.00,0.0


In [15]:
# Given there is an arrival delay, let's get some summary stats!
delay_sum = new_df[new_df.arr_delay > 0].describe()
delay_sum

Unnamed: 0,dep_delay,air_time_delay,arr_delay,dep_delay_prop,air_delay_prop,air_adjust_prop,dep_adjust_prop
count,33.0,33.0,33.0,33.0,33.0,33.0,33.0
mean,79.727273,1.666667,81.393939,0.717273,0.282727,-0.096667,-0.04
std,231.884841,15.158881,227.327536,0.400907,0.400907,0.173542,0.119661
min,-3.0,-22.0,2.0,0.0,0.0,-0.64,-0.5
25%,7.0,-9.0,13.0,0.38,0.0,-0.1,0.0
50%,38.0,0.0,34.0,1.0,0.0,0.0,0.0
75%,57.0,7.0,71.0,1.0,0.62,0.0,0.0
max,1353.0,49.0,1332.0,1.0,1.0,0.0,0.0


In [16]:
# only one-third the flights had a delay, in which about 70% of the delay is from departure delays!

print(f'Proportion of Flight Delays: {delay_sum.arr_delay[0]}')
print(f'The Average Proportion of Departure Delays: {delay_sum.dep_delay_prop[1]}')
print(f'The Average Proportion of Air time Delays: {delay_sum.air_delay_prop[1]}')

Proportion of Flight Delays: 33.0
The Average Proportion of Departure Delays: 0.7172727272727273
The Average Proportion of Air time Delays: 0.2827272727272727


In [18]:
# write csv file
# new_df.to_csv('flight_delays_100.csv', encoding='utf-8', index=False)