In [1]:
import pandas as pd
import numpy as np

In [2]:
# Rename the path as necessary to where the data is stored
flights = pd.read_csv('../data/raw_flights_10000_random.csv')


In [3]:
flights_summary = pd.DataFrame(flights.describe())
flights_summary[['dep_delay', 'arr_delay']]

Unnamed: 0,dep_delay,arr_delay
count,9845.0,9811.0
mean,10.622651,5.636123
std,50.545771,52.545968
min,-129.0,-158.0
25%,-6.0,-15.0
50%,-2.0,-6.0
75%,6.0,8.0
max,1431.0,1394.0


In [4]:
flights[['crs_elapsed_time', 'actual_elapsed_time', 'air_time']]

Unnamed: 0,crs_elapsed_time,actual_elapsed_time,air_time
0,156,135.0,121.0
1,75,60.0,41.0
2,199,187.0,172.0
3,57,63.0,40.0
4,80,84.0,60.0
...,...,...,...
9995,75,67.0,37.0
9996,75,64.0,42.0
9997,75,74.0,39.0
9998,158,166.0,139.0


### Note: **actual_elapsed_time** is the total of **air_time + taxi_out + taxi_in** <br>
### Taxi-out time starts from after departure time; arrival time is counted after the taxi-in time. <br>
### crs_dep_time + crs_elapsed_time = crs_arr_time<br>
### -> flight time delays are caused by a combination of **taxi out, taxi in, and air time**.

In [5]:
# Create two new columns: 1) Air time delay (or gain) 2) Total departure delay and flight time delay

# Note: actual_elapsed_time is the total of air_time + taxi_out + taxi_in 

flights['flight_time_delay'] = flights.actual_elapsed_time - flights.crs_elapsed_time
flights['air_dep_total'] = flights.flight_time_delay + flights.dep_delay

In [6]:
# Quick look to check total delay time -> yes, the total seems to equal the arr_delay
# daf = data analysis filter
daf = ['dep_delay', 'arr_delay', 'air_dep_total', 'flight_time_delay']
flights[daf].sort_values(by='arr_delay').tail()

Unnamed: 0,dep_delay,arr_delay,air_dep_total,flight_time_delay
9706,,,,
9731,12.0,,,
9777,,,,
9795,,,,
9990,130.0,,,


In [7]:
# Double check this -> yes, only to false returns from NaN i.e. the two cancelled flights
(flights.arr_delay == flights.air_dep_total).value_counts()

# After this, I won't be using air_dep_total

True     9809
False     191
dtype: int64

In [8]:
# Created function to see proportion of delay time from depature delays versus air time delays 
# Also shows if one or the other reduces delay time

def delay_proportions(df, flight_time=False):
    '''Returns the proportion of the delay accounted for by the departure delay and by the air time delay.
        Also returns the percentage adjustment if either air time or early departure compensates for the delay.
        Parameters:
            a (Pandas Data Frame) Flight data frame without additional column air_total_delay
            b (boolean) If True, will use data from previously created column 
        Returns:
            Four numpy one-dimensional arrays to use for new columns in dataframe
    '''
    size = df.shape[0]
    
    departure = np.empty(size)
    airtime = np.empty(size)
    air_adjust = np.zeros(size)
    dep_adjust = np.zeros(size)
    
    for i, row in df.iterrows():
        delay = row['arr_delay']
        dep_delay = row['dep_delay']
        air_delay = 0
        
        if flight_time == False:
            air_delay = row['actual_elapsed_time'] - row['crs_elapsed_time']
        else:
            air_delay = row['flight_time_delay']
       
        if delay is np.NAN:
            departure[i] = np.NAN
            airtime[i] = np.NAN
            air_adjust[i] = np.NAN
            dep_adjust[i] = np.NAN
        
        elif delay <= 0: # No delay case: for now, all zero proportions
            airtime[i] = 0
            departure[i] = 0
        else:
            
            if dep_delay > delay: # Case departure delay is 100% but air time compensates
                departure[i] = 1.0
                airtime[i] = 0
                air_adjust[i] = round(air_delay/dep_delay, 2)
            
            elif dep_delay == delay: # Case departure delay is exactly 100%
                departure[i] = 1.0
                airtime[i] = 0
            
            elif dep_delay < 0: # Case air time delay is 100% but early departure compensates
                departure[i] = 0
                airtime[i] = 1.0
                dep_adjust[i] = round(dep_delay/air_delay, 2)
            
            elif dep_delay == 0: # Case air time delay is exaclty 100%
                departure[i] = 0
                airtime[i] = 1.0
            
            else: # Case where departure is from both
                departure[i] = round(dep_delay/delay, 2) 
                airtime[i] = round(air_delay/delay, 2)
    
    return departure, airtime, air_adjust, dep_adjust


In [9]:
dep, air, air_ad, dep_ad = delay_proportions(flights, flight_time=True)
flights['dep_delay_prop'] = dep
flights['air_delay_prop'] = air
flights['air_adjust_prop'] = air_ad
flights['dep_adjust_prop'] = dep_ad

In [10]:
daf = ['dep_delay', 'flight_time_delay', 'arr_delay', 'dep_delay_prop', 'air_delay_prop',
       'air_adjust_prop', 'dep_adjust_prop']

In [11]:
# sort by the departure delay proportion and the air time adjustment -> is the pilot flying the plane faster?

flights[daf].sort_values(by=['dep_delay_prop','air_adjust_prop'], ascending=[False, True]).head(25)

# not clear if the air plane is flying faster -> need another metric or more data

Unnamed: 0,dep_delay,flight_time_delay,arr_delay,dep_delay_prop,air_delay_prop,air_adjust_prop,dep_adjust_prop
100,36.0,-35.0,1.0,1.0,0.0,-0.97,0.0
8937,31.0,-30.0,1.0,1.0,0.0,-0.97,0.0
639,24.0,-23.0,1.0,1.0,0.0,-0.96,0.0
1465,27.0,-26.0,1.0,1.0,0.0,-0.96,0.0
8830,23.0,-22.0,1.0,1.0,0.0,-0.96,0.0
818,19.0,-18.0,1.0,1.0,0.0,-0.95,0.0
1291,21.0,-20.0,1.0,1.0,0.0,-0.95,0.0
3111,20.0,-19.0,1.0,1.0,0.0,-0.95,0.0
9307,19.0,-18.0,1.0,1.0,0.0,-0.95,0.0
9911,19.0,-18.0,1.0,1.0,0.0,-0.95,0.0


In [12]:
# sort by arrival delay ascending to see how often the plane is early and why
flights[daf].sort_values(by=['arr_delay']).head(30)

# most early arrivals are from flight time compensation 
# Question: do planes always try to fly faster or head/tail wind related? 

Unnamed: 0,dep_delay,flight_time_delay,arr_delay,dep_delay_prop,air_delay_prop,air_adjust_prop,dep_adjust_prop
1023,-129.0,93.0,-158.0,0.0,0.0,0.0,0.0
9327,-8.0,-86.0,-94.0,0.0,0.0,0.0,0.0
9545,-6.0,-53.0,-59.0,0.0,0.0,0.0,0.0
2930,-10.0,-45.0,-55.0,0.0,0.0,0.0,0.0
6557,-2.0,-52.0,-54.0,0.0,0.0,0.0,0.0
8152,-2.0,-52.0,-54.0,0.0,0.0,0.0,0.0
8644,-20.0,-33.0,-53.0,0.0,0.0,0.0,0.0
4891,-7.0,-44.0,-51.0,0.0,0.0,0.0,0.0
8103,-8.0,-41.0,-49.0,0.0,0.0,0.0,0.0
8918,-1.0,-48.0,-49.0,0.0,0.0,0.0,0.0


In [13]:

# flights[daf].sort_values(by='dep_delay_prop', ascending=False).head(25)
# flights[daf].sort_values(by=['air_adjust_prop', 'flight_time_delay']).head(25)

# flights[daf].sort_values(by=['air_adjust_prop','flight_time_delay']).head(25)

# flights[daf].sort_values(by=['air_delay_prop','air_adjust_prop'], ascending=[False, True]).head(25)
# very rarely the case that there was early departure and air time delay

In [14]:
new_df = flights[daf].sort_values(by=['dep_delay_prop','air_adjust_prop'], ascending=[False, True])
new_df

Unnamed: 0,dep_delay,flight_time_delay,arr_delay,dep_delay_prop,air_delay_prop,air_adjust_prop,dep_adjust_prop
100,36.0,-35.0,1.0,1.0,0.0,-0.97,0.0
8937,31.0,-30.0,1.0,1.0,0.0,-0.97,0.0
639,24.0,-23.0,1.0,1.0,0.0,-0.96,0.0
1465,27.0,-26.0,1.0,1.0,0.0,-0.96,0.0
8830,23.0,-22.0,1.0,1.0,0.0,-0.96,0.0
...,...,...,...,...,...,...,...
9706,,,,,,0.00,0.0
9731,12.0,,,,,0.00,0.0
9777,,,,,,0.00,0.0
9795,,,,,,0.00,0.0


In [15]:
# Given there is an arrival delay, let's get some summary stats!
delay_sum = new_df[new_df.arr_delay > 0].describe()
delay_sum

Unnamed: 0,dep_delay,flight_time_delay,arr_delay,dep_delay_prop,air_delay_prop,air_adjust_prop,dep_adjust_prop
count,3458.0,3458.0,3458.0,3458.0,3458.0,3458.0,3458.0
mean,36.842394,3.37594,40.218334,0.618704,0.381296,-0.134101,-0.09734
std,78.171831,17.120067,76.417925,0.440694,0.440695,0.226463,0.208119
min,-18.0,-45.0,1.0,0.0,0.0,-0.97,-0.93
25%,0.0,-7.0,7.0,0.0,0.0,-0.19,0.0
50%,16.0,2.0,18.0,0.91,0.09,0.0,0.0
75%,45.0,11.0,45.0,1.0,1.0,0.0,0.0
max,1431.0,156.0,1394.0,1.0,1.0,0.0,0.0


In [16]:
# For 10,000 randomly sampled flights:
# Around one-third the flights had a delay, in which about 63% of the delay is from departure delays.

print(f'Proportion of Flight Delays: {delay_sum.arr_delay[0]/10000}')
print(f'The Average Proportion of Departure Delays: {delay_sum.dep_delay_prop[1]}')
print(f'The Average Proportion of Air time Delays: {delay_sum.air_delay_prop[1]}')

Proportion of Flight Delays: 0.3458
The Average Proportion of Departure Delays: 0.6187044534412958
The Average Proportion of Air time Delays: 0.38129554655870446


In [19]:
# write csv file
new_df.to_csv('../data/flight_delays_10000.csv', encoding='utf-8', index=False)

### Note: When the same code was run with a random sample of 1000 observations, similar results were seen. 

#### Proportion of Flight Delays: 0.307 <br>
#### The Average Proportion of Departure Delays: 0.6305537459283386 <br>
#### The Average Proportion of Air time Delavs: 0.36944625407166126