# Feature Creation Methods
Call these methods to add features to the Flights Training dataframe

In [11]:
from datetime import datetime

## 1. Is Flight Delayed Feature (0 or 1 values)

In [3]:
def is_delayed_feature(flights_df):
    flights_df_cp = flights_df.copy()
    
    flights_df_cp['dep_delay'] = flights_df_cp['dep_delay'].fillna(0)
    flights_df_cp['arr_delay'] = flights_df_cp['arr_delay'].fillna(0)
    flights_df_cp['carrier_delay'] = flights_df_cp['carrier_delay'].fillna(0)
    flights_df_cp['weather_delay'] = flights_df_cp['weather_delay'].fillna(0)
    flights_df_cp['nas_delay'] = flights_df_cp['nas_delay'].fillna(0)
    flights_df_cp['security_delay'] = flights_df_cp['security_delay'].fillna(0)
    flights_df_cp['late_aircraft_delay'] = flights_df_cp['late_aircraft_delay'].fillna(0)

    flights_df_cp['total_delay'] = flights_df_cp.apply(
        lambda x: ((x['dep_delay'] + x['arr_delay'] +
                 x['carrier_delay'] +
                 x['weather_delay'] + x['nas_delay'] + x['security_delay'] +
                 x['late_aircraft_delay']
        )),  axis=1)
    
    return flights_df_cp.apply(
        lambda x: (0 if x['total_delay'] <= 0 else 1),  axis=1)
    

## 2. Flight Speed

In [28]:
def flight_speed_feature(flights_df):
    flights_df_cp = flights_df.copy()

   
    flights_df_cp['distance'] = flights_df_cp.apply(
        lambda x: (0.0 if ((x['cancelled'] == 1) | (x['diverted'] == 1)) else x['distance']
              ),  axis=1)
    
    flights_df_cp['actual_elapsed_time'] = flights_df_cp.apply(
        lambda x: (60.0 if ((x['cancelled'] == 1) | (x['diverted'] == 1)) else x['actual_elapsed_time']
              ),  axis=1)
    
    return flights_df_cp.apply(
        lambda x: ((x['distance'] / (x['actual_elapsed_time']/60)
              )),  axis=1)

## 3. Flight Haul Type (values of 0, 1 and 2 for Short, Medium and Long)

In [30]:
def flight_haul_type_feature(flights_df):
    flights_df_cp = flights_df.copy()
    
    flights_df_cp['actual_elapsed_time'] = flights_df_cp.apply(
        lambda x: (0 if ((x['cancelled'] == 1) | (x['diverted'] == 1)) else x['actual_elapsed_time']
              ),  axis=1)
    
    return flights_df_cp.apply(
        lambda x: 0 if x['actual_elapsed_time'] <= 180.0 else (2 if x['actual_elapsed_time'] >= 360.0 else 1), axis=1
)

## 4. Month of the Year (1 to 12)

In [36]:
def month_feature(flights_df):
    flights_df_cp = flights_df.copy()
    
    return flights_df_cp[flights_df_cp['fl_date'].notna()].apply(
        lambda x: datetime.strptime(x['fl_date'], '%Y-%m-%d').date().month, axis=1)

## 5. Arrival Hour of the Day (0 to 23)

In [8]:
def arrival_hour_of_day_feature(flights_df):
    flights_df_cp = flights_df.copy()
    
    return flights_df_cp[flights_df_cp['arr_time'].notna()].apply(
        lambda x: 0 if x['arr_time'] == 2400.0 else datetime.strptime(str(int(x['arr_time'])).zfill(4), '%H%M').time().hour, axis=1)
    

## 6. Departure Hour of the Day (0 to 23)

In [9]:
def departure_hour_of_day_feature(flights_df):
    flights_df_cp = flights_df.copy()
    
    return flights_df_cp[flights_df_cp['dep_time'].notna()].apply(
        lambda x: 0 if x['dep_time'] == 2400.0 else datetime.strptime(str(int(x['dep_time'])).zfill(4), '%H%M').time().hour, axis=1)
  

sample data of top delayed location, early location

# Test All Methods

In [10]:
import pickle

filename = 'flight_2019'

infile = open(filename,'rb')
flight_2019 = pickle.load(infile)
infile.close()

#### New dataframe with a smaller sample of 500,000 records

In [12]:
flight_sample = flight_2019.sample(n=1000000, random_state=1)

In [13]:
flight_sample.shape

(1000000, 38)

In [14]:
flight_sample['fl_date'].head()

5817519    2019-05-23
3440584    2019-12-05
2213756    2019-02-06
4372882    2019-09-17
2224046    2019-02-06
Name: fl_date, dtype: object

In [18]:
flight_sample['is_delayed'] = is_delayed_feature(flight_sample)
flight_sample.head()

Unnamed: 0,mkt_unique_carrier,branded_code_share,mkt_carrier_fl_num,op_unique_carrier,tail_num,op_carrier_fl_num,origin_airport_id,dest_airport_id,crs_dep_time,dep_time,...,nas_delay,security_delay,late_aircraft_delay,first_dep_time,total_add_gtime,longest_add_gtime,fl_date,year,month,is_delayed
5817519,DL,DL_CODESHARE,4016,OO,N291SY,4016,12953,13342,1747,1944.0,...,3.0,0.0,108.0,,,,2019-05-23,2019.0,5.0,1
3440584,DL,DL_CODESHARE,4566,OO,N477CA,4566,10397,12007,2019,2017.0,...,,,,,,,2019-12-05,2019.0,12.0,0
2213756,DL,DL,2151,DL,N844DN,2151,10721,13487,1820,1802.0,...,,,,,,,2019-02-06,2019.0,2.0,0
4372882,AA,AA,439,AA,N155NN,439,13930,14107,705,656.0,...,,,,,,,2019-09-17,2019.0,9.0,0
2224046,NK,NK,318,NK,N907NK,318,14635,13487,1315,1311.0,...,,,,,,,2019-02-06,2019.0,2.0,0


In [19]:
flight_sample['flight_speed'] = flight_speed_feature(flight_sample)
flight_sample.head()

TypeError: unsupported operand type(s) for |: 'int' and 'float'

In [32]:
flight_sample['flight_haul_type'] = flight_haul_type_feature(flight_sample)
flight_sample.head()

Unnamed: 0,mkt_unique_carrier,branded_code_share,mkt_carrier_fl_num,op_unique_carrier,tail_num,op_carrier_fl_num,origin_airport_id,dest_airport_id,crs_dep_time,dep_time,...,late_aircraft_delay,first_dep_time,total_add_gtime,longest_add_gtime,fl_date,year,month,is_delayed,flight_speed,flight_haul_type
5817519,DL,DL_CODESHARE,4016,OO,N291SY,4016,12953,13342,1747,1944.0,...,108.0,,,,2019-05-23,2019.0,5.0,1,297.181208,0
3440584,DL,DL_CODESHARE,4566,OO,N477CA,4566,10397,12007,2019,2017.0,...,,,,,2019-12-05,2019.0,12.0,0,229.52381,0
2213756,DL,DL,2151,DL,N844DN,2151,10721,13487,1820,1802.0,...,,,,,2019-02-06,2019.0,2.0,0,328.97561,1
4372882,AA,AA,439,AA,N155NN,439,13930,14107,705,656.0,...,,,,,2019-09-17,2019.0,9.0,0,374.025974,1
2224046,NK,NK,318,NK,N907NK,318,14635,13487,1315,1311.0,...,,,,,2019-02-06,2019.0,2.0,0,382.702703,1


In [37]:
flight_sample['flight_month'] = month_feature(flight_sample)
flight_sample.head()

Unnamed: 0,mkt_unique_carrier,branded_code_share,mkt_carrier_fl_num,op_unique_carrier,tail_num,op_carrier_fl_num,origin_airport_id,dest_airport_id,crs_dep_time,dep_time,...,first_dep_time,total_add_gtime,longest_add_gtime,fl_date,year,month,is_delayed,flight_speed,flight_haul_type,flight_month
5817519,DL,DL_CODESHARE,4016,OO,N291SY,4016,12953,13342,1747,1944.0,...,,,,2019-05-23,2019.0,5.0,1,297.181208,0,5
3440584,DL,DL_CODESHARE,4566,OO,N477CA,4566,10397,12007,2019,2017.0,...,,,,2019-12-05,2019.0,12.0,0,229.52381,0,12
2213756,DL,DL,2151,DL,N844DN,2151,10721,13487,1820,1802.0,...,,,,2019-02-06,2019.0,2.0,0,328.97561,1,2
4372882,AA,AA,439,AA,N155NN,439,13930,14107,705,656.0,...,,,,2019-09-17,2019.0,9.0,0,374.025974,1,9
2224046,NK,NK,318,NK,N907NK,318,14635,13487,1315,1311.0,...,,,,2019-02-06,2019.0,2.0,0,382.702703,1,2


In [38]:
flight_sample['arrival_hour_of_day'] = arrival_hour_of_day_feature(flight_sample)
flight_sample.head()

Unnamed: 0,mkt_unique_carrier,branded_code_share,mkt_carrier_fl_num,op_unique_carrier,tail_num,op_carrier_fl_num,origin_airport_id,dest_airport_id,crs_dep_time,dep_time,...,total_add_gtime,longest_add_gtime,fl_date,year,month,is_delayed,flight_speed,flight_haul_type,flight_month,arrival_hour_of_day
5817519,DL,DL_CODESHARE,4016,OO,N291SY,4016,12953,13342,1747,1944.0,...,,,2019-05-23,2019.0,5.0,1,297.181208,0,5,21.0
3440584,DL,DL_CODESHARE,4566,OO,N477CA,4566,10397,12007,2019,2017.0,...,,,2019-12-05,2019.0,12.0,0,229.52381,0,12,20.0
2213756,DL,DL,2151,DL,N844DN,2151,10721,13487,1820,1802.0,...,,,2019-02-06,2019.0,2.0,0,328.97561,1,2,20.0
4372882,AA,AA,439,AA,N155NN,439,13930,14107,705,656.0,...,,,2019-09-17,2019.0,9.0,0,374.025974,1,9,8.0
2224046,NK,NK,318,NK,N907NK,318,14635,13487,1315,1311.0,...,,,2019-02-06,2019.0,2.0,0,382.702703,1,2,15.0


In [None]:
flight_sample['departure_hour_of_day'] = departure_hour_of_day_feature(flight_sample)
flight_sample.head()