# Feature Engineering

In [19]:
# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import warnings
import random

In [20]:
# Adjust settings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
# pd.set_option('display.max_rows', None)
%matplotlib inline

In [21]:
# Load preprocessed dataset

data_path = '../data/interim/'
df = pd.read_pickle(os.path.join(data_path, 'explored.pkl'))


In [22]:
df.shape

(11703, 19)

## Routing

In [23]:
# Create new feature for routing as dep arpt_arr arpt
df['routing'] = df['flt_dep_airpt'].astype(str) + '_' + df['flt_arr_airpt'].astype(str)
# Drop departure and arrival airport as all information is encoded in routing variable. Furthermore drop leg, as it is only a unique
# identifier similiar to the index.
df.drop(columns=['flt_dep_airpt', 'flt_arr_airpt', 'flt_leg'], inplace=True)

## Crew and Crew Changes

In [24]:
# Save flight crew, cabin crew names and their size in new features and drop the original variable afterwards
cp_crew = []
ca_crew = []
cp_count = []
ca_count = []

for crew in df['flt_TLC_trans']:
    cp_help = []
    ca_help = []
    for member in crew:
        if 'cp' in member:
            cp_help.append(member.partition('_')[0])
        elif 'ca' in member:
            ca_help.append(member.partition('_')[0])
    cp_crew.append(sorted(cp_help))
    ca_crew.append(sorted(ca_help))
    cp_count.append(len(cp_help))
    ca_count.append(len(ca_help))

df['cp_crew'] = cp_crew
df['ca_crew'] = ca_crew
df['cp_count'] = cp_count
df['ca_count'] = ca_count

df = df.drop(columns='flt_TLC_trans')

In [25]:
# Create two new features from flt_crew_change:
# 1. Indicator whether cabin, flight or both crews changed aircraft
# 2. Number of crew members who changed aircraft
def crew_change_cp_ca (row):
    if all(x in row['flt_crewt_change'] for x in ['ca', 'cp']):
        return 'both'
    if 'cp' in row['flt_crewt_change']:
        return 'flight_crew'
    elif 'ca' in row['flt_crewt_change']:
        return 'cabin_crew'
    else:
        return 'None'

def crew_members (row):
    return len(row['flt_crewt_change'])

df['cc_cp_ca'] = df.apply(crew_change_cp_ca, axis=1)
df['cc_count'] = df.apply(crew_members, axis=1)

## Day of the week and hour of the day features

In [26]:
# Create new features for day of the week and hour of the day
df['day_of_week'] = df['flt_sched_dep'].dt.dayofweek
df['hour_of_day_dep'] = df['flt_sched_dep'].dt.hour
df['hour_of_day_arr'] = df['flt_sched_arr'].dt.hour

## Data Checks: Crew Changing variables

In [27]:
# Check whether flt_crewt_change and flt_crewg match
pd.crosstab(df['flt_crewg'], df['cc_count'])

cc_count,0,1,2,3,4,5,6,8
flt_crewg,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
A,5028,164,121,171,176,185,3,1
B,483,0,0,0,0,0,0,0
B2,0,0,0,55,59,292,15,0
C,0,316,182,96,95,41,1,0
Start,4219,0,0,0,0,0,0,0


There seems to be a mismatch between the variables flt_crewg and the number of crew members who changed the aircraft based on the definition of the variable flt_crewg according to the documentation:
Crew Group: assignment of what happens to the whole crew after a flight
-  Start : First flight of day
-  A : all crew members stay on the aircraft for the next flight
-  B, B2: all crew members switch aircraft for the next flight
-  C: at least one crew member switches aircraft for the next flight

For flt_crewg == A there should only be cc_count == 0 and for flt_crewg == B there should only be cc_count != 0. Therefore a further analysis is done on that.

In [28]:
# Identifying a specific aircraft wherer flt_crewg == A and cc_count != 0 to analyse the crew changes for a specific date
condition = ((df['flt_crewg'] == 'A') & (df['cc_count'] != 0))
sample = df[condition].sample(n=3, random_state=42)

sample

Unnamed: 0,flt_offblock,flt_onblock,flt_ac_reg,dep_delay,flt_ac_type,flt_tt,flt_sched_tt,flt_crewg,flt_crewt_change,flt_sched_dep,flt_sched_arr,arr_delay,routing,sched_gt,act_gt,cp_crew,ca_crew,cp_count,ca_count,cc_cp_ca,cc_count,day_of_week,hour_of_day_dep,hour_of_day_arr
7831,2019-06-20 17:04:00,2019-06-20 18:59:00,ECLWFX,89.0,E95,64.0,60.0,A,[cp],2019-06-20 15:35:00,2019-06-20 17:10:00,109.0,East Carmen_Youngland,40.0,-66.0,"[Juan Massey, Steve Johnston]","[Melanie Fuentes, Susan Franco, Tammy Mullins]",2,3,flight_crew,1,3,15,17
9751,2019-06-25 15:41:00,2019-06-25 17:02:00,ECLBRX,31.0,320,32.0,30.0,A,"[ca, ca, ca, cp, cp]",2019-06-25 15:10:00,2019-06-25 16:40:00,22.0,East Carmen_Lake Lawrencechester,70.0,49.0,"[Jennifer Adams DDS, Randall Johnson]","[David Giles, Luis Robertson, Steven Blair, Za...",2,4,both,5,1,15,16
2040,2019-06-06 07:52:00,2019-06-06 09:07:00,ECLBKX,62.0,320,41.0,45.0,A,"[ca, ca, cp, cp, ca]",2019-06-06 06:50:00,2019-06-06 08:10:00,57.0,East Carmen_Joneshaven,45.0,50.0,"[Jill Conner, Shannon Schmidt]","[Charles Patterson, Dr. Robert White, Jose Tru...",2,4,both,5,3,6,8


In [29]:
# Now looking at the crew change history of the first registration for the specific day
df[(df['flt_ac_reg'] == 'ECLWFX') & (df['flt_sched_dep'].dt.strftime('%Y-%m-%d') == '2019-06-20')].iloc[:,[10, 11, 18, 19, 20, 21, 22, 23]]

Unnamed: 0,flt_sched_arr,arr_delay,ca_count,cc_cp_ca,cc_count,day_of_week,hour_of_day_dep,hour_of_day_arr
7564,2019-06-20 05:30:00,10.0,3,,0,3,4,5
7620,2019-06-20 07:55:00,4.0,3,flight_crew,2,3,6,7
7678,2019-06-20 10:10:00,-13.0,3,,0,3,8,10
7722,2019-06-20 12:20:00,53.0,3,,0,3,10,12
7775,2019-06-20 14:35:00,90.0,3,,0,3,13,14
7831,2019-06-20 17:10:00,109.0,3,flight_crew,1,3,15,17
7867,2019-06-20 19:30:00,-9.0,3,,0,3,17,19


Based on the hypothesis that the names of the crews are being recorded correctly as this might not only be relevant for payroll accounting, but also from regulative perspective the flt_crewg and flt_crewt_change values should be the following:
|   | flt_crewg | flt_crewt_change |
| --- | --- | --- |
7564 | C | [ca, ca, ca] |
7620 | A | [] |
7678 | B | [cp, cp, ca, ca, ca] |
7722 | A | [] |
7775 | C | [cp, ca, ca, ca] |
7831 | A | [] |
7867 | NaN | NaN

In [30]:
# Let's have a detailed look at the second example
df[(df['flt_ac_reg'] == 'ECLBRX') & (df['flt_sched_dep'].dt.strftime('%Y-%m-%d') == '2019-06-25')].iloc[:,[10, 11, 18, 19, 20, 21]]

Unnamed: 0,flt_sched_arr,arr_delay,ca_count,cc_cp_ca,cc_count,day_of_week
9528,2019-06-25 07:05:00,5.0,4,,0,1
9592,2019-06-25 09:55:00,20.0,4,,0,1
9651,2019-06-25 12:20:00,35.0,4,flight_crew,2,1
9715,2019-06-25 14:40:00,74.0,4,cabin_crew,4,1
9751,2019-06-25 16:40:00,22.0,4,both,5,1
9820,2019-06-25 19:15:00,-3.0,4,,0,1


For the second example the values should be the following:
|   | flt_crewg | flt_crewt_change |
| --- | --- | --- |
9528 | A | [] |
9592 | B | [cp, cp, ca, ca, ca, ca] |
9651 | C | [cp, cp] |
9715 | C | [ca, ca, ca, ca] |
9751 | A | [] |
9820 | NaN | NaN |

In [31]:
# Let's have a detailed look at the third example
df[(df['flt_ac_reg'] == 'ECLBKX') & (df['flt_sched_dep'].dt.strftime('%Y-%m-%d') == '2019-06-06')].iloc[:,[10, 11, 18, 19, 20, 21]]

Unnamed: 0,flt_sched_arr,arr_delay,ca_count,cc_cp_ca,cc_count,day_of_week
1972,2019-06-06 06:05:00,24.0,4,,0,3
2040,2019-06-06 08:10:00,57.0,4,both,5,3
2104,2019-06-06 10:20:00,59.0,4,,0,3
2156,2019-06-06 12:20:00,69.0,4,cabin_crew,3,3
2198,2019-06-06 14:25:00,-11.0,4,cabin_crew,1,3
2232,2019-06-06 16:40:00,62.0,4,both,4,3
2296,2019-06-06 19:15:00,-7.0,4,,0,3
2356,2019-06-06 23:45:00,26.0,4,,0,3


For the second example the values should be the following:
|   | flt_crewg | flt_crewt_change |
| --- | --- | --- |
1972 | A | [] |
2040 | A | [] |
2104 | B | [cp, cp, ca, ca, ca, ca] |
2156 | B | [cp, cp, ca, ca, ca, ca] |
2198 | B | [cp, cp, ca, ca, ca, ca] |
2232 | A | [] |
2296 | B | [cp, cp, ca, ca, ca, ca] |
2359 | NaN | NaN |

Within this sample, only a few entries in flt_crewg and flt_crewt_change match with the crew data over the rotation of a single registration for a day. Furthermore, there is a general error in the variable flt_crewg as it always shows Start for the first leg of the day, however the variable shall state "what happens to the whole crew after a flight".
Due to this, new features to capture crew changes will be established based on the columns cp_crew and ca_crew for the rotation of a specific aircraft on any given day.

In [32]:
# Creating new features to track crew changes from the flight before: cc_type {0: Start - First flight of the day, 1: No crew change;
# 2: At least one member, but not all of the crew changed; 3: Whole crew changed}; cc_roles [list of elements either ca or cp with an
# entry for every crew member that changed aircraft]

def crew_changes (registrations):
    # Create an empty df that will be appended in every step and finally returned by the function
    cc = pd.DataFrame(columns=['cc_types', 'cc_roles'])
    # Loop through the aircrafts and sort their flights by sched_dep
    for aircraft in registrations:
        indexes = df[df['flt_ac_reg'] == aircraft].sort_values(by='flt_sched_dep').index
        date = '2019-01-01'
        # Loop through the indexes of a specific aircraft
        for idx in indexes:
            cc_roles = []
            # If index has new date and therefore is first flight of the day
            if date not in str(df['flt_sched_dep'].loc[idx]):
                date = str(df['flt_sched_dep'].loc[idx]).partition(' ')[0]
                cp = df['cp_crew'].loc[idx]
                ca = df['ca_crew'].loc[idx]
                help = pd.DataFrame([[0, []]], columns=['cc_types', 'cc_roles'], index=[idx])
            else:
                # Check differences between flight and cabin crew from previous flight
                dif_cp = list(set(cp) - set(df['cp_crew'].loc[idx]))
                dif_ca = list(set(ca) - set(df['ca_crew'].loc[idx]))
                # If no crew members changed
                if (len(dif_cp) == 0) and (len(dif_ca) == 0):
                    help = pd.DataFrame([[1, []]], columns=['cc_types', 'cc_roles'], index=[idx])
                # If all crew members changed
                elif (len(dif_cp) >= len(cp)) and (len(dif_ca) >= len(ca)):
                    for item in dif_cp:
                        cc_roles.append('cp')
                    for item in dif_ca:
                        cc_roles.append('ca')
                    help = pd.DataFrame([[3, cc_roles]], columns=['cc_types', 'cc_roles'], index=[idx])
                else:
                    for item in dif_cp:
                        cc_roles.append('cp')
                    for item in dif_ca:
                        cc_roles.append('ca')
                    help = pd.DataFrame([[2, cc_roles]], columns=['cc_types', 'cc_roles'], index=[idx])
                cp = df['cp_crew'].loc[idx]
                ca = df['ca_crew'].loc[idx]
            cc = cc.append(help)
    return cc

# Create a list of all registrations in the dataset
registrations = df['flt_ac_reg'].unique()

df = pd.merge(df, crew_changes(registrations), left_index=True, right_index=True)
# Finally drop old columns
df.drop(columns=['flt_crewg', 'flt_crewt_change'], inplace=True)

In [33]:
# Create two new features from flt_crew_change:
# 1. Indicator whether cabin, flight or both crews changed aircraft
# 2. Number of crew members who changed aircraft
def crew_change_cp_ca (row):
    if all(x in row['cc_roles'] for x in ['ca', 'cp']):
        return 'both'
    if 'cp' in row['cc_roles']:
        return 'flight_crew'
    elif 'ca' in row['cc_roles']:
        return 'cabin_crew'
    else:
        return 'none'

def crew_members (row):
    return len(row['cc_roles'])

df['cc_cp_ca'] = df.apply(crew_change_cp_ca, axis=1)
df['cc_count'] = df.apply(crew_members, axis=1)
df.drop(columns=['cp_crew', 'ca_crew'], inplace=True)

## Adjusting Data Types prior to further analysis

In [34]:
# Checking current data types
df.dtypes

flt_offblock       datetime64[ns]
flt_onblock        datetime64[ns]
flt_ac_reg                 object
dep_delay                 float64
flt_ac_type              category
flt_tt                    float64
flt_sched_tt              float64
flt_sched_dep      datetime64[ns]
flt_sched_arr      datetime64[ns]
arr_delay                 float64
routing                    object
sched_gt                  float64
act_gt                    float64
cp_count                    int64
ca_count                    int64
cc_cp_ca                   object
cc_count                    int64
day_of_week                 int64
hour_of_day_dep             int64
hour_of_day_arr             int64
cc_types                   object
cc_roles                   object
dtype: object

In [35]:
convert_dict = {'cp_count': 'object',
                'ca_count': 'object',
                'cc_count': 'object',
                'day_of_week': 'object',
                'hour_of_day_dep': 'object',
                'hour_of_day_arr': 'object'
}
df = df.astype(convert_dict)

In [36]:
df.to_csv('../data/interim/features.csv')
df.to_pickle('../data/interim/features.pkl')