# Feature Engineering

## Setup

In [1]:
import pathlib

import pandas as pd
import numpy as np
from pickle import dump

import preprocessing

## Read Data

In [2]:
combined_data_path: pathlib.Path = pathlib.Path.cwd().parent / "datasets" / "raw" / "combined.parquet"

In [3]:
data_df: pd.DataFrame = pd.read_parquet(combined_data_path)

## Handle Missing Values

In [4]:
percent_missing: pd.Series = preprocessing.percentage_missing_values_per_column(data_df)
percent_missing[percent_missing > 0]

AirTime              2.703047e+01
TaxiIn               2.573043e+01
TaxiOut              2.571812e+01
Tail_Number          2.539502e+01
ArrDelay             2.261258e+00
ActualElapsedTime    2.260889e+00
ArrTime              2.220742e+00
DepDelay             2.021100e+00
DepTime              2.020133e+00
Distance             1.363867e-01
OriginCityName       2.738200e-02
OriginState          2.738200e-02
DestState            2.618895e-02
DestCityName         2.618895e-02
CRSElapsedTime       1.743252e-02
DestAirport          6.751819e-07
dtype: float64

The top 3 features containing the most missing values, are about 25% empty.
Since 25% is a substantial amount of data, we ..,

In [5]:
data_df.drop(['AirTime', 'TaxiIn', 'TaxiOut', 'Tail_Number'], axis=1, inplace=True)

In [6]:
data_df.dropna(inplace=True)

## Limit the minimum arrival and departure delays

In [7]:
data_df['ArrDelay'] = data_df['ArrDelay'].apply(lambda x: 0 if x < 0 else x)
data_df['DepDelay'] = data_df['DepDelay'].apply(lambda x: 0 if x < 0 else x)

## Convert types

Now that we do not have any missing values, we convert the column types to reduce memory consumption.

In [8]:
data_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 144482939 entries, 0 to 148108239
Data columns (total 24 columns):
 #   Column             Dtype   
---  ------             -----   
 0   DayOfWeek          int64   
 1   DepDelay           float64 
 2   DepTime            float64 
 3   Origin             category
 4   DayofMonth         int64   
 5   CRSDepTime         int64   
 6   ArrTime            float64 
 7   Diverted           bool    
 8   Airline            category
 9   Distance           float64 
 10  Cancelled          bool    
 11  ActualElapsedTime  float64 
 12  OriginCityName     category
 13  OriginState        category
 14  CRSElapsedTime     float64 
 15  DestCityName       category
 16  Month              int64   
 17  ArrDelay           float64 
 18  DestAirport        category
 19  DestState          category
 20  Dest               category
 21  OriginAirport      category
 22  Year               int64   
 23  CRSArrTime         int64   
dtypes: bool(2), category

### Unnecessary float to int

In [9]:
float_columns: pd.Index = data_df.select_dtypes(include=['float']).columns

for column in float_columns:
    if (data_df[column] == data_df[column].astype('int64')).all():
        data_df[column] = data_df[column].astype('int64')

In [10]:
data_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 144482939 entries, 0 to 148108239
Data columns (total 24 columns):
 #   Column             Dtype   
---  ------             -----   
 0   DayOfWeek          int64   
 1   DepDelay           int64   
 2   DepTime            int64   
 3   Origin             category
 4   DayofMonth         int64   
 5   CRSDepTime         int64   
 6   ArrTime            int64   
 7   Diverted           bool    
 8   Airline            category
 9   Distance           int64   
 10  Cancelled          bool    
 11  ActualElapsedTime  int64   
 12  OriginCityName     category
 13  OriginState        category
 14  CRSElapsedTime     int64   
 15  DestCityName       category
 16  Month              int64   
 17  ArrDelay           int64   
 18  DestAirport        category
 19  DestState          category
 20  Dest               category
 21  OriginAirport      category
 22  Year               int64   
 23  CRSArrTime         int64   
dtypes: bool(2), category

### Int columns to fewer byte ints

In [11]:
integer_columns: pd.Index = data_df.select_dtypes(include=['int']).columns

for column in integer_columns:
    max_size = data_df[column].max()
    min_size = data_df[column].min()

    
    if min_size >= 0:
        if max_size <= np.iinfo(np.uint8).max:
            data_df[column] = data_df[column].astype('uint8')
        elif max_size <= np.iinfo(np.uint16).max:
            data_df[column] = data_df[column].astype('uint16')
        elif max_size <= np.iinfo(np.uint32).max:
            data_df[column] = data_df[column].astype('uint32')
        elif max_size <= np.iinfo(np.uint64).max:
            data_df[column] = data_df[column].astype('uint64')
    else:
        if max_size <= np.iinfo(np.int8).max and min_size >= np.iinfo(np.int8).min: 
            data_df[column] = data_df[column].astype('int8')
        elif max_size <= np.iinfo(np.int16).max and min_size >= np.iinfo(np.int16).min:
            data_df[column] = data_df[column].astype('int16')
        elif max_size <= np.iinfo(np.int32).max and min_size >= np.iinfo(np.int32).min:
            data_df[column] = data_df[column].astype('int32')
        elif max_size <= np.iinfo(np.int64).max and min_size >= np.iinfo(np.int64).min:
            data_df[column] = data_df[column].astype('int64')

In [12]:
data_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 144482939 entries, 0 to 148108239
Data columns (total 24 columns):
 #   Column             Dtype   
---  ------             -----   
 0   DayOfWeek          uint8   
 1   DepDelay           uint16  
 2   DepTime            uint16  
 3   Origin             category
 4   DayofMonth         uint8   
 5   CRSDepTime         uint16  
 6   ArrTime            uint16  
 7   Diverted           bool    
 8   Airline            category
 9   Distance           uint16  
 10  Cancelled          bool    
 11  ActualElapsedTime  int16   
 12  OriginCityName     category
 13  OriginState        category
 14  CRSElapsedTime     int16   
 15  DestCityName       category
 16  Month              uint8   
 17  ArrDelay           uint16  
 18  DestAirport        category
 19  DestState          category
 20  Dest               category
 21  OriginAirport      category
 22  Year               uint16  
 23  CRSArrTime         uint16  
dtypes: bool(2), category

## Encode categorical columns

Since we're planning to use random forest for the regression task, we encode the categorical columns using LabelEncoder as opposed to OneHotEncoder which should be used if the regression was performed by a NN.

In [16]:
from sklearn.preprocessing import LabelEncoder

In [17]:
categorical_columns: list[str] = data_df.select_dtypes(include=['category']).columns.to_list()

for column in categorical_columns:
    print(f"Column '{column}' has {len(data_df[column].unique())} unique values.")

Column 'Origin' has 422 unique values.
Column 'Airline' has 47 unique values.
Column 'OriginCityName' has 442 unique values.
Column 'OriginState' has 54 unique values.
Column 'DestCityName' has 441 unique values.
Column 'DestAirport' has 718 unique values.
Column 'DestState' has 54 unique values.
Column 'Dest' has 420 unique values.
Column 'OriginAirport' has 723 unique values.


In [18]:
for feature in categorical_columns:
    label_encoder = LabelEncoder()
    data_df[feature] = label_encoder.fit_transform(data_df[feature])
    dump(label_encoder, open('../datasets/processed/label_encoder_' + feature + '.pkl', 'wb'))

## Split into Train, Val, Test Datasets

In [19]:
from sklearn.model_selection import train_test_split

In [20]:
train_df, test_df = train_test_split(data_df, test_size=0.1, random_state=42)
train_df, val_df = train_test_split(train_df, test_size=len(test_df), random_state=42)

In [21]:
train_df.to_parquet(pathlib.Path.cwd().parent / "datasets" / "processed" / "train.parquet")
val_df.to_parquet(pathlib.Path.cwd().parent / "datasets" / "processed" / "val.parquet")
test_df.to_parquet(pathlib.Path.cwd().parent / "datasets" / "processed" / "test.parquet")