# Feature Engineering

## Setup

In [1]:
import pathlib

import pandas as pd
import numpy as np
from pickle import dump

import preprocessing

## Read Data

In [2]:
combined_data_path: pathlib.Path = pathlib.Path.cwd().parent / "datasets" / "raw" / "combined.parquet"

In [3]:
data_df: pd.DataFrame = pd.read_parquet(combined_data_path)

## Handle Missing Values

In [4]:
percent_missing: pd.Series = preprocessing.percentage_missing_values_per_column(data_df)
percent_missing[percent_missing > 0]

AirTime              27.030472
TaxiIn               25.730429
TaxiOut              25.718116
Tail_Number          25.395022
ArrDelay              2.261258
ActualElapsedTime     2.260889
ArrTime               2.220742
DepDelay              2.021100
DepTime               2.020133
Distance              0.136387
OriginState           0.027382
OriginCityName        0.027382
DestCityName          0.026189
DestState             0.026189
CRSElapsedTime        0.017433
dtype: float64

The top 3 features containing the most missing values, are about 25% empty.
Since 25% is a substantial amount of data, we ..,

In [5]:
data_df.drop(['AirTime', 'TaxiIn', 'TaxiOut', 'Tail_Number'], axis=1, inplace=True)

In [6]:
data_df.dropna(inplace=True)

## Limit the minimum arrival and departure delays

In [7]:
data_df['ArrDelay'] = data_df['ArrDelay'].apply(lambda x: 0 if x < 0 else x)
data_df['DepDelay'] = data_df['DepDelay'].apply(lambda x: 0 if x < 0 else x)

## Convert types

Now that we do not have any missing values, we convert the column types to reduce memory consumption.

In [8]:
data_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 144482939 entries, 0 to 148108239
Data columns (total 22 columns):
 #   Column             Dtype   
---  ------             -----   
 0   Airline            category
 1   Origin             category
 2   Year               int64   
 3   Distance           float64 
 4   Cancelled          bool    
 5   DestState          category
 6   ActualElapsedTime  float64 
 7   OriginState        category
 8   ArrTime            float64 
 9   DestCityName       category
 10  CRSElapsedTime     float64 
 11  DepTime            float64 
 12  DepDelay           float64 
 13  Diverted           bool    
 14  CRSArrTime         int64   
 15  DayOfWeek          int64   
 16  OriginCityName     category
 17  ArrDelay           float64 
 18  Dest               category
 19  Month              int64   
 20  CRSDepTime         int64   
 21  DayofMonth         int64   
dtypes: bool(2), category(7), float64(7), int64(6)
memory usage: 16.8 GB


### Unnecessary float to int

In [9]:
float_columns: pd.Index = data_df.select_dtypes(include=['float']).columns

for column in float_columns:
    if (data_df[column] == data_df[column].astype('int64')).all():
        data_df[column] = data_df[column].astype('int64')

In [10]:
data_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 144482939 entries, 0 to 148108239
Data columns (total 22 columns):
 #   Column             Dtype   
---  ------             -----   
 0   Airline            category
 1   Origin             category
 2   Year               int64   
 3   Distance           int64   
 4   Cancelled          bool    
 5   DestState          category
 6   ActualElapsedTime  int64   
 7   OriginState        category
 8   ArrTime            int64   
 9   DestCityName       category
 10  CRSElapsedTime     int64   
 11  DepTime            int64   
 12  DepDelay           int64   
 13  Diverted           bool    
 14  CRSArrTime         int64   
 15  DayOfWeek          int64   
 16  OriginCityName     category
 17  ArrDelay           int64   
 18  Dest               category
 19  Month              int64   
 20  CRSDepTime         int64   
 21  DayofMonth         int64   
dtypes: bool(2), category(7), int64(13)
memory usage: 16.8 GB


### Int columns to fewer byte ints

In [11]:
integer_columns: pd.Index = data_df.select_dtypes(include=['int']).columns

for column in integer_columns:
    max_size = data_df[column].max()
    min_size = data_df[column].min()

    
    if min_size >= 0:
        if max_size <= np.iinfo(np.uint8).max:
            data_df[column] = data_df[column].astype('uint8')
        elif max_size <= np.iinfo(np.uint16).max:
            data_df[column] = data_df[column].astype('uint16')
        elif max_size <= np.iinfo(np.uint32).max:
            data_df[column] = data_df[column].astype('uint32')
        elif max_size <= np.iinfo(np.uint64).max:
            data_df[column] = data_df[column].astype('uint64')
    else:
        if max_size <= np.iinfo(np.int8).max and min_size >= np.iinfo(np.int8).min: 
            data_df[column] = data_df[column].astype('int8')
        elif max_size <= np.iinfo(np.int16).max and min_size >= np.iinfo(np.int16).min:
            data_df[column] = data_df[column].astype('int16')
        elif max_size <= np.iinfo(np.int32).max and min_size >= np.iinfo(np.int32).min:
            data_df[column] = data_df[column].astype('int32')
        elif max_size <= np.iinfo(np.int64).max and min_size >= np.iinfo(np.int64).min:
            data_df[column] = data_df[column].astype('int64')

In [12]:
data_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 144482939 entries, 0 to 148108239
Data columns (total 22 columns):
 #   Column             Dtype   
---  ------             -----   
 0   Airline            category
 1   Origin             category
 2   Year               uint16  
 3   Distance           uint16  
 4   Cancelled          bool    
 5   DestState          category
 6   ActualElapsedTime  int16   
 7   OriginState        category
 8   ArrTime            uint16  
 9   DestCityName       category
 10  CRSElapsedTime     int16   
 11  DepTime            uint16  
 12  DepDelay           uint16  
 13  Diverted           bool    
 14  CRSArrTime         uint16  
 15  DayOfWeek          uint8   
 16  OriginCityName     category
 17  ArrDelay           uint16  
 18  Dest               category
 19  Month              uint8   
 20  CRSDepTime         uint16  
 21  DayofMonth         uint8   
dtypes: bool(2), category(7), int16(2), uint16(8), uint8(3)
memory usage: 5.9 GB


## Encode categorical columns

Since we're planning to use random forest for the regression task, we encode the categorical columns using LabelEncoder as opposed to OneHotEncoder which should be used if the regression was performed by a NN.

In [13]:
from sklearn.preprocessing import LabelEncoder

In [14]:
categorical_columns: list[str] = data_df.select_dtypes(include=['category']).columns.to_list()

for column in categorical_columns:
    print(f"Column '{column}' has {len(data_df[column].unique())} unique values.")

Column 'Airline' has 47 unique values.
Column 'Origin' has 422 unique values.
Column 'DestState' has 54 unique values.
Column 'OriginState' has 54 unique values.
Column 'DestCityName' has 441 unique values.
Column 'OriginCityName' has 442 unique values.
Column 'Dest' has 420 unique values.


In [15]:
for feature in categorical_columns:
    label_encoder = LabelEncoder()
    data_df[feature] = label_encoder.fit_transform(data_df[feature])
    dump(label_encoder, open('../datasets/processed/label_encoder_' + feature + '.pkl', 'wb'))

Save the label encoder for future use

In [16]:
# dump(label_encoder, open('../datasets/processed/label_encoder.pkl', 'wb'))

## Split into Train, Val, Test Datasets

In [17]:
from sklearn.model_selection import train_test_split

In [18]:
train_df, test_df = train_test_split(data_df, test_size=0.1, random_state=42)
train_df, val_df = train_test_split(train_df, test_size=len(test_df), random_state=42)

In [19]:
train_df.to_parquet(pathlib.Path.cwd().parent / "datasets" / "processed" / "train.parquet")
val_df.to_parquet(pathlib.Path.cwd().parent / "datasets" / "processed" / "val.parquet")
test_df.to_parquet(pathlib.Path.cwd().parent / "datasets" / "processed" / "test.parquet")