# Feature Engineering

In [1]:
import os
import numpy as np
import pandas as pd
import sklearn
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import (
    OneHotEncoder,
    StandardScaler,
    MinMaxScaler,
    PowerTransformer,
    FunctionTransformer
)
from feature_engine.encoding import (
	RareLabelEncoder,
    MeanEncoder,
    CountFrequencyEncoder
)
from feature_engine.datetime import DatetimeFeatures

In [2]:
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
sklearn.set_config(transform_output='pandas')

In [3]:
PROJECT_DIR = 'R:\Jaydeep/Flight-Price-Prediction'
DATA_DIR = 'data'

TRAIN_DATASET_NAME = 'train'
VAL_DATASET_NAME = 'validation'
TEST_DATASET_NAME  = 'test'

TRAIN_PRE_DATASET_NAME = 'train_preprocessing'
VAL_PRE_DATASET_NAME = 'val_preprocessing'
TEST_PRE_DATASET_NAME = 'test_preprocessing'

In [4]:
def get_dataset(dataset_name):
    file_name = f'{dataset_name}.csv'
    file_path = os.path.join(PROJECT_DIR, DATA_DIR, file_name)
    return pd.read_csv(file_path)

In [5]:
def export_dataset(X, name):
	file_name = f"{name}.csv"
	file_path = os.path.join(PROJECT_DIR, DATA_DIR, file_name)
	X.to_csv(file_path, index=False)

In [6]:
train_df = get_dataset(TRAIN_DATASET_NAME)

In [7]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6693 entries, 0 to 6692
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   airline          6693 non-null   object
 1   date_of_journey  6693 non-null   object
 2   source           6693 non-null   object
 3   destination      6693 non-null   object
 4   dep_time         6693 non-null   object
 5   arrival_time     6693 non-null   object
 6   duration_minute  6693 non-null   int64 
 7   total_stops      6693 non-null   int64 
 8   additional_info  6693 non-null   object
 9   price            6693 non-null   int64 
dtypes: int64(3), object(7)
memory usage: 523.0+ KB


In [8]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6693 entries, 0 to 6692
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   airline          6693 non-null   object
 1   date_of_journey  6693 non-null   object
 2   source           6693 non-null   object
 3   destination      6693 non-null   object
 4   dep_time         6693 non-null   object
 5   arrival_time     6693 non-null   object
 6   duration_minute  6693 non-null   int64 
 7   total_stops      6693 non-null   int64 
 8   additional_info  6693 non-null   object
 9   price            6693 non-null   int64 
dtypes: int64(3), object(7)
memory usage: 523.0+ KB


In [9]:
X_train = train_df.drop(columns=['price','additional_info'])
y_train = train_df.price.copy()

In [10]:
X_train.columns.to_list()

['airline',
 'date_of_journey',
 'source',
 'destination',
 'dep_time',
 'arrival_time',
 'duration_minute',
 'total_stops']

## Airline

In [11]:
airline_transformer = Pipeline(steps=[
    ('grouper', RareLabelEncoder(tol=0.1, replace_with='other', n_categories=2)),
    ('encoder', OneHotEncoder(sparse_output=False, handle_unknown='ignore'))
])

## Date of Journey

In [12]:
date_features = ['month', 'day_of_week', 'day_of_year']

In [13]:
date_tranformer = Pipeline(steps=[
    ('date_to_features', DatetimeFeatures(features_to_extract=date_features, yearfirst=True, format='mixed')),
    ('min_max_scaler', MinMaxScaler())
])

## Source & Destination

In [14]:
set(np.union1d(
    X_train.source.unique(),
    X_train.destination.unique()
))


{'Banglore',
 'Chennai',
 'Cochin',
 'Delhi',
 'Hyderabad',
 'Kolkata',
 'Mumbai',
 'New Delhi'}

In [15]:
location_df = X_train.loc[:, ['source', 'destination']]

In [16]:
location_transformer = Pipeline(steps=[
    ('grouper', RareLabelEncoder(tol=0.1, replace_with='other', n_categories=2)),
    ('mean_encoder', MeanEncoder()),
    ('power_transformer', PowerTransformer())
])

In [17]:
def is_north(X):
	columns = X.columns.to_list()
	north_cities = {"Delhi", "Kolkata", "Mumbai", "New Delhi"}
	return (
		X
		.assign(**{
			f"{col}_is_north": X.loc[:, col].isin(north_cities).astype(int)
			for col in columns
		})
		.drop(columns=columns)
	)

In [18]:
location_union_transformer = FeatureUnion(transformer_list=[
	("location_transformer", location_transformer),
	("is_north_transformer", FunctionTransformer(func=is_north))
])

## Departure Time & Arrival Time

In [19]:
time_df = X_train.loc[:, ['dep_time', 'arrival_time']]

In [20]:
time_df.sample(5)

Unnamed: 0,dep_time,arrival_time
4193,10:20:00,17:35:00
2695,21:05:00,22:25:00
158,08:20:00,10:35:00
882,11:10:00,14:05:00
5706,13:00:00,21:00:00


In [21]:
time_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6693 entries, 0 to 6692
Data columns (total 2 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   dep_time      6693 non-null   object
 1   arrival_time  6693 non-null   object
dtypes: object(2)
memory usage: 104.7+ KB


In [22]:
time_transformer = Pipeline(
    steps = [
        ('dt', DatetimeFeatures(features_to_extract=['hour', 'minute'])),
        # ('scaler', MinMaxScaler())
    ]
)

In [23]:
def part_of_day(X, start=0 , mid=8, end=16):
    columns = X.columns.to_list()
    X_temp = X.assign(
        **{
            col: pd.to_datetime(X.loc[:, col]).dt.hour
            for col in columns
        }
    )
    return (
        X_temp
        .assign(
            **{
                f'{col}_part_of_day': np.select(
                    [
                        X_temp.loc[:, col].between(start, mid, inclusive='left'),
                        X_temp.loc[:, col].between(mid, end, inclusive='left')
                    ], choicelist = ['start', 'mid'] ,default = 'end'
                )
                for col in columns
            }
        ).drop(columns=columns)
    )

In [24]:
part_of_day_transformer = Pipeline(
    steps = [
        ('part_of_day_func', FunctionTransformer(func=part_of_day)),
        ('count_fre_encoder', CountFrequencyEncoder()),
        ('min_max_scaler', MinMaxScaler())
    ]
)

In [25]:
time_union_transformer = FeatureUnion(
    transformer_list=[
        ('time_transformer', time_transformer),
        ('part_of_day_transformer', part_of_day_transformer)
    ]
)

In [26]:
X_train.columns

Index(['airline', 'date_of_journey', 'source', 'destination', 'dep_time',
       'arrival_time', 'duration_minute', 'total_stops'],
      dtype='object')

In [27]:
time_union_transformer.fit_transform(X_train.loc[:,['arrival_time']]).sample(5)

Unnamed: 0,arrival_time_hour,arrival_time_minute,arrival_time_part_of_day
2850,21,0,1.0
3889,23,15,1.0
2196,7,40,0.0
1172,10,5,0.397086
4250,13,10,0.397086


## Duration

In [28]:
duration_log_transformer = FunctionTransformer(func=np.log)

## Total Stop

In [29]:
def is_direct_flight(X):
    return X.assign(
        is_direct_flight = X.total_stops.eq(0).astype(int)
    )

In [30]:
total_stops_transformer = FunctionTransformer(func=is_direct_flight)

## Column Transformer

In [31]:
column_transformer = ColumnTransformer(transformers=[
    ('airline_transformer', airline_transformer, ['airline']),
    ('date_transformer', date_tranformer, ['date_of_journey']),
    ('location_union_transformer', location_union_transformer, ['source', 'destination']),
    ('time_union_transformer', time_union_transformer, ['dep_time', 'arrival_time']),
    ('duration_log_transformer', duration_log_transformer, ['duration_minute']),
    ('total_stops_trasformer', total_stops_transformer, ['total_stops'])
])

In [32]:
final_input_df = column_transformer.fit_transform(X_train, y_train)

In [33]:
final_input_columns = final_input_df.columns.to_list()

In [34]:
len(final_input_columns)

21

In [35]:
print(final_input_columns)

['airline_transformer__airline_Air India', 'airline_transformer__airline_Indigo', 'airline_transformer__airline_Jet Airways', 'airline_transformer__airline_Multiple Carriers', 'airline_transformer__airline_other', 'date_transformer__date_of_journey_month', 'date_transformer__date_of_journey_day_of_week', 'date_transformer__date_of_journey_day_of_year', 'location_union_transformer__source', 'location_union_transformer__destination', 'location_union_transformer__source_is_north', 'location_union_transformer__destination_is_north', 'time_union_transformer__dep_time_hour', 'time_union_transformer__dep_time_minute', 'time_union_transformer__arrival_time_hour', 'time_union_transformer__arrival_time_minute', 'time_union_transformer__dep_time_part_of_day', 'time_union_transformer__arrival_time_part_of_day', 'duration_log_transformer__duration_minute', 'total_stops_trasformer__total_stops', 'total_stops_trasformer__is_direct_flight']


## Save the preprocessed CSV file

In [36]:
export_dataset(column_transformer.fit_transform(get_dataset(TRAIN_DATASET_NAME), y_train), TRAIN_PRE_DATASET_NAME)
export_dataset(column_transformer.transform(get_dataset(VAL_DATASET_NAME)), VAL_PRE_DATASET_NAME)
export_dataset(column_transformer.transform(get_dataset(TEST_DATASET_NAME)), TEST_PRE_DATASET_NAME)