# 1. Importing libraries

In [1]:
import numpy as np
import pandas as pd
import warnings
import sklearn

In [2]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.ensemble import RandomForestRegressor
from feature_engine.selection import SelectBySingleFeaturePerformance

In [3]:
from feature_engine.datetime import DatetimeFeatures
from feature_engine.outliers import Winsorizer
from sklearn.preprocessing import MinMaxScaler, PowerTransformer, FunctionTransformer, StandardScaler
from sklearn.compose import ColumnTransformer
from feature_engine.encoding import RareLabelEncoder, MeanEncoder, CountFrequencyEncoder
from sklearn.pipeline import FeatureUnion
from sklearn.metrics.pairwise import rbf_kernel

In [4]:
sklearn.set_config(transform_output='pandas')

In [5]:
warnings.filterwarnings("ignore")

# 3. Read the data

In [6]:
file_path = r"C:\Users\User\OneDrive\Desktop\Flight-Price-Prediction\Data\train.csv"

In [7]:
train = pd.read_csv(file_path)

In [8]:
train

Unnamed: 0,airline,date_of_journey,source,destination,dep_time,arrival_time,duration,total_stops,additional_info,price
0,Jet Airways,2019-03-21,Banglore,New Delhi,08:55:00,19:10:00,615,1.0,In-flight meal not included,7832
1,Jet Airways,2019-03-27,Delhi,Cochin,17:30:00,04:25:00,655,1.0,In-flight meal not included,6540
2,Goair,2019-03-09,Banglore,New Delhi,11:40:00,14:35:00,175,0.0,No Info,7305
3,Air India,2019-06-12,Kolkata,Banglore,09:25:00,18:30:00,545,1.0,No Info,8366
4,Jet Airways,2019-03-12,Banglore,New Delhi,22:55:00,07:40:00,525,1.0,In-flight meal not included,11087
...,...,...,...,...,...,...,...,...,...,...
6690,Jet Airways,2019-03-21,Delhi,Cochin,10:45:00,18:50:00,1925,2.0,No Info,11093
6691,Air India,2019-05-01,Kolkata,Banglore,09:25:00,18:30:00,545,1.0,No Info,8891
6692,Jet Airways,2019-06-01,Delhi,Cochin,14:00:00,19:00:00,300,1.0,In-flight meal not included,10262
6693,Air Asia,2019-06-24,Delhi,Cochin,07:55:00,13:25:00,330,1.0,No Info,6152


In [9]:
X_train = train.drop(columns='price')

In [10]:
y_train = train['price']

# 4. Transformation operations

In [11]:
X_train.columns.to_list()

['airline',
 'date_of_journey',
 'source',
 'destination',
 'dep_time',
 'arrival_time',
 'duration',
 'total_stops',
 'additional_info']

### 4.1 airline

In [12]:
X_train.airline

0       Jet Airways
1       Jet Airways
2             Goair
3         Air India
4       Jet Airways
           ...     
6690    Jet Airways
6691      Air India
6692    Jet Airways
6693       Air Asia
6694      Air India
Name: airline, Length: 6695, dtype: object

In [13]:
X_train.airline.isna().sum()

0

In [14]:
threshold = 537

# Get value counts for the column
value_counts = X_train['airline'].value_counts()

# Create a mapping for rare labels
rare_labels = value_counts[value_counts <= threshold].index
X_train['airline'] = X_train['airline'].apply(lambda x: 'Other' if x in rare_labels else x)

In [15]:
X_train.airline.value_counts()

airline
Jet Airways          2391
Indigo               1296
Other                1196
Air India            1076
Multiple Carriers     736
Name: count, dtype: int64

In [16]:
air_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy='most_frequent')),
    ("encoder", OneHotEncoder(sparse_output=False, handle_unknown="ignore"))
])

In [17]:
air_transformer.fit_transform(X_train.loc[:, ["airline"]])

Unnamed: 0,airline_Air India,airline_Indigo,airline_Jet Airways,airline_Multiple Carriers,airline_Other
0,0.0,0.0,1.0,0.0,0.0
1,0.0,0.0,1.0,0.0,0.0
2,0.0,0.0,0.0,0.0,1.0
3,1.0,0.0,0.0,0.0,0.0
4,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...
6690,0.0,0.0,1.0,0.0,0.0
6691,1.0,0.0,0.0,0.0,0.0
6692,0.0,0.0,1.0,0.0,0.0
6693,0.0,0.0,0.0,0.0,1.0


### 4.2 date_of_journey

In [18]:
X_train.date_of_journey

0       2019-03-21
1       2019-03-27
2       2019-03-09
3       2019-06-12
4       2019-03-12
           ...    
6690    2019-03-21
6691    2019-05-01
6692    2019-06-01
6693    2019-06-24
6694    2019-03-01
Name: date_of_journey, Length: 6695, dtype: object

In [19]:
feature_to_extract = ["month", "week", "day_of_week", "day_of_month", "weekend", "year_start", "year_end"]

In [20]:
doj_transformer = Pipeline(steps=[
    ("dt", DatetimeFeatures(features_to_extract=feature_to_extract, yearfirst=True, format="mixed")),
    ("scaler", MinMaxScaler())
])

In [21]:
doj_transformer.fit_transform(X_train.loc[:, ["date_of_journey"]])

Unnamed: 0,date_of_journey_month,date_of_journey_week,date_of_journey_day_of_week,date_of_journey_day_of_month,date_of_journey_weekend,date_of_journey_year_start,date_of_journey_year_end
0,0.000000,0.176471,0.500000,0.769231,0.0,0.0,0.0
1,0.000000,0.235294,0.333333,1.000000,0.0,0.0,0.0
2,0.000000,0.058824,0.833333,0.307692,1.0,0.0,0.0
3,1.000000,0.882353,0.333333,0.423077,0.0,0.0,0.0
4,0.000000,0.117647,0.166667,0.423077,0.0,0.0,0.0
...,...,...,...,...,...,...,...
6690,0.000000,0.176471,0.500000,0.769231,0.0,0.0,0.0
6691,0.666667,0.529412,0.333333,0.000000,0.0,0.0,0.0
6692,1.000000,0.764706,0.833333,0.000000,1.0,0.0,0.0
6693,1.000000,1.000000,0.000000,0.884615,0.0,0.0,0.0


### 4.3 source and destination

In [22]:
X_train.source

0       Banglore
1          Delhi
2       Banglore
3        Kolkata
4       Banglore
          ...   
6690       Delhi
6691     Kolkata
6692       Delhi
6693       Delhi
6694    Banglore
Name: source, Length: 6695, dtype: object

In [23]:
X_train.destination

0       New Delhi
1          Cochin
2       New Delhi
3        Banglore
4       New Delhi
          ...    
6690       Cochin
6691     Banglore
6692       Cochin
6693       Cochin
6694    New Delhi
Name: destination, Length: 6695, dtype: object

In [24]:
location_subset = X_train.loc[:, ["source", "destination"]]
location_subset

Unnamed: 0,source,destination
0,Banglore,New Delhi
1,Delhi,Cochin
2,Banglore,New Delhi
3,Kolkata,Banglore
4,Banglore,New Delhi
...,...,...
6690,Delhi,Cochin
6691,Kolkata,Banglore
6692,Delhi,Cochin
6693,Delhi,Cochin


In [25]:
location_pipe1 = Pipeline(steps=[
    ("grouper", RareLabelEncoder(tol=0.1, replace_with="other", n_categories=2)),
    ("encoder", MeanEncoder()),
    ("scaler", PowerTransformer())
])

location_pipe1.fit_transform(location_subset, y_train)

Unnamed: 0,source,destination
0,-0.857930,-0.736484
1,1.065418,1.061694
2,-0.857930,-0.736484
3,-0.203928,-0.224351
4,-0.857930,-0.736484
...,...,...
6690,1.065418,1.061694
6691,-0.203928,-0.224351
6692,1.065418,1.061694
6693,1.065418,1.061694


In [26]:
np.union1d(
    X_train.source.unique(),
    X_train.destination.unique()
)

array(['Banglore', 'Chennai', 'Cochin', 'Delhi', 'Hyderabad', 'Kolkata',
       'Mumbai', 'New Delhi'], dtype=object)

In [27]:
def is_north(X):
    columns = X.columns.to_list()
    north_cities = ["Delhi", "Mumbai", "New Delhi"]
    
    return (
        X
        .assign(**{
            f"{col}_is_north" : X.loc[:, col].isin(north_cities).astype(int)
            for col in columns
        })
        .drop(columns=columns)
    )

FunctionTransformer(func=is_north).fit_transform(location_subset)

Unnamed: 0,source_is_north,destination_is_north
0,0,1
1,1,0
2,0,1
3,0,0
4,0,1
...,...,...
6690,1,0
6691,0,0
6692,1,0
6693,1,0


In [28]:
is_north(location_subset)

Unnamed: 0,source_is_north,destination_is_north
0,0,1
1,1,0
2,0,1
3,0,0
4,0,1
...,...,...
6690,1,0
6691,0,0
6692,1,0
6693,1,0


In [29]:
loc_transformer = Pipeline(steps=[
    ("part1", location_pipe1),
    ("part2", FunctionTransformer(func=is_north))
])

loc_transformer.fit_transform(location_subset, y_train)

Unnamed: 0,source_is_north,destination_is_north
0,0,0
1,0,0
2,0,0
3,0,0
4,0,0
...,...,...
6690,0,0
6691,0,0
6692,0,0
6693,0,0


### 4.4 dep_time and arrival_time

In [30]:
X_train.arrival_time

0       19:10:00
1       04:25:00
2       14:35:00
3       18:30:00
4       07:40:00
          ...   
6690    18:50:00
6691    18:30:00
6692    19:00:00
6693    13:25:00
6694    08:55:00
Name: arrival_time, Length: 6695, dtype: object

In [31]:
X_train.dep_time

0       08:55:00
1       17:30:00
2       11:40:00
3       09:25:00
4       22:55:00
          ...   
6690    10:45:00
6691    09:25:00
6692    14:00:00
6693    07:55:00
6694    11:50:00
Name: dep_time, Length: 6695, dtype: object

In [32]:
time_subset = X_train.loc[:, ["arrival_time", "dep_time"]]
time_subset

Unnamed: 0,arrival_time,dep_time
0,19:10:00,08:55:00
1,04:25:00,17:30:00
2,14:35:00,11:40:00
3,18:30:00,09:25:00
4,07:40:00,22:55:00
...,...,...
6690,18:50:00,10:45:00
6691,18:30:00,09:25:00
6692,19:00:00,14:00:00
6693,13:25:00,07:55:00


In [33]:
time_pipe1 = Pipeline(steps=[
    ("dt", DatetimeFeatures(features_to_extract=["hour", "minute"])),
    ("scaler", MinMaxScaler())
])

time_pipe1.fit_transform(time_subset)

Unnamed: 0,arrival_time_hour,arrival_time_minute,dep_time_hour,dep_time_minute
0,0.826087,0.181818,0.347826,1.000000
1,0.173913,0.454545,0.739130,0.545455
2,0.608696,0.636364,0.478261,0.727273
3,0.782609,0.545455,0.391304,0.454545
4,0.304348,0.727273,0.956522,1.000000
...,...,...,...,...
6690,0.782609,0.909091,0.434783,0.818182
6691,0.782609,0.545455,0.391304,0.454545
6692,0.826087,0.000000,0.608696,0.000000
6693,0.565217,0.454545,0.304348,1.000000


In [34]:
def part_of_day(X, morning=4, afternoon=12, evening=16, night=20):
    columns = X.columns.to_list()
    
    X_temp = X.assign(**{
        col : pd.to_datetime(X.loc[:, col], format="%H:%M:%S").dt.hour
        for col in columns
    })
    
    return (
        X_temp
        .assign(**{
            f"{col}_part_of_day" : np.select(
                [X_temp.loc[:, col].between(morning, afternoon, inclusive="left"),
                X_temp.loc[:, col].between(afternoon, evening, inclusive="left"),
                X_temp.loc[:, col].between(evening, night, inclusive="left")],
                ["morning", "afternoon", "evening"],
                default="night"
            )
            for col in columns
        })
        .drop(columns=columns)
    )

FunctionTransformer(func=part_of_day).fit_transform(time_subset)

Unnamed: 0,arrival_time_part_of_day,dep_time_part_of_day
0,evening,morning
1,morning,evening
2,afternoon,morning
3,evening,morning
4,morning,night
...,...,...
6690,evening,morning
6691,evening,morning
6692,evening,afternoon
6693,afternoon,morning


In [35]:
time_pipe2 = Pipeline(steps=[
    ("funcT", FunctionTransformer(func=part_of_day)),
    ("encoder", CountFrequencyEncoder()),
    ("scaler", MinMaxScaler())
])

time_pipe2.fit_transform(time_subset)

Unnamed: 0,arrival_time_part_of_day,dep_time_part_of_day
0,0.667335,1.000000
1,0.952906,0.202685
2,0.000000,1.000000
3,0.667335,1.000000
4,0.952906,0.174101
...,...,...
6690,0.667335,1.000000
6691,0.667335,1.000000
6692,0.667335,0.000000
6693,0.000000,1.000000


In [36]:
time_transformer = FeatureUnion(transformer_list=[
    ("part1", time_pipe1),
    ("part2", time_pipe2)
])

time_transformer.fit_transform(time_subset)

Unnamed: 0,arrival_time_hour,arrival_time_minute,dep_time_hour,dep_time_minute,arrival_time_part_of_day,dep_time_part_of_day
0,0.826087,0.181818,0.347826,1.000000,0.667335,1.000000
1,0.173913,0.454545,0.739130,0.545455,0.952906,0.202685
2,0.608696,0.636364,0.478261,0.727273,0.000000,1.000000
3,0.782609,0.545455,0.391304,0.454545,0.667335,1.000000
4,0.304348,0.727273,0.956522,1.000000,0.952906,0.174101
...,...,...,...,...,...,...
6690,0.782609,0.909091,0.434783,0.818182,0.667335,1.000000
6691,0.782609,0.545455,0.391304,0.454545,0.667335,1.000000
6692,0.826087,0.000000,0.608696,0.000000,0.667335,0.000000
6693,0.565217,0.454545,0.304348,1.000000,0.000000,1.000000


### 4.5 duration

In [37]:
(
    X_train
    .duration
    .quantile([0.25, 0.5, 0.75])
    .values
    .reshape(-1, 1)
)

array([[170. ],
       [510. ],
       [922.5]])

- RBF Kernel is used to evaluate the similarity between X (input variable) and y (target/reference)

In [38]:
class RBFPercentileSimilarity(BaseEstimator, TransformerMixin):
    def __init__(self, variables=None, percentiles=[0.25, 0.5, 0.75], gamma=0.1):
        self.variables = variables
        self.percentiles = percentiles
        self.gamma = gamma
        
    def fit(self, X, y=None):
        X = pd.DataFrame(X)
        if not self.variables:
            self.variables = X.select_dtypes(include="number").columns.to_list()
            
        self.reference_values_ = {
            col : (
                X
                .loc[:, col]
                .quantile(self.percentiles)
                .values
                .reshape(-1, 1)
            )
            for col in self.variables
        }
    
        return self

    def transform(self, X):
        X = pd.DataFrame(X)
        objects = []
        for col in self.variables:
            columns = [f"{col}_rbf_{int(percentile*100)}" for percentile in self.percentiles]
            obj = pd.DataFrame(
                data = rbf_kernel(X.loc[:, [col]], Y = self.reference_values_[col] , gamma = self.gamma),
                columns=columns
            )
                
            objects.append(obj)
                
        return pd.concat(objects, axis=1)

In [39]:
RBFPercentileSimilarity().fit_transform(X_train.loc[:, "duration"])

Unnamed: 0,duration_rbf_25,duration_rbf_50,duration_rbf_75
0,0.000000,0.000000e+00,0.0
1,0.000000,0.000000e+00,0.0
2,0.082085,0.000000e+00,0.0
3,0.000000,6.293989e-54,0.0
4,0.000000,1.691898e-10,0.0
...,...,...,...
6690,0.000000,0.000000e+00,0.0
6691,0.000000,6.293989e-54,0.0
6692,0.000000,0.000000e+00,0.0
6693,0.000000,0.000000e+00,0.0


In [40]:
def classify_duration(X, short=180, medium=400):
    return (
        X
        .assign(duration_modified = np.select(
                [X.duration.lt(short),
                X.duration.between(short, medium, inclusive="left")],
                ["short", "medium"],
                default="long"
            ))
        .drop(columns="duration")
    )

In [41]:
def is_over(X, value=1000):
    return (
        X
        .assign(**{
            f"duration_over{value}":X.duration.gt(value).astype(int)
        })
        .drop(columns="duration")
    )

In [42]:
duration_pipe1 = Pipeline(steps=[
    ("rbf", RBFPercentileSimilarity()),
    ("scaler", PowerTransformer())
])

duration_pipe2 = Pipeline(steps=[
    ("cat", FunctionTransformer(func=classify_duration)),
    ("encoder", OrdinalEncoder(categories=[["short", "medium", "long"]]))
])

duration_union = FeatureUnion(transformer_list=[
    ("part1", duration_pipe1),
    ("part2", duration_pipe2),
    ("part3", FunctionTransformer(func=is_over)),
    ("part4", StandardScaler())
]) 

duration_transformer = Pipeline(steps=[
    ("outliers", Winsorizer(capping_method='iqr', fold=1.5)),
    ("imputer", SimpleImputer(strategy="median")),
    ("union", duration_union)
])

duration_transformer.fit_transform(X_train.loc[:, ["duration"]])

Unnamed: 0,duration_rbf_25,duration_rbf_50,duration_rbf_75,duration_modified,duration_over1000,duration
0,-0.364262,-0.107976,-0.107799,2.0,0,-0.033916
1,-0.364262,-0.107976,-0.107799,2.0,0,0.046422
2,2.373008,-0.107976,-0.107799,0.0,0,-0.917631
3,-0.364262,-0.107976,-0.107799,2.0,0,-0.174507
4,-0.364262,-0.107975,-0.107799,2.0,0,-0.214676
...,...,...,...,...,...,...
6690,-0.364262,-0.107976,-0.107799,2.0,1,2.597145
6691,-0.364262,-0.107976,-0.107799,2.0,0,-0.174507
6692,-0.364262,-0.107976,-0.107799,1.0,0,-0.666576
6693,-0.364262,-0.107976,-0.107799,1.0,0,-0.606322


### 4.6 total_stops

In [43]:
X_train.total_stops

0       1.0
1       1.0
2       0.0
3       1.0
4       1.0
       ... 
6690    2.0
6691    1.0
6692    1.0
6693    1.0
6694    1.0
Name: total_stops, Length: 6695, dtype: float64

In [44]:
def is_direct(X):
    return X.assign(is_direct_flight=X.total_stops.eq(0).astype(int))

In [45]:
total_stops_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("", FunctionTransformer(func=is_direct))
])

In [46]:
total_stops_transformer.fit_transform(X_train.loc[:, ["total_stops"]])

Unnamed: 0,total_stops,is_direct_flight
0,1.0,0
1,1.0,0
2,0.0,1
3,1.0,0
4,1.0,0
...,...,...
6690,2.0,0
6691,1.0,0
6692,1.0,0
6693,1.0,0


### 4.7 additional_info

In [47]:
X_train.additional_info

0       In-flight meal not included
1       In-flight meal not included
2                           No Info
3                           No Info
4       In-flight meal not included
                   ...             
6690                        No Info
6691                        No Info
6692    In-flight meal not included
6693                        No Info
6694                 1 Long layover
Name: additional_info, Length: 6695, dtype: object

In [48]:
info_pipe1 = Pipeline(steps=[
    ("grouper", RareLabelEncoder(tol=0.1, n_categories=2, replace_with="Other")),
    ("encoder", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])

info_pipe1.fit_transform(X_train.loc[:, ["additional_info"]])

Unnamed: 0,additional_info_In-flight meal not included,additional_info_No Info,additional_info_Other
0,1.0,0.0,0.0
1,1.0,0.0,0.0
2,0.0,1.0,0.0
3,0.0,1.0,0.0
4,1.0,0.0,0.0
...,...,...,...
6690,0.0,1.0,0.0
6691,0.0,1.0,0.0
6692,1.0,0.0,0.0
6693,0.0,1.0,0.0


In [49]:
def have_info(X):
    return X.assign(additional_info=X.additional_info.ne("No info").astype(int))

In [50]:
info_union = FeatureUnion(transformer_list=[
    ("part1", info_pipe1),
    ("part2", FunctionTransformer(func=have_info))
])

In [51]:
info_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="constant", fill_value="unknown")),
    ("union", info_union)
])

In [52]:
info_transformer.fit_transform(X_train.loc[:, ["additional_info"]])

Unnamed: 0,additional_info_In-flight meal not included,additional_info_No Info,additional_info_Other,additional_info
0,1.0,0.0,0.0,1
1,1.0,0.0,0.0,1
2,0.0,1.0,0.0,1
3,0.0,1.0,0.0,1
4,1.0,0.0,0.0,1
...,...,...,...,...
6690,0.0,1.0,0.0,1
6691,0.0,1.0,0.0,1
6692,1.0,0.0,0.0,1
6693,0.0,1.0,0.0,1


# 5. Column Transformer

In [53]:
col_transformer = ColumnTransformer(transformers=[
    ("t1", air_transformer, ["airline"]),
    ("t2", doj_transformer, ["date_of_journey"]),
    ("t3", loc_transformer, ["source", "destination"]),
    ("t4", time_transformer, ["arrival_time", "dep_time"]),
    ("t5", duration_transformer, ["duration"]),
    ("t6", total_stops_transformer, ["total_stops"]),
    ("t7", info_transformer, ["additional_info"])
], remainder="passthrough")

In [54]:
col_transformer.fit_transform(X_train, y_train)

Unnamed: 0,t1__airline_Air India,t1__airline_Indigo,t1__airline_Jet Airways,t1__airline_Multiple Carriers,t1__airline_Other,t2__date_of_journey_month,t2__date_of_journey_week,t2__date_of_journey_day_of_week,t2__date_of_journey_day_of_month,t2__date_of_journey_weekend,...,t5__duration_rbf_75,t5__duration_modified,t5__duration_over1000,t5__duration,t6__total_stops,t6__is_direct_flight,t7__additional_info_In-flight meal not included,t7__additional_info_No Info,t7__additional_info_Other,t7__additional_info
0,0.0,0.0,1.0,0.0,0.0,0.000000,0.176471,0.500000,0.769231,0.0,...,-0.107799,2.0,0,-0.033916,1.0,0,1.0,0.0,0.0,1
1,0.0,0.0,1.0,0.0,0.0,0.000000,0.235294,0.333333,1.000000,0.0,...,-0.107799,2.0,0,0.046422,1.0,0,1.0,0.0,0.0,1
2,0.0,0.0,0.0,0.0,1.0,0.000000,0.058824,0.833333,0.307692,1.0,...,-0.107799,0.0,0,-0.917631,0.0,1,0.0,1.0,0.0,1
3,1.0,0.0,0.0,0.0,0.0,1.000000,0.882353,0.333333,0.423077,0.0,...,-0.107799,2.0,0,-0.174507,1.0,0,0.0,1.0,0.0,1
4,0.0,0.0,1.0,0.0,0.0,0.000000,0.117647,0.166667,0.423077,0.0,...,-0.107799,2.0,0,-0.214676,1.0,0,1.0,0.0,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6690,0.0,0.0,1.0,0.0,0.0,0.000000,0.176471,0.500000,0.769231,0.0,...,-0.107799,2.0,1,2.597145,2.0,0,0.0,1.0,0.0,1
6691,1.0,0.0,0.0,0.0,0.0,0.666667,0.529412,0.333333,0.000000,0.0,...,-0.107799,2.0,0,-0.174507,1.0,0,0.0,1.0,0.0,1
6692,0.0,0.0,1.0,0.0,0.0,1.000000,0.764706,0.833333,0.000000,1.0,...,-0.107799,1.0,0,-0.666576,1.0,0,1.0,0.0,0.0,1
6693,0.0,0.0,0.0,0.0,1.0,1.000000,1.000000,0.000000,0.884615,0.0,...,-0.107799,1.0,0,-0.606322,1.0,0,0.0,1.0,0.0,1


# 6. Feature selection

In [55]:
estimator = RandomForestRegressor(n_estimators=10, max_depth=3, random_state=42)

selector = SelectBySingleFeaturePerformance(
    estimator=estimator,
    scoring="r2",
    threshold=0.1
)

In [56]:
preprocessor = Pipeline(steps=[
    ("ct", col_transformer),
    ("selector", selector)
])

preprocessor.fit_transform(X_train, y_train)

Unnamed: 0,t1__airline_Indigo,t1__airline_Jet Airways,t1__airline_Other,t2__date_of_journey_week,t5__duration_rbf_25,t5__duration_modified,t5__duration_over1000,t5__duration,t6__total_stops,t6__is_direct_flight
0,0.0,1.0,0.0,0.176471,-0.364262,2.0,0,-0.033916,1.0,0
1,0.0,1.0,0.0,0.235294,-0.364262,2.0,0,0.046422,1.0,0
2,0.0,0.0,1.0,0.058824,2.373008,0.0,0,-0.917631,0.0,1
3,0.0,0.0,0.0,0.882353,-0.364262,2.0,0,-0.174507,1.0,0
4,0.0,1.0,0.0,0.117647,-0.364262,2.0,0,-0.214676,1.0,0
...,...,...,...,...,...,...,...,...,...,...
6690,0.0,1.0,0.0,0.176471,-0.364262,2.0,1,2.597145,2.0,0
6691,0.0,0.0,0.0,0.529412,-0.364262,2.0,0,-0.174507,1.0,0
6692,0.0,1.0,0.0,0.764706,-0.364262,1.0,0,-0.666576,1.0,0
6693,0.0,0.0,1.0,1.000000,-0.364262,1.0,0,-0.606322,1.0,0


# 7. Visualisations

In [57]:
preprocessor

In [58]:
feature_performance = preprocessor.named_steps["selector"].feature_performance_

In [59]:
feature_performance

{'t1__airline_Air India': 0.0016858488187393068,
 't1__airline_Indigo': 0.12827620333701603,
 't1__airline_Jet Airways': 0.19341654864699923,
 't1__airline_Multiple Carriers': 0.01885367626900632,
 't1__airline_Other': 0.11819934849670959,
 't2__date_of_journey_month': 0.08907386829479753,
 't2__date_of_journey_week': 0.18569991444411282,
 't2__date_of_journey_day_of_week': 0.004664789869158821,
 't2__date_of_journey_day_of_month': 0.034223276788402646,
 't2__date_of_journey_weekend': -0.0009403217440686618,
 't2__date_of_journey_year_start': -0.00071587460488202,
 't2__date_of_journey_year_end': -0.00071587460488202,
 't3__source_is_north': -0.00071587460488202,
 't3__destination_is_north': -0.00071587460488202,
 't4__arrival_time_hour': 0.07867962012854395,
 't4__arrival_time_minute': 0.03229609256720356,
 't4__dep_time_hour': 0.007541502467654985,
 't4__dep_time_minute': 0.03749304101660215,
 't4__arrival_time_part_of_day': 0.0313709099091218,
 't4__dep_time_part_of_day': -0.0013166

In [60]:
sorted_feat_imp = dict(sorted(feature_performance.items(), key=lambda val: val[1]))
sorted_feat_imp

{'t4__dep_time_part_of_day': -0.0013166458315723162,
 't7__additional_info_No Info': -0.001132211161403977,
 't2__date_of_journey_weekend': -0.0009403217440686618,
 't2__date_of_journey_year_start': -0.00071587460488202,
 't2__date_of_journey_year_end': -0.00071587460488202,
 't3__source_is_north': -0.00071587460488202,
 't3__destination_is_north': -0.00071587460488202,
 't7__additional_info': -0.00071587460488202,
 't5__duration_rbf_75': 0.0007341117000158048,
 't7__additional_info_In-flight meal not included': 0.0014426619953507396,
 't1__airline_Air India': 0.0016858488187393068,
 't2__date_of_journey_day_of_week': 0.004664789869158821,
 't5__duration_rbf_50': 0.005915259152939482,
 't4__dep_time_hour': 0.007541502467654985,
 't7__additional_info_Other': 0.01760748529752132,
 't1__airline_Multiple Carriers': 0.01885367626900632,
 't4__arrival_time_part_of_day': 0.0313709099091218,
 't4__arrival_time_minute': 0.03229609256720356,
 't2__date_of_journey_day_of_month': 0.034223276788402