In [1]:
import pandas as pd
import numpy as np
import sklearn
import os

from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.impute import SimpleImputer 
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, PowerTransformer, FunctionTransformer,StandardScaler,OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics.pairwise import rbf_kernel
from sklearn.ensemble import RandomForestRegressor

from feature_engine.encoding import RareLabelEncoder, MeanEncoder, CountFrequencyEncoder
from feature_engine.selection import SelectBySingleFeaturePerformance
from feature_engine.datetime import DatetimeFeatures
from feature_engine.outliers import Winsorizer

import warnings
import matplotlib.pyplot as plt

In [2]:
sklearn.set_config(transform_output="pandas")

In [3]:
pd.set_option("display.max_columns",None)

In [4]:
warnings.filterwarnings("ignore")

In [5]:
## We are performing feature engineering on train dataset on;y

In [6]:
path=r"C:\Users\LENOVO\OneDrive\Desktop\Flight_price_predict_project\DataSets\train1_dataset.csv"
train_data=pd.read_csv(path)

In [7]:
train_data["total_stops"]=train_data["total_stops"].fillna(0)

In [8]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6695 entries, 0 to 6694
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   airline          6695 non-null   object 
 1   date_of_journey  6695 non-null   object 
 2   source           6695 non-null   object 
 3   destination      6695 non-null   object 
 4   dep_time         6695 non-null   object 
 5   arrival_time     6695 non-null   object 
 6   duration         6695 non-null   int64  
 7   total_stops      6695 non-null   float64
 8   additional_info  6695 non-null   object 
 9   price            6695 non-null   int64  
dtypes: float64(1), int64(2), object(7)
memory usage: 523.2+ KB


In [9]:
# train_data.total_stops.info()

In [10]:
x_train=train_data.drop(columns="price")
y_train=train_data.price.copy()

### 1. Transformation Operations

In [11]:
x_train.columns.to_list()

['airline',
 'date_of_journey',
 'source',
 'destination',
 'dep_time',
 'arrival_time',
 'duration',
 'total_stops',
 'additional_info']

#### 1.1 airline

In [12]:
train_data.airline.info()

<class 'pandas.core.series.Series'>
RangeIndex: 6695 entries, 0 to 6694
Series name: airline
Non-Null Count  Dtype 
--------------  ----- 
6695 non-null   object
dtypes: object(1)
memory usage: 52.4+ KB


In [13]:
airline_pipe=Pipeline(steps=[
    ("imputer",SimpleImputer(strategy="most_frequent")),
    ("grouper", RareLabelEncoder(tol=0.1,replace_with="Other", n_categories=2)),
    ("encoder", OneHotEncoder(sparse_output=False, handle_unknown="ignore"))
])

In [14]:
airline_pipe.fit_transform(x_train.loc[:,["airline"]])

Unnamed: 0,airline_Air India,airline_Indigo,airline_Jet Airways,airline_Multiple Carriers,airline_Other
0,0.0,0.0,1.0,0.0,0.0
1,0.0,0.0,1.0,0.0,0.0
2,0.0,0.0,0.0,0.0,1.0
3,1.0,0.0,0.0,0.0,0.0
4,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...
6690,0.0,0.0,1.0,0.0,0.0
6691,1.0,0.0,0.0,0.0,0.0
6692,0.0,0.0,1.0,0.0,0.0
6693,0.0,0.0,0.0,0.0,1.0


#### 1.2 date_of_journey

In [15]:
train_data.date_of_journey.info()

<class 'pandas.core.series.Series'>
RangeIndex: 6695 entries, 0 to 6694
Series name: date_of_journey
Non-Null Count  Dtype 
--------------  ----- 
6695 non-null   object
dtypes: object(1)
memory usage: 52.4+ KB


In [16]:
features_to_ex=["month","week","day_of_week","day_of_year"]

doj_pipe=Pipeline(steps=[
    ("dt",DatetimeFeatures(features_to_extract=features_to_ex, yearfirst=True,format="mixed")),
    ("scalar", MinMaxScaler())
])

doj_pipe.fit_transform(x_train.loc[:,["date_of_journey"]])

Unnamed: 0,date_of_journey_month,date_of_journey_week,date_of_journey_day_of_week,date_of_journey_day_of_year
0,0.000000,0.176471,0.500000,0.169492
1,0.000000,0.235294,0.333333,0.220339
2,0.000000,0.058824,0.833333,0.067797
3,1.000000,0.882353,0.333333,0.872881
4,0.000000,0.117647,0.166667,0.093220
...,...,...,...,...
6690,0.000000,0.176471,0.500000,0.169492
6691,0.666667,0.529412,0.333333,0.516949
6692,1.000000,0.764706,0.833333,0.779661
6693,1.000000,1.000000,0.000000,0.974576


#### 1.3 Source & destination

In [17]:
train_data.source.info()

<class 'pandas.core.series.Series'>
RangeIndex: 6695 entries, 0 to 6694
Series name: source
Non-Null Count  Dtype 
--------------  ----- 
6695 non-null   object
dtypes: object(1)
memory usage: 52.4+ KB


In [18]:
location_sd=x_train.loc[:,["source","destination"]]
location_sd

Unnamed: 0,source,destination
0,Banglore,New Delhi
1,Delhi,Cochin
2,Banglore,New Delhi
3,Kolkata,Banglore
4,Banglore,New Delhi
...,...,...
6690,Delhi,Cochin
6691,Kolkata,Banglore
6692,Delhi,Cochin
6693,Delhi,Cochin


In [19]:
location_pipe=Pipeline(steps=[
    ("grouper",RareLabelEncoder(tol=0.1,replace_with="Other",n_categories=2)),
    ("encoder",MeanEncoder()),
    ("scaler",PowerTransformer())
])
location_pipe.fit_transform(location_sd,y_train)

Unnamed: 0,source,destination
0,-0.857930,-0.736484
1,1.065418,1.061694
2,-0.857930,-0.736484
3,-0.203928,-0.224351
4,-0.857930,-0.736484
...,...,...
6690,1.065418,1.061694
6691,-0.203928,-0.224351
6692,1.065418,1.061694
6693,1.065418,1.061694


In [20]:
np.union1d(
    x_train.source.unique(),
    x_train.destination.unique(),
)

array(['Banglore', 'Chennai', 'Cochin', 'Delhi', 'Hyderabad', 'Kolkata',
       'Mumbai', 'New Delhi'], dtype=object)

In [21]:
def is_north(x):
    columns=x.columns.to_list()
    north_cities=["Delhi","kokata","Mumbai","New Delhi"]
    return(
        x
        .assign(**{
            f"{col}_is_north":x.loc[:,col].isin(north_cities).astype(int)
            for col in columns
        })
        .drop(columns=columns)
    )    

# is_north(location_sd)
FunctionTransformer(func=is_north).fit_transform(location_sd)

Unnamed: 0,source_is_north,destination_is_north
0,0,1
1,1,0
2,0,1
3,0,0
4,0,1
...,...,...
6690,1,0
6691,0,0
6692,1,0
6693,1,0


In [22]:
location_transformer=FeatureUnion(transformer_list=[
    ("part-1",location_pipe),
    ("part-2",FunctionTransformer(func=is_north))
])

location_transformer.fit_transform(location_sd,y_train)

Unnamed: 0,source,destination,source_is_north,destination_is_north
0,-0.857930,-0.736484,0,1
1,1.065418,1.061694,1,0
2,-0.857930,-0.736484,0,1
3,-0.203928,-0.224351,0,0
4,-0.857930,-0.736484,0,1
...,...,...,...,...
6690,1.065418,1.061694,1,0
6691,-0.203928,-0.224351,0,0
6692,1.065418,1.061694,1,0
6693,1.065418,1.061694,1,0


#### 1.4 dep_time & arrival_time

In [23]:
x_train.arrival_time

0       19:10:00
1       04:25:00
2       14:35:00
3       18:30:00
4       07:40:00
          ...   
6690    18:50:00
6691    18:30:00
6692    19:00:00
6693    13:25:00
6694    08:55:00
Name: arrival_time, Length: 6695, dtype: object

In [24]:
time_subset=x_train.loc[:,["dep_time","arrival_time"]]
time_subset.head()

Unnamed: 0,dep_time,arrival_time
0,08:55:00,19:10:00
1,17:30:00,04:25:00
2,11:40:00,14:35:00
3,09:25:00,18:30:00
4,22:55:00,07:40:00


In [25]:
time_pipe1=Pipeline(steps=[
    ("dt", DatetimeFeatures(features_to_extract=["hour","minute"],format="mixed")),
    ("scaler",MinMaxScaler())
])

time_pipe1.fit_transform(time_subset).head()

Unnamed: 0,dep_time_hour,dep_time_minute,arrival_time_hour,arrival_time_minute
0,0.347826,1.0,0.826087,0.181818
1,0.73913,0.545455,0.173913,0.454545
2,0.478261,0.727273,0.608696,0.636364
3,0.391304,0.454545,0.782609,0.545455
4,0.956522,1.0,0.304348,0.727273


In [26]:
def part_of_day(x,morning=4,noon=12,eve=16,night=20):
    columns=x.columns.to_list()
    x_temp=x.assign(**{
        col:pd.to_datetime(x.loc[:,col],format="mixed").dt.hour
        for col in columns
    })
    return (
        x_temp
        .assign(**{
            f"{col}_part_of_day":np.select(
                [x_temp.loc[:,col].between(morning,noon,inclusive="left"),
                 x_temp.loc[:,col].between(noon,eve,inclusive="left"),
                 x_temp.loc[:,col].between(eve,night,inclusive="left")],
                ["morning","afternoon","evening"],
                default="night"
            )
            for col in columns
        })
        .drop(columns=columns)
    )

FunctionTransformer(func=part_of_day).fit_transform(time_subset).head()

Unnamed: 0,dep_time_part_of_day,arrival_time_part_of_day
0,morning,evening
1,evening,morning
2,morning,afternoon
3,morning,evening
4,night,morning


In [27]:
time_pipe2=Pipeline(steps=[
    ("part",FunctionTransformer(func=part_of_day)),
    ("encoder", CountFrequencyEncoder()),
    ("scaler",MinMaxScaler())
])

time_pipe2.fit_transform(time_subset)

Unnamed: 0,dep_time_part_of_day,arrival_time_part_of_day
0,1.000000,0.667335
1,0.202685,0.952906
2,1.000000,0.000000
3,1.000000,0.667335
4,0.174101,0.952906
...,...,...
6690,1.000000,0.667335
6691,1.000000,0.667335
6692,0.000000,0.667335
6693,1.000000,0.000000


In [28]:
time_transformer=FeatureUnion(transformer_list=[
    ("part1",time_pipe1),
    ("part2", time_pipe2)
])
time_transformer.fit_transform(time_subset)

Unnamed: 0,dep_time_hour,dep_time_minute,arrival_time_hour,arrival_time_minute,dep_time_part_of_day,arrival_time_part_of_day
0,0.347826,1.000000,0.826087,0.181818,1.000000,0.667335
1,0.739130,0.545455,0.173913,0.454545,0.202685,0.952906
2,0.478261,0.727273,0.608696,0.636364,1.000000,0.000000
3,0.391304,0.454545,0.782609,0.545455,1.000000,0.667335
4,0.956522,1.000000,0.304348,0.727273,0.174101,0.952906
...,...,...,...,...,...,...
6690,0.434783,0.818182,0.782609,0.909091,1.000000,0.667335
6691,0.391304,0.454545,0.782609,0.545455,1.000000,0.667335
6692,0.608696,0.000000,0.826087,0.000000,0.000000,0.667335
6693,0.304348,1.000000,0.565217,0.454545,1.000000,0.000000


#### 1.5 duration

In [29]:
class RBFpercentileSimi(BaseEstimator,TransformerMixin):
    def __init__(self, variables=None,percentiles=[0.2,0.5,0.75],gamma=0.1):
        self.variables=variables
        self.percentiles=percentiles
        self.gamma=gamma

    def fit(self,x,y=None):
        if not self.variables:
            self.variables=x.select_dtypes(include="number").columns.to_list()

        self.reference_values_={
            col:(
                x
                .loc[:,col]
                .quantile(self.percentiles)
                .values
                .reshape(-1,1)
            )
            for col in self.variables
        }
        return self    

    def transform(self,x):
        objects=[]
        for col in self.variables:
            columns=[f"{col}_rbf_{int(percentile*100)}" for percentile in self.percentiles]
            obj=pd.DataFrame(
                data=rbf_kernel(x.loc[:,[col]],Y=self.reference_values_[col],gamma=self.gamma),
                columns=columns
            )
            objects.append(obj)
        return pd.concat(objects,axis=1)

In [30]:
RBFpercentileSimi().fit_transform(x_train)

Unnamed: 0,duration_rbf_20,duration_rbf_50,duration_rbf_75,total_stops_rbf_20,total_stops_rbf_50,total_stops_rbf_75
0,0.000000,0.000000e+00,0.0,0.904837,1.000000,1.000000
1,0.000000,0.000000e+00,0.0,0.904837,1.000000,1.000000
2,0.000045,0.000000e+00,0.0,1.000000,0.904837,0.904837
3,0.000000,6.293989e-54,0.0,0.904837,1.000000,1.000000
4,0.000000,1.691898e-10,0.0,0.904837,1.000000,1.000000
...,...,...,...,...,...,...
6690,0.000000,0.000000e+00,0.0,0.670320,0.904837,0.904837
6691,0.000000,6.293989e-54,0.0,0.904837,1.000000,1.000000
6692,0.000000,0.000000e+00,0.0,0.904837,1.000000,1.000000
6693,0.000000,0.000000e+00,0.0,0.904837,1.000000,1.000000


In [31]:
def duration_category(x,short=180,med=400):
    return(
        x
        .assign(duration_cat=np.select([x.duration.lt(short),
									    x.duration.between(short, med, inclusive="left")],
									   ["short", "medium"],
									   default="long"))
        .drop(columns="duration")
    )

In [32]:
duration_category(x_train.loc[:,["duration"]])

Unnamed: 0,duration_cat
0,long
1,long
2,short
3,long
4,long
...,...
6690,long
6691,long
6692,medium
6693,medium


In [33]:
def is_over(x,value=1000):
    return (
        x
        .assign(**{
            f"duration_over_{value}":x.duration.ge(value).astype(int)
        })
        .drop(columns="duration")
    )

In [34]:
is_over(x_train.loc[:,["duration"]])

Unnamed: 0,duration_over_1000
0,0
1,0
2,0
3,0
4,0
...,...
6690,1
6691,0
6692,0
6693,0


In [35]:
duration_pipe1=Pipeline(steps=[
    ("rbf",RBFpercentileSimi()),
    ("scaler",PowerTransformer())
])

duration_pipe2=Pipeline(steps=[
    ("cat",FunctionTransformer(func=duration_category)),
    ("encoder",OrdinalEncoder(categories=[["short","medium","long"]])),
])

duration_union=FeatureUnion(transformer_list=[
    ("part1",duration_pipe1),
    ("part2",duration_pipe2),
    ("part3",FunctionTransformer(func=is_over)),
    ("part4",StandardScaler())
])

duration_transformer=Pipeline(steps=[
    ("outlier",Winsorizer(capping_method="iqr",fold=1.5)),
    ("imputer", SimpleImputer(strategy="median")),
    ("union",duration_union)
])

duration_transformer.fit_transform(x_train.loc[:,["duration"]])

Unnamed: 0,duration_rbf_20,duration_rbf_50,duration_rbf_75,duration_cat,duration_over_1000,duration
0,-0.340164,-0.107976,-0.107799,2.0,0,-0.033916
1,-0.340164,-0.107976,-0.107799,2.0,0,0.046422
2,-0.335554,-0.107976,-0.107799,0.0,0,-0.917631
3,-0.340164,-0.107976,-0.107799,2.0,0,-0.174507
4,-0.340164,-0.107975,-0.107799,2.0,0,-0.214676
...,...,...,...,...,...,...
6690,-0.340164,-0.107976,-0.107799,2.0,1,2.597145
6691,-0.340164,-0.107976,-0.107799,2.0,0,-0.174507
6692,-0.340164,-0.107976,-0.107799,1.0,0,-0.666576
6693,-0.340164,-0.107976,-0.107799,1.0,0,-0.606322


#### 1.5 total_stops

In [36]:
def is_direct(x):
    return x.assign(direct_flight=x.total_stops.eq(0).astype(int))

total_stops_transformer=Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("",FunctionTransformer(func=is_direct))
])

In [37]:
total_stops_transformer.fit_transform(x_train.loc[:,["total_stops"]])

Unnamed: 0,total_stops,direct_flight
0,1.0,0
1,1.0,0
2,0.0,1
3,1.0,0
4,1.0,0
...,...,...
6690,2.0,0
6691,1.0,0
6692,1.0,0
6693,1.0,0


#### 1.6 additional_info

In [38]:
x_train.additional_info

0       In-flight meal not included
1       In-flight meal not included
2                           No Info
3                           No Info
4       In-flight meal not included
                   ...             
6690                        No Info
6691                        No Info
6692    In-flight meal not included
6693                        No Info
6694                 1 Long layover
Name: additional_info, Length: 6695, dtype: object

In [39]:
info_pipe1=Pipeline(steps=[
    ("group",RareLabelEncoder(tol=0.1,n_categories=2,replace_with="Other")),
    ("encoder",OneHotEncoder(handle_unknown="ignore",sparse_output=False))
])

info_pipe1.fit_transform(x_train.loc[:,["additional_info"]])

Unnamed: 0,additional_info_In-flight meal not included,additional_info_No Info,additional_info_Other
0,1.0,0.0,0.0
1,1.0,0.0,0.0
2,0.0,1.0,0.0
3,0.0,1.0,0.0
4,1.0,0.0,0.0
...,...,...,...
6690,0.0,1.0,0.0
6691,0.0,1.0,0.0
6692,1.0,0.0,0.0
6693,0.0,1.0,0.0


In [40]:
def have_info(x):
	return x.assign(additional_info=x.additional_info.ne("No Info").astype(int))

In [41]:
info_union = FeatureUnion(transformer_list=[
	("part1", info_pipe1),
	("part2", FunctionTransformer(func=have_info))
])

In [42]:
info_transformer = Pipeline(steps=[
	("imputer", SimpleImputer(strategy="constant", fill_value="unknown")),
	("union", info_union)
])

info_transformer.fit_transform(x_train.loc[:, ["additional_info"]])

Unnamed: 0,additional_info_In-flight meal not included,additional_info_No Info,additional_info_Other,additional_info
0,1.0,0.0,0.0,1
1,1.0,0.0,0.0,1
2,0.0,1.0,0.0,0
3,0.0,1.0,0.0,0
4,1.0,0.0,0.0,1
...,...,...,...,...
6690,0.0,1.0,0.0,0
6691,0.0,1.0,0.0,0
6692,1.0,0.0,0.0,1
6693,0.0,1.0,0.0,0


### 2. Column Transformer

In [43]:
column_transformer=ColumnTransformer(transformers=[
    ("air_pipeline", airline_pipe, ["airline"]),
    ("doj_pipeline", doj_pipe,["date_of_journey"]),
    ("location_transformer", location_transformer,["source","destination"]),
    ("time_transformer", time_transformer,["dep_time","arrival_time"]),
    ("duration_transformer", duration_transformer,["duration"]),
    ("stops",total_stops_transformer,["total_stops"]),
    ("info",info_transformer,["additional_info"])
], remainder="passthrough")

In [44]:
column_transformer.fit_transform(x_train,y_train)

Unnamed: 0,air_pipeline__airline_Air India,air_pipeline__airline_Indigo,air_pipeline__airline_Jet Airways,air_pipeline__airline_Multiple Carriers,air_pipeline__airline_Other,doj_pipeline__date_of_journey_month,doj_pipeline__date_of_journey_week,doj_pipeline__date_of_journey_day_of_week,doj_pipeline__date_of_journey_day_of_year,location_transformer__source,location_transformer__destination,location_transformer__source_is_north,location_transformer__destination_is_north,time_transformer__dep_time_hour,time_transformer__dep_time_minute,time_transformer__arrival_time_hour,time_transformer__arrival_time_minute,time_transformer__dep_time_part_of_day,time_transformer__arrival_time_part_of_day,duration_transformer__duration_rbf_20,duration_transformer__duration_rbf_50,duration_transformer__duration_rbf_75,duration_transformer__duration_cat,duration_transformer__duration_over_1000,duration_transformer__duration,stops__total_stops,stops__direct_flight,info__additional_info_In-flight meal not included,info__additional_info_No Info,info__additional_info_Other,info__additional_info
0,0.0,0.0,1.0,0.0,0.0,0.000000,0.176471,0.500000,0.169492,-0.857930,-0.736484,0,1,0.347826,1.000000,0.826087,0.181818,1.000000,0.667335,-0.340164,-0.107976,-0.107799,2.0,0,-0.033916,1.0,0,1.0,0.0,0.0,1
1,0.0,0.0,1.0,0.0,0.0,0.000000,0.235294,0.333333,0.220339,1.065418,1.061694,1,0,0.739130,0.545455,0.173913,0.454545,0.202685,0.952906,-0.340164,-0.107976,-0.107799,2.0,0,0.046422,1.0,0,1.0,0.0,0.0,1
2,0.0,0.0,0.0,0.0,1.0,0.000000,0.058824,0.833333,0.067797,-0.857930,-0.736484,0,1,0.478261,0.727273,0.608696,0.636364,1.000000,0.000000,-0.335554,-0.107976,-0.107799,0.0,0,-0.917631,0.0,1,0.0,1.0,0.0,0
3,1.0,0.0,0.0,0.0,0.0,1.000000,0.882353,0.333333,0.872881,-0.203928,-0.224351,0,0,0.391304,0.454545,0.782609,0.545455,1.000000,0.667335,-0.340164,-0.107976,-0.107799,2.0,0,-0.174507,1.0,0,0.0,1.0,0.0,0
4,0.0,0.0,1.0,0.0,0.0,0.000000,0.117647,0.166667,0.093220,-0.857930,-0.736484,0,1,0.956522,1.000000,0.304348,0.727273,0.174101,0.952906,-0.340164,-0.107975,-0.107799,2.0,0,-0.214676,1.0,0,1.0,0.0,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6690,0.0,0.0,1.0,0.0,0.0,0.000000,0.176471,0.500000,0.169492,1.065418,1.061694,1,0,0.434783,0.818182,0.782609,0.909091,1.000000,0.667335,-0.340164,-0.107976,-0.107799,2.0,1,2.597145,2.0,0,0.0,1.0,0.0,0
6691,1.0,0.0,0.0,0.0,0.0,0.666667,0.529412,0.333333,0.516949,-0.203928,-0.224351,0,0,0.391304,0.454545,0.782609,0.545455,1.000000,0.667335,-0.340164,-0.107976,-0.107799,2.0,0,-0.174507,1.0,0,0.0,1.0,0.0,0
6692,0.0,0.0,1.0,0.0,0.0,1.000000,0.764706,0.833333,0.779661,1.065418,1.061694,1,0,0.608696,0.000000,0.826087,0.000000,0.000000,0.667335,-0.340164,-0.107976,-0.107799,1.0,0,-0.666576,1.0,0,1.0,0.0,0.0,1
6693,0.0,0.0,0.0,0.0,1.0,1.000000,1.000000,0.000000,0.974576,1.065418,1.061694,1,0,0.304348,1.000000,0.565217,0.454545,1.000000,0.000000,-0.340164,-0.107976,-0.107799,1.0,0,-0.606322,1.0,0,0.0,1.0,0.0,0


In [45]:
from sklearn import set_config
set_config(display="diagram")

### 5. Feature Selection

In [46]:
estimator = RandomForestRegressor(n_estimators=10, max_depth=3, random_state=42)

selector = SelectBySingleFeaturePerformance(
	estimator=estimator,
	scoring="r2",
	threshold=0.1
) 

In [47]:
preprocessor = Pipeline(steps=[
	("ct", column_transformer),
	("selector", selector)
])

preprocessor.fit(x_train, y_train)

In [48]:
preprocessor.fit_transform(x_train,y_train)

Unnamed: 0,air_pipeline__airline_Indigo,air_pipeline__airline_Jet Airways,air_pipeline__airline_Other,doj_pipeline__date_of_journey_week,doj_pipeline__date_of_journey_day_of_year,location_transformer__source,location_transformer__destination,duration_transformer__duration_rbf_20,duration_transformer__duration_cat,duration_transformer__duration_over_1000,duration_transformer__duration,stops__total_stops,stops__direct_flight
0,0.0,1.0,0.0,0.176471,0.169492,-0.857930,-0.736484,-0.340164,2.0,0,-0.033916,1.0,0
1,0.0,1.0,0.0,0.235294,0.220339,1.065418,1.061694,-0.340164,2.0,0,0.046422,1.0,0
2,0.0,0.0,1.0,0.058824,0.067797,-0.857930,-0.736484,-0.335554,0.0,0,-0.917631,0.0,1
3,0.0,0.0,0.0,0.882353,0.872881,-0.203928,-0.224351,-0.340164,2.0,0,-0.174507,1.0,0
4,0.0,1.0,0.0,0.117647,0.093220,-0.857930,-0.736484,-0.340164,2.0,0,-0.214676,1.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6690,0.0,1.0,0.0,0.176471,0.169492,1.065418,1.061694,-0.340164,2.0,1,2.597145,2.0,0
6691,0.0,0.0,0.0,0.529412,0.516949,-0.203928,-0.224351,-0.340164,2.0,0,-0.174507,1.0,0
6692,0.0,1.0,0.0,0.764706,0.779661,1.065418,1.061694,-0.340164,1.0,0,-0.666576,1.0,0
6693,0.0,0.0,1.0,1.000000,0.974576,1.065418,1.061694,-0.340164,1.0,0,-0.606322,1.0,0


In [49]:
feature_performances = preprocessor.named_steps["selector"].feature_performance_
feature_performances

{'air_pipeline__airline_Air India': np.float64(0.0016858488187393068),
 'air_pipeline__airline_Indigo': np.float64(0.12827620333701603),
 'air_pipeline__airline_Jet Airways': np.float64(0.19341654864699923),
 'air_pipeline__airline_Multiple Carriers': np.float64(0.01885367626900632),
 'air_pipeline__airline_Other': np.float64(0.11819934849670959),
 'doj_pipeline__date_of_journey_month': np.float64(0.08907386829479753),
 'doj_pipeline__date_of_journey_week': np.float64(0.18569991444411282),
 'doj_pipeline__date_of_journey_day_of_week': np.float64(0.004664789869158821),
 'doj_pipeline__date_of_journey_day_of_year': np.float64(0.22920092220662194),
 'location_transformer__source': np.float64(0.1267120700518518),
 'location_transformer__destination': np.float64(0.13064205945980614),
 'location_transformer__source_is_north': np.float64(0.019103940953350513),
 'location_transformer__destination_is_north': np.float64(0.009833075378672654),
 'time_transformer__dep_time_hour': np.float64(0.0075

In [50]:
train_data.total_stops.unique()

array([1., 0., 2., 3.])

In [55]:
train_data["airline"].unique()

array(['Jet Airways', 'Goair', 'Air India', 'Spicejet',
       'Multiple Carriers', 'Indigo', 'Vistara', 'Air Asia', 'Trujet'],
      dtype=object)

In [51]:
project_dir=r"C:\Users\LENOVO\OneDrive\Desktop\Flight_price_predict_project"
data_dir="DataSets"

In [54]:
file_name="train1_data_new.csv"
file_path=os.path.join(project_dir, data_dir, file_name)

In [56]:
train_data.to_csv(file_path,index=False)