# Import the packages

In [1]:
import pandas as pd
import numpy as np
from sklearn import set_config,get_config
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import train_test_split

from sklearn.preprocessing import MinMaxScaler
from feature_engine.datetime import DatetimeFeatures

from sklearn.preprocessing import OneHotEncoder
from feature_engine.encoding import RareLabelEncoder
from sklearn.pipeline import FunctionTransformer

from feature_engine.discretisation import ArbitraryDiscretiser

import category_encoders as ce

from sklearn.preprocessing import TargetEncoder
from sklearn.preprocessing import PowerTransformer
from sklearn.preprocessing import StandardScaler
from feature_engine.outliers import Winsorizer

import seaborn as sns

from sklearn.base import BaseEstimator, TransformerMixin, OneToOneFeatureMixin
from sklearn.metrics.pairwise import rbf_kernel

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
# get the config

get_config()

{'assume_finite': False,
 'working_memory': 1024,
 'print_changed_only': True,
 'display': 'diagram',
 'pairwise_dist_chunk_size': 256,
 'enable_cython_pairwise_dist': True,
 'array_api_dispatch': False,
 'transform_output': 'default',
 'enable_metadata_routing': False,
 'skip_parameter_validation': False}

In [4]:
set_config(transform_output='pandas')

In [5]:
get_config()['transform_output']

'pandas'

# Load and split the data

In [6]:
# load the data

data_path = "../data/cleaned_data.csv"

df = pd.read_csv(data_path)

df.head()

Unnamed: 0,airline,source,destination,departure_time,arrival_time,duration,total_stops,price,day_of_journey,month_of_journey
0,Indigo,Banglore,Delhi,22:20:00,01:10:00,170,0,3897,24,3
1,Air_India,Kolkata,Banglore,05:50:00,13:15:00,445,2,7662,1,5
2,Jet_Airways,Delhi,Cochin,09:25:00,04:25:00,1140,2,13882,9,6
3,Indigo,Kolkata,Banglore,18:05:00,23:30:00,325,1,6218,12,5
4,Indigo,Banglore,Delhi,16:50:00,21:35:00,285,1,13302,1,3


In [7]:
# check for missing values in the data

df.isna().sum().sum()

0

**There are no missing values in the data**

In [8]:
# check for duplicate rows in data

df.duplicated().sum()

2

In [9]:
# check rows which have duplicate values

(
    df
    .loc[
    df.duplicated(keep=False),:
    ]
)

Unnamed: 0,airline,source,destination,departure_time,arrival_time,duration,total_stops,price,day_of_journey,month_of_journey
1020,Air_Asia,Banglore,Delhi,23:25:00,02:10:00,165,0,4482,24,3
5309,Vistara,Banglore,Delhi,21:10:00,00:05:00,175,0,7608,3,3
6720,Vistara,Banglore,Delhi,21:10:00,00:05:00,175,0,7608,3,3
8595,Air_Asia,Banglore,Delhi,23:25:00,02:10:00,165,0,4482,24,3


In [10]:
# drop the rows which have duplicate values

def drop_duplicate_rows(df:pd.DataFrame):
    return (
        df.drop_duplicates()
    )

df = drop_duplicate_rows(df)

# check for duplicates

df.duplicated().sum()

0

In [11]:
# data types of df

df.dtypes

airline             object
source              object
destination         object
departure_time      object
arrival_time        object
duration             int64
total_stops          int64
price                int64
day_of_journey       int64
month_of_journey     int64
dtype: object

In [12]:
# split the data into train and test splits

train_data, test_data = train_test_split(df,test_size=0.2,random_state=42)

print('The shape of the train data is ',train_data.shape)
print('The shape of the test data is ',test_data.shape)

The shape of the train data is  (8368, 10)
The shape of the test data is  (2092, 10)


# Column wise transformations

## Airline

In [13]:
(
    train_data['airline']
    .value_counts(normalize=True)
)

airline
Jet_Airways          0.354924
Indigo               0.194551
Air_India            0.163121
Multiple_Carriers    0.114006
Spicejet             0.075645
Vistara              0.048040
Air_Asia             0.030832
Goair                0.018881
Name: proportion, dtype: float64

In [14]:
airline_pipe = Pipeline(steps=[
    ('rare_cat',RareLabelEncoder(tol=0.1,n_categories=3,replace_with='Others')),
    ('encode',OneHotEncoder(sparse_output=False,handle_unknown='ignore'))
])

airline_pipe.fit_transform(train_data.loc[:,['airline']])

Unnamed: 0,airline_Air_India,airline_Indigo,airline_Jet_Airways,airline_Multiple_Carriers,airline_Others
3457,1.0,0.0,0.0,0.0,0.0
611,1.0,0.0,0.0,0.0,0.0
6553,1.0,0.0,0.0,0.0,0.0
4521,1.0,0.0,0.0,0.0,0.0
511,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...
5734,0.0,1.0,0.0,0.0,0.0
5191,0.0,0.0,1.0,0.0,0.0
5390,0.0,1.0,0.0,0.0,0.0
860,0.0,1.0,0.0,0.0,0.0


In [15]:
airline_pipe.named_steps['rare_cat'].encoder_dict_

{'airline': ['Jet_Airways', 'Indigo', 'Air_India', 'Multiple_Carriers']}

In [16]:
airline_pipe

## Departure and Arrival times

In [17]:
time_subset = train_data.loc[:,['arrival_time','departure_time']]

In [18]:
time_subset

Unnamed: 0,arrival_time,departure_time
3457,19:15:00,12:20:00
611,19:15:00,23:00:00
6553,17:10:00,14:10:00
4521,07:40:00,06:50:00
511,10:15:00,08:50:00
...,...,...
5734,11:20:00,08:30:00
5191,21:20:00,07:00:00
5390,15:35:00,10:35:00
860,08:50:00,05:35:00


In [19]:
times_pipe1 = Pipeline(steps=[
    ('dt',DatetimeFeatures(features_to_extract=['hour','minute'],format='mixed')),
    ('scale',MinMaxScaler())
])


times_pipe1.fit_transform(time_subset)

Unnamed: 0,arrival_time_hour,arrival_time_minute,departure_time_hour,departure_time_minute
3457,0.826087,0.272727,0.521739,0.363636
611,0.826087,0.272727,1.000000,0.000000
6553,0.739130,0.181818,0.608696,0.181818
4521,0.304348,0.727273,0.260870,0.909091
511,0.434783,0.272727,0.347826,0.909091
...,...,...,...,...
5734,0.478261,0.363636,0.347826,0.545455
5191,0.913043,0.363636,0.304348,0.000000
5390,0.652174,0.636364,0.434783,0.636364
860,0.347826,0.909091,0.217391,0.636364


In [20]:
times_pipe1

In [21]:
time_subset

Unnamed: 0,arrival_time,departure_time
3457,19:15:00,12:20:00
611,19:15:00,23:00:00
6553,17:10:00,14:10:00
4521,07:40:00,06:50:00
511,10:15:00,08:50:00
...,...,...
5734,11:20:00,08:30:00
5191,21:20:00,07:00:00
5390,15:35:00,10:35:00
860,08:50:00,05:35:00


In [22]:
def binning(dataframe,morning=6,noon=12,evening=17,night=20):
    columns = dataframe.columns.to_list()
    X_temp = (
        dataframe
        .assign(**{
            col : pd.to_datetime(dataframe.loc[:,col],format='mixed').dt.hour
            for col in columns
        })
    )
    return (
        X_temp
        .assign(**{
            f'{col}_part_of_day': np.select(condlist=[
                X_temp.loc[:,col].between(morning,noon,inclusive='left'),
                X_temp.loc[:,col].between(noon,evening,inclusive='left'),
                X_temp.loc[:,col].between(evening,night,inclusive='left')
            ],
                          choicelist=[
                              "morning",
                              'noon',
                              'evening'
                          ],
                          default='night')
            for col in columns
        }
        )
        .drop(columns=columns)
    )

FunctionTransformer(binning).fit_transform(time_subset)

Unnamed: 0,arrival_time_part_of_day,departure_time_part_of_day
3457,evening,noon
611,evening,night
6553,evening,noon
4521,morning,morning
511,morning,morning
...,...,...
5734,morning,morning
5191,night,morning
5390,noon,morning
860,morning,night


In [23]:
times_pipe2 = Pipeline(steps=[
    ('binning',FunctionTransformer(func=binning)),
    ('count_encoding',ce.CountEncoder()),
    ('scaling',MinMaxScaler())
])

times_pipe2.fit_transform(time_subset)

Unnamed: 0,arrival_time_part_of_day,departure_time_part_of_day
3457,0.130982,0.062561
611,0.130982,0.405670
6553,0.130982,0.062561
4521,0.115869,1.000000
511,0.115869,1.000000
...,...,...
5734,0.115869,1.000000
5191,1.000000,1.000000
5390,0.000000,1.000000
860,0.115869,0.405670


In [24]:
times_pipe2

In [25]:
time_union = FeatureUnion(transformer_list=[
    ('pipe_1',times_pipe1),
    ('pipe_2',times_pipe2)
],n_jobs=-1)

time_union.fit_transform(time_subset)

Unnamed: 0,arrival_time_hour,arrival_time_minute,departure_time_hour,departure_time_minute,arrival_time_part_of_day,departure_time_part_of_day
3457,0.826087,0.272727,0.521739,0.363636,0.130982,0.062561
611,0.826087,0.272727,1.000000,0.000000,0.130982,0.405670
6553,0.739130,0.181818,0.608696,0.181818,0.130982,0.062561
4521,0.304348,0.727273,0.260870,0.909091,0.115869,1.000000
511,0.434783,0.272727,0.347826,0.909091,0.115869,1.000000
...,...,...,...,...,...,...
5734,0.478261,0.363636,0.347826,0.545455,0.115869,1.000000
5191,0.913043,0.363636,0.304348,0.000000,1.000000,1.000000
5390,0.652174,0.636364,0.434783,0.636364,0.000000,1.000000
860,0.347826,0.909091,0.217391,0.636364,0.115869,0.405670


In [26]:
time_union

## Source and Destination

In [27]:
location_subset = train_data.loc[:,['source','destination']]

location_subset

Unnamed: 0,source,destination
3457,Delhi,Cochin
611,Delhi,Cochin
6553,Delhi,Cochin
4521,Delhi,Cochin
511,Mumbai,Hyderabad
...,...,...
5734,Banglore,Delhi
5191,Banglore,Delhi
5390,Delhi,Cochin
860,Delhi,Cochin


In [28]:
(
    location_subset
    .loc[:,['source']]
    .value_counts(normalize=True)
)

source  
Delhi       0.413360
Kolkata     0.275813
Banglore    0.207576
Mumbai      0.066444
Chennai     0.036807
Name: proportion, dtype: float64

In [29]:
(
    location_subset
    .loc[:,['destination']]
    .value_counts(normalize=True)
)

destination
Cochin         0.413360
Banglore       0.275813
Delhi          0.207576
Hyderabad      0.066444
Kolkata        0.036807
Name: proportion, dtype: float64

In [30]:
loc_pipe1 = Pipeline(steps=[
    ('rare_encode',RareLabelEncoder(tol=0.2,n_categories=3,replace_with='Other')),
    ('mean_encode',TargetEncoder(target_type='continuous',smooth=0.1,random_state=30)),
    ('power_transform',PowerTransformer())
])

loc_pipe1

In [31]:
loc_pipe1.fit_transform(location_subset,train_data.loc[:,'price'])

Unnamed: 0,source,destination
3457,1.055535,1.055535
611,0.984361,0.984361
6553,1.055535,1.055535
4521,1.051186,1.051186
511,-1.894648,-1.894648
...,...,...
5734,-0.945825,-0.945825
5191,-0.889059,-0.889059
5390,1.055535,1.055535
860,1.086759,1.086759


In [32]:
def is_north_city(df):
    columns = df.columns.to_list()
    north_cities = ['Delhi','Kolkata']

    return (
        df
        .assign(**{
            f'{col}_is_north' : df.loc[:,col].isin(north_cities).astype(int)
            for col in columns
        })
        .drop(columns=columns)
    )
    
is_north_city(location_subset)

Unnamed: 0,source_is_north,destination_is_north
3457,1,0
611,1,0
6553,1,0
4521,1,0
511,0,0
...,...,...
5734,0,1
5191,0,1
5390,1,0
860,1,0


In [33]:
location_union = FeatureUnion(transformer_list=[
    ('pipe_1',loc_pipe1),
    ('pipe_2',FunctionTransformer(is_north_city))
],n_jobs=-1)

In [34]:
location_union

In [35]:
location_union.fit_transform(location_subset,train_data.loc[:,'price'])

Unnamed: 0,source,destination,source_is_north,destination_is_north
3457,1.055535,1.055535,1,0
611,0.984361,0.984361,1,0
6553,1.055535,1.055535,1,0
4521,1.051186,1.051186,1,0
511,-1.894648,-1.894648,0,0
...,...,...,...,...
5734,-0.945825,-0.945825,0,1
5191,-0.889059,-0.889059,0,1
5390,1.055535,1.055535,1,0
860,1.086759,1.086759,1,0


## Duration

In [36]:
(
    train_data
    .loc[:,'duration']
)

3457    1855
611     1215
6553     180
4521    1490
511       85
        ... 
5734     170
5191     860
5390     300
860      195
7271    1215
Name: duration, Length: 8368, dtype: int64

In [37]:
(
    train_data
    .loc[:,['duration']]
    .quantile([0.25,0.5,0.75])
    .values
    .shape
)

(3, 1)

In [38]:
class RbfSimilarity(TransformerMixin,BaseEstimator,OneToOneFeatureMixin):

    def __init__(self,variables,percentiles,gamma=0.1):
        self.variables = variables
        self.percentiles = percentiles
        self.gamma = gamma
        
    def fit(self,X,y=None):
        self.quantile_values_ = {
            col : (
                X.loc[:,[col]]
                .quantile(self.percentiles)
                .values
            )
            for col in self.variables
        }
        
        return self

    def transform(self,X):
        objects = []
        for col in self.variables:
            columns = [f'{col}_rbf_{int(percentile*100)}' for percentile in self.percentiles]
            obj = pd.DataFrame(
                data=rbf_kernel(X.loc[:,[col]],Y=self.quantile_values_[col],gamma=self.gamma),
                columns=columns
            )
            objects.append(obj)
        return pd.concat(objects,axis=1)

In [39]:
rbf = RbfSimilarity(variables=['duration'],percentiles=[0.25,0.5,0.75])
rbf

In [40]:
rbf.fit(train_data.loc[:,['duration']])

In [41]:
rbf.quantile_values_['duration']

array([[170.],
       [510.],
       [925.]])

In [42]:
rbf.transform(train_data.loc[:,['duration']])

Unnamed: 0,duration_rbf_25,duration_rbf_50,duration_rbf_75
0,0.000000e+00,0.0,0.000000e+00
1,0.000000e+00,0.0,0.000000e+00
2,4.539993e-05,0.0,0.000000e+00
3,0.000000e+00,0.0,0.000000e+00
4,1.668157e-314,0.0,0.000000e+00
...,...,...,...
8363,1.000000e+00,0.0,0.000000e+00
8364,0.000000e+00,0.0,3.240271e-184
8365,0.000000e+00,0.0,0.000000e+00
8366,7.187782e-28,0.0,0.000000e+00


In [43]:
# #duration_pipe = Pipeline(steps=[
#     #('rbf_similarity',RbfSimilarity(variables=['duration'],percentiles=[0.25,0.5,0.75])),
#     #('transform',PowerTransformer())
# ])

# #duration_pipe.fit_transform(train_data.loc[:,['duration']])

In [45]:
#duration_pipe

In [46]:
duration_union = FeatureUnion(transformer_list=[
    ('power_transform',PowerTransformer()),
    ('scaling',StandardScaler())
])

duration_union

In [53]:
train_data.columns

Index(['airline', 'source', 'destination', 'departure_time', 'arrival_time',
       'duration', 'total_stops', 'price', 'day_of_journey',
       'month_of_journey'],
      dtype='object')

## Total Stops

In [57]:
def no_stops(X):
    columns_list = X.columns.to_list()

    return (
        X
        .assign(**{
            col : np.where(X[col]==0,1,0)
            for col in columns_list
        })
    )

no_stops(train_data.loc[:,['total_stops']]).value_counts()

total_stops
0              5612
1              2756
Name: count, dtype: int64

In [56]:
(
    train_data
    .loc[:,['total_stops']]
    .value_counts()
)

total_stops
1              4506
0              2756
2              1074
3                32
Name: count, dtype: int64

# Column Transformers

In [47]:
train_data.columns

Index(['airline', 'source', 'destination', 'departure_time', 'arrival_time',
       'duration', 'total_stops', 'price', 'day_of_journey',
       'month_of_journey'],
      dtype='object')

In [58]:
preprocessor = ColumnTransformer(transformers=[
    ('airline',airline_pipe,['airline']),
    ('journey',MinMaxScaler(),['day_of_journey','month_of_journey']),
    ('time',time_union,['arrival_time','departure_time']),
    ('location',location_union,['source','destination']),
    ('duration',duration_union,['duration']),
    ('total_stops',FunctionTransformer(no_stops),['total_stops'])
],n_jobs=-1,verbose_feature_names_out=False)

preprocessor

In [59]:
y_train = train_data.loc[:,'price']

In [60]:
preprocessor.fit_transform(train_data,y_train)

Unnamed: 0,airline_Air_India,airline_Indigo,airline_Jet_Airways,airline_Multiple_Carriers,airline_Others,day_of_journey,month_of_journey,arrival_time_hour,arrival_time_minute,departure_time_hour,departure_time_minute,arrival_time_part_of_day,departure_time_part_of_day,source,destination,source_is_north,destination_is_north,power_transform__duration,scaling__duration,total_stops
3457,1.0,0.0,0.0,0.0,0.0,1.000000,0.666667,0.826087,0.272727,0.521739,0.363636,0.130982,0.062561,1.055535,1.055535,1,0,1.678467,2.424604,0
611,1.0,0.0,0.0,0.0,0.0,0.769231,0.000000,0.826087,0.272727,1.000000,0.000000,0.130982,0.405670,0.984361,0.984361,1,0,1.138053,1.151704,0
6553,1.0,0.0,0.0,0.0,0.0,1.000000,0.333333,0.739130,0.181818,0.608696,0.181818,0.130982,0.062561,1.055535,1.055535,1,0,-0.998730,-0.906813,1
4521,1.0,0.0,0.0,0.0,0.0,0.538462,1.000000,0.304348,0.727273,0.260870,0.909091,0.115869,1.000000,1.051186,1.051186,1,0,1.395373,1.698654,0
511,0.0,0.0,0.0,0.0,1.0,0.307692,0.000000,0.434783,0.272727,0.347826,0.909091,0.115869,1.000000,-1.894648,-1.894648,0,0,-1.715677,-1.095759,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5734,0.0,1.0,0.0,0.0,0.0,0.076923,1.000000,0.478261,0.363636,0.347826,0.545455,0.115869,1.000000,-0.945825,-0.945825,0,1,-1.055650,-0.926702,1
5191,0.0,0.0,1.0,0.0,0.0,0.769231,0.000000,0.913043,0.363636,0.304348,0.000000,1.000000,1.000000,-0.889059,-0.889059,0,1,0.715703,0.445643,0
5390,0.0,1.0,0.0,0.0,0.0,0.076923,0.333333,0.652174,0.636364,0.434783,0.636364,0.000000,1.000000,1.055535,1.055535,1,0,-0.472645,-0.668145,0
860,0.0,1.0,0.0,0.0,0.0,1.000000,0.333333,0.347826,0.909091,0.217391,0.636364,0.115869,0.405670,1.086759,1.086759,1,0,-0.918372,-0.876980,1


In [61]:
preprocessor_pipe = Pipeline(steps=[
    ('remove_outliers',Winsorizer(capping_method='iqr',fold=1.5,tail='both',variables=['duration'])),
    ('preprocessor',preprocessor)
])

preprocessor_pipe

In [62]:
preprocessor_pipe.fit_transform(train_data,y_train)

Unnamed: 0,airline_Air_India,airline_Indigo,airline_Jet_Airways,airline_Multiple_Carriers,airline_Others,day_of_journey,month_of_journey,arrival_time_hour,arrival_time_minute,departure_time_hour,departure_time_minute,arrival_time_part_of_day,departure_time_part_of_day,source,destination,source_is_north,destination_is_north,power_transform__duration,scaling__duration,total_stops
3457,1.0,0.0,0.0,0.0,0.0,1.000000,0.666667,0.826087,0.272727,0.521739,0.363636,0.130982,0.062561,1.055535,1.055535,1,0,1.686381,2.450037,0
611,1.0,0.0,0.0,0.0,0.0,0.769231,0.000000,0.826087,0.272727,1.000000,0.000000,0.130982,0.405670,0.984361,0.984361,1,0,1.141704,1.165328,0
6553,1.0,0.0,0.0,0.0,0.0,1.000000,0.333333,0.739130,0.181818,0.608696,0.181818,0.130982,0.062561,1.055535,1.055535,1,0,-0.999708,-0.912288,1
4521,1.0,0.0,0.0,0.0,0.0,0.538462,1.000000,0.304348,0.727273,0.260870,0.909091,0.115869,1.000000,1.051186,1.051186,1,0,1.400909,1.717352,0
511,0.0,0.0,0.0,0.0,1.0,0.307692,0.000000,0.434783,0.272727,0.347826,0.909091,0.115869,1.000000,-1.894648,-1.894648,0,0,-1.713324,-1.102987,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5734,0.0,1.0,0.0,0.0,0.0,0.076923,1.000000,0.478261,0.363636,0.347826,0.545455,0.115869,1.000000,-0.945825,-0.945825,0,1,-1.056459,-0.932361,1
5191,0.0,0.0,1.0,0.0,0.0,0.769231,0.000000,0.913043,0.363636,0.304348,0.000000,1.000000,1.000000,-0.889059,-0.889059,0,1,0.716846,0.452716,0
5390,0.0,1.0,0.0,0.0,0.0,0.076923,0.333333,0.652174,0.636364,0.434783,0.636364,0.000000,1.000000,1.055535,1.055535,1,0,-0.474420,-0.671405,0
860,0.0,1.0,0.0,0.0,0.0,1.000000,0.333333,0.347826,0.909091,0.217391,0.636364,0.115869,0.405670,1.086759,1.086759,1,0,-0.919558,-0.882177,1
