In [2]:
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.linear_model import LinearRegression

In [3]:
# Custom selector to extract columns for transformation
class NumberSelector(BaseEstimator, TransformerMixin):
    def __init__(self, key):
        self.key = key
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[[self.key]]

# Custom selector for boolean columns (can be treated directly as numeric with scaling)
class BoolSelector(BaseEstimator, TransformerMixin):
    def __init__(self, key):
        self.key = key
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[[self.key]].astype(float)
    
class ColumnSelector(BaseEstimator, TransformerMixin):
    def __init__(self, key, dtype=None):
        self.key = key
        self.dtype = dtype
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        # Convert column to specified dtype if provided
        if self.dtype:
            return X[[self.key]].astype(self.dtype)
        return X[[self.key]]

class UniqueAirlines(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        # Ensure that the operation is applied correctly on each row
        if isinstance(X, pd.Series):
            transformed = X.apply(lambda x: '||'.join(sorted(set(x.split('||')))))
        else:
            transformed = pd.Series(X).apply(lambda x: '||'.join(sorted(set(x.split('||')))))
        return transformed.values.reshape(-1, 1)


In [4]:
pd.set_option('display.max_columns', None)


In [5]:
dataset = pd.read_pickle('train.pkl')
dataset.head()

Unnamed: 0,legId,searchDate,flightDate,startingAirport,destinationAirport,fareBasisCode,travelDuration,elapsedDays,isBasicEconomy,isRefundable,isNonStop,baseFare,totalFare,seatsRemaining,totalTravelDistance,segmentsDepartureTimeEpochSeconds,segmentsDepartureTimeRaw,segmentsArrivalTimeEpochSeconds,segmentsArrivalTimeRaw,segmentsArrivalAirportCode,segmentsDepartureAirportCode,segmentsAirlineName,segmentsAirlineCode,segmentsEquipmentDescription,segmentsDurationInSeconds,segmentsDistance,segmentsCabinCode,days_until_flight,days_since_last_search,fare_change,seats_change,weighted_fare_change,weighted_seats_change,is_search_weekend,is_flight_weekend,Multiple_Carriers,distinct_airlines,combination_code,journeyStartTime,searchDayName,flightDayName,searchDay,flightDay
84572,20dc57dc714100c3acf0fc3400df937c,2022-06-24,2022-06-25,SFO,LGA,MA0QA0MQ,PT12H19M,1.0,0,0,0,714.42,797.2,2.0,2897.0,1656217800||1656253800,2022-06-25T21:30:00.000-07:00||2022-06-26T10:3...,1656234540||1656262140,2022-06-26T05:09:00.000-04:00||2022-06-26T12:4...,ATL||LGA,SFO||ATL,Delta||Delta,DL||DL,Boeing 737-900||Airbus A320,16740||8340,2135||762,coach||coach,1,9.0,44.0,-7.0,4.888889,-0.777778,0,1,0,{Delta},4,21.5,Friday,Saturday,4,5
21171,1eb522898e3c567b7d62a9d8d7e61152,2022-05-08,2022-06-23,SFO,BOS,UAA0OHEN,PT5H37M,1.0,0,0,1,524.65,578.6,9.0,2698.0,1656046020,2022-06-23T21:47:00.000-07:00,1656066240,2022-06-24T06:24:00.000-04:00,BOS,SFO,United,UA,Boeing 737 MAX 9,20220,2698,coach,46,1.0,0.0,0.0,0.0,0.0,1,0,0,{United},10,21.783333,Sunday,Thursday,6,3
47459,86f3b8bf39cfea9b09a0e46fa75d6e16,2022-07-06,2022-07-31,SFO,EWR,KH0OASMN,PT5H50M,0.0,0,0,1,478.14,528.6,7.0,2566.0,1659291000,2022-07-31T11:10:00.000-07:00,1659312000,2022-07-31T20:00:00.000-04:00,EWR,SFO,Alaska Airlines,AS,Boeing 737-900,21000,2566,coach,25,1.0,-100.0,0.0,-100.0,0.0,0,1,0,{Alaska Airlines},1,11.166667,Wednesday,Sunday,2,6
106870,e0eb783f205cbc64edaa1f186ada7bba,2022-08-24,2022-09-17,SFO,CLT,SAA2AFEN,PT10H21M,0.0,0,0,0,188.84,226.6,9.0,3111.0,1663423200||1663452840,2022-09-17T07:00:00.000-07:00||2022-09-17T18:1...,1663442460||1663460460,2022-09-17T15:21:00.000-04:00||2022-09-17T20:2...,EWR||CLT,SFO||EWR,United||United,UA||UA,BOEING 777-300ER||Boeing 737-900,19260||7620,2566||545,coach||coach,24,1.0,0.0,0.0,0.0,0.0,0,1,0,{United},10,7.0,Wednesday,Saturday,2,5
51925,65e124bd8ce6db3fafbf41c3f554caef,2022-04-26,2022-05-06,SFO,ORD,VAA7OKEN,PT4H15M,1.0,0,0,1,282.79,318.6,1.0,1847.0,1651886700,2022-05-06T18:25:00.000-07:00,1651902000,2022-05-07T00:40:00.000-05:00,ORD,SFO,United,UA,Boeing 737-900,15300,1847,coach,10,1.0,-24.0,-8.0,-24.0,-8.0,0,0,0,{United},10,18.416667,Tuesday,Friday,1,4


In [6]:
test_dataset = pd.read_pickle('test.pkl')

In [7]:
dataset['segmentsAirlineName']

84572                             Delta||Delta
21171                                   United
47459                          Alaska Airlines
106870                          United||United
51925                                   United
                          ...                 
101163                            Delta||Delta
62181     American Airlines||American Airlines
162717                 Alaska Airlines||United
72390                           United||United
45470                                   United
Name: segmentsAirlineName, Length: 4409182, dtype: object

In [8]:
features = ["days_until_flight", "is_search_weekend", "is_flight_weekend", "searchDayName", "flightDayName", "isBasicEconomy", "segmentsAirlineName", "destinationAirport"]
target = "totalFare"
X_train = dataset[features]
y_train = dataset[target]

X_test = test_dataset[features]
y_test = test_dataset[target]

In [9]:
X_train.dtypes

days_until_flight       int64
is_search_weekend       int64
is_flight_weekend       int64
searchDayName          object
flightDayName          object
isBasicEconomy          int64
segmentsAirlineName    object
destinationAirport     object
dtype: object

In [10]:
# Pipeline Setup for Each Feature
days_until_flight_pipe = Pipeline([
    ('selector', ColumnSelector(key='days_until_flight')),
    ('standard', StandardScaler())
])

is_search_weekend_pipe = Pipeline([
    ('selector', ColumnSelector(key='is_search_weekend')),
    ('standard', StandardScaler())
])

is_flight_weekend_pipe = Pipeline([
    ('selector', ColumnSelector(key='is_flight_weekend')),
    ('standard', StandardScaler())
])

search_day_pipe = Pipeline([
    ('selector', ColumnSelector(key='searchDayName')),
    ('onehot', OneHotEncoder())
])

flight_day_pipe = Pipeline([
    ('selector', ColumnSelector(key='flightDayName')),
    ('onehot', OneHotEncoder())
])

is_basic_economy_pipe = Pipeline([
    ('selector', ColumnSelector(key='isBasicEconomy')),
    ('standard', StandardScaler())
])

segments_airline_name_pipe = Pipeline([
    ('selector', ColumnSelector(key='segmentsAirlineName', dtype='str')),
    ('onehot', OneHotEncoder())
])

destination_airport_pipe = Pipeline([
    ('selector', ColumnSelector(key='destinationAirport', dtype='str')),
    ('onehot', OneHotEncoder())
])

# Combine Features Using FeatureUnion
combined_features = FeatureUnion([
    ('days_until_flight', days_until_flight_pipe),
    ('is_search_weekend', is_search_weekend_pipe),
    ('is_flight_weekend', is_flight_weekend_pipe),
    ('search_day', search_day_pipe),
    ('flight_day', flight_day_pipe),
    ('is_basic_economy', is_basic_economy_pipe),
    ('segments_airline_name', segments_airline_name_pipe),
    ('destination_airport', destination_airport_pipe)
])

# Final Pipeline with Regression
pipeline = Pipeline([
    ('features', combined_features),
    ('regressor', LinearRegression())
])

pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)
print("Predicted fares:", y_pred[:10])


Predicted fares: [446.73084359 473.12163201 273.98820563 439.77912899 556.52741731
 431.16313489 519.33763278 302.67326559 414.65495215 580.98064785]


In [55]:
y_test[:10]

128169    428.60
28053     810.60
104331    158.60
19892     266.60
45944     802.60
44434     436.60
82794     553.59
88110     232.60
92390     372.20
88012     543.60
Name: totalFare, dtype: float64

In [11]:
import joblib

# Assuming 'pipeline' is your trained model
joblib.dump(pipeline, 'flight_fare_predictor.pkl')


['flight_fare_predictor.pkl']