In [31]:
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score



In [32]:
# Cell 1: Define Custom Transformers

class NumberSelector(BaseEstimator, TransformerMixin):
    def __init__(self, key):
        self.key = key
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[[self.key]]

class BoolSelector(BaseEstimator, TransformerMixin):
    def __init__(self, key):
        self.key = key
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[[self.key]].astype(float)
    
class ColumnSelector(BaseEstimator, TransformerMixin):
    def __init__(self, key, dtype=None):
        self.key = key
        self.dtype = dtype
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        if self.dtype:
            return X[[self.key]].astype(self.dtype)
        return X[[self.key]]


In [33]:
pd.set_option('display.max_columns', None)


In [34]:
dataset_train = pd.read_pickle('processed.pkl')

In [35]:
dataset_train.head()

Unnamed: 0,legId,searchDate,flightDate,startingAirport,destinationAirport,fareBasisCode,travelDuration,elapsedDays,isBasicEconomy,isRefundable,isNonStop,baseFare,totalFare,seatsRemaining,totalTravelDistance,segmentsDepartureTimeEpochSeconds,segmentsDepartureTimeRaw,segmentsArrivalTimeEpochSeconds,segmentsArrivalTimeRaw,segmentsArrivalAirportCode,segmentsDepartureAirportCode,segmentsAirlineName,segmentsAirlineCode,segmentsEquipmentDescription,segmentsDurationInSeconds,segmentsDistance,segmentsCabinCode,days_until_flight,days_since_last_search,fare_change,seats_change,weighted_fare_change,weighted_seats_change,is_search_weekend,is_flight_weekend,Multiple_Carriers,distinct_airlines,combination_code,journeyStartTime,searchDayName,flightDayName,searchDay,flightDay
152195,00002659e6bc297c2cc3337dba41ff1a,2022-06-28,2022-07-10,SFO,DFW,QAA0OKEN,PT6H14M,0.0,0,0,0,320.0,367.6,2.0,1874.0,1657468980||1657486800,2022-07-10T09:03:00.000-07:00||2022-07-10T16:0...,1657483680||1657491420,2022-07-10T15:08:00.000-05:00||2022-07-10T17:1...,IAH||DFW,SFO||IAH,United||United,UA||UA,Boeing 737-800||Embraer 175 (Enhanced Winglets),14700||4620,1641||233,coach||coach,12,0.0,0.0,0.0,0.0,0.0,0,1,0,{United},10,9.05,Tuesday,Sunday,1,6
16907,00002659e6bc297c2cc3337dba41ff1a,2022-06-29,2022-07-10,SFO,DFW,QAA0OKEN,PT6H14M,0.0,0,0,0,320.0,367.6,3.0,1874.0,1657468980||1657486800,2022-07-10T09:03:00.000-07:00||2022-07-10T16:0...,1657483680||1657491420,2022-07-10T15:08:00.000-05:00||2022-07-10T17:1...,IAH||DFW,SFO||IAH,United||United,UA||UA,Boeing 737-800||Embraer 175 (Enhanced Winglets),14700||4620,1641||233,coach||coach,11,1.0,0.0,1.0,0.0,1.0,0,1,0,{United},10,9.05,Wednesday,Sunday,2,6
6107,000037aea1029976f73c4afefb8fb9cb,2022-08-20,2022-09-16,SFO,DTW,SAA7AWBN,PT12H2M,1.0,1,0,0,212.09,261.7,9.0,2173.0,1663378620||1663400100||1663417020,2022-09-16T18:37:00.000-07:00||2022-09-17T00:3...,1663384500||1663412700||1663421940,2022-09-16T20:15:00.000-07:00||2022-09-17T06:0...,LAS||ORD||DTW,SFO||LAS||ORD,United||United||United,UA||UA||UA,Boeing 737-900||Boeing 737-800||Embraer 170,5880||12600||4920,424||1509||240,coach||coach||coach,27,0.0,0.0,0.0,0.0,0.0,1,0,0,{United},10,18.616667,Saturday,Friday,5,4
80703,000037aea1029976f73c4afefb8fb9cb,2022-08-22,2022-09-16,SFO,DTW,SAA7AWBN,PT12H2M,1.0,1,0,0,212.09,261.7,9.0,2173.0,1663378620||1663400100||1663417020,2022-09-16T18:37:00.000-07:00||2022-09-17T00:3...,1663384500||1663412700||1663421940,2022-09-16T20:15:00.000-07:00||2022-09-17T06:0...,LAS||ORD||DTW,SFO||LAS||ORD,United||United||United,UA||UA||UA,Boeing 737-900||Boeing 737-800||Embraer 170,5880||12600||4920,424||1509||240,coach||coach||coach,25,2.0,0.0,0.0,0.0,0.0,0,0,0,{United},10,18.616667,Monday,Friday,0,4
97084,000037aea1029976f73c4afefb8fb9cb,2022-08-24,2022-09-16,SFO,DTW,WAD7PKBN,PT12H2M,1.0,1,0,0,240.0,291.7,9.0,2173.0,1663378620||1663400100||1663417020,2022-09-16T18:37:00.000-07:00||2022-09-17T00:3...,1663384500||1663412700||1663421940,2022-09-16T20:15:00.000-07:00||2022-09-17T06:0...,LAS||ORD||DTW,SFO||LAS||ORD,United||United||United,UA||UA||UA,Boeing 737-900||Boeing 737-800||Embraer 170,5880||12600||4920,424||1509||240,coach||coach||coach,23,2.0,30.0,0.0,15.0,0.0,0,0,0,{United},10,18.616667,Wednesday,Friday,2,4


- 'destinationAirport'                        G
- 'travelDuration',                           E
- 'elapsedDays',                              E
- 'isBasicEconomy',                           G
- 'isRefundable',                             D
- 'isNonStop',                                E
- 'baseFare',                                 E
- 'totalFare',                                T
- 'seatsRemaining',                           E
- 'totalTravelDistance',                      E
- 'segmentsDepartureTimeEpochSeconds',        D
- 'segmentsDepartureTimeRaw',                 D
- 'segmentsArrivalTimeEpochSeconds',          D
- 'segmentsArrivalTimeRaw',                   D
- 'segmentsArrivalAirportCode',               D
- 'segmentsDepartureAirportCode',             D
- 'segmentsAirlineName',                      G
- 'segmentsAirlineCode',                      D
- 'segmentsEquipmentDescription',             D
- 'segmentsDurationInSeconds',                D
- 'segmentsDistance',                         D
- 'segmentsCabinCode',                        D
- 'days_until_flight',                        G
- 'days_since_last_search',                   -
- 'fare_change',                              -
- 'seats_change',                             -
- 'weighted_fare_change',                     -
- 'weighted_seats_change',                    -
- 'is_search_weekend',                        G   
- 'is_flight_weekend',                        G
- 'Multiple_Carriers',                        E
- 'distinct_airlines',                        D
- 'combination_code',                         E
- 'journeyStartTime',                         E
- 'searchDayName',                            G
- 'flightDayName',                            G
- 'searchDay',                                G
- 'flightDay']                                G

In [36]:
dataset_train["distinct_airlines"] = dataset_train["distinct_airlines"].apply(lambda x: list(x)[0] if x and isinstance(x, set) else None)

dataset_train["distinct_airlines"]


152195    United
16907     United
6107      United
80703     United
97084     United
           ...  
125868     Delta
3301       Delta
32844      Delta
64539      Delta
145143     Delta
Name: distinct_airlines, Length: 5511478, dtype: object

In [37]:
# target = ['totalFare']
# got_elements = ['flightDayName', 'searchDayName', 'destinationAirport', 'distinct_airlines', 'is_search_weekend', 'is_flight_weekend', 'days_until_flight', 'isBasicEconomy']
# to_be_estimated_elements = ['baseFare', 'travelDuration', 'elapsedDays', 'isNonStop', 'Multiple_Carriers']
# categorical_tbe = ['isNonStop', 'Multiple_Carriers']
# numerical_tbe = ['baseFare', 'travelDuration', 'elapsedDays']

# len(got_elements) + len(to_be_estimated_elements) + len(target)

In [38]:
got_elements = ['flightDayName', 'searchDayName', 'destinationAirport', 'distinct_airlines', 'is_search_weekend', 'is_flight_weekend', 'days_until_flight', 'isBasicEconomy']
categorical_tbe = ['isNonStop', 'Multiple_Carriers']
numerical_tbe = ['baseFare', 'totalTravelDistance', 'elapsedDays']

X = dataset_train[got_elements]

In [39]:
X.head()

Unnamed: 0,flightDayName,searchDayName,destinationAirport,distinct_airlines,is_search_weekend,is_flight_weekend,days_until_flight,isBasicEconomy
152195,Sunday,Tuesday,DFW,United,0,1,12,0
16907,Sunday,Wednesday,DFW,United,0,1,11,0
6107,Friday,Saturday,DTW,United,1,0,27,1
80703,Friday,Monday,DTW,United,0,0,25,1
97084,Friday,Wednesday,DTW,United,0,0,23,1


In [40]:
X.isna().sum()

flightDayName         0
searchDayName         0
destinationAirport    0
distinct_airlines     0
is_search_weekend     0
is_flight_weekend     0
days_until_flight     0
isBasicEconomy        0
dtype: int64

## Pipelines

In [41]:
# Pipeline Setup for Each Feature
days_until_flight_pipe = Pipeline([
    ('selector', ColumnSelector(key='days_until_flight')),
    ('standard', StandardScaler())
])

is_search_weekend_pipe = Pipeline([
    ('selector', ColumnSelector(key='is_search_weekend')),
    ('standard', StandardScaler())
])

is_flight_weekend_pipe = Pipeline([
    ('selector', ColumnSelector(key='is_flight_weekend')),
    ('standard', StandardScaler())
])

search_day_pipe = Pipeline([
    ('selector', ColumnSelector(key='searchDayName')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

flight_day_pipe = Pipeline([
    ('selector', ColumnSelector(key='flightDayName')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

is_basic_economy_pipe = Pipeline([
    ('selector', ColumnSelector(key='isBasicEconomy')),
    ('standard', StandardScaler())
])

distinct_airlines_pipiline = Pipeline([
    ('selector', ColumnSelector(key='distinct_airlines', dtype='str')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

destination_airport_pipe = Pipeline([
    ('selector', ColumnSelector(key='destinationAirport', dtype='str')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine Features Using FeatureUnion
combined_features = FeatureUnion([
    ('days_until_flight', days_until_flight_pipe),
    ('is_search_weekend', is_search_weekend_pipe),
    ('is_flight_weekend', is_flight_weekend_pipe),
    ('search_day', search_day_pipe),
    ('flight_day', flight_day_pipe),
    ('is_basic_economy', is_basic_economy_pipe),
    ('distinct_airlines', distinct_airlines_pipiline),
    ('destination_airport', destination_airport_pipe)
])

In [42]:
# estimators = {
#     'baseFare': RandomForestRegressor(n_estimators=100, random_state=42),
#     'travelDuration': RandomForestRegressor(n_estimators=100, random_state=42),
#     'elapsedDays': LinearRegression(),
#     'isNonStop': LogisticRegression(max_iter=500),
#     'Multiple_Carriers': LogisticRegression(max_iter=500),
# }

# models = {}
# for feature, estimator in estimators.items():
#     pipeline = Pipeline([
#         ('features', combined_features),
#         ('estimator', estimator)
#     ])
    
#     X_train, X_test, y_train, y_test = train_test_split(
#         X.drop(to_be_estimated_elements, axis=1), X[feature], test_size=0.2, random_state=42)
    
#     models[feature] = pipeline.fit(X_train, y_train)
#     print(f"Model trained for predicting: {feature}")


In [43]:
X["distinct_airlines"].unique()

array(['United', 'Delta', 'American Airlines', 'JetBlue Airways',
       'Alaska Airlines', 'Frontier Airlines', 'Sun Country Airlines',
       'Cape Air'], dtype=object)

- change to random forest

In [44]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor

y_baseFare = dataset_train['baseFare']

# Define the pipeline for baseFare
base_fare_pipeline = Pipeline([
    ('features', combined_features),
    ('estimator', LinearRegression())
])

base_fare_pipeline.fit(X, y_baseFare)

# Perform cross-validation
scores = cross_val_score(base_fare_pipeline, X, y_baseFare, cv=5, scoring='neg_mean_squared_error')
mean_score = -scores.mean()
print(f"Average MSE for baseFare: {mean_score}")


Average MSE for baseFare: 29212.711798512108


- change to random forest

In [45]:
X.head()

Unnamed: 0,flightDayName,searchDayName,destinationAirport,distinct_airlines,is_search_weekend,is_flight_weekend,days_until_flight,isBasicEconomy
152195,Sunday,Tuesday,DFW,United,0,1,12,0
16907,Sunday,Wednesday,DFW,United,0,1,11,0
6107,Friday,Saturday,DTW,United,1,0,27,1
80703,Friday,Monday,DTW,United,0,0,25,1
97084,Friday,Wednesday,DTW,United,0,0,23,1


In [46]:
X_test = pd.read_pickle('test.pkl')
dataset_test = X_test[got_elements]
dataset_test["distinct_airlines"] = dataset_test["distinct_airlines"].apply(lambda x: list(x)[0] if x and isinstance(x, set) else None)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset_test["distinct_airlines"] = dataset_test["distinct_airlines"].apply(lambda x: list(x)[0] if x and isinstance(x, set) else None)


In [47]:
dataset_test.head()

Unnamed: 0,flightDayName,searchDayName,destinationAirport,distinct_airlines,is_search_weekend,is_flight_weekend,days_until_flight,isBasicEconomy
128169,Sunday,Monday,JFK,Alaska Airlines,0,1,27,0
28053,Thursday,Monday,ORD,American Airlines,0,0,31,0
104331,Thursday,Friday,LAX,United,0,0,48,0
19892,Tuesday,Saturday,BOS,American Airlines,1,0,10,0
45944,Saturday,Tuesday,LGA,Delta,0,1,25,0


In [48]:
dataset_test[0:1]

# Predict baseFare


Unnamed: 0,flightDayName,searchDayName,destinationAirport,distinct_airlines,is_search_weekend,is_flight_weekend,days_until_flight,isBasicEconomy
128169,Sunday,Monday,JFK,Alaska Airlines,0,1,27,0


In [49]:
one_prediction = base_fare_pipeline.predict(dataset_test[0:1])

In [50]:
one_prediction

array([484.78748884])

In [51]:
# Define the pipeline for travelDuration
y_totalTravelDistance = dataset_train['totalTravelDistance']

totalTravelDistance_pipelines = Pipeline([
    ('features', combined_features),
    ('estimator', LinearRegression())
])

totalTravelDistance_pipelines.fit(X, y_totalTravelDistance)

# Perform cross-validation
scores = cross_val_score(totalTravelDistance_pipelines, X, y_totalTravelDistance, cv=5, scoring='neg_mean_squared_error')
mean_score = -scores.mean()
print(f"Average MSE for travelDuration: {mean_score}")


Average MSE for travelDuration: 92507.44337221891


In [57]:
totalTravelDistance_pipelines.predict(dataset_test[0:1])

array([2718.54462717])

In [52]:
from sklearn.linear_model import LinearRegression

y_elapsedDays = dataset_train['elapsedDays']

# Define the pipeline for elapsedDays
elapsed_days_pipeline = Pipeline([
    ('features', combined_features),
    ('estimator', LinearRegression())
])

elapsed_days_pipeline.fit(X, y_elapsedDays)

# Perform cross-validation
scores = cross_val_score(elapsed_days_pipeline, X, y_elapsedDays, cv=5, scoring='neg_mean_squared_error')
mean_score = -scores.mean()
print(f"Average MSE for elapsedDays: {mean_score}")


Average MSE for elapsedDays: 0.2100306740879952


In [58]:
elapsed_days_pipeline.predict(dataset_test[0:1])

array([0.35613696])

In [53]:
X

Unnamed: 0,flightDayName,searchDayName,destinationAirport,distinct_airlines,is_search_weekend,is_flight_weekend,days_until_flight,isBasicEconomy
152195,Sunday,Tuesday,DFW,United,0,1,12,0
16907,Sunday,Wednesday,DFW,United,0,1,11,0
6107,Friday,Saturday,DTW,United,1,0,27,1
80703,Friday,Monday,DTW,United,0,0,25,1
97084,Friday,Wednesday,DTW,United,0,0,23,1
...,...,...,...,...,...,...,...,...
125868,Wednesday,Friday,EWR,Delta,0,0,5,1
3301,Wednesday,Saturday,EWR,Delta,1,0,4,1
32844,Wednesday,Sunday,EWR,Delta,1,0,3,1
64539,Wednesday,Monday,EWR,Delta,0,0,2,1


In [54]:
from sklearn.linear_model import LogisticRegression

y_isNonStop = dataset_train['isNonStop']
# Define the pipeline for isNonStop
is_non_stop_pipeline = Pipeline([
    ('features', combined_features),
    ('estimator', LogisticRegression(max_iter=500))
])

is_non_stop_pipeline.fit(X, y_isNonStop)

# Perform cross-validation
scores = cross_val_score(is_non_stop_pipeline, X, y_isNonStop, cv=5, scoring='accuracy')
mean_score = scores.mean()
print(f"Average accuracy for isNonStop: {mean_score}")


Average accuracy for isNonStop: 0.8374085859295761


In [55]:
# Define the pipeline for Multiple_Carriers

y_multipleCarriers = dataset_train['Multiple_Carriers']
multiple_carriers_pipeline = Pipeline([
    ('features', combined_features),
    ('estimator', LogisticRegression(max_iter=500))
])

multiple_carriers_pipeline.fit(X, y_multipleCarriers)
# Perform cross-validation
scores = cross_val_score(multiple_carriers_pipeline, X, y_multipleCarriers, cv=5, scoring='accuracy')
mean_score = scores.mean()
print(f"Average accuracy for Multiple_Carriers: {mean_score}")


Average accuracy for Multiple_Carriers: 0.9076695940946422


In [56]:
import joblib

# Assuming 'pipeline' is your trained model
joblib.dump(base_fare_pipeline, 'base_fare_pipeline.pkl')
joblib.dump(totalTravelDistance_pipelines, 'travel_duration_pipeline.pkl')
joblib.dump(elapsed_days_pipeline, 'elapsed_days_pipeline.pkl')
joblib.dump(is_non_stop_pipeline, 'is_non_stop_pipeline.pkl')
joblib.dump(multiple_carriers_pipeline, 'multiple_carriers_pipeline.pkl')


['multiple_carriers_pipeline.pkl']