In [1]:
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score



In [2]:
# Cell 1: Define Custom Transformers

class NumberSelector(BaseEstimator, TransformerMixin):
    def __init__(self, key):
        self.key = key
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[[self.key]]

class BoolSelector(BaseEstimator, TransformerMixin):
    def __init__(self, key):
        self.key = key
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[[self.key]].astype(float)
    
class ColumnSelector(BaseEstimator, TransformerMixin):
    def __init__(self, key, dtype=None):
        self.key = key
        self.dtype = dtype
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        if self.dtype:
            return X[[self.key]].astype(self.dtype)
        return X[[self.key]]


In [3]:
pd.set_option('display.max_columns', None)


In [4]:
dataset_train = pd.read_pickle('processed.pkl')

In [5]:
dataset_train.head()

Unnamed: 0,legId,searchDate,flightDate,startingAirport,destinationAirport,fareBasisCode,travelDuration,elapsedDays,isBasicEconomy,isRefundable,isNonStop,baseFare,totalFare,seatsRemaining,totalTravelDistance,segmentsDepartureTimeEpochSeconds,segmentsDepartureTimeRaw,segmentsArrivalTimeEpochSeconds,segmentsArrivalTimeRaw,segmentsArrivalAirportCode,segmentsDepartureAirportCode,segmentsAirlineName,segmentsAirlineCode,segmentsEquipmentDescription,segmentsDurationInSeconds,segmentsDistance,segmentsCabinCode,days_until_flight,days_since_last_search,fare_change,seats_change,weighted_fare_change,weighted_seats_change,is_search_weekend,is_flight_weekend,Multiple_Carriers,distinct_airlines,combination_code,journeyStartTime,searchDayName,flightDayName,searchDay,flightDay
152195,00002659e6bc297c2cc3337dba41ff1a,2022-06-28,2022-07-10,SFO,DFW,QAA0OKEN,PT6H14M,0.0,0,0,0,320.0,367.6,2.0,1874.0,1657468980||1657486800,2022-07-10T09:03:00.000-07:00||2022-07-10T16:0...,1657483680||1657491420,2022-07-10T15:08:00.000-05:00||2022-07-10T17:1...,IAH||DFW,SFO||IAH,United||United,UA||UA,Boeing 737-800||Embraer 175 (Enhanced Winglets),14700||4620,1641||233,coach||coach,12,0.0,0.0,0.0,0.0,0.0,0,1,0,{United},10,9.05,Tuesday,Sunday,1,6
16907,00002659e6bc297c2cc3337dba41ff1a,2022-06-29,2022-07-10,SFO,DFW,QAA0OKEN,PT6H14M,0.0,0,0,0,320.0,367.6,3.0,1874.0,1657468980||1657486800,2022-07-10T09:03:00.000-07:00||2022-07-10T16:0...,1657483680||1657491420,2022-07-10T15:08:00.000-05:00||2022-07-10T17:1...,IAH||DFW,SFO||IAH,United||United,UA||UA,Boeing 737-800||Embraer 175 (Enhanced Winglets),14700||4620,1641||233,coach||coach,11,1.0,0.0,1.0,0.0,1.0,0,1,0,{United},10,9.05,Wednesday,Sunday,2,6
6107,000037aea1029976f73c4afefb8fb9cb,2022-08-20,2022-09-16,SFO,DTW,SAA7AWBN,PT12H2M,1.0,1,0,0,212.09,261.7,9.0,2173.0,1663378620||1663400100||1663417020,2022-09-16T18:37:00.000-07:00||2022-09-17T00:3...,1663384500||1663412700||1663421940,2022-09-16T20:15:00.000-07:00||2022-09-17T06:0...,LAS||ORD||DTW,SFO||LAS||ORD,United||United||United,UA||UA||UA,Boeing 737-900||Boeing 737-800||Embraer 170,5880||12600||4920,424||1509||240,coach||coach||coach,27,0.0,0.0,0.0,0.0,0.0,1,0,0,{United},10,18.616667,Saturday,Friday,5,4
80703,000037aea1029976f73c4afefb8fb9cb,2022-08-22,2022-09-16,SFO,DTW,SAA7AWBN,PT12H2M,1.0,1,0,0,212.09,261.7,9.0,2173.0,1663378620||1663400100||1663417020,2022-09-16T18:37:00.000-07:00||2022-09-17T00:3...,1663384500||1663412700||1663421940,2022-09-16T20:15:00.000-07:00||2022-09-17T06:0...,LAS||ORD||DTW,SFO||LAS||ORD,United||United||United,UA||UA||UA,Boeing 737-900||Boeing 737-800||Embraer 170,5880||12600||4920,424||1509||240,coach||coach||coach,25,2.0,0.0,0.0,0.0,0.0,0,0,0,{United},10,18.616667,Monday,Friday,0,4
97084,000037aea1029976f73c4afefb8fb9cb,2022-08-24,2022-09-16,SFO,DTW,WAD7PKBN,PT12H2M,1.0,1,0,0,240.0,291.7,9.0,2173.0,1663378620||1663400100||1663417020,2022-09-16T18:37:00.000-07:00||2022-09-17T00:3...,1663384500||1663412700||1663421940,2022-09-16T20:15:00.000-07:00||2022-09-17T06:0...,LAS||ORD||DTW,SFO||LAS||ORD,United||United||United,UA||UA||UA,Boeing 737-900||Boeing 737-800||Embraer 170,5880||12600||4920,424||1509||240,coach||coach||coach,23,2.0,30.0,0.0,15.0,0.0,0,0,0,{United},10,18.616667,Wednesday,Friday,2,4


- 'destinationAirport'                        G
- 'travelDuration',                           E
- 'elapsedDays',                              E
- 'isBasicEconomy',                           G
- 'isRefundable',                             D
- 'isNonStop',                                E
- 'baseFare',                                 E
- 'totalFare',                                T
- 'seatsRemaining',                           E
- 'totalTravelDistance',                      E
- 'segmentsDepartureTimeEpochSeconds',        D
- 'segmentsDepartureTimeRaw',                 D
- 'segmentsArrivalTimeEpochSeconds',          D
- 'segmentsArrivalTimeRaw',                   D
- 'segmentsArrivalAirportCode',               D
- 'segmentsDepartureAirportCode',             D
- 'segmentsAirlineName',                      G
- 'segmentsAirlineCode',                      D
- 'segmentsEquipmentDescription',             D
- 'segmentsDurationInSeconds',                D
- 'segmentsDistance',                         D
- 'segmentsCabinCode',                        D
- 'days_until_flight',                        G
- 'days_since_last_search',                   -
- 'fare_change',                              -
- 'seats_change',                             -
- 'weighted_fare_change',                     -
- 'weighted_seats_change',                    -
- 'is_search_weekend',                        G   
- 'is_flight_weekend',                        G
- 'Multiple_Carriers',                        E
- 'distinct_airlines',                        D
- 'combination_code',                         E
- 'journeyStartTime',                         E
- 'searchDayName',                            G
- 'flightDayName',                            G
- 'searchDay',                                G
- 'flightDay']                                G

In [6]:
def parse_duration(duration):
    hours = 0
    minutes = 0
    
    # Check if the duration contains hours
    if 'H' in duration:
        hours = int(duration.split('H')[0].split('T')[-1])
    
    # Check if the duration contains minutes
    if 'M' in duration:
        minutes = int(duration.split('M')[0].split('H')[-1] if 'H' in duration else duration.split('T')[-1])
    
    # Convert total duration to hours as a decimal
    total_hours = hours + minutes / 60
    return total_hours

In [7]:
dataset_train["distinct_airlines"] = dataset_train["distinct_airlines"].apply(lambda x: list(x)[0] if x and isinstance(x, set) else None)
dataset_train['totalTravelDuration'] = dataset_train['travelDuration'].apply(parse_duration)

dataset_train["distinct_airlines"]


152195    United
16907     United
6107      United
80703     United
97084     United
           ...  
125868     Delta
3301       Delta
32844      Delta
64539      Delta
145143     Delta
Name: distinct_airlines, Length: 5511478, dtype: object

In [8]:
got_elements = ['flightDayName', 'searchDayName', 'destinationAirport', 'distinct_airlines', 'is_search_weekend', 'is_flight_weekend', 'days_until_flight', 'isBasicEconomy']
categorical_tbe = ['isNonStop', 'Multiple_Carriers']
numerical_tbe = ['baseFare', 'totalTravelDistance', 'elapsedDays']

added_features = (['journeyStartTime',
                   'totalTravelDuration',
                   ])

new_estimations = (['combination_code',
                    'seatsRemaining',
                    'weighted_fare_change',
                    'weighted_seats_change'
                    ])

X = dataset_train[got_elements + added_features]

In [9]:
X.head()

Unnamed: 0,flightDayName,searchDayName,destinationAirport,distinct_airlines,is_search_weekend,is_flight_weekend,days_until_flight,isBasicEconomy,journeyStartTime,totalTravelDuration
152195,Sunday,Tuesday,DFW,United,0,1,12,0,9.05,6.233333
16907,Sunday,Wednesday,DFW,United,0,1,11,0,9.05,6.233333
6107,Friday,Saturday,DTW,United,1,0,27,1,18.616667,12.033333
80703,Friday,Monday,DTW,United,0,0,25,1,18.616667,12.033333
97084,Friday,Wednesday,DTW,United,0,0,23,1,18.616667,12.033333


In [10]:
X.isna().sum()

flightDayName          0
searchDayName          0
destinationAirport     0
distinct_airlines      0
is_search_weekend      0
is_flight_weekend      0
days_until_flight      0
isBasicEconomy         0
journeyStartTime       0
totalTravelDuration    0
dtype: int64

## Pipelines

In [11]:
# Pipeline Setup for Each Feature
days_until_flight_pipe = Pipeline([
    ('selector', ColumnSelector(key='days_until_flight')),
    ('standard', StandardScaler())
])

is_search_weekend_pipe = Pipeline([
    ('selector', ColumnSelector(key='is_search_weekend')),
    ('standard', StandardScaler())
])

is_flight_weekend_pipe = Pipeline([
    ('selector', ColumnSelector(key='is_flight_weekend')),
    ('standard', StandardScaler())
])

search_day_pipe = Pipeline([
    ('selector', ColumnSelector(key='searchDayName')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

flight_day_pipe = Pipeline([
    ('selector', ColumnSelector(key='flightDayName')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

is_basic_economy_pipe = Pipeline([
    ('selector', ColumnSelector(key='isBasicEconomy')),
    ('standard', StandardScaler())
])

distinct_airlines_pipiline = Pipeline([
    ('selector', ColumnSelector(key='distinct_airlines', dtype='str')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

destination_airport_pipe = Pipeline([
    ('selector', ColumnSelector(key='destinationAirport', dtype='str')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

journeyStartTime_pipe = Pipeline([
    ('selector', ColumnSelector(key='journeyStartTime')),
    ('standard', StandardScaler())
])



In [12]:
# Combine Features Using FeatureUnion
combined_features = FeatureUnion([
    ('days_until_flight', days_until_flight_pipe),
    ('is_search_weekend', is_search_weekend_pipe),
    ('is_flight_weekend', is_flight_weekend_pipe),
    ('search_day', search_day_pipe),
    ('flight_day', flight_day_pipe),
    ('is_basic_economy', is_basic_economy_pipe),
    ('distinct_airlines', distinct_airlines_pipiline),
    ('destination_airport', destination_airport_pipe),
    ('journeyStartTime', journeyStartTime_pipe)
])

## Different model for Each Predcitor

- 1. Categorical Variables:
- - For binary or categorical variables, logistic regression or classification trees are commonly used.

Variables: isNonStop, Multiple_Carriers

Model: Logistic Regression
Rationale: These are binary categorical variables, and logistic regression is suitable for predicting binary outcomes.
- 2. Continuous Variables:
For continuous variables, various regression models can be applied based on the distribution and relationship of the features.

- - Variables: baseFare, totalTravelDistance, elapsedDays

Model: Random Forest Regressor
Rationale: Random Forest is robust to outliers and can model non-linear relationships, making it suitable for continuous data that may not necessarily follow a linear trend.
- - Variable: totalTravelDuration

Model: Gradient Boosting Regressor
Rationale: This is a continuous variable, and Gradient Boosting can efficiently handle varied data distributions and interactions, often providing high accuracy.
3. Special/Composite Variables:
These might include categorical or continuous outputs and require specific approaches, possibly even custom models or transformations.

- - Variable: combination_code

Model: Ridge Regression
Rationale: If combination_code is a float but derived from multiple categories or conditions, Ridge can help manage multicollinearity among features.
Variable: seatsRemaining

Model: Lasso Regression
Rationale: Useful if you have a lot of features and want to simplify the model by eliminating less important features, which is common in scenarios predicting counts or quantities.
- - Variables: weighted_fare_change, weighted_seats_change

Model: Elastic Net Regression
Rationale: These are likely continuous but might benefit from a model that balances between feature coefficient reduction (like Lasso) and regularization (like Ridge), especially if there are many correlated predictors.

In [13]:
X["distinct_airlines"].unique()

array(['United', 'Delta', 'American Airlines', 'JetBlue Airways',
       'Alaska Airlines', 'Frontier Airlines', 'Sun Country Airlines',
       'Key Lime Air', 'Cape Air'], dtype=object)

- change to random forest

- predicting base fare

## Logistic Regression Models

In [14]:
from sklearn.linear_model import LogisticRegression

y_isNonStop = dataset_train['isNonStop']
# Define the pipeline for isNonStop
is_non_stop_pipeline = Pipeline([
    ('features', combined_features),
    ('estimator', LogisticRegression(max_iter=500))
])

is_non_stop_pipeline.fit(X, y_isNonStop)

# Perform cross-validation
scores = cross_val_score(is_non_stop_pipeline, X, y_isNonStop, cv=5, scoring='accuracy')
mean_score = scores.mean()
print(f"Average accuracy for isNonStop: {mean_score}")


Average accuracy for isNonStop: 0.8382823986974618


In [15]:
# Define the pipeline for Multiple_Carriers

y_multipleCarriers = dataset_train['Multiple_Carriers']
multiple_carriers_pipeline = Pipeline([
    ('features', combined_features),
    ('estimator', LogisticRegression(max_iter=500))
])

multiple_carriers_pipeline.fit(X, y_multipleCarriers)
# Perform cross-validation
scores = cross_val_score(multiple_carriers_pipeline, X, y_multipleCarriers, cv=5, scoring='accuracy')
mean_score = scores.mean()
print(f"Average accuracy for Multiple_Carriers: {mean_score}")


Average accuracy for Multiple_Carriers: 0.9030104082926254


## Random Forest Models

In [16]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor

y_baseFare = dataset_train['baseFare']

# Define the pipeline for baseFare
base_fare_pipeline = Pipeline([
    ('features', combined_features),
    # ('estimator', RandomForestRegressor(n_estimators=50, n_jobs=-1, random_state=42))
    ('estimator', LinearRegression())
    
])

base_fare_pipeline.fit(X, y_baseFare)

# Perform cross-validation
scores = cross_val_score(base_fare_pipeline, X, y_baseFare, cv=5, scoring='neg_mean_squared_error')
mean_score = -scores.mean()
print(f"Average MSE for baseFare: {mean_score}")


Average MSE for baseFare: 29526.462954179868


- change to random forest

In [17]:
# Define the pipeline for travelDuration
y_totalTravelDistance = dataset_train['totalTravelDistance']

totalTravelDistance_pipelines = Pipeline([
    ('features', combined_features),
    # ('estimator', RandomForestRegressor(n_estimators=50, n_jobs=-1, random_state=42))
    ('estimator', LinearRegression())

])

totalTravelDistance_pipelines.fit(X, y_totalTravelDistance)

# Perform cross-validation
scores = cross_val_score(totalTravelDistance_pipelines, X, y_totalTravelDistance, cv=5, scoring='neg_mean_squared_error')
mean_score = -scores.mean()
print(f"Average MSE for travelDuration: {mean_score}")


Average MSE for travelDuration: 92397.93867726906


## Linear Regression Models

In [18]:
from sklearn.linear_model import LinearRegression

y_elapsedDays = dataset_train['elapsedDays']

# Define the pipeline for elapsedDays
elapsed_days_pipeline = Pipeline([
    ('features', combined_features),
    ('estimator', LinearRegression())
])

elapsed_days_pipeline.fit(X, y_elapsedDays)

# Perform cross-validation
scores = cross_val_score(elapsed_days_pipeline, X, y_elapsedDays, cv=5, scoring='neg_mean_squared_error')
mean_score = -scores.mean()
print(f"Average MSE for elapsedDays: {mean_score}")


Average MSE for elapsedDays: 0.06338932220509909


## Gradient Boost Models

In [19]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.preprocessing import StandardScaler

y_totalTravelDuration = dataset_train['totalTravelDuration']

totalTravelDuration_pipeline = Pipeline([
    ('features', combined_features),
    # ('scaler', StandardScaler(with_mean=False)),  # Adjust for sparse data
    # ('estimator', GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42))
    ('estimator', LinearRegression())

])

totalTravelDuration_pipeline.fit(X, y_totalTravelDuration)

# Perform cross-validation
scores = cross_val_score(totalTravelDuration_pipeline, X, y_totalTravelDuration, cv=5, scoring='neg_mean_squared_error')
mean_score = -scores.mean()
print(f"Average MSE for Total Travel Duration: {mean_score}")


Average MSE for Total Travel Duration: 7.072954908136813


## Ridge Model

In [21]:
from sklearn.linear_model import Ridge
from sklearn.preprocessing import StandardScaler

y_combination_code = dataset_train['combination_code']

combination_code_pipeline = Pipeline([
    ('features', combined_features),
    (('scaler', StandardScaler(with_mean=False))),  # Scaling features
    ('estimator', Ridge(alpha=1.0))
])

combination_code_pipeline.fit(X, y_combination_code)

# Perform cross-validation
scores = cross_val_score(combination_code_pipeline, X, y_combination_code, cv=5, scoring='neg_mean_squared_error')
mean_score = -scores.mean()
print(f"Average MSE for Combination Code: {mean_score}")


Average MSE for Combination Code: 469.18458194639925


## Lasso

In [22]:
from sklearn.linear_model import Lasso
from sklearn.preprocessing import StandardScaler

y_seatsRemaining = dataset_train['seatsRemaining']

seatsRemaining_pipeline = Pipeline([
    ('features', combined_features),
    (('scaler', StandardScaler(with_mean=False))),  # Scaling features
    ('estimator', Lasso(alpha=0.1))
])

seatsRemaining_pipeline.fit(X, y_seatsRemaining)

# Perform cross-validation
scores = cross_val_score(seatsRemaining_pipeline, X, y_seatsRemaining, cv=5, scoring='neg_mean_squared_error')
mean_score = -scores.mean()
print(f"Average MSE for Seats Remaining: {mean_score}")


Average MSE for Seats Remaining: 5.477318313122534


## Elastic Models

In [25]:
from sklearn.linear_model import ElasticNet
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score

# For weighted_fare_change
y_weighted_fare_change = dataset_train['weighted_fare_change']

weighted_fare_change_pipeline = Pipeline([
    ('features', combined_features),
    ('scaler', StandardScaler(with_mean=False)),  # Ensure with_mean=False for handling sparse matrices
    ('estimator', ElasticNet(alpha=0.1, l1_ratio=0.5))
])

weighted_fare_change_pipeline.fit(X, y_weighted_fare_change)

# Perform cross-validation
scores = cross_val_score(weighted_fare_change_pipeline, X, y_weighted_fare_change, cv=5, scoring='neg_mean_squared_error')
mean_score = -scores.mean()
print(f"Average MSE for Weighted Fare Change: {mean_score}")

# For weighted_seats_change
y_weighted_seats_change = dataset_train['weighted_seats_change']

weighted_seats_change_pipeline = Pipeline([
    ('features', combined_features),
    ('scaler', StandardScaler(with_mean=False)),  # Ensure with_mean=False for handling sparse matrices
    ('estimator', ElasticNet(alpha=0.1, l1_ratio=0.5))
])

weighted_seats_change_pipeline.fit(X, y_weighted_seats_change)

# Perform cross-validation
scores = cross_val_score(weighted_seats_change_pipeline, X, y_weighted_seats_change, cv=5, scoring='neg_mean_squared_error')
mean_score = -scores.mean()
print(f"Average MSE for Weighted Seats Change: {mean_score}")


Average MSE for Weighted Fare Change: 2030.2355650999307
Average MSE for Weighted Seats Change: 3.3932228409012573


In [26]:
import joblib

# Assuming 'pipeline' is your trained model
joblib.dump(base_fare_pipeline, 'Models/base_fare_pipeline.pkl'),
joblib.dump(totalTravelDistance_pipelines, 'Models/totalTravelDistance_pipelines.pkl'),
joblib.dump(elapsed_days_pipeline, 'Models/elapsed_days_pipeline.pkl'),
joblib.dump(totalTravelDuration_pipeline, 'Models/totalTravelDuration_pipeline.pkl'),
joblib.dump(combination_code_pipeline, 'Models/combination_code_pipeline.pkl'),
joblib.dump(seatsRemaining_pipeline, 'Models/seatsRemaining_pipeline.pkl'),
joblib.dump(weighted_fare_change_pipeline, 'Models/weighted_fare_change_pipeline.pkl'),
joblib.dump(weighted_seats_change_pipeline, 'Models/weighted_seats_change_pipeline.pkl'),
joblib.dump(is_non_stop_pipeline, 'Models/is_non_stop_pipeline.pkl'),
joblib.dump(multiple_carriers_pipeline, 'Models/multiple_carriers_pipeline.pkl')





['Models/multiple_carriers_pipeline.pkl']