## Importing the data

In [1]:
import pandas as pd
import numpy as np
import sys
import joblib
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
from dateutil import parser
from build_features import get_date_features, get_time_features

reading data from zipped file

In [2]:
data = pd.read_pickle("../../data/processed/data_cleaned.pkl.gz", compression='gzip')

## Exploring the data

In [3]:
data.sample(20)

Unnamed: 0,searchDate,flightDate,startingAirport,destinationAirport,isBasicEconomy,isRefundable,isNonStop,totalFare,totalTravelDistance,segmentsDepartureTimeEpochSeconds_Leg1,...,segmentsDurationInSeconds_Leg4,segmentsDistance_Leg1,segmentsDistance_Leg2,segmentsDistance_Leg3,segmentsDistance_Leg4,segmentsCabinCode_Leg1,segmentsCabinCode_Leg2,segmentsCabinCode_Leg3,segmentsCabinCode_Leg4,traveltime_hours
1314842,2022-04-29,2022-06-24,BOS,LGA,False,False,False,251.610001,627.0,1656065000.0,...,0.0,406.0,221.0,0.0,0.0,coach,coach,0,0,4.316667
9422900,2022-05-13,2022-07-10,LGA,DFW,False,False,True,245.600006,1380.0,1657471000.0,...,0.0,1380.0,0.0,0.0,0.0,coach,0,0,0,3.783333
10112885,2022-05-06,2022-06-18,MIA,DTW,False,False,False,407.190002,1589.0,1655557000.0,...,0.0,1104.0,485.0,0.0,0.0,coach,coach,0,0,12.483333
1178911,2022-04-25,2022-06-13,BOS,ORD,False,False,True,213.600006,862.0,1655122000.0,...,0.0,862.0,0.0,0.0,0.0,coach,0,0,0,2.9
10337748,2022-05-18,2022-06-15,MIA,BOS,False,False,True,258.600006,1260.0,1655318000.0,...,0.0,1260.0,0.0,0.0,0.0,coach,0,0,0,3.3
2668212,2022-05-14,2022-05-26,CLT,PHL,False,False,False,517.109985,895.0,1653600000.0,...,0.0,228.0,667.0,0.0,0.0,coach,coach,0,0,5.7
9762409,2022-04-25,2022-06-21,MIA,IAD,False,False,False,388.600006,1137.0,1655806000.0,...,0.0,596.0,541.0,0.0,0.0,coach,coach,0,0,5.866667
4712201,2022-04-29,2022-06-01,DTW,LGA,False,False,True,208.610001,485.0,1654092000.0,...,0.0,485.0,0.0,0.0,0.0,coach,0,0,0,1.8
10430794,2022-04-20,2022-05-02,OAK,DEN,False,False,False,386.609985,1689.0,1651537000.0,...,0.0,672.0,1017.0,0.0,0.0,coach,coach,0,0,5.583333
3070711,2022-04-30,2022-05-16,DEN,CLT,False,False,False,408.600006,1623.0,1652732000.0,...,0.0,693.0,930.0,0.0,0.0,coach,coach,0,0,7.65


In [4]:
data.shape

(13519999, 58)

In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13519999 entries, 0 to 13519998
Data columns (total 58 columns):
 #   Column                                  Dtype   
---  ------                                  -----   
 0   searchDate                              object  
 1   flightDate                              object  
 2   startingAirport                         category
 3   destinationAirport                      category
 4   isBasicEconomy                          bool    
 5   isRefundable                            bool    
 6   isNonStop                               bool    
 7   totalFare                               float32 
 8   totalTravelDistance                     float32 
 9   segmentsDepartureTimeEpochSeconds_Leg1  float32 
 10  segmentsDepartureTimeEpochSeconds_Leg2  float32 
 11  segmentsDepartureTimeEpochSeconds_Leg3  float32 
 12  segmentsDepartureTimeEpochSeconds_Leg4  float32 
 13  segmentsDepartureTimeRaw_Leg1           object  
 14  segmentsDepartur

converting the flightDate column fro object to datetime format

In [5]:
data['flightDate'] = pd.to_datetime(data['flightDate'])

In [6]:

data['segmentsDepartureTimeRaw_Leg1'] = data['segmentsDepartureTimeRaw_Leg1'].apply(parser.parse)
# 'H:M' formatin new column
data['departure_time'] = data['segmentsDepartureTimeRaw_Leg1'].apply(lambda x: x.strftime('%H:%M'))

creating new feature 'departure_time' to store the 30 minute round-offs of the flight departure timings

In [7]:
data['departure_time'] = pd.to_datetime(data['departure_time']).dt.round('30min').dt.strftime('%H:%M')

  data['departure_time'] = pd.to_datetime(data['departure_time']).dt.round('30min').dt.strftime('%H:%M')


splitting features from target

In [8]:
features = ['startingAirport', 'destinationAirport','flightDate', 'departure_time', 'segmentsCabinCode_Leg1']
target = 'totalFare'

In [11]:
fin_data = data.groupby(features)[target].mean().reset_index()
fin_data.dropna(subset=['totalFare'], inplace=True)
fin_data.rename(columns = {'segmentsCabinCode_Leg1':'cabin_type'}, inplace = True)

In [12]:
flight_data=fin_data.copy()

converting the features into pandas datatypes

In [13]:
mapping = {'startingAirport': 'category','destinationAirport': 'category','cabin_type': 'category''totalFare': 'float32'}
flight_data = flight_data.astype(mapping)

In [None]:
sys.path.append("../../src/features")

In [14]:
flight_data = get_date_features(flight_data)
flight_data = get_time_features(flight_data)

  flight_data['departure_time'] = pd.to_datetime(flight_data['departure_time'])


In [15]:
flight_data.head()

Unnamed: 0,startingAirport,destinationAirport,cabin_type,totalFare,month,day,weekday,departure_time_sin,departure_time_cos,departure_time_category
189,ATL,BOS,coach,271.589996,4,17,6,0.965926,0.258819,night
193,ATL,BOS,coach,252.600006,4,17,6,1.0,6.123234000000001e-17,night
197,ATL,BOS,coach,248.600006,4,17,6,1.0,6.123234000000001e-17,night
201,ATL,BOS,coach,251.100006,4,17,6,0.965926,-0.258819,morning
213,ATL,BOS,coach,251.100006,4,17,6,0.866025,-0.5,morning


In [None]:
X = flight_data.drop('totalFare', axis=1)
y = flight_data['totalFare']

In [None]:
# Label encoding categorical variables
le = {}
for col in ['startingAirport', 'destinationAirport', 'cabin_type', 'departure_time_category']:
    le[col] = LabelEncoder()
    X[col] = le[col].fit_transform(X[col])

joblib.dump(le, '../../models/ANIKA/label_encoder.joblib')

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, shuffle=True, random_state=23)

In [None]:
# Scale the data using StandardScaler
scl = StandardScaler()
X_train_scaled = scl.fit_transform(X_train)
X_test_scaled = scl.transform(X_test)


joblib.dump(scl, '../../models/ANIKA/standard_scaler.joblib')

## Assessing the baseline

In [18]:
# mean of totalFare
mtf = y_train.mean()
baseline_pred = [mtf] * len(X_test)
# Root Mean Absolute Error of the baseline model
rmse_baseline = np.sqrt(mean_squared_error(y_test, baseline_pred))
print(f"Root Mean Squared Error of the baseline model: {rmse_baseline}")

Root Mean Squared Error of the baseline model: 268.4026184082031


## Modelling

path to save models

In [19]:

sys.path.append('../../src/models')

#### exp 1 XGBOOST with default parameters

In [22]:
xgb_default_model = XGBRegressor()
xgb_default_model.fit(X_train_scaled, y_train)

In [24]:
train_predictions_xgb_default = xgb_default_model.predict(X_train_scaled)
test_predictions_xgb_default = xgb_default_model.predict(X_test_scaled)

In [25]:
train_rmse_xgb_default = mean_squared_error(y_train, train_predictions_xgb_default, squared=False)
test_rmse_xgb_default = mean_squared_error(y_test, test_predictions_xgb_default, squared=False)

In [26]:
print(f"Training RMSE: {train_rmse_xgb_default}")
print(f"Test RMSE: {test_rmse_xgb_default}")

Training RMSE: 131.5935516357422
Test RMSE: 136.42172241210938


In [27]:
joblib.dump(xgb_default_model, '../../models/ANIKA/xgb_default_model.joblib')

['../../models/ANIKA/xgb_default_model.joblib']

#### exp 2 XGBOOST with best parameters

In [28]:
xgb_tuned_model = XGBRegressor()
param_grid_xgb_tuned = {
    'n_estimators': [150],  
    'max_depth': [3, 4, 5],          
    'learning_rate': [0.01, 0.1, 0.2]  
}

In [29]:
grid_search__xgb_tuned = GridSearchCV(xgb_tuned_model, param_grid_xgb_tuned, cv=5, scoring='neg_mean_squared_error')
grid_search__xgb_tuned.fit(X_train_scaled, y_train)

In [30]:
best_params__xgb_tuned = grid_search__xgb_tuned.best_params_
print("Best Hyperparameters:", best_params__xgb_tuned)

Best Hyperparameters: {'learning_rate': 0.2, 'max_depth': 5, 'n_estimators': 150}


In [31]:
xgb_tuned_model = XGBRegressor(**best_params__xgb_tuned)
xgb_tuned_model.fit(X_train_scaled, y_train)

In [32]:
train_predictions_xgb_tuned = xgb_tuned_model.predict(X_train_scaled)
test_predictions_xgb_tuned = xgb_tuned_model.predict(X_test_scaled)

In [33]:
train_rmse_xgb_tuned = mean_squared_error(y_train, train_predictions_xgb_tuned, squared=False)
test_rmse_xgb_tuned = mean_squared_error(y_test, test_predictions_xgb_tuned, squared=False)

In [34]:
print(f"Training RMSE: {train_rmse_xgb_tuned}")
print(f"Test RMSE: {test_rmse_xgb_tuned}")

Training RMSE: 141.4088134765625
Test RMSE: 143.48294067382812


In [35]:
joblib.dump(xgb_tuned_model, '../../models/ANIKA/xgb_tuned_model.joblib')

['../../models/ANIKA/xgb_tuned_model.joblib']

#### exp 3 LightGBM with default parameters

In [36]:

lgb_default_model = LGBMRegressor()
lgb_default_model.fit(X_train_scaled, y_train)

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 127
[LightGBM] [Info] Number of data points in the train set: 389241, number of used features: 9
[LightGBM] [Info] Start training from score 409.143536


In [37]:
train_predictions_lgb_dafault = lgb_default_model.predict(X_train_scaled)
test_predictions_lgb_dafault = lgb_default_model.predict(X_test_scaled)

In [38]:
train_rmse_lgb_dafault = mean_squared_error(y_train, train_predictions_lgb_dafault, squared=False)
test_rmse_lgb_dafault = mean_squared_error(y_test, test_predictions_lgb_dafault, squared=False)

In [39]:
print(f"Training RMSE: {train_rmse_lgb_dafault}")
print(f"Test RMSE: {test_rmse_lgb_dafault}")

Training RMSE: 148.22775388411736
Test RMSE: 149.05714680809274


In [40]:
joblib.dump(lgb_default_model, '../../models/ANIKA/lgb_default_model.joblib')

['../../models/ANIKA/lgb_default_model.joblib']

#### exp 4 LightGBM with best parameters

In [41]:
lgb_tuned_model = LGBMRegressor()
param_grid_lgbm = {
    'n_estimators': [150],  
    'max_depth': [3, 4, 5],          
    'learning_rate': [0.01, 0.1, 0.2]  
}

In [42]:
grid_search_LGB_tuned = GridSearchCV(lgb_tuned_model, param_grid_lgbm, cv=5, scoring='neg_mean_squared_error')
grid_search_LGB_tuned.fit(X_train_scaled, y_train)

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 127
[LightGBM] [Info] Number of data points in the train set: 311392, number of used features: 9
[LightGBM] [Info] Start training from score 409.137002
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 127
[LightGBM] [Info] Number of data points in the train set: 311393, number of used features: 9
[LightGBM] [Info] Start training from score 409.171884
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 127
[LightGBM] [Info] Number of data points in the train set: 311393, number of used features: 9
[LightGBM] [Info] Start training from score 409.364161
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you 

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 127
[LightGBM] [Info] Number of data points in the train set: 311392, number of used features: 9
[LightGBM] [Info] Start training from score 409.137002
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 127
[LightGBM] [Info] Number of data points in the train set: 311393, number of used features: 9
[LightGBM] [Info] Start training from score 409.171884
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 127
[LightGBM] [Info] Number of data points in the train set: 311393, number of used features: 9
[LightGBM] [Info] Start training from score 409.364161
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you 

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 127
[LightGBM] [Info] Number of data points in the train set: 311393, number of used features: 9
[LightGBM] [Info] Start training from score 409.364161
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 127
[LightGBM] [Info] Number of data points in the train set: 311393, number of used features: 9
[LightGBM] [Info] Start training from score 408.861179
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 127
[LightGBM] [Info] Number of data points in the train set: 311393, number of used features: 9
[LightGBM] [Info] Start training from score 409.183454
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you 

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 127
[LightGBM] [Info] Number of data points in the train set: 311393, number of used features: 9
[LightGBM] [Info] Start training from score 409.171884
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 127
[LightGBM] [Info] Number of data points in the train set: 311393, number of used features: 9
[LightGBM] [Info] Start training from score 409.364161
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 127
[LightGBM] [Info] Number of data points in the train set: 311393, number of used features: 9
[LightGBM] [Info] Start training from score 408.861179
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you 

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 127
[LightGBM] [Info] Number of data points in the train set: 311393, number of used features: 9
[LightGBM] [Info] Start training from score 409.364161
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 127
[LightGBM] [Info] Number of data points in the train set: 311393, number of used features: 9
[LightGBM] [Info] Start training from score 408.861179
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 127
[LightGBM] [Info] Number of data points in the train set: 311393, number of used features: 9


[LightGBM] [Info] Start training from score 409.183454
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 127
[LightGBM] [Info] Number of data points in the train set: 389241, number of used features: 9
[LightGBM] [Info] Start training from score 409.143536


In [43]:
best_params_LGB_tuned = grid_search_LGB_tuned.best_params_
print("Best Hyperparameters:", best_params_LGB_tuned)

Best Hyperparameters: {'learning_rate': 0.2, 'max_depth': 5, 'n_estimators': 150}


In [44]:
lgb_tuned_model = LGBMRegressor(**best_params_LGB_tuned)
lgb_tuned_model.fit(X_train_scaled, y_train)

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 127
[LightGBM] [Info] Number of data points in the train set: 389241, number of used features: 9
[LightGBM] [Info] Start training from score 409.143536


In [45]:
train_predictions_lgbm_tuned = lgb_tuned_model.predict(X_train_scaled)
test_predictions_lgbm_tuned = lgb_tuned_model.predict(X_test_scaled)



In [46]:
train_rmse_lgbm_tuned = mean_squared_error(y_train, train_predictions_lgbm_tuned, squared=False)
test_rmse_lgbm_tuned = mean_squared_error(y_test, test_predictions_lgbm_tuned, squared=False)

In [47]:
print(f"Training RMSE: {train_rmse_lgbm_tuned}")
print(f"Test RMSE: {test_rmse_lgbm_tuned}")

Training RMSE: 141.9264866752889
Test RMSE: 143.6805183913013


In [48]:
joblib.dump(lgb_tuned_model, '../../models/ANIKA/lgb_tuned_model.joblib')

['../../models/ANIKA/lgb_tuned_model.joblib']

#### exp 5 XGB with regularization

In [49]:
xgb_model_reg = XGBRegressor()

In [50]:
param_grid_reg = {
    'alpha': [0.1, 0.01, 0.001],
    'lambda': [1.0, 0.1, 0.01]
}

In [51]:
grid_search_reg = GridSearchCV(estimator=xgb_model_reg, param_grid=param_grid_reg, scoring='neg_mean_squared_error', cv=5)
grid_search_reg.fit(X_train_scaled, y_train)

In [52]:
best_alpha = grid_search_reg.best_params_['alpha']
best_lambda = grid_search_reg.best_params_['lambda']

In [53]:
best_xgb_model_reg = XGBRegressor(alpha=best_alpha, reg_lambda=best_lambda)

In [54]:
best_xgb_model_reg.fit(X_train_scaled, y_train)

In [56]:
train_predictions_xgb_reg = best_xgb_model_reg.predict(X_train_scaled)
test_predictions_xgb_reg = best_xgb_model_reg.predict(X_test_scaled)

In [58]:
train_rmse_reg = mean_squared_error(y_train, train_predictions_xgb_reg, squared=False)
test_rmse_reg = mean_squared_error(y_test, test_predictions_xgb_reg, squared=False)

In [60]:
print(f"Training RMSE: {train_rmse_reg}")
print(f"Test RMSE: {test_rmse_reg}")

Training RMSE: 131.59356689453125
Test RMSE: 136.42173767089844


In [61]:
joblib.dump(best_xgb_model_reg, '../../models/ANIKA/best_xgb_model_reg.joblib')

['../../models/ANIKA/best_xgb_model_reg.joblib']