In [1]:
import pandas as pd
import numpy as np
import os
import time
from datetime import datetime, timedelta
from google.cloud import bigquery
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.metrics import mean_absolute_error
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import RandomizedSearchCV


In [2]:
# 📌 Set Up BigQuery Connection
PROJECT_ID = "travel-insider-452211"
DATASET_NAME = "travel_insider_dataset"
TABLE_NAME = "filtered_flights"
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "/home/sebastian/code/JPYY-96/travel_insider/raw_data/travel-insider-452211-181bd2eba48e.json"

# 📌 Initialize BigQuery Client
client = bigquery.Client()
query = f"SELECT * FROM `{PROJECT_ID}.{DATASET_NAME}.{TABLE_NAME}` ORDER BY RAND()  LIMIT 2000000"

# 📌 Load Data in Chunks with Timer
chunk_size = 1000000
data_chunks = []
start_time = time.time()

print(f"⏳ [{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}] Loading data from BigQuery in chunks...")
query_job = client.query(query)
result_iter = query_job.result(page_size=chunk_size)

for page in result_iter.pages:
    chunk_data = [dict(row) for row in page]
    df_chunk = pd.DataFrame(chunk_data)
    data_chunks.append(df_chunk)
    print(f"✅ [{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}] Loaded {len(df_chunk)} rows (total: {sum(len(c) for c in data_chunks)})")

data_query = pd.concat(data_chunks, ignore_index=True)
print(f"✅ [{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}] Successfully loaded {len(data_query)} rows from BigQuery!")


⏳ [2025-03-08 11:11:02] Loading data from BigQuery in chunks...
✅ [2025-03-08 11:11:36] Loaded 31818 rows (total: 31818)
✅ [2025-03-08 11:11:45] Loaded 31818 rows (total: 63636)
✅ [2025-03-08 11:11:55] Loaded 31818 rows (total: 95454)
✅ [2025-03-08 11:12:08] Loaded 31818 rows (total: 127272)
✅ [2025-03-08 11:12:17] Loaded 31818 rows (total: 159090)
✅ [2025-03-08 11:12:25] Loaded 31818 rows (total: 190908)
✅ [2025-03-08 11:12:33] Loaded 31818 rows (total: 222726)
✅ [2025-03-08 11:12:45] Loaded 31818 rows (total: 254544)
✅ [2025-03-08 11:12:54] Loaded 31818 rows (total: 286362)
✅ [2025-03-08 11:13:04] Loaded 31818 rows (total: 318180)
✅ [2025-03-08 11:13:15] Loaded 31818 rows (total: 349998)
✅ [2025-03-08 11:13:24] Loaded 31818 rows (total: 381816)
✅ [2025-03-08 11:13:33] Loaded 31818 rows (total: 413634)
✅ [2025-03-08 11:13:41] Loaded 31818 rows (total: 445452)
✅ [2025-03-08 11:13:51] Loaded 31818 rows (total: 477270)
✅ [2025-03-08 11:14:00] Loaded 31818 rows (total: 509088)
✅ [2025-03-

In [3]:
# 📌 Convert Date Columns
data_query["searchDate"] = pd.to_datetime(data_query["searchDate"])
data_query["flightDate"] = pd.to_datetime(data_query["flightDate"])

# 📌 Feature Engineering
data_query["days_to_flight"] = (data_query["flightDate"] - data_query["searchDate"]).dt.days
data_query["day_of_week"] = data_query["flightDate"].dt.dayofweek
data_query["is_weekend"] = (data_query["day_of_week"] >= 5).astype(int)
data_query["is_holiday_season"] = data_query["flightDate"].dt.month.isin([6, 7, 12]).astype(int)
data_query["days_to_flight_squared"] = data_query["days_to_flight"] ** 2
data_query["flight_month"] = data_query["flightDate"].dt.month
data_query["flight_year"] = data_query["flightDate"].dt.year
data_query["search_month"] = data_query["searchDate"].dt.month
data_query["search_day"] = data_query["searchDate"].dt.day
data_query["days_to_flight_log"] = np.log1p(data_query["days_to_flight"])

# Remove invalid rows
data_query = data_query[data_query["days_to_flight"] > 0]

# 📌 One-Hot Encode Categorical Features (Airports & Airlines)
encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
encoded_features = encoder.fit_transform(data_query[['startingAirport', 'destinationAirport', 'segmentsAirlineName']])
encoded_df = pd.DataFrame(encoded_features, columns=encoder.get_feature_names_out())

# 📌 Ordinal Encoding for Cabin Class
data_query['segmentsCabinCode'] = data_query['segmentsCabinCode'].fillna('Coach').str.strip().str.title()
cabin_classes = [['Coach', 'Premium Coach', 'Business', 'First']]
ordinal_encoder = OrdinalEncoder(categories=cabin_classes)
data_query['cabin_class_encoded'] = ordinal_encoder.fit_transform(data_query[['segmentsCabinCode']])

# 📌 Merge Encoded Data
data_query = data_query.reset_index(drop=True)
data_query = pd.concat([data_query, encoded_df], axis=1)

: 

In [None]:
# 📌 Select Features
X = data_query[['days_to_flight', 'days_to_flight_squared', 'day_of_week', 'is_weekend', 'is_holiday_season',
                'flight_month', 'flight_year', 'search_month', 'search_day', 'days_to_flight_log',
                'seatsRemaining', 'isRefundable', 'cabin_class_encoded'] + list(encoded_df.columns)]
y = data_query['totalFare']

# 📌 Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# 📌 Train XGBoost Model
xgb_model = XGBRegressor(
    n_estimators=500, learning_rate=0.05, max_depth=8,
    subsample=0.8, colsample_bytree=0.8, random_state=42
)
xgb_model.fit(X_train, y_train)

# 📌 Train LightGBM Model
lgbm_model = LGBMRegressor(
    n_estimators=700, learning_rate=0.03, max_depth=10,
    subsample=0.8, colsample_bytree=0.8, random_state=42
)
lgbm_model.fit(X_train, y_train)

# 📌 Make Predictions
y_pred_xgb = xgb_model.predict(X_test)
y_pred_lgbm = lgbm_model.predict(X_test)

In [42]:
# 📌 Evaluate Models
mae_xgb = mean_absolute_error(y_test, y_pred_xgb)
mape_xgb = (mae_xgb / y_test.mean()) * 100

mae_lgbm = mean_absolute_error(y_test, y_pred_lgbm)
mape_lgbm = (mae_lgbm / y_test.mean()) * 100

print(f"\n📊 XGBoost MAE: ${mae_xgb:.2f}, MAPE: {mape_xgb:.2f}%")
print(f"📊 LightGBM MAE: ${mae_lgbm:.2f}, MAPE: {mape_lgbm:.2f}%")

# 📌 Combine XGBoost and LightGBM Predictions (Weighted Average)
y_pred_blended = (0.7 * y_pred_xgb) + (0.3 * y_pred_lgbm)

# 📌 Evaluate Blended Model
mae_blended = mean_absolute_error(y_test, y_pred_blended)
mape_blended = (mae_blended / y_test.mean()) * 100
print(f"\n📊 Blended Model MAE: ${mae_blended:.2f}, MAPE: {mape_blended:.2f}%")


📊 XGBoost MAE: $53.89, MAPE: 20.33%
📊 LightGBM MAE: $61.08, MAPE: 23.05%

📊 Blended Model MAE: $55.43, MAPE: 20.91%


In [40]:
# 📌 Find Best Booking Date for Any Flight
flight_date = datetime(2025, 5, 30)
search_dates = [flight_date - timedelta(days=i) for i in range(1, 61)]

# 📌 Create DataFrame for Predictions with All Required Features
search_df = pd.DataFrame({
    'days_to_flight': [(flight_date - d).days for d in search_dates],
    'days_to_flight_squared': [(flight_date - d).days ** 2 for d in search_dates],
    'day_of_week': [d.weekday() for d in search_dates],
    'is_weekend': [1 if d.weekday() >= 5 else 0 for d in search_dates],
    'is_holiday_season': [1 if d.month in [6, 7, 12] else 0 for d in search_dates],
    'flight_month': [d.month for d in search_dates],  # ✅ Added
    'flight_year': [d.year for d in search_dates],  # ✅ Added
    'search_month': [flight_date.month] * len(search_dates),  # ✅ Added
    'search_day': [flight_date.day] * len(search_dates),  # ✅ Added
    'days_to_flight_log': [np.log1p((flight_date - d).days) for d in search_dates],  # ✅ Added
    'seatsRemaining': np.median(data_query['seatsRemaining']),
    'isRefundable': False,
    'cabin_class_encoded': 0  # Default to Coach
})

# 📌 Add Encoded Features (Default Values)
for col in encoded_df.columns:
    search_df[col] = 0

# 📌 Reorder Columns to Match `X_train`
search_df = search_df[X_train.columns]
# 📌 Predict Prices for Different Booking Dates
predicted_fares = xgb_model.predict(search_df)

# 📌 Find Best Date to Book
best_search_date = search_dates[np.argmin(predicted_fares)]
print(f"\n📅 Best date to book for any flight (May 1st flight): {best_search_date.strftime('%Y-%m-%d')}")

# 📌 Print Total Execution Time
print(f"\n⏳ Total script execution time: {time.time() - start_time:.2f} seconds")


📅 Best date to book for any flight (May 1st flight): 2025-05-24

⏳ Total script execution time: 861.74 seconds
