In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from tpot import TPOTRegressor
from google.cloud import bigquery
from datetime import datetime
from sklearn.metrics import mean_absolute_error


In [4]:
# 📌 Set Up BigQuery Connection
PROJECT_ID = "travel-insider-452211"
DATASET_NAME = "travel_insider_dataset"
TABLE_NAME = "filtered_flights"
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "/home/sebastian/code/JPYY-96/travel_insider/raw_data/travel-insider-452211-181bd2eba48e.json"

# Initialize BigQuery Client
client = bigquery.Client()

# Initialize AutoML Model (TPOT)
auto_ml = TPOTRegressor(generations=5, population_size=20,  random_state=42)

# Parameters for querying data
chunk_size = 1000000  # Query 1 million rows at a time
offset = 0
total_rows = 16839840  # Total number of rows to process

# Initialize overall batch number
overall_batch_num = 0


In [None]:
# Loop over chunks of data
while offset < total_rows:
    print(f"\n[INFO] Querying data chunk {offset} to {offset + chunk_size}...")
    query = f"""
    SELECT * FROM `{PROJECT_ID}.{DATASET_NAME}.{TABLE_NAME}`
    LIMIT {chunk_size} OFFSET {offset}
    """
    chunk_data = client.query(query).to_dataframe()

    if chunk_data.empty:
        break

    # Feature Engineering
    chunk_data["searchDate"] = pd.to_datetime(chunk_data["searchDate"])
    chunk_data["flightDate"] = pd.to_datetime(chunk_data["flightDate"])
    chunk_data["days_to_flight"] = (chunk_data["flightDate"] - chunk_data["searchDate"]).dt.days
    chunk_data["day_of_week"] = chunk_data["flightDate"].dt.dayofweek
    chunk_data["is_weekend"] = (chunk_data["day_of_week"] >= 5).astype(int)
    chunk_data["is_holiday_season"] = chunk_data["flightDate"].dt.month.isin([6, 7, 12]).astype(int)
    chunk_data["days_to_flight_squared"] = chunk_data["days_to_flight"] ** 2
    chunk_data["days_to_flight_log"] = np.log1p(chunk_data["days_to_flight"])
    chunk_data = chunk_data[chunk_data["days_to_flight"] > 0]

    # Convert `isRefundable` to numeric
    chunk_data['isRefundable'] = chunk_data['isRefundable'].map({'Yes': 1, 'No': 0}).fillna(0).astype(int)

    # One-Hot Encode Categorical Features
    if overall_batch_num == 0:
        encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
        encoded_features = encoder.fit_transform(chunk_data[['startingAirport', 'destinationAirport', 'segmentsAirlineName']])
    else:
        encoded_features = encoder.transform(chunk_data[['startingAirport', 'destinationAirport', 'segmentsAirlineName']])

    encoded_df = pd.DataFrame(encoded_features, columns=encoder.get_feature_names_out())

    feature_columns = ['days_to_flight', 'days_to_flight_squared', 'day_of_week', 'is_weekend', 'is_holiday_season',
                       'days_to_flight_log', 'seatsRemaining', 'isRefundable'] + list(encoded_df.columns)

    chunk_data = chunk_data.reset_index(drop=True)
    chunk_data = pd.concat([chunk_data, encoded_df], axis=1)

    # Select Features and Target
    X_chunk = chunk_data[feature_columns]
    y_chunk = chunk_data['totalFare']

    # Train-Test Split
    X_train, X_test, y_train, y_test = train_test_split(X_chunk, y_chunk, test_size=0.3, random_state=42)

    # Train AutoML Model (TPOT)
    if overall_batch_num == 0:
        auto_ml.fit(X_train, y_train)
    else:
        auto_ml.fit(X_train, y_train)  # TPOT automatically updates the model

    print(f"\n[INFO] Training completed for batch {overall_batch_num + 1}")
    overall_batch_num += 1

    # Increment offset for the next chunk
    offset += chunk_size



[INFO] Querying data chunk 0 to 1000000...


KeyboardInterrupt: 

Process Dask Worker process (from Nanny):
Traceback (most recent call last):
  File "/home/sebastian/.pyenv/versions/3.10.6/envs/travel_insider/lib/python3.10/site-packages/distributed/compatibility.py", line 236, in asyncio_run
    return loop.run_until_complete(main)
  File "/home/sebastian/.pyenv/versions/3.10.6/lib/python3.10/asyncio/base_events.py", line 633, in run_until_complete
    self.run_forever()
  File "/home/sebastian/.pyenv/versions/3.10.6/lib/python3.10/asyncio/base_events.py", line 600, in run_forever
    self._run_once()
  File "/home/sebastian/.pyenv/versions/3.10.6/lib/python3.10/asyncio/base_events.py", line 1860, in _run_once
    event_list = self._selector.select(timeout)
  File "/home/sebastian/.pyenv/versions/3.10.6/lib/python3.10/selectors.py", line 469, in select
    fd_event_list = self._selector.poll(timeout, max_ev)
KeyboardInterrupt

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/sebas

In [None]:
# Final Evaluation
print("\n[INFO] Final Evaluation on Test Data...")
y_pred = auto_ml.predict(X_test)
mae_auto_ml = mean_absolute_error(y_test, y_pred)
mape_auto_ml = (mae_auto_ml / y_test.mean()) * 100
print(f"\n[RESULT] AutoML MAE: ${mae_auto_ml:.2f}, MAPE: {mape_auto_ml:.2f}%")

# Export Best Model
auto_ml.export('best_model_pipeline.py')
print("\n[INFO] Best Model Saved as best_model_pipeline.py")


✅ [2025-03-13 21:32:35] Loaded 31818 rows (total: 1031818)
✅ [2025-03-13 21:32:42] Loaded 31818 rows (total: 1063636)
✅ [2025-03-13 21:32:50] Loaded 31818 rows (total: 1095454)
✅ [2025-03-13 21:32:57] Loaded 31818 rows (total: 1127272)
✅ [2025-03-13 21:33:05] Loaded 31818 rows (total: 1159090)
✅ [2025-03-13 21:33:13] Loaded 31818 rows (total: 1190908)
✅ [2025-03-13 21:33:21] Loaded 31818 rows (total: 1222726)
✅ [2025-03-13 21:33:28] Loaded 31818 rows (total: 1254544)
✅ [2025-03-13 21:33:36] Loaded 31818 rows (total: 1286362)
✅ [2025-03-13 21:33:44] Loaded 31818 rows (total: 1318180)
✅ [2025-03-13 21:33:52] Loaded 31818 rows (total: 1349998)
✅ [2025-03-13 21:33:59] Loaded 31818 rows (total: 1381816)
✅ [2025-03-13 21:34:07] Loaded 31818 rows (total: 1413634)
✅ [2025-03-13 21:34:15] Loaded 31818 rows (total: 1445452)
✅ [2025-03-13 21:34:23] Loaded 31818 rows (total: 1477270)
✅ [2025-03-13 21:34:31] Loaded 31818 rows (total: 1509088)
✅ [2025-03-13 21:34:38] Loaded 31818 rows (total: 154090

In [20]:
# 📌 Convert Date Columns
data_query["searchDate"] = pd.to_datetime(data_query["searchDate"])
data_query["flightDate"] = pd.to_datetime(data_query["flightDate"])

# 📌 Feature Engineering
data_query["days_to_flight"] = (data_query["flightDate"] - data_query["searchDate"]).dt.days
data_query["day_of_week"] = data_query["flightDate"].dt.dayofweek
data_query["is_weekend"] = (data_query["day_of_week"] >= 5).astype(int)
data_query["is_holiday_season"] = data_query["flightDate"].dt.month.isin([6, 7, 12]).astype(int)
data_query["days_to_flight_squared"] = data_query["days_to_flight"] ** 2
data_query["flight_month"] = data_query["flightDate"].dt.month
data_query["flight_year"] = data_query["flightDate"].dt.year
data_query["search_month"] = data_query["searchDate"].dt.month
data_query["search_day"] = data_query["searchDate"].dt.day
data_query["days_to_flight_log"] = np.log1p(data_query["days_to_flight"])
data_query['isRefundable'] = data_query['isRefundable'].map({'true': 1, 'false': 0}).fillna(0).astype(int)

# Remove invalid rows
data_query = data_query[data_query["days_to_flight"] > 0]

# 📌 One-Hot Encode Categorical Features (Airports & Airlines)
encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
encoded_features = encoder.fit_transform(data_query[['startingAirport', 'destinationAirport', 'segmentsAirlineName']])
encoded_df = pd.DataFrame(encoded_features, columns=encoder.get_feature_names_out())

# 📌 Merge Encoded Data
data_query = data_query.reset_index(drop=True)
data_query = pd.concat([data_query, encoded_df], axis=1)

In [21]:
# 📌 Select Features
X = data_query[['days_to_flight', 'days_to_flight_squared', 'day_of_week', 'is_weekend', 'is_holiday_season',
                'flight_month', 'flight_year', 'search_month', 'search_day', 'days_to_flight_log',
                'seatsRemaining', 'isRefundable'] + list(encoded_df.columns)]
y = data_query['totalFare']

# 📌 Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


In [None]:
# 📌 Batch Training Function
def train_in_batches(model, X_train, y_train, batch_size=10000, n_iter=3):
    # Training in batches
    for i in range(n_iter):
        for start in range(0, X_train.shape[0], batch_size):
            end = min(start + batch_size, X_train.shape[0])
            X_batch, y_batch = X_train.iloc[start:end], y_train.iloc[start:end]
            model.fit(X_batch, y_batch, verbose=False)
            print(f"✅ Batch {i+1}, Training on rows {start} to {end}...")

# 📌 Initialize Models
xgb_model = XGBRegressor(
    n_estimators=500, learning_rate=0.05, max_depth=8,
    subsample=0.8, colsample_bytree=0.8, random_state=42
)


# 📌 Train Models in Batches
train_in_batches(xgb_model, X_train, y_train, batch_size=5000, n_iter=3)


✅ Batch 1, Training on rows 0 to 5000...
✅ Batch 1, Training on rows 5000 to 10000...
✅ Batch 1, Training on rows 10000 to 15000...
✅ Batch 1, Training on rows 15000 to 20000...
✅ Batch 1, Training on rows 20000 to 25000...
✅ Batch 1, Training on rows 25000 to 30000...
✅ Batch 1, Training on rows 30000 to 35000...
✅ Batch 1, Training on rows 35000 to 40000...
✅ Batch 1, Training on rows 40000 to 45000...
✅ Batch 1, Training on rows 45000 to 50000...
✅ Batch 1, Training on rows 50000 to 55000...
✅ Batch 1, Training on rows 55000 to 60000...
✅ Batch 1, Training on rows 60000 to 65000...
✅ Batch 1, Training on rows 65000 to 70000...
✅ Batch 1, Training on rows 70000 to 75000...
✅ Batch 1, Training on rows 75000 to 80000...
✅ Batch 1, Training on rows 80000 to 85000...
✅ Batch 1, Training on rows 85000 to 90000...
✅ Batch 1, Training on rows 90000 to 95000...
✅ Batch 1, Training on rows 95000 to 100000...
✅ Batch 1, Training on rows 100000 to 105000...
✅ Batch 1, Training on rows 105000 to

In [23]:
# 📌 Make Predictions
y_pred_xgb = xgb_model.predict(X_test)


# 📌 Evaluate Models
mae_xgb = mean_absolute_error(y_test, y_pred_xgb)
mape_xgb = (mae_xgb / y_test.mean()) * 100


print(f"\n📊 XGBoost MAE: ${mae_xgb:.2f}, MAPE: {mape_xgb:.2f}%")



📊 XGBoost MAE: $73.06, MAPE: 27.56%


In [None]:

# 📌 Evaluate Blended Model
mae_blended = mean_absolute_error(y_test, y_pred_blended)
mape_blended = (mae_blended / y_test.mean()) * 100
print(f"\n📊 Blended Model MAE: ${mae_blended:.2f}, MAPE: {mape_blended:.2f}%")