# Importation

In [105]:
!pip install -q --upgrade catboost
!pip install -q scikit-learn
!pip install -q --upgrade pycaret

In [106]:
import os
import math
import random
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
from datetime import datetime, timedelta
import plotly.express as px
import plotly.io as pio
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from google.colab import drive
from pydantic import BaseModel
from pycaret.regression import setup, compare_models
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.feature_selection import SelectKBest, f_regression
from scipy import signal
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
from typing import Dict
from typing_extensions import Annotated
from sklearn.model_selection import GridSearchCV
import joblib



pio.templates.default = "plotly_dark"

In [107]:
class Config(BaseModel):
    SPLIT_BY_WTC: bool = False
    SPLIT_BY_MTOW: bool = True
    SPLIT_BY_FLOWN_DISTANCE: bool = False
    SPLIT_BY_TRAIN_TEST:bool = False

    USE_ENSEMBLE: bool = False
    PREDICT_ONLY: bool = True

    FLIGHT_PHASES_REFINEMENT: bool = False
    USE_TRAJECTORY: bool = False
    FIND_BEST_MODEL: bool = False
    FIND_BEST_PARAMETERS: bool = False

    CATBOOST_MODEL_PATH: str = "/content/drive/MyDrive/PRCModels/catboost_model_{}.pkl"
    XGBOOST_MODEL_PATH: str = "/content/drive/MyDrive/PRCModels/xgboost_model_{}.pkl"
    TRAINED_CATBOOST_MODEL_PATH: str = "/content/drive/MyDrive/PRCModels/Catboost.pkl"
    TRAINED_XGBOOST_MODEL_PATH: str = "/content/drive/MyDrive/PRCModels/XGBoost.pkl"
    TRAINED_CATBOOST_MTOW_MODEL_PATH: list[str] = ["/content/drive/MyDrive/PRCModels/catboost_model_very_low_mtow.pkl",
                                                   "/content/drive/MyDrive/PRCModels/catboost_model_low_mtow.pkl",
                                                   "/content/drive/MyDrive/PRCModels/catboost_model_medium_mtow.pkl",
                                                   "/content/drive/MyDrive/PRCModels/catboost_model_high_mtow.pkl",
                                                   "/content/drive/MyDrive/PRCModels/catboost_model_b77w.pkl",
                                                   "/content/drive/MyDrive/PRCModels/catboost_model_non_b77w.pkl",
                                                   ]
    TRAINED_XGBOOST_MTOW_MODEL_PATH: list[str] = ["/content/drive/MyDrive/PRCModels/xgboost_model_very_low_mtow.pkl",
                                                   "/content/drive/MyDrive/PRCModels/xgboost_model_low_mtow.pkl",
                                                   "/content/drive/MyDrive/PRCModels/xgboost_model_medium_mtow.pkl",
                                                   "/content/drive/MyDrive/PRCModels/xgboost_model_high_mtow.pkl",
                                                   "/content/drive/MyDrive/PRCModels/xgboost_model_b77w.pkl",
                                                   "/content/drive/MyDrive/PRCModels/xgboost_model_non_b77w.pkl",
                                                   ]


config = Config()

# Getting Data

In [108]:
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [109]:
# parquet_files = glob.glob('/content/drive/MyDrive/PRC/Data/*.parquet')
parquet_file_path = '/content/drive/MyDrive/PRC/Data/2022-01-01.parquet'
challenge_file_path = '/content/drive/MyDrive/PRC/Data/challenge_set.csv'
submisstion_file_path = '/content/drive/MyDrive/PRC/Data/final_submission_set.csv'

trajectory_df = pd.read_parquet(parquet_file_path) # Parquet stores dtype by default
trajectory_df = trajectory_df[:len(trajectory_df)]#//10]  # Small data for the baseline
train_df = pd.read_csv(challenge_file_path, parse_dates=['date', 'actual_offblock_time', 'arrival_time'])
test_df = pd.read_csv(submisstion_file_path, parse_dates=['date', 'actual_offblock_time', 'arrival_time']).drop(["tow"], axis=1)

In [110]:
trajectory_df.tail(3)

Unnamed: 0,flight_id,timestamp,latitude,longitude,altitude,groundspeed,track,vertical_rate,u_component_of_wind,v_component_of_wind,temperature,specific_humidity,icao24
5909616,248772010,2022-01-02 01:59:57+00:00,44.06575,-58.601074,40975.0,543.0,76.040526,-64.0,49.665905,-31.158336,210.208643,1.3e-05,248772010
5909617,248772010,2022-01-02 01:59:58+00:00,44.06575,-58.601074,40975.0,543.0,76.040526,-64.0,49.66589,-31.158134,210.208632,1.3e-05,248772010
5909618,248772010,2022-01-02 01:59:59+00:00,44.06575,-58.601074,40975.0,543.0,76.040526,-64.0,49.665875,-31.157932,210.208621,1.3e-05,248772010


In [111]:
trajectory_df.sort_values("altitude").head()

Unnamed: 0,flight_id,timestamp,latitude,longitude,altitude,groundspeed,track,vertical_rate,u_component_of_wind,v_component_of_wind,temperature,specific_humidity,icao24
4976611,248753716,2022-01-01 08:43:57+00:00,50.600886,5.94108,-1000.0,405.0,298.587902,-3008.0,-4.676256,1.139285,285.208894,0.007699,248753716
3994953,248757164,2022-01-01 20:19:19+00:00,42.919441,23.02597,-825.0,448.0,107.544832,0.0,1.886192,-0.520333,285.60048,0.005527,248757164
4075988,248765597,2022-01-01 15:02:22+00:00,40.093441,-2.839466,-800.0,433.0,191.854602,0.0,0.476404,0.213125,296.964754,0.005604,248765597
886139,248764524,2022-01-01 11:30:58+00:00,48.489928,2.258124,-800.0,412.0,197.818889,0.0,-4.016883,2.499475,279.057464,0.00891,248764524
2741367,248765281,2022-01-01 08:24:33+00:00,50.893443,4.499359,-800.0,118.0,249.727314,-640.0,0.865356,11.374119,282.421174,0.008033,248765281


In [112]:
trajectory_df.flight_id.nunique(), trajectory_df.shape

(790, (5909619, 13))

In [113]:
train_df.head(3)

Unnamed: 0,flight_id,date,callsign,adep,name_adep,country_code_adep,ades,name_ades,country_code_ades,actual_offblock_time,arrival_time,aircraft_type,wtc,airline,flight_duration,taxiout_time,flown_distance,tow
0,248763780,2022-01-01,3840d84f25d3f5fcc0a1be3076bb4039,EGLL,London Heathrow,GB,EICK,Cork,IE,2022-01-01 13:46:00+00:00,2022-01-01 15:04:56+00:00,A320,M,a73f82288988b79be490c6322f4c32ed,61,18,321,54748.0
1,248760618,2022-01-01,f6f610e73002b8892a239a81321f7f1d,LEBL,Barcelona,ES,KMIA,Miami,US,2022-01-01 09:55:00+00:00,2022-01-01 19:37:56+00:00,B772,H,5543e4dc327359ffaf5b9c0e6faaf0e1,570,13,4193,185441.0
2,248753824,2022-01-01,139670936660762c230ca92556ba842b,ESSA,Stockholm Arlanda,SE,KORD,Chicago O'Hare,US,2022-01-01 09:39:00+00:00,2022-01-01 19:08:13+00:00,A333,H,8be5c854fd664bcb97fb543339f74770,554,15,3770,230396.0


In [114]:
train_df.flight_id.nunique(), train_df.aircraft_type.nunique()

(369013, 30)

In [115]:
test_df.head(3)

Unnamed: 0,flight_id,date,callsign,adep,name_adep,country_code_adep,ades,name_ades,country_code_ades,actual_offblock_time,arrival_time,aircraft_type,wtc,airline,flight_duration,taxiout_time,flown_distance
0,248753821,2022-01-01,3b3de0f3ad0ee192513995c02f7bf7cf,LTFJ,Istanbul Sabiha Gokcen,TR,LFLL,Lyon,FR,2022-01-01 09:44:00+00:00,2022-01-01 12:48:33+00:00,B738,M,6351ec1b849adacc0cbb3b1313d8d39b,170,15,1122
1,248753822,2022-01-01,e06dd03d4a879ca37d9e18c1bd7cad16,EBBR,Brussels,BE,KJFK,New York JFK,US,2022-01-01 09:45:00+00:00,2022-01-01 17:49:51+00:00,A333,H,bdeeef3a675587d530de70a25d7118d2,470,15,3205
2,248754498,2022-01-01,2d3b1c962c78c4ebeef11bcd51b9e94c,KMIA,Miami,US,EGLL,London Heathrow,GB,2022-01-01 01:52:00+00:00,2022-01-01 09:55:16+00:00,B77W,H,5543e4dc327359ffaf5b9c0e6faaf0e1,473,10,3965


In [116]:
train_df.shape, test_df.shape

((369013, 18), (158149, 17))

In [117]:
test_df.aircraft_type.unique()

array(['B738', 'A333', 'B77W', 'BCS3', 'B38M', 'A320', 'E190', 'CRJ9',
       'A20N', 'A21N', 'B789', 'B739', 'A319', 'E195', 'A321', 'A359',
       'BCS1', 'A332', 'B788', 'B763', 'AT76', 'B772', 'B737', 'A343',
       'B39M', 'B752', 'B773', 'E290'], dtype=object)

# EDA

## Understanding the Flight Metadata

### Examine the shape of the datasets

In [118]:
print("Train Flights Shape:", train_df.shape)
print("Test Flights Shape:", test_df.shape)

Train Flights Shape: (369013, 18)
Test Flights Shape: (158149, 17)


### Summary statistics

In [119]:
train_df.describe()

Unnamed: 0,flight_id,date,flight_duration,taxiout_time,flown_distance,tow
count,369013.0,369013,369013.0,369013.0,369013.0,369013.0
mean,253522000.0,2022-07-14 06:48:45.496933632,145.876779,13.489709,1021.728581,79482.257229
min,248750600.0,2022-01-01 00:00:00,8.0,0.0,19.0,14944.0
25%,251229600.0,2022-04-29 00:00:00,59.0,10.0,338.0,55836.0
50%,253620000.0,2022-07-20 00:00:00,100.0,12.0,647.0,63852.0
75%,255905900.0,2022-10-04 00:00:00,164.0,16.0,1113.0,73756.0
max,258074500.0,2022-12-31 00:00:00,1013.0,90.0,7272.0,351327.0
std,2688565.0,,139.337587,5.779555,1128.171163,53250.919631


In [120]:
train_df.describe()

Unnamed: 0,flight_id,date,flight_duration,taxiout_time,flown_distance,tow
count,369013.0,369013,369013.0,369013.0,369013.0,369013.0
mean,253522000.0,2022-07-14 06:48:45.496933632,145.876779,13.489709,1021.728581,79482.257229
min,248750600.0,2022-01-01 00:00:00,8.0,0.0,19.0,14944.0
25%,251229600.0,2022-04-29 00:00:00,59.0,10.0,338.0,55836.0
50%,253620000.0,2022-07-20 00:00:00,100.0,12.0,647.0,63852.0
75%,255905900.0,2022-10-04 00:00:00,164.0,16.0,1113.0,73756.0
max,258074500.0,2022-12-31 00:00:00,1013.0,90.0,7272.0,351327.0
std,2688565.0,,139.337587,5.779555,1128.171163,53250.919631


### Missing Values

In [121]:
train_df.isnull().sum()

Unnamed: 0,0
flight_id,0
date,0
callsign,0
adep,0
name_adep,0
country_code_adep,0
ades,0
name_ades,0
country_code_ades,0
actual_offblock_time,0


In [122]:
test_df.isnull().sum()

Unnamed: 0,0
flight_id,0
date,0
callsign,0
adep,0
name_adep,0
country_code_adep,0
ades,0
name_ades,0
country_code_ades,0
actual_offblock_time,0


### TOW Histogram

In [123]:
fig = px.histogram(train_df['tow'], title="Distribution of TakeOff Weight (TOW)")
fig.update_xaxes(title_text='TakeOff Weight (kg)')
fig.update_yaxes(title_text='Frequency')
fig.show()

### Correlation Matrix

In [124]:
corr_matrix = train_df.select_dtypes(include=[np.number]).corr()
fig = px.imshow(corr_matrix, title='Correlation Matrix')
fig.show()

### Does any flight in the test not belong in the trajectory?

In [125]:
test_flight_ids = set(test_df['flight_id'])
trajectory_flight_ids = set(trajectory_df['flight_id'])

missing_flight_ids = test_flight_ids - trajectory_flight_ids

if missing_flight_ids:
    print(f"There are {len(missing_flight_ids)} flight IDs in the test data that don't appear in the trajectory data:")
    print(missing_flight_ids)
else:
    print("All flight IDs in the test data are present in the trajectory data.")

There are 157900 flight IDs in the test data that don't appear in the trajectory data:
{254803980, 249036815, 252706836, 256376854, 250609690, 255852571, 256901148, 252182559, 254804000, 255852576, 251658275, 254804004, 255328295, 255328300, 250085426, 250609715, 253755448, 250085440, 255852609, 254804035, 254804036, 250609736, 257425488, 254804054, 249036898, 254279784, 250085493, 249561212, 256901245, 256376960, 253755526, 257949830, 250085513, 253755529, 254279831, 257949854, 254279855, 255328432, 255328436, 256377022, 253755586, 250085572, 251658439, 256901328, 252182741, 252182744, 252182749, 252707037, 254279903, 254804189, 256901345, 254279906, 250609895, 250609896, 256377065, 255328490, 250609909, 254279933, 250609922, 255852810, 253755674, 249561372, 252182814, 250085669, 253755686, 257949989, 249561389, 252707132, 249561407, 254804293, 252707148, 256377167, 253755729, 252182868, 254804310, 249561436, 249037149, 250085725, 252707165, 255328606, 255328609, 256901474, 257950044,

## Understanding the Trajectory Data

### Number of unique flights in trajectory data

In [126]:
trajectory_df['flight_id'].nunique()

790

### Missing Value

In [127]:
trajectory_df.isnull().sum()

Unnamed: 0,0
flight_id,0
timestamp,0
latitude,0
longitude,0
altitude,0
groundspeed,19432
track,19432
vertical_rate,19432
u_component_of_wind,0
v_component_of_wind,0


In [128]:
print(f"Missing: {trajectory_df.groundspeed.isnull().sum() / trajectory_df.shape[0] * 100:.2f}%")

Missing: 0.33%


Since we have a lot of data, we might consider removing null later.

### Plot sample trajectory

In [129]:
sample_flight_id = trajectory_df['flight_id'].iloc[0]
sample_flight_data = trajectory_df[trajectory_df['flight_id'] == sample_flight_id]

fig = px.line(sample_flight_data, x='timestamp', y='altitude',
            title=f'Altitude over Time for Flight ID: {sample_flight_id}')
fig.update_xaxes(title_text='Timestamp')
fig.update_yaxes(title_text='Altitude')
fig.show()

There should be some sort of sensor error. (Remove the outlier)

# Data Preprocessing

## Find duration

In [130]:
def get_duration(df):
    df['actual_offblock_time'] = pd.to_datetime(df['actual_offblock_time'])
    df['arrival_time'] = pd.to_datetime(df['arrival_time'])
    df['duration'] = (df['arrival_time'] - df[
        'actual_offblock_time']).dt.total_seconds() / 60
    return df

train_df = get_duration(train_df)
test_df = get_duration(test_df)

In [131]:
train_df.head()

Unnamed: 0,flight_id,date,callsign,adep,name_adep,country_code_adep,ades,name_ades,country_code_ades,actual_offblock_time,arrival_time,aircraft_type,wtc,airline,flight_duration,taxiout_time,flown_distance,tow,duration
0,248763780,2022-01-01,3840d84f25d3f5fcc0a1be3076bb4039,EGLL,London Heathrow,GB,EICK,Cork,IE,2022-01-01 13:46:00+00:00,2022-01-01 15:04:56+00:00,A320,M,a73f82288988b79be490c6322f4c32ed,61,18,321,54748.0,78.933333
1,248760618,2022-01-01,f6f610e73002b8892a239a81321f7f1d,LEBL,Barcelona,ES,KMIA,Miami,US,2022-01-01 09:55:00+00:00,2022-01-01 19:37:56+00:00,B772,H,5543e4dc327359ffaf5b9c0e6faaf0e1,570,13,4193,185441.0,582.933333
2,248753824,2022-01-01,139670936660762c230ca92556ba842b,ESSA,Stockholm Arlanda,SE,KORD,Chicago O'Hare,US,2022-01-01 09:39:00+00:00,2022-01-01 19:08:13+00:00,A333,H,8be5c854fd664bcb97fb543339f74770,554,15,3770,230396.0,569.216667
3,248753852,2022-01-01,509dc61bb54fbab0e5406067c95603e2,LSZH,Zurich,CH,KPHL,Philadelphia,US,2022-01-01 11:04:00+00:00,2022-01-01 19:32:13+00:00,B788,H,5543e4dc327359ffaf5b9c0e6faaf0e1,497,11,3607,157615.0,508.216667
4,248755934,2022-01-01,d0610d000dcf26b1d7bba8103ecc393d,EIDW,Dublin,IE,EGLL,London Heathrow,GB,2022-01-01 12:36:00+00:00,2022-01-01 13:44:32+00:00,A21N,M,a73f82288988b79be490c6322f4c32ed,55,14,305,70318.447226,68.533333


## Can we get initial weight for each aircraft_type?

- https://contentzone.eurocontrol.int/aircraftperformance/details.aspx?ICAO=B38M

In [132]:
external_information = {
  "B738": {
                "MTOW(kg)": 70530,
                "passengers": 162,
                "ROC_Initial_Climb(ft/min)": 3000,
                "V2 (IAS)": 145,
            },
            "A333": {
                "MTOW(kg)": 230000,
                "passengers": 295,
                "ROC_Initial_Climb(ft/min)": 2000,
                "V2 (IAS)": 145,
            },
            "B77W": {
                "MTOW(kg)": 351500,
                "passengers": 365,
                "ROC_Initial_Climb(ft/min)": 2000,
                "V2 (IAS)": 149,
            },
            "B38M": {
                "MTOW(kg)": 82600,
                "passengers": 162,
                "ROC_Initial_Climb(ft/min)": 2500,
                "V2 (IAS)": 145,
            },
            "A320": {
                "MTOW(kg)": 73900,
                "passengers": 150,
                "ROC_Initial_Climb(ft/min)": 2500,
                "V2 (IAS)": 145,
            },
            "E190": {
                "MTOW(kg)": 45995,
                "passengers": 94,
                "ROC_Initial_Climb(ft/min)": 3400,
                "V2 (IAS)": 138,
            },
            "CRJ9": {
                "MTOW(kg)": 38330,
                "passengers": 80,
                "ROC_Initial_Climb(ft/min)": 2500,
                "V2 (IAS)": 140,
            },
            "A21N": {
                "MTOW(kg)": 97000,
                "passengers": 180,
                "ROC_Initial_Climb(ft/min)": 2000,
                "V2 (IAS)": 145,
            },
            "A20N": {
                "MTOW(kg)": 79000,
                "passengers": 150,
                "ROC_Initial_Climb(ft/min)": 2200,
                "V2 (IAS)": 145,
            },
            "B739": {
                "MTOW(kg)": 79015,
                "passengers": 177,
                "ROC_Initial_Climb(ft/min)": 3000,
                "V2 (IAS)": 149,
            },
            "BCS3": {
                "MTOW(kg)": 69900,
                "passengers": 120,
                "ROC_Initial_Climb(ft/min)": 3100,
                "V2 (IAS)": 165,
            },
            "E195": {
                "MTOW(kg)": 52290,
                "passengers": 100,
                "ROC_Initial_Climb(ft/min)": 3000,
                "V2 (IAS)": 140,
            },
            "A321": {
                "MTOW(kg)": 83000,
                "passengers": 185,
                "ROC_Initial_Climb(ft/min)": 2500,
                "V2 (IAS)": 145,
            },
            "A359": {
                "MTOW(kg)": 268000,
                "passengers": 314,
                "ROC_Initial_Climb(ft/min)": 3000,
                "V2 (IAS)": 150,
            },
            "A319": {
                "MTOW(kg)": 64000,
                "passengers": 124,
                "ROC_Initial_Climb(ft/min)": 2500,
                "V2 (IAS)": 135,
            },
            "A332": {
                "MTOW(kg)": 230000,
                "passengers": 253,
                "ROC_Initial_Climb(ft/min)": 2000,
                "V2 (IAS)": 145,
            },
            "B788": {
                "MTOW(kg)": 228000,
                "passengers": 210,
                "ROC_Initial_Climb(ft/min)": 2700,
                "V2 (IAS)": 165,
            },
            "B789": {
                "MTOW(kg)": 253000,
                "passengers": 406,
                "ROC_Initial_Climb(ft/min)": 3000,
                "V2 (IAS)": 165,
            },
            "BCS1": {
                "MTOW(kg)": 63100,
                "passengers": 100,
                "ROC_Initial_Climb(ft/min)": 3500,
                "V2 (IAS)": 140,
            },
            "B763": {
                "MTOW(kg)": 186880,
                "passengers": 269,
                "ROC_Initial_Climb(ft/min)": 3000,
                "V2 (IAS)": 160,
            },
            "AT76": {
                "MTOW(kg)": 23000,
                "passengers": 78,
                "ROC_Initial_Climb(ft/min)": 1350,
                "V2 (IAS)": 116,
            },
            "B772": {
                "MTOW(kg)": 247210,
                "passengers": 305,
                "ROC_Initial_Climb(ft/min)": 3000,
                "V2 (IAS)": 170,
            },
            "B737": {
                "MTOW(kg)": 66320,
                "passengers": 128,
                "ROC_Initial_Climb(ft/min)": 3000,
                "V2 (IAS)": 150,
            },
            "A343": {
                "MTOW(kg)": 275000,
                "passengers": 295,
                "ROC_Initial_Climb(ft/min)": 1400,
                "V2 (IAS)": 145,
            },
            "B39M": {
                "MTOW(kg)": 88300,
                "passengers": 178,
                "ROC_Initial_Climb(ft/min)": 2300,
                "V2 (IAS)": 150,
            },
            "B752": {
                "MTOW(kg)": 115680,
                "passengers": 200,
                "ROC_Initial_Climb(ft/min)": 3500,
                "V2 (IAS)": 145,
            },
            "B773": {
                "MTOW(kg)": 299370,
                "passengers": 368,
                "ROC_Initial_Climb(ft/min)": 3000,
                "V2 (IAS)": 168,
            },
            "E290": {
                "MTOW(kg)": 45995,
                "passengers": 94,
                "ROC_Initial_Climb(ft/min)": 3400,
                "V2 (IAS)": 138,
            },
}

### Make sure it covers all aircraft in the test data. (Not the train data)

In [133]:
unique_aircraft_types = set(test_df['aircraft_type'].unique())
external_info_keys = set(external_information.keys())

missing_aircraft_types = unique_aircraft_types - external_info_keys
missing_aircraft_types

set()

In [134]:
external_df = pd.DataFrame.from_dict(external_information, orient='index')
external_df.reset_index(inplace=True)
external_df.rename(columns={'index': 'aircraft_type'}, inplace=True)
external_df.head()

Unnamed: 0,aircraft_type,MTOW(kg),passengers,ROC_Initial_Climb(ft/min),V2 (IAS)
0,B738,70530,162,3000,145
1,A333,230000,295,2000,145
2,B77W,351500,365,2000,149
3,B38M,82600,162,2500,145
4,A320,73900,150,2500,145


In [135]:
external_df.columns

Index(['aircraft_type', 'MTOW(kg)', 'passengers', 'ROC_Initial_Climb(ft/min)',
       'V2 (IAS)'],
      dtype='object')

In [136]:
columns_to_plot = ['MTOW(kg)', 'passengers', 'ROC_Initial_Climb(ft/min)', 'V2 (IAS)']

fig = make_subplots(rows=4, cols=4,
                  shared_xaxes=True, shared_yaxes=True,
                  column_titles=columns_to_plot,
                  row_titles=columns_to_plot)

for i, col1 in enumerate(columns_to_plot):
  for j, col2 in enumerate(columns_to_plot):
      fig.add_trace(
          go.Scatter(
              x=external_df[col2],
              y=external_df[col1],
              mode='markers+text',
              # text=external_df['aircraft_type'],
              textposition='top center',
              name=f'{col1} vs {col2}',
              showlegend=False
          ),
          row=i+1, col=j+1
      )

fig.update_layout(
  title='Relationships Among Aircraft Characteristics',
  height=800,
  width=1000,
  showlegend=False,
)

fig.show()

### Merge external information

In [137]:
def merge_external_info(df, external_df):
  return pd.merge(df, external_df, on='aircraft_type', how='left')

train_df = merge_external_info(train_df, external_df)
test_df = merge_external_info(test_df, external_df)

### Filter train data to only external information we have

In [138]:
train_df = train_df[train_df['aircraft_type'].isin(external_information.keys())]
train_df.reset_index(drop=True, inplace=True)
test_df.reset_index(drop=True, inplace=True)
train_df.isnull().sum(), test_df.isnull().sum()

(flight_id                    0
 date                         0
 callsign                     0
 adep                         0
 name_adep                    0
 country_code_adep            0
 ades                         0
 name_ades                    0
 country_code_ades            0
 actual_offblock_time         0
 arrival_time                 0
 aircraft_type                0
 wtc                          0
 airline                      0
 flight_duration              0
 taxiout_time                 0
 flown_distance               0
 tow                          0
 duration                     0
 MTOW(kg)                     0
 passengers                   0
 ROC_Initial_Climb(ft/min)    0
 V2 (IAS)                     0
 dtype: int64,
 flight_id                    0
 date                         0
 callsign                     0
 adep                         0
 name_adep                    0
 country_code_adep            0
 ades                         0
 name_ades               

## Clean df with isolation forest

In [139]:
def clean_dataframe_with_isolation_forest(df, contamination=0.01):
  print("Shape before clean: ", df.shape)
  cleaned_df = df.copy()
  cleaned_df = cleaned_df.dropna()

  numeric_columns = cleaned_df.select_dtypes(include=[np.number]).columns

  # Standardize the numeric columns
  scaler = StandardScaler()
  scaled_data = scaler.fit_transform(cleaned_df[numeric_columns])

  # Use Isolation Forest for outlier detection
  iso_forest = IsolationForest(contamination=contamination, random_state=42)
  outlier_labels = iso_forest.fit_predict(scaled_data)

  # Keep only the non-outlier data points
  cleaned_df = cleaned_df[outlier_labels == 1].reset_index(drop=True)

  print("Shape after clean: ", cleaned_df.shape)
  return cleaned_df

In [140]:
def clean_trajectory_with_isolation_forest(df, contamination=0.01):
  print("Shape before clean: ", df.shape)
  cleaned_df = df.copy()

  # Filter rows based on altitude (From the ISA code)
  cleaned_df = cleaned_df[(cleaned_df['altitude'] >= 0) & (cleaned_df['altitude'] <= 47000)]
  cleaned_df = cleaned_df.dropna()

  numeric_columns = cleaned_df.select_dtypes(include=[np.number]).columns

  def clean_group(group):
      if len(group) <= 1:
          return group

      scaler = StandardScaler()
      scaled_data = scaler.fit_transform(group[numeric_columns])

      iso_forest = IsolationForest(contamination=contamination, random_state=42)
      outlier_labels = iso_forest.fit_predict(scaled_data)

      return group[outlier_labels == 1]

  # Apply the cleaning function to each flight_id group
  cleaned_df = cleaned_df.groupby('flight_id').apply(clean_group)

  # Reset index once before returning
  cleaned_df = cleaned_df.reset_index(drop=True)

  print("Shape after clean: ", cleaned_df.shape)
  return cleaned_df

In [141]:
train_df = clean_dataframe_with_isolation_forest(train_df)
# test_df = clean_dataframe_with_isolation_forest(test_df)
# trajectory_df = clean_trajectory_with_isolation_forest(trajectory_df)

Shape before clean:  (369010, 23)
Shape after clean:  (365319, 23)


In [142]:
sample_flight_id = random.choice(trajectory_df['flight_id'].unique())
sample_flight_data = trajectory_df[trajectory_df['flight_id'] == sample_flight_id]

fig = px.line(sample_flight_data, x='timestamp', y='altitude',
            title=f'Altitude over Time for Flight ID: {sample_flight_id}')
fig.update_xaxes(title_text='Timestamp')
fig.update_yaxes(title_text='Altitude')

fig.show()

## Find Thrust - Drag

In [143]:
g = 32.17405  # Gravitational acceleration (ft/s^2)
kt_to_ft_per_sec = 1.6878098571012  # knots to ft/s
m_to_ft = 3.28084

### Find the standard temperature (ISA)

In [144]:
def isa(alt):
  T0 = 288.15
  p0 = 101325
  rho0 = 1.225
  a0 = 340.294
  k = 1.4
  R = 287.05287
  betabelow = -0.0065
  trop = 11000

  if alt < 0 or alt > 47000:
      print("Altitude must be in [0, 47000]")
      return None, None

  if alt == 0:
      return T0, rho0

  if 0 < alt <= trop:
      temperature = T0 + betabelow * alt
      pressure = p0 * (temperature / T0) ** ((-1) * g / (betabelow * R))
  elif trop < alt < 47000:
      temperature = T0 + betabelow * trop
      pressure = (p0 * (temperature / T0) ** ((-1) * g / (betabelow * R))) * math.exp(-g * (alt - trop) / (R * temperature))

  density = pressure / (R * temperature)
  return temperature, density

### Find V

In [145]:
def calculate_true_airspeed(row):
  GS = row['groundspeed']
  track = np.radians(row['track'])
  u_wind = row['u_component_of_wind']
  v_wind = row['v_component_of_wind']

  V = np.sqrt((GS * np.sin(track) - u_wind)**2 + (GS * np.cos(track) - v_wind)**2) * ktstofts
  return V

### Find dh/dt

In [146]:
def calculate_vertical_speed(row):
  return row['vertical_rate'] / 60  # Convert from ft/min to ft/s

### Find temperature deviation (Δt)

In [147]:
def calculate_temp_deviation(row):
  isa_temp, _ = isa(row['altitude'])
  return row['temperature'] - isa_temp

### Find horizontal acceleration (dV/dt)

In [148]:
def calculate_horizontal_acceleration(group):
  group = group.sort_values('timestamp')
  group['V'] = group.apply(calculate_true_airspeed, axis=1)
  group['dV_dt'] = group['V'].diff() / group['timestamp'].diff().dt.total_seconds() # .diff() = current - previous, .diff(-1) = current - next
  return group


### Find wind acceleration (dWi/dt)

In [149]:
def calculate_wind_acceleration(group):
  group = group.sort_values('timestamp')
  group['W_long'] = (group['u_component_of_wind'] * np.sin(np.radians(group['track'])) +
                     group['v_component_of_wind'] * np.cos(np.radians(group['track']))) * ktstofts
  group['dWi_dt'] = group['W_long'].diff() / group['timestamp'].diff().dt.total_seconds()
  return group

### Segment the Trajectory into Flight Phases
Aggregating the trajectory data by computing averages or maxima over the entire flight might dilute the crucial information specific to the takeoff phase. Since flights consist of multiple phases (takeoff, climb, cruise, descent, landing), combining all phases might mask the effects that are directly influenced by ATOW.

So, we will focus on the `takeoff and initial climb phases`because:

- During takeoff, the aircraft accelerates both horizontally and vertically. This acceleration is directly influenced by the mass.
- Pilots use maximum or near-maximum thrust during takeoff, and any variations in T-D are more likely due to differences in mass rather than throttle settings.
- In cruise, the aircraft's acceleration is minimal, and variations in T-D are more affected by changes in drag due to atmospheric conditions rather than mass.


In [150]:
def identify_flight_phases(group, flight_phases_refinement=False):
  group = group.sort_values('timestamp').reset_index(drop=True)
  group['altitude_diff'] = group['altitude'].diff()

  # Applies a Savitzky-Golay filter to smooth the altitude profile, reducing noise.
  altitude_smooth = signal.savgol_filter(group['altitude'],
                                         window_length=min(21, len(group) // 2 * 2 + 1),
                                         polyorder=3)

  # Calculate the rate of climb (ROC) (gradient[i] = (f[i+1] - f[i-1]) / (x[i+1] - x[i-1]))
  group['ROC'] = np.gradient(altitude_smooth, group['timestamp'].astype(int) / 10**9)
  max_altitude = group['altitude'].max()
  takeoff_end = group[group['altitude'] > group['altitude'].quantile(0.1)].index[0]
  top_of_climb = group[group['altitude'] > max_altitude * 0.95].index[0]

  takeoff_phase = group.loc[:takeoff_end]
  initial_climb_phase = group.loc[takeoff_end:top_of_climb]
  cruise_phase = group.loc[top_of_climb:]

  if flight_phases_refinement:
    # Refine takeoff phase (focus on the most significant part of the takeoff)
    # The assumption is that the aircraft's weight (ATOW) will have the most significant impact during this high-ROC portion of takeoff.
    takeoff_phase = takeoff_phase[takeoff_phase['ROC'] > takeoff_phase['ROC'].quantile(0.5)]

    # Refine initial climb phase
    initial_climb_phase = initial_climb_phase[
        (initial_climb_phase['ROC'] > initial_climb_phase['ROC'].quantile(0.25)) &
        (initial_climb_phase['altitude'] < max_altitude * 0.8)
    ]

  return takeoff_phase, initial_climb_phase, cruise_phase

### Thrust - Drag (T-D)

In [151]:
def calculate_thrust_minus_drag_for_phases(trajectory_df):
  td_list = []

  for flight_id, group in tqdm(trajectory_df.groupby('flight_id'), desc="Calculating T-D for each flight"):
      takeoff_phase, initial_climb_phase, _ = identify_flight_phases(group, FLIGHT_PHASES_REFINEMENT)

      # You can combine takeoff and initial climb if desired
      relevant_phase = pd.concat([takeoff_phase, initial_climb_phase])

      if relevant_phase.empty:
          continue

      # Perform calculations only on the relevant phase
      relevant_phase['V'] = relevant_phase.apply(calculate_true_airspeed, axis=1)
      relevant_phase['dh_dt'] = relevant_phase.apply(calculate_vertical_speed, axis=1)
      relevant_phase['delta_t'] = relevant_phase.apply(calculate_temp_deviation, axis=1)

      relevant_phase = calculate_horizontal_acceleration(relevant_phase)
      relevant_phase = calculate_wind_acceleration(relevant_phase)

      # Calculate T - D
      relevant_phase['T_minus_D'] = (
          g * relevant_phase['dh_dt'] / relevant_phase['V'] * (relevant_phase['temperature'] / (relevant_phase['temperature'] - relevant_phase['delta_t']))
          + relevant_phase['dV_dt'] + relevant_phase['dWi_dt']
      )

      # Add flight ID for linking later
      relevant_phase['flight_id'] = flight_id

      # Collect the relevant data
      td_list.append(relevant_phase)

  # Concatenate all flights' data
  td_df = pd.concat(td_list, ignore_index=True)

  # Drop any rows with missing T_minus_D values
  td_df = td_df.dropna(subset=['T_minus_D'])

  return td_df

## Aggregate Features (Still not find the important variables)

In [152]:
def aggregate_features(td_df):
    numerical_cols = td_df.select_dtypes(include=np.number).columns.tolist()
    if 'flight_id' in numerical_cols:
        numerical_cols.remove('flight_id')
    agg_funcs = {col: ['mean', 'max', 'std'] for col in numerical_cols}
    aggregated_features = td_df.groupby('flight_id').agg(agg_funcs).reset_index()
    aggregated_features.columns = ['_'.join(col).rstrip('_') for col in aggregated_features.columns.values]

    return aggregated_features

## Merging Datasets

In [153]:
if config.USE_TRAJECTORY:
  td_df = calculate_thrust_minus_drag_for_phases(trajectory_df, config.FLIGHT_PHASES_REFINEMENT)
  aggregated_features = aggregate_features(td_df)
  merged_df = pd.merge(train_df, aggregated_features, left_on='flight_id', right_on='flight_id', how='inner')
else:
  merged_df = train_df.copy()
merged_df.head()

Unnamed: 0,flight_id,date,callsign,adep,name_adep,country_code_adep,ades,name_ades,country_code_ades,actual_offblock_time,...,airline,flight_duration,taxiout_time,flown_distance,tow,duration,MTOW(kg),passengers,ROC_Initial_Climb(ft/min),V2 (IAS)
0,248763780,2022-01-01,3840d84f25d3f5fcc0a1be3076bb4039,EGLL,London Heathrow,GB,EICK,Cork,IE,2022-01-01 13:46:00+00:00,...,a73f82288988b79be490c6322f4c32ed,61,18,321,54748.0,78.933333,73900.0,150.0,2500.0,145.0
1,248760618,2022-01-01,f6f610e73002b8892a239a81321f7f1d,LEBL,Barcelona,ES,KMIA,Miami,US,2022-01-01 09:55:00+00:00,...,5543e4dc327359ffaf5b9c0e6faaf0e1,570,13,4193,185441.0,582.933333,247210.0,305.0,3000.0,170.0
2,248753824,2022-01-01,139670936660762c230ca92556ba842b,ESSA,Stockholm Arlanda,SE,KORD,Chicago O'Hare,US,2022-01-01 09:39:00+00:00,...,8be5c854fd664bcb97fb543339f74770,554,15,3770,230396.0,569.216667,230000.0,295.0,2000.0,145.0
3,248753852,2022-01-01,509dc61bb54fbab0e5406067c95603e2,LSZH,Zurich,CH,KPHL,Philadelphia,US,2022-01-01 11:04:00+00:00,...,5543e4dc327359ffaf5b9c0e6faaf0e1,497,11,3607,157615.0,508.216667,228000.0,210.0,2700.0,165.0
4,248755934,2022-01-01,d0610d000dcf26b1d7bba8103ecc393d,EIDW,Dublin,IE,EGLL,London Heathrow,GB,2022-01-01 12:36:00+00:00,...,a73f82288988b79be490c6322f4c32ed,55,14,305,70318.447226,68.533333,97000.0,180.0,2000.0,145.0


In [154]:
merged_df["MTOW(kg)"].describe()

Unnamed: 0,MTOW(kg)
count,365319.0
mean,94514.174037
std,65033.169358
min,23000.0
25%,69900.0
50%,73900.0
75%,83000.0
max,351500.0


In [155]:
merged_df.aircraft_type.nunique(), test_df.aircraft_type.nunique()

(28, 28)

## Normalization

In [156]:
def normalize_dataframe(df, exclude_columns=None):
    df_normalized = df.copy()

    if exclude_columns is None:
        exclude_columns = []

    columns_to_normalize = df.select_dtypes(include=[np.number]).columns.difference(exclude_columns)
    scaler = StandardScaler()
    df_normalized[columns_to_normalize] = scaler.fit_transform(df[columns_to_normalize])

    return df_normalized

In [157]:
exclude_cols = ['MTOW(kg)', 'flight_id', 'tow', 'date', 'callsign', 'adep', 'ades', 'actual_offblock_time', 'arrival_time', 'aircraft_type', 'wtc', 'airline']

if config.SPLIT_BY_FLOWN_DISTANCE:
  exclude_cols.extend(['flown_distance'])

norm_df = normalize_dataframe(merged_df, exclude_columns=exclude_cols)
norm_df.head()

Unnamed: 0,flight_id,date,callsign,adep,name_adep,country_code_adep,ades,name_ades,country_code_ades,actual_offblock_time,...,airline,flight_duration,taxiout_time,flown_distance,tow,duration,MTOW(kg),passengers,ROC_Initial_Climb(ft/min),V2 (IAS)
0,248763780,2022-01-01,3840d84f25d3f5fcc0a1be3076bb4039,EGLL,London Heathrow,GB,EICK,Cork,IE,2022-01-01 13:46:00+00:00,...,a73f82288988b79be490c6322f4c32ed,-0.616852,0.793523,-0.630854,54748.0,-0.573622,73900.0,-0.222165,-0.084663,-0.028796
1,248760618,2022-01-01,f6f610e73002b8892a239a81321f7f1d,LEBL,Barcelona,ES,KMIA,Miami,US,2022-01-01 09:55:00+00:00,...,5543e4dc327359ffaf5b9c0e6faaf0e1,3.336506,-0.079192,3.093692,185441.0,3.284415,247210.0,2.167263,1.18297,3.077931
2,248753824,2022-01-01,139670936660762c230ca92556ba842b,ESSA,Stockholm Arlanda,SE,KORD,Chicago O'Hare,US,2022-01-01 09:39:00+00:00,...,8be5c854fd664bcb97fb543339f74770,3.212236,0.269894,2.686801,230396.0,3.179416,230000.0,2.013107,-1.352296,-0.028796
3,248753852,2022-01-01,509dc61bb54fbab0e5406067c95603e2,LSZH,Zurich,CH,KPHL,Philadelphia,US,2022-01-01 11:04:00+00:00,...,5543e4dc327359ffaf5b9c0e6faaf0e1,2.769522,-0.428278,2.530009,157615.0,2.712471,228000.0,0.702775,0.422391,2.456586
4,248755934,2022-01-01,d0610d000dcf26b1d7bba8103ecc393d,EIDW,Dublin,IE,EGLL,London Heathrow,GB,2022-01-01 12:36:00+00:00,...,a73f82288988b79be490c6322f4c32ed,-0.663454,0.095351,-0.646244,70318.447226,-0.653233,97000.0,0.240305,-1.352296,-0.028796


In [158]:
test_df = normalize_dataframe(test_df, exclude_columns=exclude_cols)
test_df.head()

Unnamed: 0,flight_id,date,callsign,adep,name_adep,country_code_adep,ades,name_ades,country_code_ades,actual_offblock_time,...,wtc,airline,flight_duration,taxiout_time,flown_distance,duration,MTOW(kg),passengers,ROC_Initial_Climb(ft/min),V2 (IAS)
0,248753821,2022-01-01,3b3de0f3ad0ee192513995c02f7bf7cf,LTFJ,Istanbul Sabiha Gokcen,TR,LFLL,Lyon,FR,2022-01-01 09:44:00+00:00,...,M,6351ec1b849adacc0cbb3b1313d8d39b,0.172633,0.2607,0.088191,0.177912,70530,-0.066785,1.176746,-0.041601
1,248753822,2022-01-01,e06dd03d4a879ca37d9e18c1bd7cad16,EBBR,Brussels,BE,KJFK,New York JFK,US,2022-01-01 09:45:00+00:00,...,H,bdeeef3a675587d530de70a25d7118d2,2.321494,0.2607,1.930059,2.301228,230000,1.888052,-1.340051,-0.041601
2,248754498,2022-01-01,2d3b1c962c78c4ebeef11bcd51b9e94c,KMIA,Miami,US,EGLL,London Heathrow,GB,2022-01-01 01:52:00+00:00,...,H,5543e4dc327359ffaf5b9c0e6faaf0e1,2.342982,-0.604563,2.602079,2.290033,351500,2.916913,-1.340051,0.451127
3,248763650,2022-01-01,35f7721f68bf85128195547ae38b0f04,EBBR,Brussels,BE,LEAL,Alicante,ES,2022-01-01 12:02:00+00:00,...,M,f53c55b5cf0cbb3be755bf50df6fa52d,-0.164022,-0.777615,-0.194765,-0.194122,70530,-0.066785,1.176746,-0.041601
4,248763651,2022-01-01,eb56918bee9bc5204624186b9bcc4391,LSZH,Zurich,CH,LFPG,Paris Charles de Gaulle,FR,2022-01-01 12:03:00+00:00,...,M,2d5def0a5a844b343ba1b7cc9cb28fa9,-0.643934,-0.43151,-0.645726,-0.655129,69900,-0.684102,1.428425,2.422041


## Categorical Encoding

In [159]:
def encode_categorical_features(df, preserve_columns=None):
  if preserve_columns is None:
      preserve_columns = []

  df_encoded = df.copy()

  categorical_col = [
      "adep",
      "country_code_adep",
      "ades",
      "country_code_ades",
      "aircraft_type",
      "airline",
  ]

  encoder = LabelEncoder()

  for col in categorical_col:
      df_encoded[col + "_encoded"] = encoder.fit_transform(df_encoded[col])
      # Only drop columns that are not in preserve_columns
      if col not in preserve_columns:
          df_encoded = df_encoded.drop(columns=[col])

  oneHot_col = ["wtc"]
  df_encoded = pd.get_dummies(df_encoded, columns=oneHot_col)

  df_encoded["wtc_M"] = df_encoded["wtc_M"].astype(int)
  df_encoded["wtc_H"] = df_encoded["wtc_H"].astype(int)

  return df_encoded

In [160]:
encoded_df = encode_categorical_features(norm_df, preserve_columns=['aircraft_type'])
test_df = encode_categorical_features(test_df, preserve_columns=['aircraft_type'])

## Feature Selection

### Drop Features

In [161]:
def drop_features(norm_df, final_drop=False):
  drop_cols = [
      "date",
      "callsign",
      "name_adep",
      "name_ades",
      "actual_offblock_time",
      "arrival_time",
  ]

  # If this is the final drop (after MTOW splitting), add preserved columns to drop
  if final_drop:
      drop_cols.extend(['aircraft_type'])

  dropped_df = norm_df.drop(columns=drop_cols, errors='ignore')
  return dropped_df

In [162]:
drop_df = drop_features(encoded_df, final_drop=False)

In [163]:
test_df = drop_features(test_df, final_drop=False)
test_df.head()

Unnamed: 0,flight_id,aircraft_type,flight_duration,taxiout_time,flown_distance,duration,MTOW(kg),passengers,ROC_Initial_Climb(ft/min),V2 (IAS),adep_encoded,country_code_adep_encoded,ades_encoded,country_code_ades_encoded,aircraft_type_encoded,airline_encoded,wtc_H,wtc_M
0,248753821,B738,0.172633,0.2607,0.088191,0.177912,70530,-0.066785,1.176746,-0.041601,347,84,194,23,13,11,0,1
1,248753822,A333,2.321494,0.2607,1.930059,2.301228,230000,1.888052,-1.340051,-0.041601,12,9,145,65,6,16,1,0
2,248754498,B77W,2.342982,-0.604563,2.602079,2.290033,351500,2.916913,-1.340051,0.451127,168,88,52,24,19,8,1,0
3,248763650,B738,-0.164022,-0.777615,-0.194765,-0.194122,70530,-0.066785,1.176746,-0.041601,12,9,168,21,13,21,0,1
4,248763651,BCS3,-0.643934,-0.43151,-0.645726,-0.655129,69900,-0.684102,1.428425,2.422041,302,14,204,23,23,2,0,1


### Select Feature

In [164]:
def feature_selection(
  X_train,
  y_train,
  X_test,
  k=15
):
  """Selects the best features from the training data and applies them to the test data."""

  selector = SelectKBest(score_func=f_regression, k=k)

  selector.fit(X_train, y_train)
  selected_features = X_train.columns[selector.get_support()]

  selected_features_df = pd.DataFrame(
      selected_features, columns=["Selected Features"]
  )

  feature_scores_df = pd.DataFrame(
      {
          "Feature": X_train.columns,
          "Score": selector.scores_,
          "p-value": selector.pvalues_,
      }
  )

  X_train_selected = selector.transform(X_train)
  X_test_selected = selector.transform(X_test)

  X_train_selected = pd.DataFrame(
      X_train_selected, columns=selected_features, index=X_train.index
  )
  X_test_selected = pd.DataFrame(
      X_test_selected, columns=selected_features, index=X_test.index
  )

  return X_train_selected, X_test_selected, selected_features_df, feature_scores_df

In [165]:
def process_category_split(X, y, category_name):
  """Helper function to perform train-test split and print shapes"""
  X_train, X_test, y_train, y_test = train_test_split(
      X, y, test_size=0.2, random_state=42
  )

  print(f"{category_name} Train shape:", X_train.shape)
  print(f"{category_name} Test shape:", X_test.shape)
  print(f"{category_name} y Train shape:", y_train.shape)
  print(f"{category_name} y Test shape:", y_test.shape)
  print()

  return X_train, X_test, y_train, y_test

In [166]:
def process_feature_selection(X_train, y_train, X_test, category_name):
  """Helper function to perform feature selection and print results"""
  X_train_selected, X_test_selected, selected_features_df, feature_scores_df = feature_selection(
      X_train, y_train, X_test
  )
  print(f"Selected Features for {category_name}:\n", selected_features_df)
  print()

  return X_train_selected, X_test_selected, selected_features_df, feature_scores_df

### Split the data by wtc

In [167]:
def split_wtc(df):
  X = df.drop(['tow'], axis=1)
  y = df[['flight_id', 'tow']]

  X_wtc_M = X[X['wtc_M'] == 1].drop(["wtc_H", "wtc_M"], axis=1).reset_index(drop=True)
  X_wtc_H = X[X['wtc_H'] == 1].drop(["wtc_H", "wtc_M"], axis=1).reset_index(drop=True)

  y_wtc_M = y[y['flight_id'].isin(X_wtc_M.flight_id.unique())].drop('flight_id', axis=1).reset_index(drop=True)
  y_wtc_H = y[y['flight_id'].isin(X_wtc_H.flight_id.unique())].drop('flight_id', axis=1).reset_index(drop=True)

  X_wtc_M = X_wtc_M.drop('flight_id', axis=1).reset_index(drop=True)
  X_wtc_H = X_wtc_H.drop('flight_id', axis=1).reset_index(drop=True)

  return X_wtc_M, X_wtc_H, y_wtc_M, y_wtc_H

In [168]:
if config.SPLIT_BY_WTC:
  X_wtc_M, X_wtc_H, y_wtc_M, y_wtc_H = split_wtc(drop_df)

  X_wtc_H_train, X_wtc_H_test, y_wtc_H_train, y_wtc_H_test = train_test_split(
    X_wtc_H, y_wtc_H, test_size=0.2, random_state=42
  )

  X_wtc_M_train, X_wtc_M_test, y_wtc_M_train, y_wtc_M_test = train_test_split(
    X_wtc_M, y_wtc_M, test_size=0.2, random_state=42
  )

  print("X_wtc_H_train shape:", X_wtc_H_train.shape)
  print("X_wtc_H_test shape:", X_wtc_H_test.shape)
  print("y_wtc_H_train shape:", y_wtc_H_train.shape)
  print("y_wtc_H_test shape:", y_wtc_H_test.shape)

  print("X_wtc_M_train shape:", X_wtc_M_train.shape)
  print("X_wtc_M_test shape:", X_wtc_M_test.shape)
  print("y_wtc_M_train shape:", y_wtc_M_train.shape)
  print("y_wtc_M_test shape:", y_wtc_M_test.shape)

  X_train_selected_H, X_test_selected_H, selected_features_df_H, feature_scores_df_H = feature_selection(X_wtc_H_train, y_wtc_H_train, X_wtc_H_test)
  X_train_selected_M, X_test_selected_M, selected_features_df_M, feature_scores_df_M = feature_selection(X_wtc_M_train, y_wtc_M_train, X_wtc_M_test)

  print("Selected Features for wtc_H:\n", selected_features_df_H)
  print("Feature Scores for wtc_H:\n", feature_scores_df_H)

  print("Selected Features for wtc_M:\n", selected_features_df_M)
  print("Feature Scores for wtc_M:\n", feature_scores_df_M)

### Split the data by mtow

In [169]:
def split_by_mtow(df, is_test=False):

  def process_subset(subset_df):

      subset_df = subset_df.copy()
      scaler = StandardScaler()
      subset_df['MTOW(kg)'] = scaler.fit_transform(subset_df[['MTOW(kg)']])

      if not is_test:
          X = subset_df.drop(['tow', 'MTOW_range'], axis=1).reset_index(drop=True)
          y = subset_df[['flight_id', 'tow']].drop('flight_id', axis=1).reset_index(drop=True)
          return X, y
      else:
          X = subset_df.drop(['MTOW_range'], axis=1).reset_index(drop=True)
          return X, None

  def get_category_data(data_df, category_col, category_value):
      """Helper function to filter data by category and process it"""
      category_df = data_df[data_df[category_col] == category_value]
      return process_subset(category_df)

  narrow_body_df = df[df["MTOW(kg)"] <= 115680].copy()
  wide_body_df = df[df["MTOW(kg)"] > 115680].copy()

  narrow_body_df['MTOW_range'] = pd.qcut(narrow_body_df['MTOW(kg)'],
                                          q=4,
                                          labels=['Very Low', 'Low', 'Medium', 'High'])

  wide_body_df['MTOW_range'] = wide_body_df['aircraft_type'].apply(
      lambda x: 'B77W' if x == 'B77W' else 'NonB77W'
  )

  X_very_low, y_very_low = get_category_data(narrow_body_df, 'MTOW_range', 'Very Low')
  X_low, y_low = get_category_data(narrow_body_df, 'MTOW_range', 'Low')
  X_medium, y_medium = get_category_data(narrow_body_df, 'MTOW_range', 'Medium')
  X_high, y_high = get_category_data(narrow_body_df, 'MTOW_range', 'High')

  X_non_b77w, y_non_b77w = get_category_data(wide_body_df, 'MTOW_range', 'NonB77W')
  X_b77w, y_b77w = get_category_data(wide_body_df, 'MTOW_range', 'B77W')

  return (X_very_low, X_low, X_medium, X_high, X_non_b77w, X_b77w,
          y_very_low, y_low, y_medium, y_high, y_non_b77w, y_b77w)

In [170]:
# temp_1, temp_2 = split_by_mtow(merged_df)
# temp_1.head()

In [171]:
# temp_1.groupby('MTOW_range').agg({'MTOW(kg)': ['min', 'max', 'mean', 'count']})

In [172]:
# temp_2.groupby('MTOW_range').agg({'MTOW(kg)': ['min', 'max', 'mean', 'count']})

In [173]:
if config.SPLIT_BY_MTOW:
  (X_very_low, X_low, X_medium, X_high, X_non_b77w, X_b77w,
   y_very_low, y_low, y_medium, y_high, y_non_b77w, y_b77w) = split_by_mtow(drop_df)

  X_very_low = drop_features(X_very_low, final_drop=True)
  X_low = drop_features(X_low, final_drop=True)
  X_medium = drop_features(X_medium, final_drop=True)
  X_high = drop_features(X_high, final_drop=True)
  X_non_b77w = drop_features(X_non_b77w, final_drop=True)
  X_b77w = drop_features(X_b77w, final_drop=True)

  # Very Low MTOW
  X_very_low_train, X_very_low_test, y_very_low_train, y_very_low_test = process_category_split(
      X_very_low, y_very_low, "Very Low MTOW"
  )

  # Low MTOW
  X_low_train, X_low_test, y_low_train, y_low_test = process_category_split(
      X_low, y_low, "Low MTOW"
  )

  # Medium MTOW
  X_medium_train, X_medium_test, y_medium_train, y_medium_test = process_category_split(
      X_medium, y_medium, "Medium MTOW"
  )

  # High MTOW
  X_high_train, X_high_test, y_high_train, y_high_test = process_category_split(
      X_high, y_high, "High MTOW"
  )

  # Non-B77W
  X_non_b77w_train, X_non_b77w_test, y_non_b77w_train, y_non_b77w_test = process_category_split(
      X_non_b77w, y_non_b77w, "Non-B77W"
  )

  # B77W
  X_b77w_train, X_b77w_test, y_b77w_train, y_b77w_test = process_category_split(
      X_b77w, y_b77w, "B77W"
  )

  # Feature selection for each category
  X_train_selected_very_low, X_test_selected_very_low, selected_features_df_very_low, feature_scores_df_very_low = process_feature_selection(
      X_very_low_train, y_very_low_train, X_very_low_test, "Very Low MTOW"
  )

  X_train_selected_low, X_test_selected_low, selected_features_df_low, feature_scores_df_low = process_feature_selection(
      X_low_train, y_low_train, X_low_test, "Low MTOW"
  )

  X_train_selected_medium, X_test_selected_medium, selected_features_df_medium, feature_scores_df_medium = process_feature_selection(
      X_medium_train, y_medium_train, X_medium_test, "Medium MTOW"
  )

  X_train_selected_high, X_test_selected_high, selected_features_df_high, feature_scores_df_high = process_feature_selection(
      X_high_train, y_high_train, X_high_test, "High MTOW"
  )

  X_train_selected_non_b77w, X_test_selected_non_b77w, selected_features_df_non_b77w, feature_scores_df_non_b77w = process_feature_selection(
      X_non_b77w_train, y_non_b77w_train, X_non_b77w_test, "Non-B77W"
  )

  X_train_selected_b77w, X_test_selected_b77w, selected_features_df_b77w, feature_scores_df_b77w = process_feature_selection(
      X_b77w_train, y_b77w_train, X_b77w_test, "B77W"
  )

Very Low MTOW Train shape: (66157, 17)
Very Low MTOW Test shape: (16540, 17)
Very Low MTOW y Train shape: (66157, 1)
Very Low MTOW y Test shape: (16540, 1)

Low MTOW Train shape: (103353, 17)
Low MTOW Test shape: (25839, 17)
Low MTOW y Train shape: (103353, 1)
Low MTOW y Test shape: (25839, 1)

Medium MTOW Train shape: (30355, 17)
Medium MTOW Test shape: (7589, 17)
Medium MTOW y Train shape: (30355, 1)
Medium MTOW y Test shape: (7589, 1)

High MTOW Train shape: (53880, 17)
High MTOW Test shape: (13471, 17)
High MTOW y Train shape: (53880, 1)
High MTOW y Test shape: (13471, 1)

Non-B77W Train shape: (33179, 17)
Non-B77W Test shape: (8295, 17)
Non-B77W y Train shape: (33179, 1)
Non-B77W y Test shape: (8295, 1)

B77W Train shape: (5328, 17)
B77W Test shape: (1333, 17)
B77W y Train shape: (5328, 1)
B77W y Test shape: (1333, 1)

Selected Features for Very Low MTOW:
             Selected Features
0                   flight_id
1             flight_duration
2                taxiout_time
3     

In [174]:
if config.SPLIT_BY_MTOW and config.PREDICT_ONLY:
  (X_predict_very_low, X_predict_low, X_predict_medium, X_predict_high, X_predict_non_b77w, X_predict_b77w, *_) = split_by_mtow(test_df, is_test=True)

  X_predict_very_low = drop_features(X_predict_very_low, final_drop=True)
  X_predict_low = drop_features(X_predict_low, final_drop=True)
  X_predict_medium = drop_features(X_predict_medium, final_drop=True)
  X_predict_high = drop_features(X_predict_high, final_drop=True)
  X_predict_non_b77w = drop_features(X_predict_non_b77w, final_drop=True)
  X_predict_b77w = drop_features(X_predict_b77w, final_drop=True)

  X_predict_selected_very_low = X_predict_very_low[X_train_selected_very_low.columns]
  X_predict_selected_low = X_predict_low[X_train_selected_low.columns]
  X_predict_selected_medium = X_predict_medium[X_train_selected_medium.columns]
  X_predict_selected_high = X_predict_high[X_train_selected_high.columns]
  X_predict_selected_non_b77w = X_predict_non_b77w[X_train_selected_non_b77w.columns]
  X_predict_selected_b77w = X_predict_b77w[X_train_selected_b77w.columns]

### Split the data by distance

In [175]:
def split_by_flown_distance(df):
  """
  Splits the DataFrame into narrow-body and wide-body aircraft,
  then separates each category into X (features) and y (target) components.
  For narrow-body, splits by flown_distance instead of MTOW.
  """
  def process_subset(subset_df):
      """Helper function to process X and y for a subset of data"""
      subset_df = subset_df.copy()
      scaler = StandardScaler()
      subset_df[['MTOW(kg)', 'flown_distance']] = scaler.fit_transform(subset_df[['MTOW(kg)', 'flown_distance']])

      X = subset_df.drop(['tow', 'flown_distance_range'], axis=1).reset_index(drop=True)
      y = subset_df[['flight_id', 'tow']].drop('flight_id', axis=1).reset_index(drop=True)
      return X, y

  def get_category_data(data_df, category_col, category_value):
      """Helper function to filter data by category and process it"""
      category_df = data_df[data_df[category_col] == category_value]
      return process_subset(category_df)

  narrow_body_df = df[df["MTOW(kg)"] <= 115680].copy()
  wide_body_df = df[df["MTOW(kg)"] > 115680].copy()

  bins = [0, 500, 1000, float('inf')]
  labels = ['0-500', '500-1000', '>1000']
  narrow_body_df['flown_distance_range'] = pd.cut(narrow_body_df['flown_distance'], bins=bins, labels=labels, right=False)

  wide_body_df['flown_distance_range'] = wide_body_df['aircraft_type'].apply(
      lambda x: 'B77W' if x == 'B77W' else 'NonB77W'
  )
  # return narrow_body_df, wide_body_df

  X_0_500, y_0_500 = get_category_data(narrow_body_df, 'flown_distance_range', '0-500')
  X_500_1000, y_500_1000 = get_category_data(narrow_body_df, 'flown_distance_range', '500-1000')
  X_above_1000, y_above_1000 = get_category_data(narrow_body_df, 'flown_distance_range', '>1000')


  X_non_b77w, y_non_b77w = get_category_data(wide_body_df, 'flown_distance_range', 'NonB77W')
  X_b77w, y_b77w = get_category_data(wide_body_df, 'flown_distance_range', 'B77W')

  return (X_0_500, X_500_1000, X_above_1000, X_non_b77w, X_b77w,
          y_0_500, y_500_1000, y_above_1000, y_non_b77w, y_b77w)


In [176]:
# temp_1, temp_2 = split_by_flown_distance(merged_df)
# temp_1.head()

In [177]:
# temp_2.groupby('flown_distance_range').agg({'flown_distance': ['min', 'max', 'mean', 'count']})

In [178]:
if config.SPLIT_BY_FLOWN_DISTANCE:
  (X_0_500, X_500_1000, X_above_1000, X_non_b77w, X_b77w,
   y_0_500, y_500_1000, y_above_1000, y_non_b77w, y_b77w) = split_by_flown_distance(drop_df)

  X_0_500 = drop_features(X_0_500, final_drop=True)
  X_500_1000 = drop_features(X_500_1000, final_drop=True)
  X_above_1000 = drop_features(X_above_1000, final_drop=True)
  X_non_b77w = drop_features(X_non_b77w, final_drop=True)
  X_b77w = drop_features(X_b77w, final_drop=True)

  # 0-500 Flown Distance
  X_0_500_train, X_0_500_test, y_0_500_train, y_0_500_test = process_category_split(
      X_0_500, y_0_500, "0-500 Flown Distance"
  )

  # 500-1000 Flown Distance
  X_500_1000_train, X_500_1000_test, y_500_1000_train, y_500_1000_test = process_category_split(
      X_500_1000, y_500_1000, "500-1000 Flown Distance"
  )

  # Above 1000 Flown Distance
  X_above_1000_train, X_above_1000_test, y_above_1000_train, y_above_1000_test = process_category_split(
      X_above_1000, y_above_1000, "Above 1000 Flown Distance"
  )

  # Non-B77W
  X_non_b77w_train, X_non_b77w_test, y_non_b77w_train, y_non_b77w_test = process_category_split(
      X_non_b77w, y_non_b77w, "Non-B77W"
  )

  # B77W
  X_b77w_train, X_b77w_test, y_b77w_train, y_b77w_test = process_category_split(
      X_b77w, y_b77w, "B77W"
  )

  # Feature selection for each category
  X_train_selected_0_500, X_test_selected_0_500, selected_features_df_0_500, feature_scores_df_0_500 = process_feature_selection(
      X_0_500_train, y_0_500_train, X_0_500_test, "0-500 Flown Distance"
  )

  X_train_selected_500_1000, X_test_selected_500_1000, selected_features_df_500_1000, feature_scores_df_500_1000 = process_feature_selection(
      X_500_1000_train, y_500_1000_train, X_500_1000_test, "500-1000 Flown Distance"
  )

  X_train_selected_above_1000, X_test_selected_above_1000, selected_features_df_above_1000, feature_scores_df_above_1000 = process_feature_selection(
      X_above_1000_train, y_above_1000_train, X_above_1000_test, "Above 1000 Flown Distance"
  )

  X_train_selected_non_b77w, X_test_selected_non_b77w, selected_features_df_non_b77w, feature_scores_df_non_b77w = process_feature_selection(
      X_non_b77w_train, y_non_b77w_train, X_non_b77w_test, "Non-B77W"
  )

  X_train_selected_b77w, X_test_selected_b77w, selected_features_df_b77w, feature_scores_df_b77w = process_feature_selection(
      X_b77w_train, y_b77w_train, X_b77w_test, "B77W"
  )

### Split the data by train test

In [179]:
def split_train_test(df):
  X = df.drop(['tow'], axis=1)
  y = df[['tow']]
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
  return X_train, X_test, y_train, y_test


In [180]:
if config.SPLIT_BY_TRAIN_TEST:
  X_train, X_test, y_train, y_test = split_train_test(drop_df)

# Modelling

## Find best model

In [181]:
def select_best_model(
  X_train,
  y_train,
) -> Dict[str, any]:
  """Selects the best regression model using PyCaret's automated model selection with GPU acceleration."""

  train_data = pd.concat([X_train, y_train], axis=1)

  # Setup with GPU acceleration and other useful parameters
  reg = setup(
      data=train_data,
      target="tow",
      session_id=42,
      use_gpu=False,
      fold=3,
      remove_multicollinearity=True,  # Remove multicollinearity
      multicollinearity_threshold=0.95,  # Threshold for multicollinearity
      normalize=True,  # Normalize the data
      transformation=True,
      n_jobs=-1,

  )

  best_model = compare_models(
      n_select=1,
      sort="RMSE",
      fold=3,
      round=4,
      cross_validation=True,
      budget_time=600,
      turbo=False,
      errors='ignore'
  )

  best_model_name = best_model.__class__.__name__
  best_model_params = best_model.get_params()

  return {
      "model": best_model,
      "model_name": best_model_name,
      "params": best_model_params,
  }

In [182]:
if config.FIND_BEST_MODEL:
  model_info = select_best_model(X_train_selected, y_train)
  model_info

## Train one model

### Catboost

In [183]:
def train_catboost_model(X_train, y_train, X_test, y_test, model_name):

  if config.FIND_BEST_PARAMETERS:
      model = CatBoostRegressor(random_state=42)
      param_grid = {
          'iterations': [1000, 2000],
          'learning_rate': [0.01, 0.1],
          'depth': [4, 6, 8]
      }

      grid_search = GridSearchCV(model, param_grid, scoring='neg_mean_squared_error', cv=3, verbose=100)
      grid_search.fit(X_train, y_train, eval_set=(X_test, y_test), early_stopping_rounds=50, verbose=100)

      best_model = grid_search.best_estimator_
      y_pred = best_model.predict(X_test)
      rmse = np.sqrt(mean_squared_error(y_test, y_pred))
      print(f"CatBoost Model ({model_name}) RMSE: {rmse}")
      print(f"Best Parameters: {grid_search.best_params_}")
      joblib.dump(best_model, config.CATBOOST_MODEL_PATH.format(model_name))
      print(f"CatBoost model ({model_name}) saved to {config.CATBOOST_MODEL_PATH.format(model_name)}")

      feature_importance = pd.DataFrame({
          'feature': X_train.columns,
          'importance': best_model.feature_importances_
      }).sort_values('importance', ascending=False)

  else:
      model = CatBoostRegressor(random_state=42, depth=9, iterations=2000, learning_rate=0.15)
      model.fit(X_train, y_train, eval_set=(X_test, y_test), verbose=100)
      y_pred = model.predict(X_test)
      rmse = np.sqrt(mean_squared_error(y_test, y_pred))
      print(f"CatBoost Model ({model_name}) RMSE: {rmse}")
      joblib.dump(model, config.CATBOOST_MODEL_PATH.format(model_name))
      print(f"CatBoost model ({model_name}) saved to {config.CATBOOST_MODEL_PATH.format(model_name)}")

      feature_importance = pd.DataFrame({
          'feature': X_train.columns,
          'importance': model.feature_importances_
      }).sort_values('importance', ascending=False)

  print("\nTop 10 Most Important Features:")
  print(feature_importance.head(10))
  return y_pred

#### Train by wtc

In [184]:
if not config.USE_ENSEMBLE and config.SPLIT_BY_WTC:
  print("Training CatBoost Model for wtc_H:")
  train_catboost_model(X_train_selected_H, y_wtc_H_train, X_test_selected_H, y_wtc_H_test, "wtc_H")

  print("\nTraining CatBoost Model for wtc_M:")
  train_catboost_model(X_train_selected_M, y_wtc_M_train, X_test_selected_M, y_wtc_M_test, "wtc_M")

#### Train by MTOW

In [185]:
if not config.USE_ENSEMBLE and config.SPLIT_BY_MTOW and not config.PREDICT_ONLY:
  print("Training CatBoost Model for Very Low MTOW:")
  train_catboost_model(X_train_selected_very_low, y_very_low_train, X_test_selected_very_low, y_very_low_test, "Very Low MTOW")

  print("\nTraining CatBoost Model for Low MTOW:")
  train_catboost_model(X_train_selected_low, y_low_train, X_test_selected_low, y_low_test, "Low MTOW")

  print("\nTraining CatBoost Model for Medium MTOW:")
  train_catboost_model(X_train_selected_medium, y_medium_train, X_test_selected_medium, y_medium_test, "Medium MTOW")

  print("\nTraining CatBoost Model for High MTOW:")
  train_catboost_model(X_train_selected_high, y_high_train, X_test_selected_high, y_high_test, "High MTOW")

  print("\nTraining CatBoost Model for Non-B77W:")
  train_catboost_model(X_train_selected_non_b77w, y_non_b77w_train, X_test_selected_non_b77w, y_non_b77w_test, "Non-B77W")

  print("\nTraining CatBoost Model for B77W:")
  train_catboost_model(X_train_selected_b77w, y_b77w_train, X_test_selected_b77w, y_b77w_test, "B77W")

#### Train by flown distance

In [186]:
if not config.USE_ENSEMBLE and config.SPLIT_BY_FLOWN_DISTANCE:
  print("Training CatBoost Model for 0-500 Flown Distance:")
  train_catboost_model(X_train_selected_0_500, y_0_500_train, X_test_selected_0_500, y_0_500_test, "0-500 Flown Distance")

  print("\nTraining CatBoost Model for 500-1000 Flown Distance:")
  train_catboost_model(X_train_selected_500_1000, y_500_1000_train, X_test_selected_500_1000, y_500_1000_test, "500-1000 Flown Distance")

  print("\nTraining CatBoost Model for Above 1000 Flown Distance:")
  train_catboost_model(X_train_selected_above_1000, y_above_1000_train, X_test_selected_above_1000, y_above_1000_test, "Above 1000 Flown Distance")

  print("\nTraining CatBoost Model for Non-B77W:")
  train_catboost_model(X_train_selected_non_b77w, y_non_b77w_train, X_test_selected_non_b77w, y_non_b77w_test, "Non-B77W")

  print("\nTraining CatBoost Model for B77W:")
  train_catboost_model(X_train_selected_b77w, y_b77w_train, X_test_selected_b77w, y_b77w_test, "B77W")

#### Train by train_test

In [187]:
if not config.USE_ENSEMBLE and config.SPLIT_BY_TRAIN_TEST:
  train_catboost_model(X_train, y_train, X_test, y_test, "Train Test Catboost")

### XGBoost

In [188]:
def train_xgboost_model(X_train, y_train, X_test, y_test, model_name):

  if config.FIND_BEST_PARAMETERS:
      model = XGBRegressor(random_state=42)
      param_grid = {
          'n_estimators': [1000, 2000],
          'learning_rate': [0.01, 0.1],
          'max_depth': [4, 6, 8]
      }

      grid_search = GridSearchCV(model, param_grid, scoring='neg_mean_squared_error', cv=3, verbose=100)
      grid_search.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=100)

      best_model = grid_search.best_estimator_
      y_pred = best_model.predict(X_test)
      rmse = np.sqrt(mean_squared_error(y_test, y_pred))
      print(f"XGBoost Model ({model_name}) RMSE: {rmse}")
      print(f"Best Parameters: {grid_search.best_params_}")

      # Save the model
      joblib.dump(best_model, config.XGBOOST_MODEL_PATH.format(model_name))
      print(f"XGBoost model ({model_name}) saved to {config.XGBOOST_MODEL_PATH.format(model_name)}")

      feature_importance = pd.DataFrame({
          'feature': X_train.columns,
          'importance': best_model.feature_importances_
      }).sort_values('importance', ascending=False)

  else:
      model = XGBRegressor(random_state=42, learning_rate=0.05, max_depth=8, n_estimators=2000)
      model.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=100)
      y_pred = model.predict(X_test)
      rmse = np.sqrt(mean_squared_error(y_test, y_pred))
      print(f"XGBoost Model ({model_name}) RMSE: {rmse}")

      joblib.dump(model, config.XGBOOST_MODEL_PATH.format(model_name))
      print(f"XGBoost model ({model_name}) saved to {config.XGBOOST_MODEL_PATH.format(model_name)}")

      feature_importance = pd.DataFrame({
          'feature': X_train.columns,
          'importance': model.feature_importances_
      }).sort_values('importance', ascending=False)

  print("\nTop 10 Most Important Features:")
  print(feature_importance.head(10))
  return y_pred

#### Train by wtc

In [189]:
if not config.USE_ENSEMBLE and config.SPLIT_BY_WTC:
  print("Training XGBoost Model for wtc_H:")
  train_xgboost_model(X_train_selected_H, y_wtc_H_train, X_test_selected_H, y_wtc_H_test, "wtc_H")
  print("\nTraining XGBoost Model for wtc_M:")
  train_xgboost_model(X_train_selected_M, y_wtc_M_train, X_test_selected_M, y_wtc_M_test, "wtc_M")

#### Train by MTOW

In [190]:
if not config.USE_ENSEMBLE and config.SPLIT_BY_MTOW and not config.PREDICT_ONLY:
  print("Training XGBoost Model for Very Low MTOW:")
  train_xgboost_model(X_train_selected_very_low, y_very_low_train, X_test_selected_very_low, y_very_low_test, "Very Low MTOW")

  print("\nTraining XGBoost Model for Low MTOW:")
  train_xgboost_model(X_train_selected_low, y_low_train, X_test_selected_low, y_low_test, "Low MTOW")

  print("\nTraining XGBoost Model for Medium MTOW:")
  train_xgboost_model(X_train_selected_medium, y_medium_train, X_test_selected_medium, y_medium_test, "Medium MTOW")

  print("\nTraining XGBoost Model for High MTOW:")
  train_xgboost_model(X_train_selected_high, y_high_train, X_test_selected_high, y_high_test, "High MTOW")

  print("\nTraining XGBoost Model for Non-B77W:")
  train_xgboost_model(X_train_selected_non_b77w, y_non_b77w_train, X_test_selected_non_b77w, y_non_b77w_test, "Non-B77W")

  print("\nTraining XGBoost Model for B77W:")
  train_xgboost_model(X_train_selected_b77w, y_b77w_train, X_test_selected_b77w, y_b77w_test, "B77W")


#### Train by flown distance

In [191]:
if not config.USE_ENSEMBLE and config.SPLIT_BY_FLOWN_DISTANCE:
  print("Training XGBoost Model for 0-500 Flown Distance:")
  train_xgboost_model(X_train_selected_0_500, y_0_500_train, X_test_selected_0_500, y_0_500_test, "0-500 Flown Distance")

  print("\nTraining XGBoost Model for 500-1000 Flown Distance:")
  train_xgboost_model(X_train_selected_500_1000, y_500_1000_train, X_test_selected_500_1000, y_500_1000_test, "500-1000 Flown Distance")

  print("\nTraining XGBoost Model for Above 1000 Flown Distance:")
  train_xgboost_model(X_train_selected_above_1000, y_above_1000_train, X_test_selected_above_1000, y_above_1000_test, "Above 1000 Flown Distance")

  print("\nTraining XGBoost Model for Non-B77W:")
  train_xgboost_model(X_train_selected_non_b77w, y_non_b77w_train, X_test_selected_non_b77w, y_non_b77w_test, "Non-B77W")

  print("\nTraining XGBoost Model for B77W:")
  train_xgboost_model(X_train_selected_b77w, y_b77w_train, X_test_selected_b77w, y_b77w_test, "B77W")

#### Train by train_test

In [192]:
if not config.USE_ENSEMBLE and config.SPLIT_BY_TRAIN_TEST:
  train_catboost_model(X_train, y_train, X_test, y_test, "Train Test XGBoost")

## Ensemble Model

In [193]:
def ensemble_models(X_train, y_train, X_test, y_test, model_name):
  y_pred_catboost = train_catboost_model(X_train, y_train, X_test, y_test, model_name)
  y_pred_xgboost = train_xgboost_model(X_train, y_train, X_test, y_test, model_name)

  y_pred_ensemble = (y_pred_catboost + y_pred_xgboost) / 2
  ensemble_rmse = np.sqrt(mean_squared_error(y_test, y_pred_ensemble))
  print(f"Ensemble Model ({model_name}) RMSE: {ensemble_rmse}")

  feature_importance = pd.DataFrame({
      'feature': X_train.columns,
  }).set_index('feature')

  return y_pred_ensemble

### Ensemble on MTOW split

In [194]:
if config.USE_ENSEMBLE and config.SPLIT_BY_MTOW and not config.PREDICT_ONLY:
  print("Training ensemble_models for Very Low MTOW:")
  y_pred_very_low = ensemble_models(X_train_selected_very_low, y_very_low_train, X_test_selected_very_low, y_very_low_test, "very_low_mtow")

  print("\nTraining ensemble_models for Low MTOW:")
  y_pred_low = ensemble_models(X_train_selected_low, y_low_train, X_test_selected_low, y_low_test, "low_mtow")

  print("\nTraining ensemble_models for Medium MTOW:")
  y_pred_medium = ensemble_models(X_train_selected_medium, y_medium_train, X_test_selected_medium, y_medium_test, "medium_mtow")

  print("\nTraining ensemble_models for High MTOW:")
  y_pred_high = ensemble_models(X_train_selected_high, y_high_train, X_test_selected_high, y_high_test, "high_mtow")

  print("\nTraining ensemble_models for Non-B77W:")
  y_pred_non_b77w = ensemble_models(X_train_selected_non_b77w, y_non_b77w_train, X_test_selected_non_b77w, y_non_b77w_test, "non_b77w")

  print("\nTraining ensemble_models for B77W:")
  y_pred_b77w= ensemble_models(X_train_selected_b77w, y_b77w_train, X_test_selected_b77w, y_b77w_test, "b77w")

### Ensemble on flown distance split

In [195]:
if config.USE_ENSEMBLE and config.SPLIT_BY_FLOWN_DISTANCE:
  print("Training ensemble_models for 0-500 Flown Distance:")
  ensemble_models(X_train_selected_0_500, y_0_500_train, X_test_selected_0_500, y_0_500_test, "0_500_flown_distance")

  print("\nTraining ensemble_models for 500-1000 Flown Distance:")
  ensemble_models(X_train_selected_500_1000, y_500_1000_train, X_test_selected_500_1000, y_500_1000_test, "500_1000_flown_distance")

  print("\nTraining ensemble_models for Above 1000 Flown Distance:")
  ensemble_models(X_train_selected_above_1000, y_above_1000_train, X_test_selected_above_1000, y_above_1000_test, "above_1000_flown_distance")

  print("\nTraining ensemble_models for Non-B77W:")
  ensemble_models(X_train_selected_non_b77w, y_non_b77w_train, X_test_selected_non_b77w, y_non_b77w_test, "non_b77w")

  print("\nTraining ensemble_models for B77W:")
  ensemble_models(X_train_selected_b77w, y_b77w_train, X_test_selected_b77w, y_b77w_test, "b77w")

### Ensemble on train test split

In [196]:
if config.USE_ENSEMBLE and config.SPLIT_BY_TRAIN_TEST:
  print("Training ensemble_models for Train Test:")
  ensemble_models(X_train, y_train, X_test, y_test, "train_test")

# Predicting

In [197]:
def evaluate_model(model_path, X_test, y_test):
  best_model = joblib.load(model_path)
  y_pred = best_model.predict(X_test)
  rmse = np.sqrt(mean_squared_error(y_test, y_pred))
  print(f"Model RMSE: {rmse}")
  return y_pred


In [198]:
def predict_model(model_path, X_test):
  best_model = joblib.load(model_path)
  y_pred = best_model.predict(X_test)
  return y_pred


In [199]:
def average_and_save_predictions(
      y_pred1: np.ndarray,
      y_pred2: np.ndarray,
      X_test: pd.DataFrame,
      output_dir: str = "results"
  ) -> str:
  os.makedirs(output_dir, exist_ok=True)
  avg_predictions = (y_pred1 + y_pred2) / 2

  results_df = pd.DataFrame({
      'flight_id': X_test.flight_id,
      'tow': avg_predictions
  })

  csv_path = f"/content/drive/MyDrive/PRC/team_nice_jacket_v6_43c9f53d-3900-4d9a-b19f-42b1c388ca71.csv"

  results_df.to_csv(csv_path, index=False)

  print(f"Average predictions saved to {csv_path}")

  print(f"Created file: {csv_path}")

  return csv_path, results_df


In [200]:
def save_predictions(all_y_preds: list[np.ndarray], flight_ids: list, output_dir: str = "results"):
  os.makedirs(output_dir, exist_ok=True)

  y_pred_very_low = np.array(all_y_preds[0])
  y_pred_low = np.array(all_y_preds[1])
  y_pred_medium = np.array(all_y_preds[2])
  y_pred_high = np.array(all_y_preds[3])
  y_pred_non_b77w = np.array(all_y_preds[4])
  y_pred_b77w = np.array(all_y_preds[5])

  total_length = (len(y_pred_very_low) + len(y_pred_low) + len(y_pred_medium) +
                  len(y_pred_high) + len(y_pred_non_b77w) + len(y_pred_b77w))

  avg_predictions = (y_pred_very_low * len(y_pred_very_low) +
                     y_pred_low * len(y_pred_low) +
                     y_pred_medium * len(y_pred_medium) +
                     y_pred_high * len(y_pred_high) +
                     y_pred_non_b77w * len(y_pred_non_b77w) +
                     y_pred_b77w * len(y_pred_b77w)) / total_length

  results_df = pd.DataFrame({
      'flight_id': flight_ids,
      'tow': avg_predictions
  })

  results_df.to_csv(os.path.join(output_dir, 'predictions.csv'), index=False)

In [201]:
def save_predictions(all_results, output_dir="results", output_filename="all_ensemble_preds.csv"):
    os.makedirs(output_dir, exist_ok=True)

    all_results_df = pd.DataFrame(all_results)

    all_results_df.to_csv(os.path.join(output_dir, output_filename), index=False)


In [202]:
_sum = 0
for i in all_x_tests:
  _sum += len(i)
_sum

156567

In [203]:
if config.SPLIT_BY_MTOW and config.PREDICT_ONLY:
    all_x_tests = [
        X_predict_selected_very_low,
        X_predict_selected_low,
        X_predict_selected_medium,
        X_predict_selected_high,
        X_predict_selected_b77w,
        X_predict_selected_non_b77w
    ]

    categories = ['very_low', 'low', 'edium', 'high', 'b77w', 'non_b77w']


    all_ensemble_preds = []

    for (cat, X_test, catboost_path, xgboost_path) in zip(categories, all_x_tests,
                                                           config.TRAINED_CATBOOST_MTOW_MODEL_PATH,
                                                           config.TRAINED_XGBOOST_MTOW_MODEL_PATH):

        y_pred_catboost = predict_model(catboost_path, X_test)
        y_pred_xgboost = predict_model(xgboost_path, X_test)


        if len(y_pred_catboost) == len(y_pred_xgboost):
            ensemble_pred = (y_pred_catboost + y_pred_xgboost) / 2

            flight_ids = X_test['flight_id'].values


            cat_preds_df = pd.DataFrame({
                'flight_id': flight_ids,
                'tow': ensemble_pred,
            })

            all_ensemble_preds.append(cat_preds_df)
        else:
            print(f"Skipping ensemble for {cat} due to shape mismatch: "
                  f"CatBoost shape {len(y_pred_catboost)}, "
                  f"XGBoost shape {len(y_pred_xgboost)}")


    if all_ensemble_preds:
        complete_results_df = pd.concat(all_ensemble_preds, ignore_index=True)
        save_predictions(complete_results_df, output_filename="all_ensemble_predictions.csv")
    else:
        print("No ensemble predictions to save.")


In [204]:
if config.SPLIT_BY_TRAIN_TEST:
  cat_y_pred = predict_model(config.TRAINED_CATBOOST_MODEL_PATH, test_df)
  xg_y_pred = predict_model(config.TRAINED_XGBOOST_MODEL_PATH, test_df)
  path, results_df = average_and_save_predictions(cat_y_pred, xg_y_pred, test_df)
  results_df.tail()