In [1]:
from sklearn.model_selection import GridSearchCV, KFold, train_test_split
from sklearn.metrics import make_scorer, mean_squared_error
from sklearn.preprocessing import StandardScaler
from xgboost import plot_importance
import matplotlib.pyplot as plt
import xgboost as xgb
import pandas as pd
import numpy as np

pd.set_option("display.max_columns", None)

In [2]:
identifier_name = 'flight_id'

flight_features_names = [
    'wtc', 
    'flown_distance', 
    'aircraft_type', 
    'airline',
    'flight_duration',
    'ades', 
    'adep',
    'country_code_adep',
    'country_code_ades',
    'month_day',
    'actual_offblock_time_hour',
]  
 
trajectory_features_names = [    
    'groundspeed_max', 
    'groundspeed_75percentile', 
    'altitude_25percentile', 
    'longitude_max', 
    'latitude_min',  
    'vertical_rate_std', 
    'altitude_median', 
    'longitude_mean', 
    'altitude_75percentile', 
    'longitude_std', 
    'latitude_max', 
    'latitude_std', 
    'vertical_rate_max', 
    'vertical_rate_25percentile', 
    'latitude_mean', 
    'longitude_min', 
    'longitude_25percentile', 
    'vertical_rate_75percentile', 
    'latitude_median', 
    'altitude_mean', 
    'latitude_25percentile', 
    'groundspeed_min',
    'longitude_75percentile',
    'vertical_rate_median'
]

target_name = 'tow'

global_random_state = 123

In [3]:
encoded_challenge_set = pd.read_csv('data/encoded_challenge_set.csv')

flight_features = encoded_challenge_set[flight_features_names].values
print(f"{len(flight_features) = }")

trajectory_features = encoded_challenge_set[trajectory_features_names].values
print(f"{len(trajectory_features) = }")

target = encoded_challenge_set[[target_name]].values
print(f"{len(target) = }")

len(flight_features) = 369013
len(trajectory_features) = 369013
len(target) = 369013


In [4]:
# Function to score model using Root Mean Square Error
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))
    
# Create a scorer
rmse_scorer = make_scorer(rmse, greater_is_better=False)

# Make cross validation strategy
cv_strategy = KFold(n_splits=5, shuffle=True, random_state=global_random_state)

In [5]:
fold_num = 1

flight_rmse_scores = []
trajectory_rmse_scores = []
for train_index, test_index in cv_strategy.split(flight_features):
    print(f"Running fold number: {fold_num}")
    y_train, y_test = target[train_index], target[test_index]

    print("Running XGBoost with flight info only")
    flight_X_train, flight_X_test = flight_features[train_index], flight_features[test_index]
    flight_model = xgb.XGBRegressor(
        objective='reg:squarederror', 
        eval_metric='rmse',
        random_state=global_random_state,
        n_estimators=3000,
        colsample_bytree=1.0,
        learning_rate=0.1, 
        max_depth=10, 
        subsample=1.0
    )
    flight_model.fit(flight_X_train, y_train)
    flight_y_pred = flight_model.predict(flight_X_test)
    flight_rmse_score = rmse(y_test, flight_y_pred)
    flight_rmse_scores.append(flight_rmse_score)

    print("Running XGBoost with trajectory info only")
    trajectory_X_train, trajectory_X_test = trajectory_features[train_index], trajectory_features[test_index]
    trajectory_model = xgb.XGBRegressor(
        objective='reg:squarederror', 
        eval_metric='rmse',
        random_state=global_random_state,
        n_estimators=3000,
        colsample_bytree=1.0,
        learning_rate=0.1, 
        max_depth=10, 
        subsample=1.0
    )
    trajectory_model.fit(trajectory_X_train, y_train)
    trajectory_y_pred = trajectory_model.predict(trajectory_X_test)
    trajectory_rmse_score = rmse(y_test, trajectory_y_pred)
    trajectory_rmse_scores.append(trajectory_rmse_score)


    fold_num += 1

print("Average RMSE Scores with flight info only:", np.mean(trajectory_rmse_scores))
print("Average RMSE Scores with trajectory info only:", np.mean(trajectory_rmse_scores))

Running fold number: 1
Running XGBoost with flight info only
Running XGBoost with trajectory info only
Running fold number: 2
Running XGBoost with flight info only
Running XGBoost with trajectory info only
Running fold number: 3
Running XGBoost with flight info only
Running XGBoost with trajectory info only
Running fold number: 4
Running XGBoost with flight info only
Running XGBoost with trajectory info only
Running fold number: 5
Running XGBoost with flight info only
Running XGBoost with trajectory info only
Average RMSE Scores with flight info only: 11163.645891886506
Average RMSE Scores with trajectory info only: 11163.645891886506
