In [1]:
import xgboost as xgb
import numpy as np
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA

In [2]:
encoded_challenge_set = pd.read_csv('Data Processing/encoded_challenge_set.csv')
display(encoded_challenge_set)

Unnamed: 0,flight_id,month_day,month,day,actual_offblock_time_hour_minute,actual_offblock_time_hour,actual_offblock_time_minute,adep,country_code_adep,arrival_time_hour_minute,...,temperature_75percentile,temperature_max,specific_humidity_count,specific_humidity_mean,specific_humidity_std,specific_humidity_min,specific_humidity_25percentile,specific_humidity_median,specific_humidity_75percentile,specific_humidity_max
0,248763780,0,1,1,826,13,46,69,37,904,...,268.526083,286.978764,3614,0.001527,0.002250,0.000019,0.000022,0.000103,0.002845,0.008063
1,248760618,0,1,1,595,9,55,216,33,1177,...,221.683289,298.023307,12450,0.000523,0.002016,0.000017,0.000028,0.000040,0.000075,0.012768
2,248753824,0,1,1,579,9,39,137,88,1148,...,237.352948,272.897641,14324,0.000467,0.000943,0.000004,0.000017,0.000022,0.000160,0.003214
3,248753852,0,1,1,664,11,4,338,19,1172,...,233.394513,286.518770,18044,0.000931,0.002149,0.000004,0.000011,0.000020,0.000244,0.008750
4,248755934,0,1,1,756,12,36,92,44,824,...,270.592638,285.461825,577,0.001545,0.002045,0.000052,0.000088,0.000669,0.002149,0.006452
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
369008,258058361,364,12,31,578,9,38,253,36,1143,...,229.386113,297.970672,7125,0.001169,0.002870,0.000041,0.000045,0.000052,0.000169,0.014661
369009,258071247,364,12,31,567,9,27,385,96,749,...,217.291206,289.822850,13146,0.000440,0.001517,0.000010,0.000032,0.000040,0.000046,0.008234
369010,258059152,364,12,31,592,9,52,30,26,701,...,252.439682,289.316079,8489,0.001106,0.001981,0.000018,0.000019,0.000026,0.001182,0.008736
369011,258072276,364,12,31,577,9,37,253,36,673,...,235.684174,281.311973,5094,0.000713,0.001509,0.000012,0.000021,0.000028,0.000263,0.006041


In [3]:
identifier_name = 'flight_id'

features_names = [
 'month_day',
 'month',
 'day',
 'actual_offblock_time_hour_minute',
 'actual_offblock_time_hour',
 'actual_offblock_time_minute',
 'adep',
 'country_code_adep',
 'arrival_time_hour_minute',
 'arrival_time_hour',
 'arrival_time_minute',
 'ades',
 'country_code_ades',
 'aircraft_type',
 'wtc',
 'airline',
 'flight_duration',
 'taxiout_time',
 'flown_distance',
 'latitude_count',
 'latitude_mean',
 'latitude_std',
 'latitude_min',
 'latitude_25percentile',
 'latitude_median',
 'latitude_75percentile',
 'latitude_max',
 'longitude_count',
 'longitude_mean',
 'longitude_std',
 'longitude_min',
 'longitude_25percentile',
 'longitude_median',
 'longitude_75percentile',
 'longitude_max',
 'altitude_count',
 'altitude_mean',
 'altitude_std',
 'altitude_min',
 'altitude_25percentile',
 'altitude_median',
 'altitude_75percentile',
 'altitude_max',
 'groundspeed_count',
 'groundspeed_mean',
 'groundspeed_std',
 'groundspeed_min',
 'groundspeed_25percentile',
 'groundspeed_median',
 'groundspeed_75percentile',
 'groundspeed_max',
 'track_count',
 'track_mean',
 'track_std',
 'track_min',
 'track_25percentile',
 'track_median',
 'track_75percentile',
 'track_max',
 'vertical_rate_count',
 'vertical_rate_mean',
 'vertical_rate_std',
 'vertical_rate_min',
 'vertical_rate_25percentile',
 'vertical_rate_median',
 'vertical_rate_75percentile',
 'vertical_rate_max',
 'track_unwrapped_count',
 'track_unwrapped_mean',
 'track_unwrapped_std',
 'track_unwrapped_min',
 'track_unwrapped_25percentile',
 'track_unwrapped_median',
 'track_unwrapped_75percentile',
 'track_unwrapped_max',
 'u_component_of_wind_count',
 'u_component_of_wind_mean',
 'u_component_of_wind_std',
 'u_component_of_wind_min',
 'u_component_of_wind_25percentile',
 'u_component_of_wind_median',
 'u_component_of_wind_75percentile',
 'u_component_of_wind_max',
 'v_component_of_wind_count',
 'v_component_of_wind_mean',
 'v_component_of_wind_std',
 'v_component_of_wind_min',
 'v_component_of_wind_25percentile',
 'v_component_of_wind_median',
 'v_component_of_wind_75percentile',
 'v_component_of_wind_max',
 'temperature_count',
 'temperature_mean',
 'temperature_std',
 'temperature_min',
 'temperature_25percentile',
 'temperature_median',
 'temperature_75percentile',
 'temperature_max',
 'specific_humidity_count',
 'specific_humidity_mean',
 'specific_humidity_std',
 'specific_humidity_min',
 'specific_humidity_25percentile',
 'specific_humidity_median',
 'specific_humidity_75percentile',
 'specific_humidity_max'
]

target_name = 'tow'

features = encoded_challenge_set[features_names]
target = encoded_challenge_set[target_name]

global_random_state = 123

In [4]:
X = features.values
y = target.values

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

xgb_model = xgb.XGBRegressor(
    objective='reg:squarederror', 
    random_state=global_random_state
)

kf = KFold(n_splits=3, shuffle=True, random_state=global_random_state)

rmse_scores = []

for train_index, test_index in kf.split(X_scaled):
    X_train, X_test = X_scaled[train_index], X_scaled[test_index]
    y_train, y_test = y[train_index], y[test_index]
    xgb_model.fit(X_train, y_train)
    y_pred = xgb_model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    rmse_scores.append(rmse)

# Calculate average RMSE across folds
avg_rmse = np.mean(rmse_scores)

print(f"RMSE scores for each fold: {rmse_scores}")
print(f"Average RMSE: {avg_rmse}")

RMSE scores for each fold: [3584.2610776865295, 3544.429760224991, 3557.027977983379]
Average RMSE: 3561.9062719649664
