In [1]:
from sklearn.model_selection import GridSearchCV, KFold, train_test_split
from sklearn.metrics import make_scorer, mean_squared_error
from sklearn.preprocessing import StandardScaler
from xgboost import plot_importance
import matplotlib.pyplot as plt
import xgboost as xgb
import pandas as pd
import numpy as np

pd.set_option("display.max_columns", None)

In [2]:
identifier_name = 'flight_id'

most_important_features_names = [
     'wtc',
     'flown_distance',
     'aircraft_type',
     'longitude_max',
     'altitude_median',
     'groundspeed_max',
     'airline',
     'groundspeed_75percentile',
     'altitude_25percentile',
     'flight_duration',
     'latitude_min',
     'vertical_rate_std',
     'altitude_75percentile',
     'longitude_median',
     'longitude_std',
     'vertical_rate_25percentile',
     'longitude_min',
     'longitude_mean',
     'adep',
     'vertical_rate_max',
     'ades',
     'latitude_std',
     'latitude_max',
     'longitude_25percentile',
     'altitude_mean',
     'latitude_mean',
     'vertical_rate_75percentile',
     'latitude_median',
     'groundspeed_min',
     'country_code_adep',
     'country_code_ades',
     'latitude_25percentile',
     'longitude_count',
     'groundspeed_25percentile',
     'vertical_rate_min',
     'longitude_75percentile',
     'track_75percentile',
     'taxiout_time',
     'track_median',
     'vertical_rate_median',
     'latitude_75percentile',
     'track_25percentile',
     'month_day',
     'latitude_count',
     'altitude_std',
     'arrival_time_hour',
     'track_mean',
     'arrival_time_hour_minute',
     'vertical_rate_mean'
]

target_name = 'tow'

global_random_state = 123

In [3]:
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

In [4]:
def evaluate_model(model, data_df, features_columns, target_column):
    k_fold_number = 1
    features = data_df[features_columns]
    target = data_df[target_column]
    X_array = features.values
    y_array = target.values
    rmse_scores = []
    kf = KFold(n_splits=3, shuffle=True, random_state=global_random_state)
    for train_index, test_index in kf.split(X_array):
        print("Evaluating k-fold number: ", k_fold_number)
        X_train, X_test = features.iloc[train_index], features.iloc[test_index]
        y_train, y_test = target.iloc[train_index], target.iloc[test_index]
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        rmse_scores.append(rmse(y_test, y_pred))
        k_fold_number += 1
    return np.mean(rmse_scores)

In [5]:
encoded_challenge_set = pd.read_csv('data/encoded_challenge_set.csv')

In [6]:
def calculate_stat(
    dataframe: pd.DataFrame, 
    group_by_column: str, 
    target_column: str,
    stat_type: str
) -> pd.Series | None:
    group = dataframe.groupby(group_by_column)[target_column]
    if stat_type == 'count':
        return group.size()
    elif stat_type == 'mean':
        return group.mean()
    elif stat_type == 'std':
        return group.std()
    elif stat_type == 'min':
        return group.min()
    elif stat_type == '25percentile':
        return group.quantile(0.25)
    elif stat_type == 'median':
        return group.median()
    elif stat_type == '75percentile':
        return group.quantile(0.75)
    elif stat_type == 'max':
        return group.max()
    return None

In [7]:
def summarize_by_aircraft_type(
    data_df: pd.DataFrame,
    stat_types: list = ['count', 'mean', 'std', 'min', '25percentile', 'median', '75percentile', 'max']
) -> dict[int, dict]:
    stats_by_aircraft_df = {}
    for aircraft_type in sorted(list(data_df['aircraft_type'].unique())):
        stats_by_aircraft_df[aircraft_type] = {}

    for stat_type in stat_types:
        stat_by_aircraft = calculate_stat(dataframe=data_df, group_by_column='aircraft_type', target_column='tow', stat_type=stat_type)
        for i, value in enumerate(stat_by_aircraft):
            stats_by_aircraft_df[stat_by_aircraft.index[i]][stat_type] = value
    
    return stats_by_aircraft_df

In [8]:
stats_by_aircraft_df = summarize_by_aircraft_type(
    data_df=encoded_challenge_set[most_important_features_names+[target_name]]
)

total_count_before_summarization = len(encoded_challenge_set)
print(f"{total_count_before_summarization = }")

total_count_after_summarization = 0
for aircraft_type, aircraft_stats in stats_by_aircraft_df.items():
    total_count_after_summarization += aircraft_stats['count']
print(f"{total_count_after_summarization = }")

total_count_before_summarization = 369013
total_count_after_summarization = 369013


In [9]:
def evaluate_model_with_limit(model, data_df, features_columns, target_column, stats_by_aircraft_df):
    k_fold_number = 1
    features = data_df[features_columns]
    target = data_df[target_column]
    X_array = features.values
    y_array = target.values
    before_rmse_scores = []
    
    min_max_corrected_rmse_scores = []
    percentile_corrected_rmse_scores = []
    mean_std_median_corrected_rmse_scores = []
    
    kf = KFold(n_splits=3, shuffle=True, random_state=global_random_state)
    for train_index, test_index in kf.split(X_array):
        print("Evaluating k-fold number: ", k_fold_number)
        X_train, X_test = features.iloc[train_index], features.iloc[test_index]
        y_train, y_test = target.iloc[train_index], target.iloc[test_index]
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        before_rmse_scores.append(rmse(y_test, y_pred))

        min_max_corrected = []
        percentile_corrected = []
        mean_std_median_corrected = []
        for i, aircraft_type in enumerate(X_test['aircraft_type']):
            predicted_value = y_pred[i]

            # Correct for min and max per aircraft type
            min_limit, max_limit = stats_by_aircraft_df[aircraft_type]['min'], stats_by_aircraft_df[aircraft_type]['max']
            if predicted_value < min_limit:
                min_max_corrected.append(min_limit) 
            elif predicted_value > max_limit:
                min_max_corrected.append(max_limit)
            else:
                min_max_corrected.append(predicted_value)

            # Correct for 25 and 75 percentile per aircraft type
            min_percentile_limit = stats_by_aircraft_df[aircraft_type]['25percentile']
            max_percentile_limit = stats_by_aircraft_df[aircraft_type]['75percentile']
            if predicted_value < min_percentile_limit:
                percentile_corrected.append(min_percentile_limit) 
            elif predicted_value > max_percentile_limit:
                percentile_corrected.append(max_percentile_limit)
            else:
                percentile_corrected.append(predicted_value)

            # Correct for mean, std, and media per aircraft type
            mean = stats_by_aircraft_df[aircraft_type]['mean']
            std = stats_by_aircraft_df[aircraft_type]['std']
            median = stats_by_aircraft_df[aircraft_type]['median']
            correction_max = np.max([mean, std, median])
            correction_min = np.min([mean, std, median])
            if predicted_value < correction_min:
                mean_std_median_corrected.append(correction_min) 
            elif predicted_value > correction_max:
                mean_std_median_corrected.append(correction_max) 
            else:
                mean_std_median_corrected.append(predicted_value)
        
        min_max_corrected_rmse_scores.append(rmse(y_test, np.array(min_max_corrected)))
        percentile_corrected_rmse_scores.append(rmse(y_test, np.array(percentile_corrected)))
        mean_std_median_corrected_rmse_scores.append(rmse(y_test, np.array(mean_std_median_corrected)))
        
        k_fold_number += 1
        
    return np.mean(before_rmse_scores), np.mean(min_max_corrected_rmse_scores), np.mean(percentile_corrected_rmse_scores), np.mean(mean_std_median_corrected_rmse_scores)
    

In [10]:
xgb_model = xgb.XGBRegressor(
    max_depth=9,
    n_estimators=3000,
    learning_rate=0.1,
    subsample=1.0,
    colsample_bytree=0.8,
    objective='reg:squarederror', 
    eval_metric='rmse',
    random_state=global_random_state
)

before_rmse_scores, min_max_corrected_rmse_scores, percentile_corrected_rmse_scores, mean_std_median_corrected_rmse_scores = evaluate_model_with_limit(
    model=xgb_model, 
    data_df=encoded_challenge_set, 
    features_columns=most_important_features_names, 
    target_column=target_name,
    stats_by_aircraft_df=stats_by_aircraft_df
)

Evaluating k-fold number:  1
Evaluating k-fold number:  2
Evaluating k-fold number:  3


In [11]:
print(f"{before_rmse_scores = }")
print(f"{min_max_corrected_rmse_scores = }")
print(f"{percentile_corrected_rmse_scores = }")
print(f"{mean_std_median_corrected_rmse_scores = }")

before_rmse_scores = 2771.52439347872
min_max_corrected_rmse_scores = 2759.0006563240963
percentile_corrected_rmse_scores = 5386.340891878303
mean_std_median_corrected_rmse_scores = 8215.151384350223
