In [1]:
from sklearn.model_selection import GridSearchCV, KFold, train_test_split
from sklearn.metrics import make_scorer, mean_squared_error
from sklearn.preprocessing import StandardScaler
from xgboost import plot_importance
import matplotlib.pyplot as plt
import xgboost as xgb
import pandas as pd
import numpy as np
import itertools

pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)

In [2]:
identifier_name = 'flight_id'

most_important_features_names = [
    'wtc', 
    'aircraft_type', 
    'flown_distance', 
    'groundspeed_max', 
    'latitude_min', 
    'altitude_25percentile', 
    'airline', 
    'flight_duration', 
    'longitude_max', 
    'vertical_rate_75percentile', 
    'altitude_median', 
    'ades'
]

target_name = 'tow'

global_random_state = 123

In [3]:
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

In [4]:
def evaluate_model(model, data_df, features_columns, target_column):
    features = data_df[features_columns]
    target = data_df[target_column]
    X_array = features.values
    y_array = target.values
    rmse_scores = []
    kf = KFold(n_splits=3, shuffle=True, random_state=global_random_state)
    for train_index, test_index in kf.split(X_array):
        X_train, X_test = features.iloc[train_index], features.iloc[test_index]
        y_train, y_test = target.iloc[train_index], target.iloc[test_index]
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        rmse_scores.append(rmse(y_test, y_pred))
    return np.mean(rmse_scores)

In [5]:
encoded_challenge_set = pd.read_csv('data/encoded_challenge_set.csv')
encoded_challenge_set = encoded_challenge_set.fillna(0)

In [6]:
num_top_important_features = len(most_important_features_names)
print(f"We need to build {2**num_top_important_features} Trying all combinations of the top {num_top_important_features} important features")

all_combinations = list(itertools.chain.from_iterable(
    itertools.combinations(most_important_features_names, r) for r in range(len(most_important_features_names) + 1)
))
print(f"Total number of generated combinations: {len(all_combinations)}")

tried_combination = []
combinations_rmse_scores = []
for index, combination in enumerate(all_combinations[1:]):
    current_features = list(combination)
    
    print(f"Trying the combination number {index+1}...")
    print(current_features)
    
    xgb_model = xgb.XGBRegressor(
        n_estimators=91,
        max_depth=10,
        learning_rate=0.1,
        subsample=1.0,
        colsample_bytree=1.0,
        objective='reg:squarederror', 
        eval_metric='rmse',
        random_state=global_random_state
    )
    
    rmse_score = evaluate_model(
        model=xgb_model, 
        data_df=encoded_challenge_set, 
        features_columns=current_features, 
        target_column=target_name
    )
    print("Evaluation RMSE:", rmse_score)

    tried_combination.append(current_features)
    combinations_rmse_scores.append(rmse_score)

We need to build 4096 Trying all combinations of the top 12 important features
Total number of generated combinations: 4096
Trying the combination number 1...
['wtc']
Evaluation RMSE: 20350.17311080975
Trying the combination number 2...
['aircraft_type']
Evaluation RMSE: 10827.753782628288
Trying the combination number 3...
['flown_distance']
Evaluation RMSE: 27477.065008386544
Trying the combination number 4...
['groundspeed_max']
Evaluation RMSE: 45284.66483403879
Trying the combination number 5...
['latitude_min']
Evaluation RMSE: 36605.25314532768
Trying the combination number 6...
['altitude_25percentile']
Evaluation RMSE: 44163.29536818614
Trying the combination number 7...
['airline']
Evaluation RMSE: 43931.55855543888
Trying the combination number 8...
['flight_duration']
Evaluation RMSE: 28424.577113701594
Trying the combination number 9...
['longitude_max']
Evaluation RMSE: 45191.24966573509
Trying the combination number 10...
['vertical_rate_75percentile']
Evaluation RMSE: 4

In [7]:
print(f"Minimum RMSE: ", combinations_rmse_scores[np.argmin(combinations_rmse_scores)])

Minimum RMSE:  3521.641592899745


In [8]:
print(f"Top features: ")
top_features_with_lowest_rmse = tried_combination[np.argmin(combinations_rmse_scores)]
print(top_features_with_lowest_rmse)

Top features: 
['wtc', 'aircraft_type', 'flown_distance', 'latitude_min', 'altitude_25percentile', 'airline', 'flight_duration', 'longitude_max', 'vertical_rate_75percentile', 'altitude_median', 'ades']


In [9]:
print(f"Number of top features giving minimum RMSE: ", len(top_features_with_lowest_rmse))

Number of top features giving minimum RMSE:  11
